Add RAGAS evaluation web console (FastAPI + vanilla JS)
- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
236
scripts/seed_sample_run.py
Normal file
236
scripts/seed_sample_run.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Generate a realistic sample evaluation run so the console has demo data.
|
||||
|
||||
This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
|
||||
scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
|
||||
layer produces, but without needing ragas or any network calls. It lets the
|
||||
report board render immediately for demos and local development.
|
||||
|
||||
Usage:
|
||||
python scripts/seed_sample_run.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
|
||||
RUN_ID = "2026-06-15T08-30-00+00-00"
|
||||
JUDGE_MODEL = "deepseek-distill-qwen-32b"
|
||||
EMBEDDING_MODEL = "text-embedding-v3"
|
||||
METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
|
||||
|
||||
# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
|
||||
# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
|
||||
# tail in the distribution, and clear weak groups by difficulty).
|
||||
SAMPLES = [
|
||||
{
|
||||
"sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
|
||||
"question": "员工入职满3年可享受多少天年休假?",
|
||||
"contexts": ["员工入司满1年不满10年的,年休假5天。", "年休假在每年1月1日起可申请。"],
|
||||
"answer": "根据规定,入职满3年的员工可享受5天年休假。",
|
||||
"ground_truth": "员工入司满1年不满10年的,年休假5天。",
|
||||
"faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
|
||||
"question": "公司报销差旅费的截止提交时间是什么时候?",
|
||||
"contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
|
||||
"answer": "差旅费需在出差结束后30天内提交报销。",
|
||||
"ground_truth": "差旅费报销须在出差结束后30天内提交。",
|
||||
"faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
|
||||
"question": "申请远程办公需要经过哪些审批流程?",
|
||||
"contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
|
||||
"answer": "需先由直属主管审批,再提交人力资源部备案,每月不超过8天。",
|
||||
"ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案,每月上限8天。",
|
||||
"faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
|
||||
"question": "How many days of paternity leave are employees entitled to?",
|
||||
"contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
|
||||
"answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
|
||||
"ground_truth": "Employees are entitled to 15 days of paternity leave.",
|
||||
"faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
|
||||
"question": "正式员工与试用期员工在医疗保险待遇上有何区别?",
|
||||
"contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
|
||||
"answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。",
|
||||
"ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。",
|
||||
"faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
|
||||
"question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
|
||||
"contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
|
||||
"answer": "第三方共享需签保密协议,敏感数据须经数据保护官批准,记录留存3年。",
|
||||
"ground_truth": "向第三方共享数据须签署保密协议,敏感数据共享须经数据保护官批准,且共享记录至少留存3年。",
|
||||
"faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
|
||||
"question": "跨部门项目预算超支时的审批升级路径是怎样的?",
|
||||
"contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
|
||||
"answer": "超支10%以内项目经理批,10%-20%需总监批,超20%需财务委员会审批。",
|
||||
"ground_truth": "超支10%以内由项目经理审批,10%-20%由部门总监审批,超过20%须提交财务委员会审批。",
|
||||
"faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
|
||||
"question": "员工持股计划的最低锁定期是多少年?",
|
||||
"contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
|
||||
"answer": "员工持股计划的最低锁定期为3年。",
|
||||
"ground_truth": "员工持股计划的最低锁定期为4年。",
|
||||
"faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
|
||||
"question": "What is the difference in notice period between voluntary and involuntary termination?",
|
||||
"contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
|
||||
"answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
|
||||
"ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
|
||||
"faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
|
||||
},
|
||||
{
|
||||
"sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
|
||||
"question": "公司规定的标准工作时间是每周多少小时?",
|
||||
"contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
|
||||
"answer": "公司标准工作时间为每周40小时。",
|
||||
"ground_truth": "公司标准工作时间为每周40小时。",
|
||||
"faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
|
||||
},
|
||||
]
|
||||
|
||||
# Two samples that failed normalization, to exercise the invalid count display.
|
||||
INVALID_SAMPLES = [
|
||||
{"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"},
|
||||
{"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"},
|
||||
]
|
||||
|
||||
|
||||
def _output_dir() -> Path:
|
||||
"""Return the run directory where sample artifacts are written."""
|
||||
return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
|
||||
|
||||
|
||||
def _write_scores_csv(path: Path) -> None:
|
||||
"""Write scores.csv with sample fields, metric scores, and metadata columns."""
|
||||
fieldnames = [
|
||||
"sample_id", "question", "contexts", "answer", "ground_truth",
|
||||
"scenario", "language", "difficulty", "question_type",
|
||||
*METRICS, "error", "judge_model", "embedding_model", "run_id",
|
||||
]
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for sample in SAMPLES:
|
||||
row = {
|
||||
"sample_id": sample["sample_id"],
|
||||
"question": sample["question"],
|
||||
# Serialize contexts as a JSON list, matching engine CSV output.
|
||||
"contexts": json.dumps(sample["contexts"], ensure_ascii=False),
|
||||
"answer": sample["answer"],
|
||||
"ground_truth": sample["ground_truth"],
|
||||
"scenario": SCENARIO_NAME,
|
||||
"language": sample["language"],
|
||||
"difficulty": sample["difficulty"],
|
||||
"question_type": sample["question_type"],
|
||||
"error": "",
|
||||
"judge_model": JUDGE_MODEL,
|
||||
"embedding_model": EMBEDDING_MODEL,
|
||||
"run_id": SCENARIO_NAME,
|
||||
}
|
||||
for metric in METRICS:
|
||||
row[metric] = sample[metric]
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
def _write_invalid_csv(path: Path) -> None:
|
||||
"""Write invalid.csv with the small set of unscored samples."""
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
|
||||
writer.writeheader()
|
||||
writer.writerows(INVALID_SAMPLES)
|
||||
|
||||
|
||||
def _metric_mean(metric: str) -> float:
|
||||
"""Compute the mean of one metric across the valid samples."""
|
||||
return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
|
||||
|
||||
|
||||
def _write_metadata(path: Path) -> None:
|
||||
"""Write metadata.json mirroring the reporting layer's schema."""
|
||||
metadata = {
|
||||
"run_id": RUN_ID,
|
||||
"scenario_name": SCENARIO_NAME,
|
||||
"mode": "offline",
|
||||
"judge_model": JUDGE_MODEL,
|
||||
"embedding_model": EMBEDDING_MODEL,
|
||||
"started_at": "2026-06-15T08:29:12+00:00",
|
||||
"finished_at": "2026-06-15T08:31:45+00:00",
|
||||
"dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
|
||||
"valid_samples": len(SAMPLES),
|
||||
"invalid_samples": len(INVALID_SAMPLES),
|
||||
}
|
||||
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def _write_summary(path: Path) -> None:
|
||||
"""Write a human-readable summary.md echoing the metric means."""
|
||||
lines = [
|
||||
f"# {SCENARIO_NAME}",
|
||||
"",
|
||||
f"- run_id: `{RUN_ID}`",
|
||||
"- mode: `offline`",
|
||||
f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
|
||||
f"- valid_samples: `{len(SAMPLES)}`",
|
||||
f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
|
||||
f"- judge_model: `{JUDGE_MODEL}`",
|
||||
"",
|
||||
"## Metric Means",
|
||||
"",
|
||||
]
|
||||
for metric in METRICS:
|
||||
lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
|
||||
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def _write_scenario_snapshot(path: Path) -> None:
|
||||
"""Write scenario.snapshot.yaml so the reader resolves the metric list."""
|
||||
import yaml
|
||||
|
||||
snapshot = {
|
||||
"scenario_name": SCENARIO_NAME,
|
||||
"mode": "offline",
|
||||
"judge_model": JUDGE_MODEL,
|
||||
"embedding_model": EMBEDDING_MODEL,
|
||||
"metrics": METRICS,
|
||||
}
|
||||
path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Write all sample run artifacts into a fresh run directory."""
|
||||
run_dir = _output_dir()
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_write_scores_csv(run_dir / "scores.csv")
|
||||
_write_invalid_csv(run_dir / "invalid.csv")
|
||||
_write_metadata(run_dir / "metadata.json")
|
||||
_write_summary(run_dir / "summary.md")
|
||||
_write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
|
||||
|
||||
print(f"Sample run written to: {run_dir}")
|
||||
print("Start the console with: python webmain.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user