Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions
--- a/scripts/seed_sample_run.py
+++ b/scripts/seed_sample_run.py
@@ -0,0 +1,236 @@
+"""Generate a realistic sample evaluation run so the console has demo data.
+
+This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
+scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
+layer produces, but without needing ragas or any network calls. It lets the
+report board render immediately for demos and local development.
+
+Usage:
+    python scripts/seed_sample_run.py
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
+RUN_ID = "2026-06-15T08-30-00+00-00"
+JUDGE_MODEL = "deepseek-distill-qwen-32b"
+EMBEDDING_MODEL = "text-embedding-v3"
+METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
+
+# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
+# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
+# tail in the distribution, and clear weak groups by difficulty).
+SAMPLES = [
+    {
+        "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
+        "question": "员工入职满3年可享受多少天年休假？",
+        "contexts": ["员工入司满1年不满10年的，年休假5天。", "年休假在每年1月1日起可申请。"],
+        "answer": "根据规定，入职满3年的员工可享受5天年休假。",
+        "ground_truth": "员工入司满1年不满10年的，年休假5天。",
+        "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
+    },
+    {
+        "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
+        "question": "公司报销差旅费的截止提交时间是什么时候？",
+        "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
+        "answer": "差旅费需在出差结束后30天内提交报销。",
+        "ground_truth": "差旅费报销须在出差结束后30天内提交。",
+        "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
+    },
+    {
+        "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
+        "question": "申请远程办公需要经过哪些审批流程？",
+        "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
+        "answer": "需先由直属主管审批，再提交人力资源部备案，每月不超过8天。",
+        "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案，每月上限8天。",
+        "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
+    },
+    {
+        "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
+        "question": "How many days of paternity leave are employees entitled to?",
+        "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
+        "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
+        "ground_truth": "Employees are entitled to 15 days of paternity leave.",
+        "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
+    },
+    {
+        "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
+        "question": "正式员工与试用期员工在医疗保险待遇上有何区别？",
+        "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
+        "answer": "正式员工额外享受补充医疗保险，试用期员工仅有基础医疗保险。",
+        "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险，试用期员工仅享基础医疗保险。",
+        "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
+    },
+    {
+        "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
+        "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
+        "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
+        "answer": "第三方共享需签保密协议，敏感数据须经数据保护官批准，记录留存3年。",
+        "ground_truth": "向第三方共享数据须签署保密协议，敏感数据共享须经数据保护官批准，且共享记录至少留存3年。",
+        "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
+    },
+    {
+        "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
+        "question": "跨部门项目预算超支时的审批升级路径是怎样的？",
+        "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
+        "answer": "超支10%以内项目经理批，10%-20%需总监批，超20%需财务委员会审批。",
+        "ground_truth": "超支10%以内由项目经理审批，10%-20%由部门总监审批，超过20%须提交财务委员会审批。",
+        "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
+    },
+    {
+        "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
+        "question": "员工持股计划的最低锁定期是多少年？",
+        "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
+        "answer": "员工持股计划的最低锁定期为3年。",
+        "ground_truth": "员工持股计划的最低锁定期为4年。",
+        "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
+    },
+    {
+        "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
+        "question": "What is the difference in notice period between voluntary and involuntary termination?",
+        "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
+        "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
+        "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
+        "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
+    },
+    {
+        "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
+        "question": "公司规定的标准工作时间是每周多少小时？",
+        "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
+        "answer": "公司标准工作时间为每周40小时。",
+        "ground_truth": "公司标准工作时间为每周40小时。",
+        "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
+    },
+]
+
+# Two samples that failed normalization, to exercise the invalid count display.
+INVALID_SAMPLES = [
+    {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办？"},
+    {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡？"},
+]
+
+
+def _output_dir() -> Path:
+    """Return the run directory where sample artifacts are written."""
+    return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
+
+
+def _write_scores_csv(path: Path) -> None:
+    """Write scores.csv with sample fields, metric scores, and metadata columns."""
+    fieldnames = [
+        "sample_id", "question", "contexts", "answer", "ground_truth",
+        "scenario", "language", "difficulty", "question_type",
+        *METRICS, "error", "judge_model", "embedding_model", "run_id",
+    ]
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for sample in SAMPLES:
+            row = {
+                "sample_id": sample["sample_id"],
+                "question": sample["question"],
+                # Serialize contexts as a JSON list, matching engine CSV output.
+                "contexts": json.dumps(sample["contexts"], ensure_ascii=False),
+                "answer": sample["answer"],
+                "ground_truth": sample["ground_truth"],
+                "scenario": SCENARIO_NAME,
+                "language": sample["language"],
+                "difficulty": sample["difficulty"],
+                "question_type": sample["question_type"],
+                "error": "",
+                "judge_model": JUDGE_MODEL,
+                "embedding_model": EMBEDDING_MODEL,
+                "run_id": SCENARIO_NAME,
+            }
+            for metric in METRICS:
+                row[metric] = sample[metric]
+            writer.writerow(row)
+
+
+def _write_invalid_csv(path: Path) -> None:
+    """Write invalid.csv with the small set of unscored samples."""
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
+        writer.writeheader()
+        writer.writerows(INVALID_SAMPLES)
+
+
+def _metric_mean(metric: str) -> float:
+    """Compute the mean of one metric across the valid samples."""
+    return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
+
+
+def _write_metadata(path: Path) -> None:
+    """Write metadata.json mirroring the reporting layer's schema."""
+    metadata = {
+        "run_id": RUN_ID,
+        "scenario_name": SCENARIO_NAME,
+        "mode": "offline",
+        "judge_model": JUDGE_MODEL,
+        "embedding_model": EMBEDDING_MODEL,
+        "started_at": "2026-06-15T08:29:12+00:00",
+        "finished_at": "2026-06-15T08:31:45+00:00",
+        "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
+        "valid_samples": len(SAMPLES),
+        "invalid_samples": len(INVALID_SAMPLES),
+    }
+    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def _write_summary(path: Path) -> None:
+    """Write a human-readable summary.md echoing the metric means."""
+    lines = [
+        f"# {SCENARIO_NAME}",
+        "",
+        f"- run_id: `{RUN_ID}`",
+        "- mode: `offline`",
+        f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
+        f"- valid_samples: `{len(SAMPLES)}`",
+        f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
+        f"- judge_model: `{JUDGE_MODEL}`",
+        "",
+        "## Metric Means",
+        "",
+    ]
+    for metric in METRICS:
+        lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def _write_scenario_snapshot(path: Path) -> None:
+    """Write scenario.snapshot.yaml so the reader resolves the metric list."""
+    import yaml
+
+    snapshot = {
+        "scenario_name": SCENARIO_NAME,
+        "mode": "offline",
+        "judge_model": JUDGE_MODEL,
+        "embedding_model": EMBEDDING_MODEL,
+        "metrics": METRICS,
+    }
+    path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+
+def main() -> None:
+    """Write all sample run artifacts into a fresh run directory."""
+    run_dir = _output_dir()
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    _write_scores_csv(run_dir / "scores.csv")
+    _write_invalid_csv(run_dir / "invalid.csv")
+    _write_metadata(run_dir / "metadata.json")
+    _write_summary(run_dir / "summary.md")
+    _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
+
+    print(f"Sample run written to: {run_dir}")
+    print("Start the console with: python webmain.py")
+
+
+if __name__ == "__main__":
+    main()