Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions

236
scripts/seed_sample_run.py Normal file
View File

@@ -0,0 +1,236 @@
"""Generate a realistic sample evaluation run so the console has demo data.
This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
layer produces, but without needing ragas or any network calls. It lets the
report board render immediately for demos and local development.
Usage:
python scripts/seed_sample_run.py
"""
from __future__ import annotations
import csv
import json
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
RUN_ID = "2026-06-15T08-30-00+00-00"
JUDGE_MODEL = "deepseek-distill-qwen-32b"
EMBEDDING_MODEL = "text-embedding-v3"
METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
# tail in the distribution, and clear weak groups by difficulty).
SAMPLES = [
{
"sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
"question": "员工入职满3年可享受多少天年休假",
"contexts": ["员工入司满1年不满10年的年休假5天。", "年休假在每年1月1日起可申请。"],
"answer": "根据规定入职满3年的员工可享受5天年休假。",
"ground_truth": "员工入司满1年不满10年的年休假5天。",
"faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
},
{
"sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
"question": "公司报销差旅费的截止提交时间是什么时候?",
"contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
"answer": "差旅费需在出差结束后30天内提交报销。",
"ground_truth": "差旅费报销须在出差结束后30天内提交。",
"faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
},
{
"sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
"question": "申请远程办公需要经过哪些审批流程?",
"contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
"answer": "需先由直属主管审批再提交人力资源部备案每月不超过8天。",
"ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案每月上限8天。",
"faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
},
{
"sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
"question": "How many days of paternity leave are employees entitled to?",
"contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
"answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
"ground_truth": "Employees are entitled to 15 days of paternity leave.",
"faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
},
{
"sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
"question": "正式员工与试用期员工在医疗保险待遇上有何区别?",
"contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
"answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。",
"ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。",
"faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
},
{
"sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
"question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
"contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
"answer": "第三方共享需签保密协议敏感数据须经数据保护官批准记录留存3年。",
"ground_truth": "向第三方共享数据须签署保密协议敏感数据共享须经数据保护官批准且共享记录至少留存3年。",
"faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
},
{
"sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
"question": "跨部门项目预算超支时的审批升级路径是怎样的?",
"contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
"answer": "超支10%以内项目经理批10%-20%需总监批超20%需财务委员会审批。",
"ground_truth": "超支10%以内由项目经理审批10%-20%由部门总监审批超过20%须提交财务委员会审批。",
"faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
},
{
"sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
"question": "员工持股计划的最低锁定期是多少年?",
"contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
"answer": "员工持股计划的最低锁定期为3年。",
"ground_truth": "员工持股计划的最低锁定期为4年。",
"faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
},
{
"sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
"question": "What is the difference in notice period between voluntary and involuntary termination?",
"contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
"answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
"ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
"faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
},
{
"sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
"question": "公司规定的标准工作时间是每周多少小时?",
"contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
"answer": "公司标准工作时间为每周40小时。",
"ground_truth": "公司标准工作时间为每周40小时。",
"faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
},
]
# Two samples that failed normalization, to exercise the invalid count display.
INVALID_SAMPLES = [
{"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"},
{"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"},
]
def _output_dir() -> Path:
"""Return the run directory where sample artifacts are written."""
return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
def _write_scores_csv(path: Path) -> None:
"""Write scores.csv with sample fields, metric scores, and metadata columns."""
fieldnames = [
"sample_id", "question", "contexts", "answer", "ground_truth",
"scenario", "language", "difficulty", "question_type",
*METRICS, "error", "judge_model", "embedding_model", "run_id",
]
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
for sample in SAMPLES:
row = {
"sample_id": sample["sample_id"],
"question": sample["question"],
# Serialize contexts as a JSON list, matching engine CSV output.
"contexts": json.dumps(sample["contexts"], ensure_ascii=False),
"answer": sample["answer"],
"ground_truth": sample["ground_truth"],
"scenario": SCENARIO_NAME,
"language": sample["language"],
"difficulty": sample["difficulty"],
"question_type": sample["question_type"],
"error": "",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"run_id": SCENARIO_NAME,
}
for metric in METRICS:
row[metric] = sample[metric]
writer.writerow(row)
def _write_invalid_csv(path: Path) -> None:
"""Write invalid.csv with the small set of unscored samples."""
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
writer.writeheader()
writer.writerows(INVALID_SAMPLES)
def _metric_mean(metric: str) -> float:
"""Compute the mean of one metric across the valid samples."""
return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
def _write_metadata(path: Path) -> None:
"""Write metadata.json mirroring the reporting layer's schema."""
metadata = {
"run_id": RUN_ID,
"scenario_name": SCENARIO_NAME,
"mode": "offline",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"started_at": "2026-06-15T08:29:12+00:00",
"finished_at": "2026-06-15T08:31:45+00:00",
"dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
"valid_samples": len(SAMPLES),
"invalid_samples": len(INVALID_SAMPLES),
}
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
def _write_summary(path: Path) -> None:
"""Write a human-readable summary.md echoing the metric means."""
lines = [
f"# {SCENARIO_NAME}",
"",
f"- run_id: `{RUN_ID}`",
"- mode: `offline`",
f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
f"- valid_samples: `{len(SAMPLES)}`",
f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
f"- judge_model: `{JUDGE_MODEL}`",
"",
"## Metric Means",
"",
]
for metric in METRICS:
lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def _write_scenario_snapshot(path: Path) -> None:
"""Write scenario.snapshot.yaml so the reader resolves the metric list."""
import yaml
snapshot = {
"scenario_name": SCENARIO_NAME,
"mode": "offline",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"metrics": METRICS,
}
path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
def main() -> None:
"""Write all sample run artifacts into a fresh run directory."""
run_dir = _output_dir()
run_dir.mkdir(parents=True, exist_ok=True)
_write_scores_csv(run_dir / "scores.csv")
_write_invalid_csv(run_dir / "invalid.csv")
_write_metadata(run_dir / "metadata.json")
_write_summary(run_dir / "summary.md")
_write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
print(f"Sample run written to: {run_dir}")
print("Start the console with: python webmain.py")
if __name__ == "__main__":
main()