"""Generate a realistic sample evaluation run so the console has demo data. This writes the standard run artifacts (metadata.json, scores.csv, summary.md, scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting layer produces, but without needing ragas or any network calls. It lets the report board render immediately for demos and local development. Usage: python scripts/seed_sample_run.py """ from __future__ import annotations import csv import json from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] SCENARIO_NAME = "kba-knowledge-base-offline-baseline" RUN_ID = "2026-06-15T08-30-00+00-00" JUDGE_MODEL = "deepseek-distill-qwen-32b" EMBEDDING_MODEL = "text-embedding-v3" METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"] # Each row mirrors a scores.csv record: sample fields + metric scores + metadata. # Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long # tail in the distribution, and clear weak groups by difficulty). SAMPLES = [ { "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact", "question": "员工入职满3年可享受多少天年休假?", "contexts": ["员工入司满1年不满10年的,年休假5天。", "年休假在每年1月1日起可申请。"], "answer": "根据规定,入职满3年的员工可享受5天年休假。", "ground_truth": "员工入司满1年不满10年的,年休假5天。", "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92, }, { "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact", "question": "公司报销差旅费的截止提交时间是什么时候?", "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"], "answer": "差旅费需在出差结束后30天内提交报销。", "ground_truth": "差旅费报销须在出差结束后30天内提交。", "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88, }, { "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure", "question": "申请远程办公需要经过哪些审批流程?", "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"], "answer": "需先由直属主管审批,再提交人力资源部备案,每月不超过8天。", "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案,每月上限8天。", "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79, }, { "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact", "question": "How many days of paternity leave are employees entitled to?", "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."], "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.", "ground_truth": "Employees are entitled to 15 days of paternity leave.", "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74, }, { "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison", "question": "正式员工与试用期员工在医疗保险待遇上有何区别?", "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"], "answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。", "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。", "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71, }, { "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary", "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。", "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"], "answer": "第三方共享需签保密协议,敏感数据须经数据保护官批准,记录留存3年。", "ground_truth": "向第三方共享数据须签署保密协议,敏感数据共享须经数据保护官批准,且共享记录至少留存3年。", "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62, }, { "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure", "question": "跨部门项目预算超支时的审批升级路径是怎样的?", "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"], "answer": "超支10%以内项目经理批,10%-20%需总监批,超20%需财务委员会审批。", "ground_truth": "超支10%以内由项目经理审批,10%-20%由部门总监审批,超过20%须提交财务委员会审批。", "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48, }, { "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact", "question": "员工持股计划的最低锁定期是多少年?", "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"], "answer": "员工持股计划的最低锁定期为3年。", "ground_truth": "员工持股计划的最低锁定期为4年。", "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30, }, { "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison", "question": "What is the difference in notice period between voluntary and involuntary termination?", "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."], "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.", "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.", "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33, }, { "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact", "question": "公司规定的标准工作时间是每周多少小时?", "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"], "answer": "公司标准工作时间为每周40小时。", "ground_truth": "公司标准工作时间为每周40小时。", "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90, }, ] # Two samples that failed normalization, to exercise the invalid count display. INVALID_SAMPLES = [ {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"}, {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"}, ] def _output_dir() -> Path: """Return the run directory where sample artifacts are written.""" return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID def _write_scores_csv(path: Path) -> None: """Write scores.csv with sample fields, metric scores, and metadata columns.""" fieldnames = [ "sample_id", "question", "contexts", "answer", "ground_truth", "scenario", "language", "difficulty", "question_type", *METRICS, "error", "judge_model", "embedding_model", "run_id", ] with path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames) writer.writeheader() for sample in SAMPLES: row = { "sample_id": sample["sample_id"], "question": sample["question"], # Serialize contexts as a JSON list, matching engine CSV output. "contexts": json.dumps(sample["contexts"], ensure_ascii=False), "answer": sample["answer"], "ground_truth": sample["ground_truth"], "scenario": SCENARIO_NAME, "language": sample["language"], "difficulty": sample["difficulty"], "question_type": sample["question_type"], "error": "", "judge_model": JUDGE_MODEL, "embedding_model": EMBEDDING_MODEL, "run_id": SCENARIO_NAME, } for metric in METRICS: row[metric] = sample[metric] writer.writerow(row) def _write_invalid_csv(path: Path) -> None: """Write invalid.csv with the small set of unscored samples.""" with path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"]) writer.writeheader() writer.writerows(INVALID_SAMPLES) def _metric_mean(metric: str) -> float: """Compute the mean of one metric across the valid samples.""" return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4) def _write_metadata(path: Path) -> None: """Write metadata.json mirroring the reporting layer's schema.""" metadata = { "run_id": RUN_ID, "scenario_name": SCENARIO_NAME, "mode": "offline", "judge_model": JUDGE_MODEL, "embedding_model": EMBEDDING_MODEL, "started_at": "2026-06-15T08:29:12+00:00", "finished_at": "2026-06-15T08:31:45+00:00", "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv", "valid_samples": len(SAMPLES), "invalid_samples": len(INVALID_SAMPLES), } path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8") def _write_summary(path: Path) -> None: """Write a human-readable summary.md echoing the metric means.""" lines = [ f"# {SCENARIO_NAME}", "", f"- run_id: `{RUN_ID}`", "- mode: `offline`", f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`", f"- valid_samples: `{len(SAMPLES)}`", f"- invalid_samples: `{len(INVALID_SAMPLES)}`", f"- judge_model: `{JUDGE_MODEL}`", "", "## Metric Means", "", ] for metric in METRICS: lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`") path.write_text("\n".join(lines) + "\n", encoding="utf-8") def _write_scenario_snapshot(path: Path) -> None: """Write scenario.snapshot.yaml so the reader resolves the metric list.""" import yaml snapshot = { "scenario_name": SCENARIO_NAME, "mode": "offline", "judge_model": JUDGE_MODEL, "embedding_model": EMBEDDING_MODEL, "metrics": METRICS, } path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8") def main() -> None: """Write all sample run artifacts into a fresh run directory.""" run_dir = _output_dir() run_dir.mkdir(parents=True, exist_ok=True) _write_scores_csv(run_dir / "scores.csv") _write_invalid_csv(run_dir / "invalid.csv") _write_metadata(run_dir / "metadata.json") _write_summary(run_dir / "summary.md") _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml") print(f"Sample run written to: {run_dir}") print("Start the console with: python webmain.py") if __name__ == "__main__": main()