siemens_ragas/scripts/seed_sample_run.py

"""Generate a realistic sample evaluation run so the console has demo data.

This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
layer produces, but without needing ragas or any network calls. It lets the
report board render immediately for demos and local development.

Usage:
    python scripts/seed_sample_run.py
"""

from __future__ import annotations

import csv
import json
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]

SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
RUN_ID = "2026-06-15T08-30-00+00-00"
JUDGE_MODEL = "deepseek-distill-qwen-32b"
EMBEDDING_MODEL = "text-embedding-v3"
METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]

# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
# tail in the distribution, and clear weak groups by difficulty).
SAMPLES = [
    {
        "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
        "question": "员工入职满3年可享受多少天年休假？",
        "contexts": ["员工入司满1年不满10年的，年休假5天。", "年休假在每年1月1日起可申请。"],
        "answer": "根据规定，入职满3年的员工可享受5天年休假。",
        "ground_truth": "员工入司满1年不满10年的，年休假5天。",
        "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
    },
    {
        "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
        "question": "公司报销差旅费的截止提交时间是什么时候？",
        "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
        "answer": "差旅费需在出差结束后30天内提交报销。",
        "ground_truth": "差旅费报销须在出差结束后30天内提交。",
        "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
    },
    {
        "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
        "question": "申请远程办公需要经过哪些审批流程？",
        "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
        "answer": "需先由直属主管审批，再提交人力资源部备案，每月不超过8天。",
        "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案，每月上限8天。",
        "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
    },
    {
        "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
        "question": "How many days of paternity leave are employees entitled to?",
        "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
        "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
        "ground_truth": "Employees are entitled to 15 days of paternity leave.",
        "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
    },
    {
        "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
        "question": "正式员工与试用期员工在医疗保险待遇上有何区别？",
        "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
        "answer": "正式员工额外享受补充医疗保险，试用期员工仅有基础医疗保险。",
        "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险，试用期员工仅享基础医疗保险。",
        "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
    },
    {
        "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
        "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
        "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
        "answer": "第三方共享需签保密协议，敏感数据须经数据保护官批准，记录留存3年。",
        "ground_truth": "向第三方共享数据须签署保密协议，敏感数据共享须经数据保护官批准，且共享记录至少留存3年。",
        "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
    },
    {
        "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
        "question": "跨部门项目预算超支时的审批升级路径是怎样的？",
        "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
        "answer": "超支10%以内项目经理批，10%-20%需总监批，超20%需财务委员会审批。",
        "ground_truth": "超支10%以内由项目经理审批，10%-20%由部门总监审批，超过20%须提交财务委员会审批。",
        "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
    },
    {
        "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
        "question": "员工持股计划的最低锁定期是多少年？",
        "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
        "answer": "员工持股计划的最低锁定期为3年。",
        "ground_truth": "员工持股计划的最低锁定期为4年。",
        "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
    },
    {
        "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
        "question": "What is the difference in notice period between voluntary and involuntary termination?",
        "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
        "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
        "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
        "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
    },
    {
        "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
        "question": "公司规定的标准工作时间是每周多少小时？",
        "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
        "answer": "公司标准工作时间为每周40小时。",
        "ground_truth": "公司标准工作时间为每周40小时。",
        "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
    },
]

# Two samples that failed normalization, to exercise the invalid count display.
INVALID_SAMPLES = [
    {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办？"},
    {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡？"},
]


def _output_dir() -> Path:
    """Return the run directory where sample artifacts are written."""
    return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID


def _write_scores_csv(path: Path) -> None:
    """Write scores.csv with sample fields, metric scores, and metadata columns."""
    fieldnames = [
        "sample_id", "question", "contexts", "answer", "ground_truth",
        "scenario", "language", "difficulty", "question_type",
        *METRICS, "error", "judge_model", "embedding_model", "run_id",
    ]
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        for sample in SAMPLES:
            row = {
                "sample_id": sample["sample_id"],
                "question": sample["question"],
                # Serialize contexts as a JSON list, matching engine CSV output.
                "contexts": json.dumps(sample["contexts"], ensure_ascii=False),
                "answer": sample["answer"],
                "ground_truth": sample["ground_truth"],
                "scenario": SCENARIO_NAME,
                "language": sample["language"],
                "difficulty": sample["difficulty"],
                "question_type": sample["question_type"],
                "error": "",
                "judge_model": JUDGE_MODEL,
                "embedding_model": EMBEDDING_MODEL,
                "run_id": SCENARIO_NAME,
            }
            for metric in METRICS:
                row[metric] = sample[metric]
            writer.writerow(row)


def _write_invalid_csv(path: Path) -> None:
    """Write invalid.csv with the small set of unscored samples."""
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
        writer.writeheader()
        writer.writerows(INVALID_SAMPLES)


def _metric_mean(metric: str) -> float:
    """Compute the mean of one metric across the valid samples."""
    return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)


def _write_metadata(path: Path) -> None:
    """Write metadata.json mirroring the reporting layer's schema."""
    metadata = {
        "run_id": RUN_ID,
        "scenario_name": SCENARIO_NAME,
        "mode": "offline",
        "judge_model": JUDGE_MODEL,
        "embedding_model": EMBEDDING_MODEL,
        "started_at": "2026-06-15T08:29:12+00:00",
        "finished_at": "2026-06-15T08:31:45+00:00",
        "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
        "valid_samples": len(SAMPLES),
        "invalid_samples": len(INVALID_SAMPLES),
    }
    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")


def _write_summary(path: Path) -> None:
    """Write a human-readable summary.md echoing the metric means."""
    lines = [
        f"# {SCENARIO_NAME}",
        "",
        f"- run_id: `{RUN_ID}`",
        "- mode: `offline`",
        f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
        f"- valid_samples: `{len(SAMPLES)}`",
        f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
        f"- judge_model: `{JUDGE_MODEL}`",
        "",
        "## Metric Means",
        "",
    ]
    for metric in METRICS:
        lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def _write_scenario_snapshot(path: Path) -> None:
    """Write scenario.snapshot.yaml so the reader resolves the metric list."""
    import yaml

    snapshot = {
        "scenario_name": SCENARIO_NAME,
        "mode": "offline",
        "judge_model": JUDGE_MODEL,
        "embedding_model": EMBEDDING_MODEL,
        "metrics": METRICS,
    }
    path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")


def main() -> None:
    """Write all sample run artifacts into a fresh run directory."""
    run_dir = _output_dir()
    run_dir.mkdir(parents=True, exist_ok=True)

    _write_scores_csv(run_dir / "scores.csv")
    _write_invalid_csv(run_dir / "invalid.csv")
    _write_metadata(run_dir / "metadata.json")
    _write_summary(run_dir / "summary.md")
    _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")

    print(f"Sample run written to: {run_dir}")
    print("Start the console with: python webmain.py")


if __name__ == "__main__":
    main()