Add Siemens CT document evaluation scenario (three-step pipeline)

- scenarios/siemens_build/siemens-pdf-build.yaml: dataset build for all 17 Siemens medical-imaging PDFs (aliyun_docmind parser, 10 questions/doc, failure_mode=skip, ~170 question total) - scenarios/offline/siemens-pdf-offline-smoke.yaml: offline evaluation using source chunks as contexts and ground_truth as answer (up to 30 samples) - scenarios/online/siemens-pdf-question-bank-online.yaml: online evaluation calling siemens_pdf_qa adapter, batch_size=4, up to 50 samples - apps/siemens_pdf_qa/adapter.py: Siemens-specific adapter with bilingual (zh/en) system prompt and strict evidence-grounding for CT domain - scripts/build_siemens_offline_smoke.py: helper to derive offline smoke CSV from completed dataset build artifacts (run after dataset build step) - docs/superpowers/specs/2026-06-15-siemens-scenario-design.md: design spec All three scenarios are automatically discovered by the web console. Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 17:00:52 +08:00
parent 1288a366d1
commit 75ae7927ad
7 changed files with 361 additions and 0 deletions
--- a/apps/siemens_pdf_qa/init.py
+++ b/apps/siemens_pdf_qa/init.py
@@ -0,0 +1,6 @@
 """Siemens PDF question bank adapter for online evaluation.
 Wraps the generic pdf_question_bank adapter with a Siemens-specific system
 prompt that instructs the model to answer in the same language as the question
 (Chinese for Chinese CT documentation) and to cite only the provided evidence.
 """
--- a/apps/siemens_pdf_qa/adapter.py
+++ b/apps/siemens_pdf_qa/adapter.py
@@ -0,0 +1,170 @@
 """Online evaluation adapter for the Siemens medical-imaging PDF question bank.
 Functionally identical to apps/pdf_question_bank/adapter.py but uses a
 Siemens-specific system prompt that:
  - Instructs the model to answer in the same language as the question
    (important for Chinese CT documentation).
  - Emphasises citation of source chunks and refusal when evidence is absent.
  - Adds domain context (medical imaging / CT terminology).
 The adapter contract is the same as all other adapters:
  run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
 """
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Any
 from openai import OpenAI
 from rag_eval.settings import EvaluationSettings
 from rag_eval.shared.utils import parse_contexts
 # ── chunk cache (module-level, lives for the process lifetime) ────────────────
 _CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
 def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
    """Resolve the chunk artifact path; fall back to the latest timestamped run."""
    resolved = Path(source_chunks_path).resolve()
    if resolved.exists():
        return resolved
    if resolved.parent.name != "latest":
        raise FileNotFoundError(resolved)
    artifact_root = resolved.parent.parent
    if not artifact_root.exists():
        raise FileNotFoundError(resolved)
    candidates = sorted(
        [d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
        key=lambda p: p.name,
        reverse=True,
    )
    for run_dir in candidates:
        candidate = run_dir / resolved.name
        if candidate.exists():
            return candidate
    raise FileNotFoundError(resolved)
 def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
    """Load and cache source chunks by chunk_id."""
    resolved = _resolve_source_chunks_path(source_chunks_path)
    cached = _CHUNK_CACHE.get(resolved)
    if cached is not None:
        return cached
    lookup: dict[str, dict[str, Any]] = {}
    with resolved.open(encoding="utf-8") as fh:
        for lineno, line in enumerate(fh, 1):
            text = line.strip()
            if not text:
                continue
            payload = json.loads(text)
            chunk_id = str(payload.get("chunk_id", "")).strip()
            if not chunk_id:
                raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
            lookup[chunk_id] = payload
    _CHUNK_CACHE[resolved] = lookup
    return lookup
 def _resolve_chunk_ids(raw: Any) -> list[str]:
    """Parse the source_chunk_ids column into a list of non-empty id strings."""
    ids = parse_contexts(raw)
    normalized = [i for i in ids if i]
    if not normalized:
        raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
    return normalized
 def _build_messages(
    question: str,
    contexts: list[str],
    metadata: dict[str, Any],
 ) -> list[dict[str, str]]:
    """Build a Siemens-domain grounded prompt for the answer model."""
    evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
    meta_lines = [
        f"doc_name: {metadata.get('doc_name', '')}",
        f"section_path: {metadata.get('section_path', '')}",
        f"page_range: {metadata.get('page_start', '')}–{metadata.get('page_end', '')}",
    ]
    # Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
    system_prompt = (
        "你是西门子医疗影像知识库的问答助手（Siemens Healthineers CT Knowledge Base QA）。"
        "请严格根据下方【证据片段】回答问题，不得使用片段之外的任何知识。"
        "若证据不足以回答，请明确说明「根据现有资料无法回答」。"
        "请用与问题相同的语言（中文或英文）作答，简洁准确，必要时引用片段编号。"
    )
    user_prompt = "\n".join([
        "【问题】",
        question,
        "",
        "【文档元信息】",
        *meta_lines,
        "",
        "【证据片段】",
        *evidence_lines,
        "",
        "请基于以上证据片段作答。",
    ])
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
 def run(
    question: str,
    *,
    source_chunks_path: str,
    model: str | None = None,
    client: OpenAI | None = None,
    **kwargs: Any,
 ) -> dict[str, Any]:
    """Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
    This is the adapter contract entry point used by the online evaluation runner.
    """
    chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
    chunk_lookup = _load_source_chunks(source_chunks_path)
    missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
    if missing:
        raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
    resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
    contexts = [
        str(chunk.get("text", "")).strip()
        for chunk in resolved_chunks
        if str(chunk.get("text", "")).strip()
    ]
    if not contexts:
        raise ValueError("resolved source chunks contain no usable text.")
    settings = EvaluationSettings()
    target_model = (model or settings.ragas_judge_model).strip()
    if not target_model:
        raise ValueError("A model name is required for siemens_pdf_qa adapter.")
    llm_client = client or OpenAI(**settings.openai_client_kwargs)
    completion = llm_client.chat.completions.create(
        model=target_model,
        messages=_build_messages(question, contexts, kwargs),
        temperature=0,
    )
    answer = str(completion.choices[0].message.content or "").strip()
    return {
        "answer": answer,
        "contexts": contexts,
        "raw_response": {
            "resolved_chunk_ids": chunk_ids,
            "doc_id": kwargs.get("doc_id", ""),
            "doc_name": kwargs.get("doc_name", ""),
            "model": target_model,
            "response_text": answer,
        },
    }
--- a/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
+++ b/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
@@ -0,0 +1,59 @@
 # Siemens PDF 场景设计 Spec
 - 日期：2026-06-15
 - 状态：已确认，进入实现。
 ## 1. 目标
 基于 `datasets/siemens-pdfs/`（17 个西门子医疗 CT 中文 PDF），跑通完整三步流水线：
 ```
 dataset_build（PDF→题库）→ offline smoke 评估 → online 评估
 ```
 完全镜像现有 `sample-pdf-*` 模式（方案 A），不改动任何现有文件。
 ## 2. 参数决策
 | 项目 | 值 |
 |---|---|
 | 输入 PDF | `datasets/siemens-pdfs/*.pdf`（17 个） |
 | failure_mode | `skip`（单个文档解析失败不中断整批） |
 | max_questions_per_document | 10（共 ~170 题） |
 | max_source_chunks_per_question | 3 |
 | generation model | `.env` 的 `DATASET_GENERATOR_MODEL`（qwen3.6-plus） |
 | judge model | `.env` 的 `RAGAS_JUDGE_MODEL`（deepseek-v4-flash） |
 | embedding model | `.env` 的 `RAGAS_EMBEDDING_MODEL`（text-embedding-v3） |
 | online answer model | `.env` 的 `RAGAS_JUDGE_MODEL` |
 | metrics | faithfulness / answer_relevancy / context_recall / context_precision |
 ## 3. 新增文件（4 个）
 ```
 scenarios/siemens_build/siemens-pdf-build.yaml
 scenarios/offline/siemens-pdf-offline-smoke.yaml
 scenarios/online/siemens-pdf-question-bank-online.yaml
 apps/siemens_pdf_qa/__init__.py
 apps/siemens_pdf_qa/adapter.py
 ```
 加上辅助脚本：
 ```
 scripts/build_siemens_offline_smoke.py   ← 从 build 产物生成 offline smoke CSV
 ```
 ## 4. 运行顺序
 ```
 # 步骤 1：dataset build（PDF → 题库草稿 + source_chunks.jsonl）
 python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
 # 步骤 2：生成 offline smoke 数据集（一次性脚本，build 跑完后执行）
 python scripts/build_siemens_offline_smoke.py
 # 步骤 3：offline 评估（用 source chunks 作为 contexts，ground_truth 作为 answer）
 python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
 # 步骤 4：online 评估（实时调用 LLM 生成 answer，再评分）
 python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
 ```
--- a/scenarios/offline/siemens-pdf-offline-smoke.yaml
+++ b/scenarios/offline/siemens-pdf-offline-smoke.yaml
@@ -0,0 +1,15 @@
 scenario_name: siemens-pdf-offline-smoke
 mode: offline
 app_adapter: null
 dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
 judge_model: deepseek-v4-flash
 embedding_model: text-embedding-v3
 metrics:
  - faithfulness
  - answer_relevancy
  - context_recall
  - context_precision
 output_dir: ../../outputs/siemens-pdf-offline-smoke
 runtime:
  batch_size: 4
  max_samples: 30
--- a/scenarios/online/siemens-pdf-question-bank-online.yaml
+++ b/scenarios/online/siemens-pdf-question-bank-online.yaml
@@ -0,0 +1,22 @@
 scenario_name: siemens-pdf-question-bank-online
 mode: online
 dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
 judge_model: deepseek-v4-flash
 embedding_model: text-embedding-v3
 metrics:
  - faithfulness
  - answer_relevancy
  - context_recall
  - context_precision
 output_dir: ../../outputs/online/siemens-pdf-question-bank
 runtime:
  batch_size: 4
  app_concurrency: 4
  metric_concurrency: 4
  max_samples: 50
 app_adapter:
  type: python
  callable: apps.siemens_pdf_qa.adapter:run
  static_kwargs:
    source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
    model: deepseek-v4-flash
--- a/scenarios/siemens_build/siemens-pdf-build.yaml
+++ b/scenarios/siemens_build/siemens-pdf-build.yaml
@@ -0,0 +1,17 @@
 job_name: siemens-pdf-question-bank
 input:
  path: ../../datasets/siemens-pdfs
  glob: "*.pdf"
 parser:
  provider: aliyun_docmind
  failure_mode: skip
 generation:
  output_type: online_question_bank
  review_mode: draft_with_manual_review
  max_questions_per_document: 10
  max_source_chunks_per_question: 3
 output:
  dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
  artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
 runtime:
  max_documents: 17
--- a/scripts/build_siemens_offline_smoke.py
+++ b/scripts/build_siemens_offline_smoke.py
@@ -0,0 +1,72 @@
 """Build the Siemens offline smoke dataset from a completed dataset_build run.
 Must be run AFTER `python main.py --dataset-build-config
 scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
 It uses the stable `latest/` alias so you don't need to know the run_id.
 Usage:
    python scripts/build_siemens_offline_smoke.py
 Output:
    datasets/normalized/siemens_pdf_offline_smoke.csv
    (referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
 """
 from __future__ import annotations
 from pathlib import Path
 # ---------------------------------------------------------------------------
 # Paths — all relative to the siemens_ragas/ repository root
 # ---------------------------------------------------------------------------
 REPO_ROOT = Path(__file__).resolve().parents[1]
 DRAFT_DATASET_PATH = (
    REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
    / "latest" / "dataset_draft.csv"
 )
 SOURCE_CHUNKS_PATH = (
    REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
    / "latest" / "source_chunks.jsonl"
 )
 OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
 def main() -> None:
    """Convert the Siemens build artefacts into an offline-evaluable dataset."""
    if not DRAFT_DATASET_PATH.exists():
        raise FileNotFoundError(
            f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
            "Run the dataset build first:\n"
            "  python main.py --dataset-build-config "
            "scenarios/siemens_build/siemens-pdf-build.yaml"
        )
    if not SOURCE_CHUNKS_PATH.exists():
        raise FileNotFoundError(
            f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
            "Run the dataset build first."
        )
    # Import here so the script is importable even before rag_eval is fully set up.
    from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
    output = build_offline_smoke_dataset(
        draft_dataset_path=DRAFT_DATASET_PATH,
        source_chunks_path=SOURCE_CHUNKS_PATH,
        output_path=OUTPUT_PATH,
    )
    import pandas as pd
    frame = pd.read_csv(output)
    print(f"Offline smoke dataset written to: {output}")
    print(f"Total rows: {len(frame)}")
    if len(frame) > 0:
        lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
        diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
        print(f"Language distribution: {lang_counts}")
        print(f"Difficulty distribution: {diff_counts}")
 if __name__ == "__main__":
    main()