Add Siemens CT document evaluation scenario (three-step pipeline)

- scenarios/siemens_build/siemens-pdf-build.yaml: dataset build for all 17
  Siemens medical-imaging PDFs (aliyun_docmind parser, 10 questions/doc,
  failure_mode=skip, ~170 question total)
- scenarios/offline/siemens-pdf-offline-smoke.yaml: offline evaluation using
  source chunks as contexts and ground_truth as answer (up to 30 samples)
- scenarios/online/siemens-pdf-question-bank-online.yaml: online evaluation
  calling siemens_pdf_qa adapter, batch_size=4, up to 50 samples
- apps/siemens_pdf_qa/adapter.py: Siemens-specific adapter with bilingual
  (zh/en) system prompt and strict evidence-grounding for CT domain
- scripts/build_siemens_offline_smoke.py: helper to derive offline smoke CSV
  from completed dataset build artifacts (run after dataset build step)
- docs/superpowers/specs/2026-06-15-siemens-scenario-design.md: design spec

All three scenarios are automatically discovered by the web console.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:00:52 +08:00
parent 1288a366d1
commit 75ae7927ad
7 changed files with 361 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
"""Siemens PDF question bank adapter for online evaluation.
Wraps the generic pdf_question_bank adapter with a Siemens-specific system
prompt that instructs the model to answer in the same language as the question
(Chinese for Chinese CT documentation) and to cite only the provided evidence.
"""

View File

@@ -0,0 +1,170 @@
"""Online evaluation adapter for the Siemens medical-imaging PDF question bank.
Functionally identical to apps/pdf_question_bank/adapter.py but uses a
Siemens-specific system prompt that:
- Instructs the model to answer in the same language as the question
(important for Chinese CT documentation).
- Emphasises citation of source chunks and refusal when evidence is absent.
- Adds domain context (medical imaging / CT terminology).
The adapter contract is the same as all other adapters:
run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from openai import OpenAI
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.utils import parse_contexts
# ── chunk cache (module-level, lives for the process lifetime) ────────────────
_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
"""Resolve the chunk artifact path; fall back to the latest timestamped run."""
resolved = Path(source_chunks_path).resolve()
if resolved.exists():
return resolved
if resolved.parent.name != "latest":
raise FileNotFoundError(resolved)
artifact_root = resolved.parent.parent
if not artifact_root.exists():
raise FileNotFoundError(resolved)
candidates = sorted(
[d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
key=lambda p: p.name,
reverse=True,
)
for run_dir in candidates:
candidate = run_dir / resolved.name
if candidate.exists():
return candidate
raise FileNotFoundError(resolved)
def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
"""Load and cache source chunks by chunk_id."""
resolved = _resolve_source_chunks_path(source_chunks_path)
cached = _CHUNK_CACHE.get(resolved)
if cached is not None:
return cached
lookup: dict[str, dict[str, Any]] = {}
with resolved.open(encoding="utf-8") as fh:
for lineno, line in enumerate(fh, 1):
text = line.strip()
if not text:
continue
payload = json.loads(text)
chunk_id = str(payload.get("chunk_id", "")).strip()
if not chunk_id:
raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
lookup[chunk_id] = payload
_CHUNK_CACHE[resolved] = lookup
return lookup
def _resolve_chunk_ids(raw: Any) -> list[str]:
"""Parse the source_chunk_ids column into a list of non-empty id strings."""
ids = parse_contexts(raw)
normalized = [i for i in ids if i]
if not normalized:
raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
return normalized
def _build_messages(
question: str,
contexts: list[str],
metadata: dict[str, Any],
) -> list[dict[str, str]]:
"""Build a Siemens-domain grounded prompt for the answer model."""
evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
meta_lines = [
f"doc_name: {metadata.get('doc_name', '')}",
f"section_path: {metadata.get('section_path', '')}",
f"page_range: {metadata.get('page_start', '')}{metadata.get('page_end', '')}",
]
# Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
system_prompt = (
"你是西门子医疗影像知识库的问答助手Siemens Healthineers CT Knowledge Base QA"
"请严格根据下方【证据片段】回答问题,不得使用片段之外的任何知识。"
"若证据不足以回答,请明确说明「根据现有资料无法回答」。"
"请用与问题相同的语言(中文或英文)作答,简洁准确,必要时引用片段编号。"
)
user_prompt = "\n".join([
"【问题】",
question,
"",
"【文档元信息】",
*meta_lines,
"",
"【证据片段】",
*evidence_lines,
"",
"请基于以上证据片段作答。",
])
return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
def run(
question: str,
*,
source_chunks_path: str,
model: str | None = None,
client: OpenAI | None = None,
**kwargs: Any,
) -> dict[str, Any]:
"""Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
This is the adapter contract entry point used by the online evaluation runner.
"""
chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
chunk_lookup = _load_source_chunks(source_chunks_path)
missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
if missing:
raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
contexts = [
str(chunk.get("text", "")).strip()
for chunk in resolved_chunks
if str(chunk.get("text", "")).strip()
]
if not contexts:
raise ValueError("resolved source chunks contain no usable text.")
settings = EvaluationSettings()
target_model = (model or settings.ragas_judge_model).strip()
if not target_model:
raise ValueError("A model name is required for siemens_pdf_qa adapter.")
llm_client = client or OpenAI(**settings.openai_client_kwargs)
completion = llm_client.chat.completions.create(
model=target_model,
messages=_build_messages(question, contexts, kwargs),
temperature=0,
)
answer = str(completion.choices[0].message.content or "").strip()
return {
"answer": answer,
"contexts": contexts,
"raw_response": {
"resolved_chunk_ids": chunk_ids,
"doc_id": kwargs.get("doc_id", ""),
"doc_name": kwargs.get("doc_name", ""),
"model": target_model,
"response_text": answer,
},
}

View File

@@ -0,0 +1,59 @@
# Siemens PDF 场景设计 Spec
- 日期2026-06-15
- 状态:已确认,进入实现。
## 1. 目标
基于 `datasets/siemens-pdfs/`17 个西门子医疗 CT 中文 PDF跑通完整三步流水线
```
dataset_buildPDF→题库→ offline smoke 评估 → online 评估
```
完全镜像现有 `sample-pdf-*` 模式(方案 A不改动任何现有文件。
## 2. 参数决策
| 项目 | 值 |
|---|---|
| 输入 PDF | `datasets/siemens-pdfs/*.pdf`17 个) |
| failure_mode | `skip`(单个文档解析失败不中断整批) |
| max_questions_per_document | 10共 ~170 题) |
| max_source_chunks_per_question | 3 |
| generation model | `.env``DATASET_GENERATOR_MODEL`qwen3.6-plus |
| judge model | `.env``RAGAS_JUDGE_MODEL`deepseek-v4-flash |
| embedding model | `.env``RAGAS_EMBEDDING_MODEL`text-embedding-v3 |
| online answer model | `.env``RAGAS_JUDGE_MODEL` |
| metrics | faithfulness / answer_relevancy / context_recall / context_precision |
## 3. 新增文件4 个)
```
scenarios/siemens_build/siemens-pdf-build.yaml
scenarios/offline/siemens-pdf-offline-smoke.yaml
scenarios/online/siemens-pdf-question-bank-online.yaml
apps/siemens_pdf_qa/__init__.py
apps/siemens_pdf_qa/adapter.py
```
加上辅助脚本:
```
scripts/build_siemens_offline_smoke.py ← 从 build 产物生成 offline smoke CSV
```
## 4. 运行顺序
```
# 步骤 1dataset buildPDF → 题库草稿 + source_chunks.jsonl
python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
# 步骤 2生成 offline smoke 数据集一次性脚本build 跑完后执行)
python scripts/build_siemens_offline_smoke.py
# 步骤 3offline 评估(用 source chunks 作为 contextsground_truth 作为 answer
python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
# 步骤 4online 评估(实时调用 LLM 生成 answer再评分
python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
```

View File

@@ -0,0 +1,15 @@
scenario_name: siemens-pdf-offline-smoke
mode: offline
app_adapter: null
dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
output_dir: ../../outputs/siemens-pdf-offline-smoke
runtime:
batch_size: 4
max_samples: 30

View File

@@ -0,0 +1,22 @@
scenario_name: siemens-pdf-question-bank-online
mode: online
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
output_dir: ../../outputs/online/siemens-pdf-question-bank
runtime:
batch_size: 4
app_concurrency: 4
metric_concurrency: 4
max_samples: 50
app_adapter:
type: python
callable: apps.siemens_pdf_qa.adapter:run
static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash

View File

@@ -0,0 +1,17 @@
job_name: siemens-pdf-question-bank
input:
path: ../../datasets/siemens-pdfs
glob: "*.pdf"
parser:
provider: aliyun_docmind
failure_mode: skip
generation:
output_type: online_question_bank
review_mode: draft_with_manual_review
max_questions_per_document: 10
max_source_chunks_per_question: 3
output:
dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
runtime:
max_documents: 17

View File

@@ -0,0 +1,72 @@
"""Build the Siemens offline smoke dataset from a completed dataset_build run.
Must be run AFTER `python main.py --dataset-build-config
scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
It uses the stable `latest/` alias so you don't need to know the run_id.
Usage:
python scripts/build_siemens_offline_smoke.py
Output:
datasets/normalized/siemens_pdf_offline_smoke.csv
(referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
"""
from __future__ import annotations
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths — all relative to the siemens_ragas/ repository root
# ---------------------------------------------------------------------------
REPO_ROOT = Path(__file__).resolve().parents[1]
DRAFT_DATASET_PATH = (
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
/ "latest" / "dataset_draft.csv"
)
SOURCE_CHUNKS_PATH = (
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
/ "latest" / "source_chunks.jsonl"
)
OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
def main() -> None:
"""Convert the Siemens build artefacts into an offline-evaluable dataset."""
if not DRAFT_DATASET_PATH.exists():
raise FileNotFoundError(
f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
"Run the dataset build first:\n"
" python main.py --dataset-build-config "
"scenarios/siemens_build/siemens-pdf-build.yaml"
)
if not SOURCE_CHUNKS_PATH.exists():
raise FileNotFoundError(
f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
"Run the dataset build first."
)
# Import here so the script is importable even before rag_eval is fully set up.
from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
output = build_offline_smoke_dataset(
draft_dataset_path=DRAFT_DATASET_PATH,
source_chunks_path=SOURCE_CHUNKS_PATH,
output_path=OUTPUT_PATH,
)
import pandas as pd
frame = pd.read_csv(output)
print(f"Offline smoke dataset written to: {output}")
print(f"Total rows: {len(frame)}")
if len(frame) > 0:
lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
print(f"Language distribution: {lang_counts}")
print(f"Difficulty distribution: {diff_counts}")
if __name__ == "__main__":
main()