Add Siemens CT document evaluation scenario (three-step pipeline)
- scenarios/siemens_build/siemens-pdf-build.yaml: dataset build for all 17 Siemens medical-imaging PDFs (aliyun_docmind parser, 10 questions/doc, failure_mode=skip, ~170 question total) - scenarios/offline/siemens-pdf-offline-smoke.yaml: offline evaluation using source chunks as contexts and ground_truth as answer (up to 30 samples) - scenarios/online/siemens-pdf-question-bank-online.yaml: online evaluation calling siemens_pdf_qa adapter, batch_size=4, up to 50 samples - apps/siemens_pdf_qa/adapter.py: Siemens-specific adapter with bilingual (zh/en) system prompt and strict evidence-grounding for CT domain - scripts/build_siemens_offline_smoke.py: helper to derive offline smoke CSV from completed dataset build artifacts (run after dataset build step) - docs/superpowers/specs/2026-06-15-siemens-scenario-design.md: design spec All three scenarios are automatically discovered by the web console. Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
6
apps/siemens_pdf_qa/__init__.py
Normal file
6
apps/siemens_pdf_qa/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Siemens PDF question bank adapter for online evaluation.
|
||||
|
||||
Wraps the generic pdf_question_bank adapter with a Siemens-specific system
|
||||
prompt that instructs the model to answer in the same language as the question
|
||||
(Chinese for Chinese CT documentation) and to cite only the provided evidence.
|
||||
"""
|
||||
170
apps/siemens_pdf_qa/adapter.py
Normal file
170
apps/siemens_pdf_qa/adapter.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Online evaluation adapter for the Siemens medical-imaging PDF question bank.
|
||||
|
||||
Functionally identical to apps/pdf_question_bank/adapter.py but uses a
|
||||
Siemens-specific system prompt that:
|
||||
- Instructs the model to answer in the same language as the question
|
||||
(important for Chinese CT documentation).
|
||||
- Emphasises citation of source chunks and refusal when evidence is absent.
|
||||
- Adds domain context (medical imaging / CT terminology).
|
||||
|
||||
The adapter contract is the same as all other adapters:
|
||||
run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.utils import parse_contexts
|
||||
|
||||
|
||||
# ── chunk cache (module-level, lives for the process lifetime) ────────────────
|
||||
_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
|
||||
|
||||
|
||||
def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
|
||||
"""Resolve the chunk artifact path; fall back to the latest timestamped run."""
|
||||
resolved = Path(source_chunks_path).resolve()
|
||||
if resolved.exists():
|
||||
return resolved
|
||||
if resolved.parent.name != "latest":
|
||||
raise FileNotFoundError(resolved)
|
||||
artifact_root = resolved.parent.parent
|
||||
if not artifact_root.exists():
|
||||
raise FileNotFoundError(resolved)
|
||||
candidates = sorted(
|
||||
[d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
|
||||
key=lambda p: p.name,
|
||||
reverse=True,
|
||||
)
|
||||
for run_dir in candidates:
|
||||
candidate = run_dir / resolved.name
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
raise FileNotFoundError(resolved)
|
||||
|
||||
|
||||
def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
|
||||
"""Load and cache source chunks by chunk_id."""
|
||||
resolved = _resolve_source_chunks_path(source_chunks_path)
|
||||
cached = _CHUNK_CACHE.get(resolved)
|
||||
if cached is not None:
|
||||
return cached
|
||||
lookup: dict[str, dict[str, Any]] = {}
|
||||
with resolved.open(encoding="utf-8") as fh:
|
||||
for lineno, line in enumerate(fh, 1):
|
||||
text = line.strip()
|
||||
if not text:
|
||||
continue
|
||||
payload = json.loads(text)
|
||||
chunk_id = str(payload.get("chunk_id", "")).strip()
|
||||
if not chunk_id:
|
||||
raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
|
||||
lookup[chunk_id] = payload
|
||||
_CHUNK_CACHE[resolved] = lookup
|
||||
return lookup
|
||||
|
||||
|
||||
def _resolve_chunk_ids(raw: Any) -> list[str]:
|
||||
"""Parse the source_chunk_ids column into a list of non-empty id strings."""
|
||||
ids = parse_contexts(raw)
|
||||
normalized = [i for i in ids if i]
|
||||
if not normalized:
|
||||
raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
|
||||
return normalized
|
||||
|
||||
|
||||
def _build_messages(
|
||||
question: str,
|
||||
contexts: list[str],
|
||||
metadata: dict[str, Any],
|
||||
) -> list[dict[str, str]]:
|
||||
"""Build a Siemens-domain grounded prompt for the answer model."""
|
||||
evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
|
||||
meta_lines = [
|
||||
f"doc_name: {metadata.get('doc_name', '')}",
|
||||
f"section_path: {metadata.get('section_path', '')}",
|
||||
f"page_range: {metadata.get('page_start', '')}–{metadata.get('page_end', '')}",
|
||||
]
|
||||
# Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
|
||||
system_prompt = (
|
||||
"你是西门子医疗影像知识库的问答助手(Siemens Healthineers CT Knowledge Base QA)。"
|
||||
"请严格根据下方【证据片段】回答问题,不得使用片段之外的任何知识。"
|
||||
"若证据不足以回答,请明确说明「根据现有资料无法回答」。"
|
||||
"请用与问题相同的语言(中文或英文)作答,简洁准确,必要时引用片段编号。"
|
||||
)
|
||||
user_prompt = "\n".join([
|
||||
"【问题】",
|
||||
question,
|
||||
"",
|
||||
"【文档元信息】",
|
||||
*meta_lines,
|
||||
"",
|
||||
"【证据片段】",
|
||||
*evidence_lines,
|
||||
"",
|
||||
"请基于以上证据片段作答。",
|
||||
])
|
||||
return [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
|
||||
def run(
|
||||
question: str,
|
||||
*,
|
||||
source_chunks_path: str,
|
||||
model: str | None = None,
|
||||
client: OpenAI | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
|
||||
|
||||
This is the adapter contract entry point used by the online evaluation runner.
|
||||
"""
|
||||
chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
|
||||
chunk_lookup = _load_source_chunks(source_chunks_path)
|
||||
|
||||
missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
|
||||
if missing:
|
||||
raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
|
||||
|
||||
resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
|
||||
contexts = [
|
||||
str(chunk.get("text", "")).strip()
|
||||
for chunk in resolved_chunks
|
||||
if str(chunk.get("text", "")).strip()
|
||||
]
|
||||
if not contexts:
|
||||
raise ValueError("resolved source chunks contain no usable text.")
|
||||
|
||||
settings = EvaluationSettings()
|
||||
target_model = (model or settings.ragas_judge_model).strip()
|
||||
if not target_model:
|
||||
raise ValueError("A model name is required for siemens_pdf_qa adapter.")
|
||||
|
||||
llm_client = client or OpenAI(**settings.openai_client_kwargs)
|
||||
completion = llm_client.chat.completions.create(
|
||||
model=target_model,
|
||||
messages=_build_messages(question, contexts, kwargs),
|
||||
temperature=0,
|
||||
)
|
||||
answer = str(completion.choices[0].message.content or "").strip()
|
||||
|
||||
return {
|
||||
"answer": answer,
|
||||
"contexts": contexts,
|
||||
"raw_response": {
|
||||
"resolved_chunk_ids": chunk_ids,
|
||||
"doc_id": kwargs.get("doc_id", ""),
|
||||
"doc_name": kwargs.get("doc_name", ""),
|
||||
"model": target_model,
|
||||
"response_text": answer,
|
||||
},
|
||||
}
|
||||
59
docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
Normal file
59
docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Siemens PDF 场景设计 Spec
|
||||
|
||||
- 日期:2026-06-15
|
||||
- 状态:已确认,进入实现。
|
||||
|
||||
## 1. 目标
|
||||
|
||||
基于 `datasets/siemens-pdfs/`(17 个西门子医疗 CT 中文 PDF),跑通完整三步流水线:
|
||||
|
||||
```
|
||||
dataset_build(PDF→题库)→ offline smoke 评估 → online 评估
|
||||
```
|
||||
|
||||
完全镜像现有 `sample-pdf-*` 模式(方案 A),不改动任何现有文件。
|
||||
|
||||
## 2. 参数决策
|
||||
|
||||
| 项目 | 值 |
|
||||
|---|---|
|
||||
| 输入 PDF | `datasets/siemens-pdfs/*.pdf`(17 个) |
|
||||
| failure_mode | `skip`(单个文档解析失败不中断整批) |
|
||||
| max_questions_per_document | 10(共 ~170 题) |
|
||||
| max_source_chunks_per_question | 3 |
|
||||
| generation model | `.env` 的 `DATASET_GENERATOR_MODEL`(qwen3.6-plus) |
|
||||
| judge model | `.env` 的 `RAGAS_JUDGE_MODEL`(deepseek-v4-flash) |
|
||||
| embedding model | `.env` 的 `RAGAS_EMBEDDING_MODEL`(text-embedding-v3) |
|
||||
| online answer model | `.env` 的 `RAGAS_JUDGE_MODEL` |
|
||||
| metrics | faithfulness / answer_relevancy / context_recall / context_precision |
|
||||
|
||||
## 3. 新增文件(4 个)
|
||||
|
||||
```
|
||||
scenarios/siemens_build/siemens-pdf-build.yaml
|
||||
scenarios/offline/siemens-pdf-offline-smoke.yaml
|
||||
scenarios/online/siemens-pdf-question-bank-online.yaml
|
||||
apps/siemens_pdf_qa/__init__.py
|
||||
apps/siemens_pdf_qa/adapter.py
|
||||
```
|
||||
|
||||
加上辅助脚本:
|
||||
```
|
||||
scripts/build_siemens_offline_smoke.py ← 从 build 产物生成 offline smoke CSV
|
||||
```
|
||||
|
||||
## 4. 运行顺序
|
||||
|
||||
```
|
||||
# 步骤 1:dataset build(PDF → 题库草稿 + source_chunks.jsonl)
|
||||
python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
|
||||
|
||||
# 步骤 2:生成 offline smoke 数据集(一次性脚本,build 跑完后执行)
|
||||
python scripts/build_siemens_offline_smoke.py
|
||||
|
||||
# 步骤 3:offline 评估(用 source chunks 作为 contexts,ground_truth 作为 answer)
|
||||
python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
|
||||
|
||||
# 步骤 4:online 评估(实时调用 LLM 生成 answer,再评分)
|
||||
python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
|
||||
```
|
||||
15
scenarios/offline/siemens-pdf-offline-smoke.yaml
Normal file
15
scenarios/offline/siemens-pdf-offline-smoke.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
scenario_name: siemens-pdf-offline-smoke
|
||||
mode: offline
|
||||
app_adapter: null
|
||||
dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
|
||||
judge_model: deepseek-v4-flash
|
||||
embedding_model: text-embedding-v3
|
||||
metrics:
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
||||
runtime:
|
||||
batch_size: 4
|
||||
max_samples: 30
|
||||
22
scenarios/online/siemens-pdf-question-bank-online.yaml
Normal file
22
scenarios/online/siemens-pdf-question-bank-online.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
scenario_name: siemens-pdf-question-bank-online
|
||||
mode: online
|
||||
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||
judge_model: deepseek-v4-flash
|
||||
embedding_model: text-embedding-v3
|
||||
metrics:
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
||||
runtime:
|
||||
batch_size: 4
|
||||
app_concurrency: 4
|
||||
metric_concurrency: 4
|
||||
max_samples: 50
|
||||
app_adapter:
|
||||
type: python
|
||||
callable: apps.siemens_pdf_qa.adapter:run
|
||||
static_kwargs:
|
||||
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
||||
model: deepseek-v4-flash
|
||||
17
scenarios/siemens_build/siemens-pdf-build.yaml
Normal file
17
scenarios/siemens_build/siemens-pdf-build.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
job_name: siemens-pdf-question-bank
|
||||
input:
|
||||
path: ../../datasets/siemens-pdfs
|
||||
glob: "*.pdf"
|
||||
parser:
|
||||
provider: aliyun_docmind
|
||||
failure_mode: skip
|
||||
generation:
|
||||
output_type: online_question_bank
|
||||
review_mode: draft_with_manual_review
|
||||
max_questions_per_document: 10
|
||||
max_source_chunks_per_question: 3
|
||||
output:
|
||||
dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||
artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
|
||||
runtime:
|
||||
max_documents: 17
|
||||
72
scripts/build_siemens_offline_smoke.py
Normal file
72
scripts/build_siemens_offline_smoke.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Build the Siemens offline smoke dataset from a completed dataset_build run.
|
||||
|
||||
Must be run AFTER `python main.py --dataset-build-config
|
||||
scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
|
||||
|
||||
It uses the stable `latest/` alias so you don't need to know the run_id.
|
||||
|
||||
Usage:
|
||||
python scripts/build_siemens_offline_smoke.py
|
||||
|
||||
Output:
|
||||
datasets/normalized/siemens_pdf_offline_smoke.csv
|
||||
(referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Paths — all relative to the siemens_ragas/ repository root
|
||||
# ---------------------------------------------------------------------------
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
DRAFT_DATASET_PATH = (
|
||||
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
|
||||
/ "latest" / "dataset_draft.csv"
|
||||
)
|
||||
SOURCE_CHUNKS_PATH = (
|
||||
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
|
||||
/ "latest" / "source_chunks.jsonl"
|
||||
)
|
||||
OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Convert the Siemens build artefacts into an offline-evaluable dataset."""
|
||||
if not DRAFT_DATASET_PATH.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
|
||||
"Run the dataset build first:\n"
|
||||
" python main.py --dataset-build-config "
|
||||
"scenarios/siemens_build/siemens-pdf-build.yaml"
|
||||
)
|
||||
if not SOURCE_CHUNKS_PATH.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
|
||||
"Run the dataset build first."
|
||||
)
|
||||
|
||||
# Import here so the script is importable even before rag_eval is fully set up.
|
||||
from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
|
||||
|
||||
output = build_offline_smoke_dataset(
|
||||
draft_dataset_path=DRAFT_DATASET_PATH,
|
||||
source_chunks_path=SOURCE_CHUNKS_PATH,
|
||||
output_path=OUTPUT_PATH,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
frame = pd.read_csv(output)
|
||||
print(f"Offline smoke dataset written to: {output}")
|
||||
print(f"Total rows: {len(frame)}")
|
||||
if len(frame) > 0:
|
||||
lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
|
||||
diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
|
||||
print(f"Language distribution: {lang_counts}")
|
||||
print(f"Difficulty distribution: {diff_counts}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user