diff --git a/apps/siemens_pdf_qa/__init__.py b/apps/siemens_pdf_qa/__init__.py new file mode 100644 index 0000000..2691dc0 --- /dev/null +++ b/apps/siemens_pdf_qa/__init__.py @@ -0,0 +1,6 @@ +"""Siemens PDF question bank adapter for online evaluation. + +Wraps the generic pdf_question_bank adapter with a Siemens-specific system +prompt that instructs the model to answer in the same language as the question +(Chinese for Chinese CT documentation) and to cite only the provided evidence. +""" diff --git a/apps/siemens_pdf_qa/adapter.py b/apps/siemens_pdf_qa/adapter.py new file mode 100644 index 0000000..3a04e2e --- /dev/null +++ b/apps/siemens_pdf_qa/adapter.py @@ -0,0 +1,170 @@ +"""Online evaluation adapter for the Siemens medical-imaging PDF question bank. + +Functionally identical to apps/pdf_question_bank/adapter.py but uses a +Siemens-specific system prompt that: + - Instructs the model to answer in the same language as the question + (important for Chinese CT documentation). + - Emphasises citation of source chunks and refusal when evidence is absent. + - Adds domain context (medical imaging / CT terminology). + +The adapter contract is the same as all other adapters: + run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}} +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from openai import OpenAI + +from rag_eval.settings import EvaluationSettings +from rag_eval.shared.utils import parse_contexts + + +# ── chunk cache (module-level, lives for the process lifetime) ──────────────── +_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {} + + +def _resolve_source_chunks_path(source_chunks_path: str) -> Path: + """Resolve the chunk artifact path; fall back to the latest timestamped run.""" + resolved = Path(source_chunks_path).resolve() + if resolved.exists(): + return resolved + if resolved.parent.name != "latest": + raise FileNotFoundError(resolved) + artifact_root = resolved.parent.parent + if not artifact_root.exists(): + raise FileNotFoundError(resolved) + candidates = sorted( + [d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"], + key=lambda p: p.name, + reverse=True, + ) + for run_dir in candidates: + candidate = run_dir / resolved.name + if candidate.exists(): + return candidate + raise FileNotFoundError(resolved) + + +def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]: + """Load and cache source chunks by chunk_id.""" + resolved = _resolve_source_chunks_path(source_chunks_path) + cached = _CHUNK_CACHE.get(resolved) + if cached is not None: + return cached + lookup: dict[str, dict[str, Any]] = {} + with resolved.open(encoding="utf-8") as fh: + for lineno, line in enumerate(fh, 1): + text = line.strip() + if not text: + continue + payload = json.loads(text) + chunk_id = str(payload.get("chunk_id", "")).strip() + if not chunk_id: + raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}") + lookup[chunk_id] = payload + _CHUNK_CACHE[resolved] = lookup + return lookup + + +def _resolve_chunk_ids(raw: Any) -> list[str]: + """Parse the source_chunk_ids column into a list of non-empty id strings.""" + ids = parse_contexts(raw) + normalized = [i for i in ids if i] + if not normalized: + raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.") + return normalized + + +def _build_messages( + question: str, + contexts: list[str], + metadata: dict[str, Any], +) -> list[dict[str, str]]: + """Build a Siemens-domain grounded prompt for the answer model.""" + evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)] + meta_lines = [ + f"doc_name: {metadata.get('doc_name', '')}", + f"section_path: {metadata.get('section_path', '')}", + f"page_range: {metadata.get('page_start', '')}–{metadata.get('page_end', '')}", + ] + # Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding + system_prompt = ( + "你是西门子医疗影像知识库的问答助手(Siemens Healthineers CT Knowledge Base QA)。" + "请严格根据下方【证据片段】回答问题,不得使用片段之外的任何知识。" + "若证据不足以回答,请明确说明「根据现有资料无法回答」。" + "请用与问题相同的语言(中文或英文)作答,简洁准确,必要时引用片段编号。" + ) + user_prompt = "\n".join([ + "【问题】", + question, + "", + "【文档元信息】", + *meta_lines, + "", + "【证据片段】", + *evidence_lines, + "", + "请基于以上证据片段作答。", + ]) + return [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + +def run( + question: str, + *, + source_chunks_path: str, + model: str | None = None, + client: OpenAI | None = None, + **kwargs: Any, +) -> dict[str, Any]: + """Answer one question by resolving cited chunks and calling an OpenAI-compatible model. + + This is the adapter contract entry point used by the online evaluation runner. + """ + chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids")) + chunk_lookup = _load_source_chunks(source_chunks_path) + + missing = [cid for cid in chunk_ids if cid not in chunk_lookup] + if missing: + raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing)) + + resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids] + contexts = [ + str(chunk.get("text", "")).strip() + for chunk in resolved_chunks + if str(chunk.get("text", "")).strip() + ] + if not contexts: + raise ValueError("resolved source chunks contain no usable text.") + + settings = EvaluationSettings() + target_model = (model or settings.ragas_judge_model).strip() + if not target_model: + raise ValueError("A model name is required for siemens_pdf_qa adapter.") + + llm_client = client or OpenAI(**settings.openai_client_kwargs) + completion = llm_client.chat.completions.create( + model=target_model, + messages=_build_messages(question, contexts, kwargs), + temperature=0, + ) + answer = str(completion.choices[0].message.content or "").strip() + + return { + "answer": answer, + "contexts": contexts, + "raw_response": { + "resolved_chunk_ids": chunk_ids, + "doc_id": kwargs.get("doc_id", ""), + "doc_name": kwargs.get("doc_name", ""), + "model": target_model, + "response_text": answer, + }, + } diff --git a/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md b/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md new file mode 100644 index 0000000..bc14d0f --- /dev/null +++ b/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md @@ -0,0 +1,59 @@ +# Siemens PDF 场景设计 Spec + +- 日期:2026-06-15 +- 状态:已确认,进入实现。 + +## 1. 目标 + +基于 `datasets/siemens-pdfs/`(17 个西门子医疗 CT 中文 PDF),跑通完整三步流水线: + +``` +dataset_build(PDF→题库)→ offline smoke 评估 → online 评估 +``` + +完全镜像现有 `sample-pdf-*` 模式(方案 A),不改动任何现有文件。 + +## 2. 参数决策 + +| 项目 | 值 | +|---|---| +| 输入 PDF | `datasets/siemens-pdfs/*.pdf`(17 个) | +| failure_mode | `skip`(单个文档解析失败不中断整批) | +| max_questions_per_document | 10(共 ~170 题) | +| max_source_chunks_per_question | 3 | +| generation model | `.env` 的 `DATASET_GENERATOR_MODEL`(qwen3.6-plus) | +| judge model | `.env` 的 `RAGAS_JUDGE_MODEL`(deepseek-v4-flash) | +| embedding model | `.env` 的 `RAGAS_EMBEDDING_MODEL`(text-embedding-v3) | +| online answer model | `.env` 的 `RAGAS_JUDGE_MODEL` | +| metrics | faithfulness / answer_relevancy / context_recall / context_precision | + +## 3. 新增文件(4 个) + +``` +scenarios/siemens_build/siemens-pdf-build.yaml +scenarios/offline/siemens-pdf-offline-smoke.yaml +scenarios/online/siemens-pdf-question-bank-online.yaml +apps/siemens_pdf_qa/__init__.py +apps/siemens_pdf_qa/adapter.py +``` + +加上辅助脚本: +``` +scripts/build_siemens_offline_smoke.py ← 从 build 产物生成 offline smoke CSV +``` + +## 4. 运行顺序 + +``` +# 步骤 1:dataset build(PDF → 题库草稿 + source_chunks.jsonl) +python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml + +# 步骤 2:生成 offline smoke 数据集(一次性脚本,build 跑完后执行) +python scripts/build_siemens_offline_smoke.py + +# 步骤 3:offline 评估(用 source chunks 作为 contexts,ground_truth 作为 answer) +python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml + +# 步骤 4:online 评估(实时调用 LLM 生成 answer,再评分) +python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml +``` diff --git a/scenarios/offline/siemens-pdf-offline-smoke.yaml b/scenarios/offline/siemens-pdf-offline-smoke.yaml new file mode 100644 index 0000000..9494bd2 --- /dev/null +++ b/scenarios/offline/siemens-pdf-offline-smoke.yaml @@ -0,0 +1,15 @@ +scenario_name: siemens-pdf-offline-smoke +mode: offline +app_adapter: null +dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv +judge_model: deepseek-v4-flash +embedding_model: text-embedding-v3 +metrics: + - faithfulness + - answer_relevancy + - context_recall + - context_precision +output_dir: ../../outputs/siemens-pdf-offline-smoke +runtime: + batch_size: 4 + max_samples: 30 diff --git a/scenarios/online/siemens-pdf-question-bank-online.yaml b/scenarios/online/siemens-pdf-question-bank-online.yaml new file mode 100644 index 0000000..defc90d --- /dev/null +++ b/scenarios/online/siemens-pdf-question-bank-online.yaml @@ -0,0 +1,22 @@ +scenario_name: siemens-pdf-question-bank-online +mode: online +dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv +judge_model: deepseek-v4-flash +embedding_model: text-embedding-v3 +metrics: + - faithfulness + - answer_relevancy + - context_recall + - context_precision +output_dir: ../../outputs/online/siemens-pdf-question-bank +runtime: + batch_size: 4 + app_concurrency: 4 + metric_concurrency: 4 + max_samples: 50 +app_adapter: + type: python + callable: apps.siemens_pdf_qa.adapter:run + static_kwargs: + source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl + model: deepseek-v4-flash diff --git a/scenarios/siemens_build/siemens-pdf-build.yaml b/scenarios/siemens_build/siemens-pdf-build.yaml new file mode 100644 index 0000000..356b1ab --- /dev/null +++ b/scenarios/siemens_build/siemens-pdf-build.yaml @@ -0,0 +1,17 @@ +job_name: siemens-pdf-question-bank +input: + path: ../../datasets/siemens-pdfs + glob: "*.pdf" +parser: + provider: aliyun_docmind + failure_mode: skip +generation: + output_type: online_question_bank + review_mode: draft_with_manual_review + max_questions_per_document: 10 + max_source_chunks_per_question: 3 +output: + dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv + artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank +runtime: + max_documents: 17 diff --git a/scripts/build_siemens_offline_smoke.py b/scripts/build_siemens_offline_smoke.py new file mode 100644 index 0000000..82a995b --- /dev/null +++ b/scripts/build_siemens_offline_smoke.py @@ -0,0 +1,72 @@ +"""Build the Siemens offline smoke dataset from a completed dataset_build run. + +Must be run AFTER `python main.py --dataset-build-config +scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully. + +It uses the stable `latest/` alias so you don't need to know the run_id. + +Usage: + python scripts/build_siemens_offline_smoke.py + +Output: + datasets/normalized/siemens_pdf_offline_smoke.csv + (referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml) +""" + +from __future__ import annotations + +from pathlib import Path + +# --------------------------------------------------------------------------- +# Paths — all relative to the siemens_ragas/ repository root +# --------------------------------------------------------------------------- +REPO_ROOT = Path(__file__).resolve().parents[1] + +DRAFT_DATASET_PATH = ( + REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank" + / "latest" / "dataset_draft.csv" +) +SOURCE_CHUNKS_PATH = ( + REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank" + / "latest" / "source_chunks.jsonl" +) +OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv" + + +def main() -> None: + """Convert the Siemens build artefacts into an offline-evaluable dataset.""" + if not DRAFT_DATASET_PATH.exists(): + raise FileNotFoundError( + f"Draft dataset not found: {DRAFT_DATASET_PATH}\n" + "Run the dataset build first:\n" + " python main.py --dataset-build-config " + "scenarios/siemens_build/siemens-pdf-build.yaml" + ) + if not SOURCE_CHUNKS_PATH.exists(): + raise FileNotFoundError( + f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n" + "Run the dataset build first." + ) + + # Import here so the script is importable even before rag_eval is fully set up. + from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset + + output = build_offline_smoke_dataset( + draft_dataset_path=DRAFT_DATASET_PATH, + source_chunks_path=SOURCE_CHUNKS_PATH, + output_path=OUTPUT_PATH, + ) + + import pandas as pd + frame = pd.read_csv(output) + print(f"Offline smoke dataset written to: {output}") + print(f"Total rows: {len(frame)}") + if len(frame) > 0: + lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {} + diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {} + print(f"Language distribution: {lang_counts}") + print(f"Difficulty distribution: {diff_counts}") + + +if __name__ == "__main__": + main()