first commit

2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions
--- a/apps/pdf_question_bank/adapter.py
+++ b/apps/pdf_question_bank/adapter.py
@@ -0,0 +1,161 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from openai import OpenAI
+
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.utils import parse_contexts
+
+
+_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
+
+
+def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
+    """Resolve the configured chunk artifact path, with fallback for missing latest aliases."""
+    resolved_path = Path(source_chunks_path).resolve()
+    if resolved_path.exists():
+        return resolved_path
+
+    if resolved_path.parent.name != "latest":
+        raise FileNotFoundError(resolved_path)
+
+    artifact_root = resolved_path.parent.parent
+    if not artifact_root.exists():
+        raise FileNotFoundError(resolved_path)
+
+    candidate_runs = sorted(
+        [
+            entry for entry in artifact_root.iterdir()
+            if entry.is_dir() and entry.name != "latest"
+        ],
+        key=lambda path: path.name,
+        reverse=True,
+    )
+    for run_dir in candidate_runs:
+        candidate = run_dir / resolved_path.name
+        if candidate.exists():
+            return candidate
+
+    raise FileNotFoundError(resolved_path)
+
+
+def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
+    """Load source chunk rows from JSONL and cache them by absolute file path."""
+    resolved_path = _resolve_source_chunks_path(source_chunks_path)
+    cached = _CHUNK_CACHE.get(resolved_path)
+    if cached is not None:
+        return cached
+
+    chunk_lookup: dict[str, dict[str, Any]] = {}
+    with resolved_path.open(encoding="utf-8") as handle:
+        for line_number, line in enumerate(handle, start=1):
+            text = line.strip()
+            if not text:
+                continue
+            payload = json.loads(text)
+            chunk_id = str(payload.get("chunk_id", "")).strip()
+            if not chunk_id:
+                raise ValueError(
+                    f"source_chunks.jsonl row {line_number} is missing chunk_id: {resolved_path}"
+                )
+            chunk_lookup[chunk_id] = payload
+
+    _CHUNK_CACHE[resolved_path] = chunk_lookup
+    return chunk_lookup
+
+
+def _resolve_chunk_ids(raw_chunk_ids: Any) -> list[str]:
+    """Parse the serialized source chunk id column into a non-empty list."""
+    chunk_ids = parse_contexts(raw_chunk_ids)
+    normalized = [chunk_id for chunk_id in chunk_ids if chunk_id]
+    if not normalized:
+        raise ValueError("source_chunk_ids is required for pdf question bank samples.")
+    return normalized
+
+
+def _build_messages(question: str, contexts: list[str], metadata: dict[str, Any]) -> list[dict[str, str]]:
+    """Construct an evidence-grounded prompt for answer generation."""
+    evidence_lines = [
+        f"[chunk {index}] {context}"
+        for index, context in enumerate(contexts, start=1)
+    ]
+    metadata_lines = [
+        f"doc_id: {metadata.get('doc_id', '')}",
+        f"doc_name: {metadata.get('doc_name', '')}",
+        f"section_path: {metadata.get('section_path', '')}",
+    ]
+    system_prompt = (
+        "You answer questions only from the provided evidence chunks. "
+        "Do not use outside knowledge. If the evidence is insufficient, say so plainly. "
+        "Do not invent missing facts, citations, steps, or numbers."
+    )
+    user_prompt = "\n".join(
+        [
+            "Question:",
+            question,
+            "",
+            "Sample metadata:",
+            *metadata_lines,
+            "",
+            "Evidence chunks:",
+            *evidence_lines,
+            "",
+            "Return a concise answer grounded only in the evidence above.",
+        ]
+    )
+    return [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+
+
+def run(
+    question: str,
+    *,
+    source_chunks_path: str,
+    model: str | None = None,
+    client: OpenAI | None = None,
+    **kwargs: Any,
+) -> dict[str, Any]:
+    """Answer one question by resolving cited chunks and querying an OpenAI-compatible model."""
+    chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
+    chunk_lookup = _load_source_chunks(source_chunks_path)
+
+    missing_ids = [chunk_id for chunk_id in chunk_ids if chunk_id not in chunk_lookup]
+    if missing_ids:
+        raise ValueError(
+            "source_chunk_ids not found in source chunks artifact: " + ", ".join(missing_ids)
+        )
+
+    resolved_chunks = [chunk_lookup[chunk_id] for chunk_id in chunk_ids]
+    contexts = [str(chunk.get("text", "")).strip() for chunk in resolved_chunks if str(chunk.get("text", "")).strip()]
+    if not contexts:
+        raise ValueError("resolved source chunks did not contain usable text contexts.")
+
+    settings = EvaluationSettings()
+    target_model = (model or settings.ragas_judge_model).strip()
+    if not target_model:
+        raise ValueError("A model name is required for pdf question bank adapter.")
+
+    llm_client = client or OpenAI(**settings.openai_client_kwargs)
+    completion = llm_client.chat.completions.create(
+        model=target_model,
+        messages=_build_messages(question, contexts, kwargs),
+        temperature=0,
+    )
+    answer = str(completion.choices[0].message.content or "").strip()
+
+    return {
+        "answer": answer,
+        "contexts": contexts,
+        "raw_response": {
+            "resolved_chunk_ids": chunk_ids,
+            "resolved_doc_id": kwargs.get("doc_id", ""),
+            "resolved_doc_name": kwargs.get("doc_name", ""),
+            "model": target_model,
+            "response_text": answer,
+        },
+    }