first commit

2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions
--- a/rag_eval/init.py
+++ b/rag_eval/init.py
@@ -0,0 +1,5 @@
+"""Public package exports for the RAG evaluation toolkit."""
+
+from .execution.runner import run_scenario
+
+__all__ = ["run_scenario"]
--- a/rag_eval/adapters/init.py
+++ b/rag_eval/adapters/init.py
@@ -0,0 +1,7 @@
+"""Adapter implementations that connect evaluation flows to target applications."""
+
+from .base import AppAdapter
+from .http import HttpAppAdapter
+from .python import PythonFunctionAdapter
+
+__all__ = ["AppAdapter", "HttpAppAdapter", "PythonFunctionAdapter"]
--- a/rag_eval/adapters/base.py
+++ b/rag_eval/adapters/base.py
@@ -0,0 +1,37 @@
+"""Shared adapter interfaces for online application execution."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from rag_eval.shared.models import NormalizedSample
+
+
+class AppAdapter(ABC):
+    """Abstract base class for adapters that fetch answers and contexts from apps."""
+
+    @abstractmethod
+    async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
+        """Execute the target application for a single question."""
+        raise NotImplementedError
+
+    async def enrich_sample(self, sample: NormalizedSample) -> NormalizedSample:
+        """Merge adapter output into an existing normalized sample."""
+        response = await self.run(question=sample.question, **sample.metadata)
+        answer = str(response.get("answer", "")).strip()
+        contexts = response.get("contexts") or []
+        # Drop empty context fragments so downstream metrics receive clean lists.
+        normalized_contexts = [str(item).strip() for item in contexts if str(item).strip()]
+        return NormalizedSample(
+            sample_id=sample.sample_id,
+            question=sample.question,
+            contexts=normalized_contexts,
+            answer=answer,
+            ground_truth=sample.ground_truth,
+            scenario=sample.scenario,
+            language=sample.language,
+            retrieval_config=sample.retrieval_config,
+            metadata={**sample.metadata, "raw_response": response.get("raw_response")},
+            raw=sample.raw,
+        )
--- a/rag_eval/adapters/http.py
+++ b/rag_eval/adapters/http.py
@@ -0,0 +1,45 @@
+"""HTTP adapter implementation for online evaluation scenarios."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+from rag_eval.shared.models import AppAdapterConfig
+
+from .base import AppAdapter
+
+
+class HttpAppAdapter(AppAdapter):
+    """Call an HTTP endpoint and map its JSON response into the normalized adapter shape."""
+
+    def __init__(self, config: AppAdapterConfig):
+        """Store the HTTP adapter configuration for later requests."""
+        self.config = config
+
+    async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
+        """Send one HTTP request and return the normalized response payload."""
+        payload = dict(self.config.request_template)
+        payload["question"] = question
+        payload.update(self.config.static_kwargs)
+        payload.update(kwargs)
+
+        async with httpx.AsyncClient(timeout=self.config.timeout_seconds) as client:
+            response = await client.request(
+                self.config.method.upper(),
+                self.config.endpoint or "",
+                json=payload,
+            )
+            response.raise_for_status()
+            body = response.json()
+
+        # Allow scenario config to rename answer/context fields without custom code.
+        mapping = self.config.response_mapping or {}
+        answer_key = mapping.get("answer", "answer")
+        contexts_key = mapping.get("contexts", "contexts")
+        return {
+            "answer": body.get(answer_key, ""),
+            "contexts": body.get(contexts_key, []),
+            "raw_response": body,
+        }
--- a/rag_eval/adapters/python.py
+++ b/rag_eval/adapters/python.py
@@ -0,0 +1,38 @@
+"""Python callable adapter for in-process application integrations."""
+
+from __future__ import annotations
+
+from importlib import import_module
+from typing import Any, Callable
+
+from rag_eval.shared.models import AppAdapterConfig
+
+from .base import AppAdapter
+
+
+class PythonFunctionAdapter(AppAdapter):
+    """Wrap a configured Python callable so it can participate in online evaluation."""
+
+    def __init__(self, config: AppAdapterConfig):
+        """Load and cache the configured callable during adapter initialization."""
+        self.config = config
+        self._callable = self._load_callable(config.callable or "")
+
+    @staticmethod
+    def _load_callable(target: str) -> Callable[..., dict[str, Any]]:
+        """Resolve a `module:function` target into a callable object."""
+        module_name, _, attr_name = target.partition(":")
+        if not module_name or not attr_name:
+            raise ValueError("Python adapter callable must use module:function syntax.")
+        module = import_module(module_name)
+        fn = getattr(module, attr_name)
+        if not callable(fn):
+            raise TypeError(f"Configured callable is not callable: {target}")
+        return fn
+
+    async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
+        """Invoke the configured callable and enforce the adapter response contract."""
+        result = self._callable(question=question, **self.config.static_kwargs, **kwargs)
+        if not isinstance(result, dict):
+            raise TypeError("Python adapter callable must return a dict.")
+        return result
--- a/rag_eval/compat.py
+++ b/rag_eval/compat.py
@@ -0,0 +1,39 @@
+"""Compatibility helpers for optional third-party import paths."""
+
+from __future__ import annotations
+
+import sys
+import types
+
+
+def ensure_ragas_import_compat() -> None:
+    """Patch optional langchain module paths that ragas imports eagerly.
+
+    The local environment ships a `langchain_community` build that still exposes
+    `langchain_community.llms.vertexai` but no longer provides
+    `langchain_community.chat_models.vertexai`. Ragas imports the chat module at
+    import time even when only OpenAI is used. Inject a minimal module so ragas
+    can import without mutating site-packages.
+    """
+
+    module_name = "langchain_community.chat_models.vertexai"
+    if module_name in sys.modules:
+        return
+
+    try:
+        import langchain_community.chat_models.vertexai  # type: ignore  # noqa: F401
+
+        return
+    except ModuleNotFoundError:
+        pass
+
+    # Inject a minimal shim so ragas can import successfully in stripped builds.
+    shim = types.ModuleType(module_name)
+
+    class ChatVertexAI:  # pragma: no cover - only used for import compatibility
+        """Compatibility shim for environments that do not ship ChatVertexAI."""
+
+        pass
+
+    shim.ChatVertexAI = ChatVertexAI
+    sys.modules[module_name] = shim
--- a/rag_eval/config/init.py
+++ b/rag_eval/config/init.py
@@ -0,0 +1,5 @@
+"""Scenario configuration loading utilities."""
+
+from .loader import load_scenario
+
+__all__ = ["load_scenario"]
--- a/rag_eval/config/loader.py
+++ b/rag_eval/config/loader.py
@@ -0,0 +1,67 @@
+"""Scenario file loading and conversion into internal runtime models."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from rag_eval.shared.models import AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario
+
+from .schema import ScenarioModel
+from .validators import validate_scenario
+
+
+def _resolve_static_kwargs_paths(base_dir: Path, raw_kwargs: dict[str, object]) -> dict[str, object]:
+    """Resolve adapter static kwargs that look like relative file-system paths."""
+    resolved: dict[str, object] = {}
+    for key, value in raw_kwargs.items():
+        if key.endswith("_path") and isinstance(value, str):
+            candidate = Path(value)
+            resolved[key] = candidate if candidate.is_absolute() else (base_dir / candidate).resolve()
+            continue
+        resolved[key] = value
+    return resolved
+
+
+def load_scenario(path: str | Path) -> Scenario:
+    """Load, validate, and resolve a scenario file into the internal scenario model."""
+    scenario_path = Path(path).resolve()
+    payload = yaml.safe_load(scenario_path.read_text(encoding="utf-8")) or {}
+    model = ScenarioModel.model_validate(payload)
+    base_dir = scenario_path.parent
+
+    app_adapter = None
+    if model.app_adapter is not None:
+        # Convert the validated Pydantic model into the lightweight runtime dataclass.
+        app_adapter = AppAdapterConfig(
+            type=model.app_adapter.type,
+            endpoint=model.app_adapter.endpoint,
+            method=model.app_adapter.method,
+            timeout_seconds=model.app_adapter.timeout_seconds,
+            callable=model.app_adapter.callable,
+            request_template=model.app_adapter.request_template,
+            response_mapping=model.app_adapter.response_mapping,
+            static_kwargs=_resolve_static_kwargs_paths(base_dir, model.app_adapter.static_kwargs),
+        )
+
+    scenario = Scenario(
+        scenario_name=model.scenario_name,
+        mode=model.mode,
+        app_adapter=app_adapter,
+        dataset=DatasetConfig(path=model.resolve_path(base_dir, model.dataset)),
+        judge_model=model.judge_model,
+        embedding_model=model.embedding_model,
+        metrics=model.metrics,
+        output_dir=model.resolve_path(base_dir, model.output_dir),
+        runtime=RuntimeConfig(
+            batch_size=model.runtime.batch_size,
+            app_concurrency=model.runtime.app_concurrency,
+            metric_concurrency=model.runtime.metric_concurrency,
+            max_samples=model.runtime.max_samples,
+        ),
+        source_path=scenario_path,
+    )
+    # Run cross-field checks after all relative paths have been resolved.
+    validate_scenario(scenario)
+    return scenario
--- a/rag_eval/config/schema.py
+++ b/rag_eval/config/schema.py
@@ -0,0 +1,78 @@
+"""Pydantic schemas used to validate raw scenario configuration files."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+
+
+class RuntimeConfigModel(BaseModel):
+    """Schema for runtime concurrency and sampling settings."""
+    model_config = ConfigDict(extra="ignore")
+
+    batch_size: int = 4
+    app_concurrency: int | None = None
+    metric_concurrency: int | None = None
+    max_samples: int | None = None
+
+
+class AppAdapterConfigModel(BaseModel):
+    """Schema for adapter-specific configuration in online scenarios."""
+    model_config = ConfigDict(extra="ignore")
+
+    type: Literal["http", "python"]
+    endpoint: str | None = None
+    method: str = "POST"
+    timeout_seconds: int = 30
+    callable: str | None = None
+    request_template: dict[str, Any] = Field(default_factory=dict)
+    response_mapping: dict[str, str] = Field(default_factory=dict)
+    static_kwargs: dict[str, Any] = Field(default_factory=dict)
+
+    @model_validator(mode="after")
+    def validate_shape(self) -> "AppAdapterConfigModel":
+        """Enforce the fields required by each adapter type."""
+        if self.type == "http" and not self.endpoint:
+            raise ValueError("HTTP adapter requires endpoint.")
+        if self.type == "python" and not self.callable:
+            raise ValueError("Python adapter requires callable.")
+        return self
+
+
+class ScenarioModel(BaseModel):
+    """Schema for a user-authored evaluation scenario file."""
+    model_config = ConfigDict(extra="ignore")
+
+    scenario_name: str
+    mode: Literal["offline", "online"]
+    app_adapter: AppAdapterConfigModel | None = None
+    dataset: str
+    judge_model: str
+    embedding_model: str
+    metrics: list[str]
+    output_dir: str
+    runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
+
+    @field_validator("metrics")
+    @classmethod
+    def ensure_metrics_not_empty(cls, value: list[str]) -> list[str]:
+        """Reject scenarios that do not request any metrics."""
+        if not value:
+            raise ValueError("metrics must not be empty.")
+        return value
+
+    @model_validator(mode="after")
+    def validate_mode_requirements(self) -> "ScenarioModel":
+        """Ensure online scenarios define the adapter they depend on."""
+        if self.mode == "online" and self.app_adapter is None:
+            raise ValueError("online mode requires app_adapter.")
+        return self
+
+    def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
+        """Resolve relative paths against the scenario file directory."""
+        candidate = Path(raw_path)
+        if candidate.is_absolute():
+            return candidate
+        return (base_dir / candidate).resolve()
--- a/rag_eval/config/validators.py
+++ b/rag_eval/config/validators.py
@@ -0,0 +1,20 @@
+"""Cross-field validation helpers for resolved runtime scenarios."""
+
+from __future__ import annotations
+
+from rag_eval.metrics.registry import SUPPORTED_METRICS
+from rag_eval.shared.models import Scenario
+
+
+def validate_scenario(scenario: Scenario) -> None:
+    """Validate metric selection and mode-specific runtime constraints."""
+    unsupported = [name for name in scenario.metrics if name not in SUPPORTED_METRICS]
+    if unsupported:
+        supported = ", ".join(sorted(SUPPORTED_METRICS))
+        raise ValueError(
+            f"Unsupported metrics: {', '.join(unsupported)}. Supported metrics: {supported}"
+        )
+    if scenario.mode == "offline" and scenario.app_adapter is not None:
+        raise ValueError("offline mode should not define app_adapter.")
+    if scenario.runtime.batch_size < 1:
+        raise ValueError("runtime.batch_size must be >= 1.")
--- a/rag_eval/dataset_builder/init.py
+++ b/rag_eval/dataset_builder/init.py
@@ -0,0 +1,5 @@
+"""Dataset build workflow for converting PDFs into reviewable online question banks."""
+
+from .runner import run_dataset_build
+
+__all__ = ["run_dataset_build"]
--- a/rag_eval/dataset_builder/generator/init.py
+++ b/rag_eval/dataset_builder/generator/init.py
@@ -0,0 +1,5 @@
+"""Question generation components for draft online datasets."""
+
+from .question_generator import OpenAIQuestionGenerator, QuestionGenerator
+
+__all__ = ["OpenAIQuestionGenerator", "QuestionGenerator"]
--- a/rag_eval/dataset_builder/generator/question_generator.py
+++ b/rag_eval/dataset_builder/generator/question_generator.py
@@ -0,0 +1,173 @@
+"""LLM-backed question generator for dataset build jobs."""
+
+from __future__ import annotations
+
+import json
+from abc import ABC, abstractmethod
+from typing import Any
+
+from openai import OpenAI
+
+from rag_eval.dataset_builder.models import DraftQuestionSample, ParsedDocument, SourceChunk
+from rag_eval.settings import EvaluationSettings
+
+
+class QuestionGenerator(ABC):
+    """Abstract interface for generating draft questions from parsed documents."""
+
+    @abstractmethod
+    def generate(
+        self,
+        document: ParsedDocument,
+        *,
+        max_questions: int,
+        max_chunks_per_question: int,
+        job_name: str,
+    ) -> list[DraftQuestionSample]:
+        """Generate draft question samples for one parsed document."""
+        raise NotImplementedError
+
+
+class OpenAIQuestionGenerator(QuestionGenerator):
+    """Generate draft questions with an OpenAI-compatible chat completion API."""
+
+    def __init__(self, settings: EvaluationSettings, model: str, client: OpenAI | None = None):
+        """Initialize the OpenAI-compatible client and target generation model."""
+        if not settings.openai_api_key:
+            raise EnvironmentError("OPENAI_API_KEY must be set before generating draft questions.")
+        self.client = client or OpenAI(**settings.openai_client_kwargs)
+        self.model = model
+
+    def _build_prompt(
+        self,
+        document: ParsedDocument,
+        *,
+        max_questions: int,
+        max_chunks_per_question: int,
+    ) -> str:
+        """Build a constrained JSON-generation prompt for one document."""
+        chunk_lines: list[str] = []
+        for chunk in document.source_chunks:
+            chunk_lines.append(
+                json.dumps(
+                    {
+                        "chunk_id": chunk.chunk_id,
+                        "section_path": chunk.section_path,
+                        "page_start": chunk.page_start,
+                        "page_end": chunk.page_end,
+                        "text": chunk.text,
+                    },
+                    ensure_ascii=False,
+                )
+            )
+
+        instructions = {
+            "task": "Generate reviewable online evaluation draft questions from one document only.",
+            "rules": [
+                "Return JSON only.",
+                f"Generate at most {max_questions} samples.",
+                f"Each sample may cite at most {max_chunks_per_question} chunk ids.",
+                "Every sample must stay within this document and use existing chunk ids only.",
+                "Allowed question_type values: fact, summary, procedure, comparison.",
+                "Allowed difficulty values: easy, medium, hard.",
+            ],
+            "output_schema": {
+                "samples": [
+                    {
+                        "question": "string",
+                        "ground_truth": "string",
+                        "source_chunk_ids": ["chunk-id"],
+                        "question_type": "fact|summary|procedure|comparison",
+                        "difficulty": "easy|medium|hard",
+                    }
+                ]
+            },
+            "document": {
+                "doc_id": document.doc_id,
+                "doc_name": document.doc_name,
+                "chunks": chunk_lines,
+            },
+        }
+        return json.dumps(instructions, ensure_ascii=False, indent=2)
+
+    def _build_sample(
+        self,
+        *,
+        document: ParsedDocument,
+        payload: dict[str, Any],
+        index: int,
+        job_name: str,
+    ) -> DraftQuestionSample:
+        """Convert one model output object into the internal draft sample model."""
+        chunk_lookup: dict[str, SourceChunk] = {item.chunk_id: item for item in document.source_chunks}
+        source_chunk_ids = [str(item).strip() for item in payload.get("source_chunk_ids") or [] if str(item).strip()]
+        chunks = [chunk_lookup[item] for item in source_chunk_ids if item in chunk_lookup]
+
+        section_path = chunks[0].section_path if chunks else ""
+        page_start = min((chunk.page_start for chunk in chunks), default=0)
+        page_end = max((chunk.page_end for chunk in chunks), default=0)
+        language = "zh" if any("\u4e00" <= char <= "\u9fff" for char in payload.get("question", "")) else "en"
+        return DraftQuestionSample(
+            sample_id=f"{document.doc_id}-q{index}",
+            question=str(payload.get("question", "")).strip(),
+            ground_truth=str(payload.get("ground_truth", "")).strip(),
+            scenario=job_name,
+            language=language,
+            doc_id=document.doc_id,
+            doc_name=document.doc_name,
+            section_path=section_path,
+            page_start=page_start,
+            page_end=page_end,
+            source_chunk_ids=source_chunk_ids,
+            question_type=str(payload.get("question_type", "fact")).strip() or "fact",
+            difficulty=str(payload.get("difficulty", "medium")).strip() or "medium",
+        )
+
+    @staticmethod
+    def _parse_response_payload(content: str) -> list[dict[str, Any]]:
+        """Parse the model response into a list of sample payload dictionaries."""
+        try:
+            payload = json.loads(content or "{}")
+        except json.JSONDecodeError as exc:
+            raise ValueError("Question generator returned invalid JSON.") from exc
+
+        if not isinstance(payload, dict):
+            raise ValueError("Question generator response must be a JSON object.")
+        samples = payload.get("samples") or []
+        if not isinstance(samples, list):
+            raise ValueError("Question generator response field 'samples' must be a list.")
+
+        normalized_samples: list[dict[str, Any]] = []
+        for item in samples:
+            if isinstance(item, dict):
+                normalized_samples.append(item)
+        return normalized_samples
+
+    def generate(
+        self,
+        document: ParsedDocument,
+        *,
+        max_questions: int,
+        max_chunks_per_question: int,
+        job_name: str,
+    ) -> list[DraftQuestionSample]:
+        """Generate draft questions for one parsed document."""
+        prompt = self._build_prompt(
+            document,
+            max_questions=max_questions,
+            max_chunks_per_question=max_chunks_per_question,
+        )
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You generate structured draft question banks from source documents."},
+                {"role": "user", "content": prompt},
+            ],
+            response_format={"type": "json_object"},
+        )
+        content = response.choices[0].message.content or "{}"
+        payload = self._parse_response_payload(content)
+        return [
+            self._build_sample(document=document, payload=item, index=index, job_name=job_name)
+            for index, item in enumerate(payload[:max_questions], start=1)
+        ]
--- a/rag_eval/dataset_builder/generator/validators.py
+++ b/rag_eval/dataset_builder/generator/validators.py
@@ -0,0 +1,87 @@
+"""Validation and deduplication helpers for generated draft question samples."""
+
+from __future__ import annotations
+
+import re
+from difflib import SequenceMatcher
+
+from rag_eval.dataset_builder.models import DraftQuestionSample, ParsedDocument
+
+
+ALLOWED_QUESTION_TYPES = {"fact", "summary", "procedure", "comparison"}
+ALLOWED_DIFFICULTIES = {"easy", "medium", "hard"}
+
+
+def validate_draft_sample(
+    sample: DraftQuestionSample,
+    *,
+    document: ParsedDocument,
+    max_source_chunks_per_question: int | None = None,
+) -> list[str]:
+    """Validate one generated sample against the document and enum constraints."""
+    errors: list[str] = []
+    if not sample.question.strip():
+        errors.append("question is empty")
+    if not sample.ground_truth.strip():
+        errors.append("ground_truth is empty")
+    if not sample.source_chunk_ids:
+        errors.append("source_chunk_ids is empty")
+    if (
+        max_source_chunks_per_question is not None
+        and len(sample.source_chunk_ids) > max_source_chunks_per_question
+    ):
+        errors.append(
+            f"source_chunk_ids exceeds limit: {len(sample.source_chunk_ids)} > {max_source_chunks_per_question}"
+        )
+
+    existing_chunk_ids = {chunk.chunk_id for chunk in document.source_chunks}
+    for chunk_id in sample.source_chunk_ids:
+        if chunk_id not in existing_chunk_ids:
+            errors.append(f"unknown source chunk: {chunk_id}")
+
+    if sample.doc_id != document.doc_id:
+        errors.append("sample doc_id does not match source document")
+    if sample.question_type not in ALLOWED_QUESTION_TYPES:
+        errors.append(f"unsupported question_type: {sample.question_type}")
+    if sample.difficulty not in ALLOWED_DIFFICULTIES:
+        errors.append(f"unsupported difficulty: {sample.difficulty}")
+    return errors
+
+
+def normalize_question_text(text: str) -> str:
+    """Normalize question text for exact-match deduplication."""
+    return re.sub(r"\s+", " ", text).strip().lower()
+
+
+def dedupe_samples(samples: list[DraftQuestionSample]) -> list[DraftQuestionSample]:
+    """Drop duplicate questions and enforce one output per chunk group per document."""
+    deduped: list[DraftQuestionSample] = []
+    seen_questions: set[tuple[str, str]] = set()
+    seen_chunk_groups: set[tuple[str, tuple[str, ...]]] = set()
+    seen_chunk_answers: list[tuple[str, tuple[str, ...], str]] = []
+
+    for sample in samples:
+        question_key = (sample.doc_id, normalize_question_text(sample.question))
+        if question_key in seen_questions:
+            continue
+
+        chunk_key = tuple(sample.source_chunk_ids)
+        chunk_group_key = (sample.doc_id, chunk_key)
+        if chunk_group_key in seen_chunk_groups:
+            continue
+        answer_key = normalize_question_text(sample.ground_truth)
+        duplicate = False
+        for existing_doc_id, existing_chunk_key, existing_answer in seen_chunk_answers:
+            if existing_doc_id != sample.doc_id or existing_chunk_key != chunk_key:
+                continue
+            if SequenceMatcher(None, existing_answer, answer_key).ratio() >= 0.9:
+                duplicate = True
+                break
+        if duplicate:
+            continue
+
+        seen_questions.add(question_key)
+        seen_chunk_groups.add(chunk_group_key)
+        seen_chunk_answers.append((sample.doc_id, chunk_key, answer_key))
+        deduped.append(sample)
+    return deduped
--- a/rag_eval/dataset_builder/models.py
+++ b/rag_eval/dataset_builder/models.py
@@ -0,0 +1,203 @@
+"""Internal data models for the PDF-to-dataset build workflow."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+
+
+ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
+QuestionType = Literal["fact", "summary", "procedure", "comparison"]
+Difficulty = Literal["easy", "medium", "hard"]
+FailureMode = Literal["fail", "skip"]
+
+
+@dataclass(slots=True)
+class DatasetBuildRuntime:
+    """Runtime controls for one dataset build job."""
+
+    max_documents: int | None = None
+
+
+@dataclass(slots=True)
+class DatasetBuildJob:
+    """Resolved dataset build configuration consumed by the build runner."""
+
+    job_name: str
+    input_path: Path
+    input_glob: str
+    parser_provider: str
+    failure_mode: FailureMode
+    generation_model: str
+    output_type: str
+    review_mode: str
+    max_questions_per_document: int
+    max_source_chunks_per_question: int
+    dataset_path: Path
+    artifact_dir: Path
+    runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
+    source_path: Path | None = None
+
+    def snapshot(self) -> dict[str, Any]:
+        """Serialize the job into JSON-friendly metadata."""
+        payload = asdict(self)
+        payload["input_path"] = self.input_path.as_posix()
+        payload["dataset_path"] = self.dataset_path.as_posix()
+        payload["artifact_dir"] = self.artifact_dir.as_posix()
+        if self.source_path is not None:
+            payload["source_path"] = self.source_path.as_posix()
+        return payload
+
+
+@dataclass(slots=True)
+class StructureNode:
+    """One normalized structure heading extracted from layout results."""
+
+    node_id: str
+    level: int
+    title: str
+    page_start: int
+    page_end: int
+    section_path: str
+
+
+@dataclass(slots=True)
+class SemanticBlock:
+    """One merged semantic block used as an intermediate artifact before chunking."""
+
+    block_id: str
+    doc_id: str
+    doc_name: str
+    text: str
+    page_start: int
+    page_end: int
+    section_path: str
+    section_title: str
+    source_layout_ids: list[str]
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the block into a flat artifact record."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class SourceChunk:
+    """Evidence chunk used for question generation and human review."""
+
+    chunk_id: str
+    doc_id: str
+    doc_name: str
+    text: str
+    page_start: int
+    page_end: int
+    section_path: str
+    section_title: str
+    source_layout_ids: list[str]
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the chunk into a flat artifact record."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class ParsedDocument:
+    """Normalized parsed document ready for question generation."""
+
+    doc_id: str
+    doc_name: str
+    raw_text: str
+    structure_nodes: list[StructureNode]
+    semantic_blocks: list[SemanticBlock]
+    source_chunks: list[SourceChunk]
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the parsed document into a summary artifact record."""
+        return {
+            "doc_id": self.doc_id,
+            "doc_name": self.doc_name,
+            "raw_text": self.raw_text,
+            "structure_nodes": [asdict(item) for item in self.structure_nodes],
+            "metadata": self.metadata,
+            "semantic_block_count": len(self.semantic_blocks),
+            "source_chunk_count": len(self.source_chunks),
+        }
+
+
+@dataclass(slots=True)
+class DraftQuestionSample:
+    """One draft online evaluation sample pending manual review."""
+
+    sample_id: str
+    question: str
+    ground_truth: str
+    scenario: str
+    language: str
+    doc_id: str
+    doc_name: str
+    section_path: str
+    page_start: int
+    page_end: int
+    source_chunk_ids: list[str]
+    question_type: QuestionType
+    difficulty: Difficulty
+    review_status: ReviewStatus = "draft"
+    review_notes: str = ""
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the draft sample into a flat CSV row."""
+        return {
+            "sample_id": self.sample_id,
+            "question": self.question,
+            "ground_truth": self.ground_truth,
+            "scenario": self.scenario,
+            "language": self.language,
+            "doc_id": self.doc_id,
+            "doc_name": self.doc_name,
+            "section_path": self.section_path,
+            "page_start": self.page_start,
+            "page_end": self.page_end,
+            "source_chunk_ids": self.source_chunk_ids,
+            "question_type": self.question_type,
+            "difficulty": self.difficulty,
+            "review_status": self.review_status,
+            "review_notes": self.review_notes,
+        }
+
+
+@dataclass(slots=True)
+class ParseFailure:
+    """One document parse failure recorded for reporting and skip-mode execution."""
+
+    file_path: str
+    error: str
+
+    def to_record(self) -> dict[str, str]:
+        """Convert the failure into a flat CSV row."""
+        return asdict(self)
+
+
+@dataclass(slots=True)
+class DatasetBuildArtifactPaths:
+    """Canonical file paths produced by one dataset build run."""
+
+    root_dir: Path
+    documents_jsonl: Path
+    semantic_blocks_jsonl: Path
+    source_chunks_jsonl: Path
+    dataset_draft_csv: Path
+    parse_failures_csv: Path
+    metadata_json: Path
+
+
+@dataclass(slots=True)
+class DatasetBuildResult:
+    """Aggregate result object returned after a dataset build completes."""
+
+    job: DatasetBuildJob
+    run_id: str
+    artifact_paths: DatasetBuildArtifactPaths
+    documents: list[ParsedDocument]
+    draft_samples: list[DraftQuestionSample]
+    parse_failures: list[ParseFailure]
--- a/rag_eval/dataset_builder/offline_converter.py
+++ b/rag_eval/dataset_builder/offline_converter.py
@@ -0,0 +1,78 @@
+"""Utilities for converting draft online datasets into offline smoke-test datasets."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from rag_eval.shared.utils import ensure_directory
+
+
+def _load_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Load a JSONL file into a list of dictionaries."""
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            text = line.strip()
+            if not text:
+                continue
+            rows.append(json.loads(text))
+    return rows
+
+
+def build_offline_smoke_dataset(
+    *,
+    draft_dataset_path: Path,
+    source_chunks_path: Path,
+    output_path: Path,
+) -> Path:
+    """Derive an offline-evaluable dataset by reusing ground truth as answer and chunk text as contexts."""
+    draft_frame = pd.read_csv(draft_dataset_path)
+    chunk_rows = _load_jsonl(source_chunks_path)
+    chunk_lookup = {str(row["chunk_id"]): row for row in chunk_rows}
+
+    output_rows: list[dict[str, Any]] = []
+    for _, row in draft_frame.iterrows():
+        chunk_ids = row.get("source_chunk_ids")
+        if isinstance(chunk_ids, str):
+            parsed_chunk_ids = json.loads(chunk_ids)
+        elif isinstance(chunk_ids, list):
+            parsed_chunk_ids = chunk_ids
+        else:
+            parsed_chunk_ids = []
+
+        contexts = [
+            str(chunk_lookup[chunk_id]["text"]).strip()
+            for chunk_id in parsed_chunk_ids
+            if chunk_id in chunk_lookup and str(chunk_lookup[chunk_id]["text"]).strip()
+        ]
+        ground_truth = str(row.get("ground_truth", "")).strip()
+        output_rows.append(
+            {
+                "sample_id": row.get("sample_id", ""),
+                "question": row.get("question", ""),
+                "contexts": json.dumps(contexts, ensure_ascii=False),
+                "answer": ground_truth,
+                "ground_truth": ground_truth,
+                "scenario": row.get("scenario", ""),
+                "language": row.get("language", ""),
+                "retrieval_config": "offline-smoke-from-pdf-build",
+                "doc_id": row.get("doc_id", ""),
+                "doc_name": row.get("doc_name", ""),
+                "section_path": row.get("section_path", ""),
+                "page_start": row.get("page_start", ""),
+                "page_end": row.get("page_end", ""),
+                "source_chunk_ids": row.get("source_chunk_ids", ""),
+                "question_type": row.get("question_type", ""),
+                "difficulty": row.get("difficulty", ""),
+                "review_status": row.get("review_status", ""),
+                "review_notes": row.get("review_notes", ""),
+            }
+        )
+
+    ensure_directory(output_path.parent)
+    pd.DataFrame(output_rows).to_csv(output_path, index=False)
+    return output_path
--- a/rag_eval/dataset_builder/parser/init.py
+++ b/rag_eval/dataset_builder/parser/init.py
@@ -0,0 +1,7 @@
+"""Parser integrations and layout normalization helpers for dataset build jobs."""
+
+from .aliyun_document_parser import AliyunDocumentParser
+from .aliyun_docmind_gateway import AliyunDocmindGateway
+from .aliyun_layout_normalizer import normalize_layouts
+
+__all__ = ["AliyunDocumentParser", "AliyunDocmindGateway", "normalize_layouts"]
--- a/rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
+++ b/rag_eval/dataset_builder/parser/aliyun_docmind_gateway.py
@@ -0,0 +1,202 @@
+"""Gateway abstraction for Alibaba Cloud document parsing workflows."""
+
+from __future__ import annotations
+
+import time
+from pathlib import Path
+from typing import Any
+
+try:
+    from alibabacloud_docmind_api20220711 import models as docmind_models
+    from alibabacloud_docmind_api20220711.client import Client as DocmindClient
+    from alibabacloud_tea_openapi import models as openapi_models
+    from alibabacloud_tea_util import models as runtime_models
+except ImportError:
+    # Keep Alibaba SDK optional so offline flows and tests can import this module.
+    DocmindClient = None
+    docmind_models = None
+    openapi_models = None
+    runtime_models = None
+
+try:
+    from alibabacloud_credentials.client import Client as CredentialClient
+except ImportError:
+    CredentialClient = None
+
+from rag_eval.settings import EvaluationSettings
+
+
+class AliyunDocmindGateway:
+    """Thin gateway interface around the external Alibaba document parser service."""
+
+    def __init__(self, settings: EvaluationSettings):
+        """Store parser-related settings needed by the gateway implementation."""
+        self.settings = settings
+        self._client = None
+        self._models = None
+        self._runtime_models = None
+
+    def _load_sdk(self) -> tuple[Any, Any, Any, Any]:
+        """Load Alibaba SDK modules lazily so tests and offline flows do not require them."""
+        if (
+            DocmindClient is None
+            or openapi_models is None
+            or docmind_models is None
+            or runtime_models is None
+        ):
+            raise ImportError(
+                "Alibaba Cloud Docmind SDK is not installed. "
+                "Install alibabacloud-docmind-api20220711, "
+                "alibabacloud-tea-openapi, alibabacloud-tea-util, and "
+                "alibabacloud-credentials."
+            )
+        return DocmindClient, openapi_models, docmind_models, runtime_models
+
+    def _resolve_credentials(self) -> tuple[str, str]:
+        """Resolve AccessKey credentials from settings or the Alibaba credentials client."""
+        if self.settings.alibaba_access_key_id and self.settings.alibaba_access_key_secret:
+            return self.settings.alibaba_access_key_id, self.settings.alibaba_access_key_secret
+
+        if CredentialClient is None:
+            raise ImportError(
+                "Alibaba Cloud credentials SDK is not installed and no explicit "
+                "ALIBABA_ACCESS_KEY_ID / ALIBABA_ACCESS_KEY_SECRET were provided."
+            )
+
+        credential_client = CredentialClient()
+        credential = credential_client.get_credential()
+        return credential.get_access_key_id(), credential.get_access_key_secret()
+
+    def _init_client(self) -> Any:
+        """Create and cache the underlying Alibaba SDK client."""
+        if self._client is not None:
+            return self._client
+
+        client_class, openapi_models, docmind_models, runtime_models = self._load_sdk()
+        access_key_id, access_key_secret = self._resolve_credentials()
+        endpoint = (self.settings.alibaba_endpoint or "docmind-api.cn-hangzhou.aliyuncs.com").strip()
+        config = openapi_models.Config(
+            access_key_id=access_key_id,
+            access_key_secret=access_key_secret,
+        )
+        config.endpoint = endpoint
+        config.region_id = "cn-hangzhou"
+        config.type = "access_key"
+
+        self._client = client_class(config)
+        self._models = docmind_models
+        self._runtime_models = runtime_models
+        return self._client
+
+    @staticmethod
+    def _to_plain_dict(value: Any) -> dict[str, Any]:
+        """Convert SDK response objects into ordinary dictionaries."""
+        if value is None:
+            return {}
+        if isinstance(value, dict):
+            return value
+        if hasattr(value, "to_map"):
+            return value.to_map()
+        if hasattr(value, "__dict__"):
+            return {
+                key: getattr(value, key)
+                for key in vars(value)
+                if not key.startswith("_")
+            }
+        return {}
+
+    @staticmethod
+    def _extract_layouts(payload: Any) -> list[dict[str, Any]]:
+        """Convert layout collections from SDK payloads into plain dictionaries."""
+        if payload is None:
+            return []
+        if isinstance(payload, dict):
+            layouts = payload.get("layouts") or payload.get("Layouts") or []
+        else:
+            layouts = getattr(payload, "layouts", None) or getattr(payload, "Layouts", None) or []
+        normalized: list[dict[str, Any]] = []
+        for item in layouts:
+            normalized.append(AliyunDocmindGateway._to_plain_dict(item))
+        return normalized
+
+    def submit_parse_task(self, pdf_path: Path) -> str:
+        """Submit one PDF parse task and return the remote task identifier."""
+        client = self._init_client()
+        runtime = self._runtime_models.RuntimeOptions()
+        file_name = pdf_path.name
+        with pdf_path.open("rb") as handle:
+            request = self._models.SubmitDocParserJobAdvanceRequest(
+                file_url_object=handle,
+                file_name=file_name,
+                file_name_extension=pdf_path.suffix.lstrip(".").lower() or "pdf",
+                llm_enhancement=self.settings.aliyun_llm_enhancement,
+                enhancement_mode=self.settings.aliyun_enhancement_mode,
+            )
+            response = client.submit_doc_parser_job_advance(request, runtime)
+
+        payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
+        task_id = payload.get("id") or payload.get("Id")
+        if not task_id:
+            raise RuntimeError(f"Aliyun submit_doc_parser_job_advance returned no task id for {pdf_path.name}")
+        return str(task_id)
+
+    def get_task_status(self, task_id: str) -> dict[str, Any]:
+        """Fetch the current parse task status from the remote service."""
+        client = self._init_client()
+        request = self._models.QueryDocParserStatusRequest(id=task_id)
+        response = client.query_doc_parser_status(request)
+        payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
+        status = payload.get("status") or payload.get("Status")
+        if status is not None and "status" not in payload:
+            payload["status"] = status
+        return payload
+
+    def fetch_layouts(self, task_id: str) -> list[dict[str, Any]]:
+        """Fetch normalized layout pages for a completed parse task."""
+        client = self._init_client()
+        layout_num = 0
+        layout_step_size = min(max(1, self.settings.aliyun_parse_layout_step_size), 3000)
+        collected: list[dict[str, Any]] = []
+
+        while True:
+            request = self._models.GetDocParserResultRequest(
+                id=task_id,
+                layout_step_size=layout_step_size,
+                layout_num=layout_num,
+            )
+            response = client.get_doc_parser_result(request)
+            payload = getattr(getattr(response, "body", None), "data", None)
+            layouts = self._extract_layouts(payload)
+            if not layouts:
+                break
+            collected.extend(layouts)
+            layout_num += len(layouts)
+            if len(layouts) < layout_step_size:
+                break
+        return collected
+
+    def parse_document(self, pdf_path: Path) -> dict[str, Any]:
+        """Run the submit/poll/fetch cycle and return a raw parse payload."""
+        task_id = self.submit_parse_task(pdf_path)
+        started_at = time.monotonic()
+        poll_interval = max(1, self.settings.aliyun_parse_poll_interval_seconds)
+        timeout_seconds = max(1, self.settings.aliyun_parse_timeout_seconds)
+
+        while True:
+            status = self.get_task_status(task_id)
+            state = str(status.get("status", "")).lower()
+            if state in {"succeeded", "success", "finished"}:
+                layouts = self.fetch_layouts(task_id)
+                return {
+                    "task_id": task_id,
+                    "status": state,
+                    "doc_id": status.get("doc_id") or pdf_path.stem,
+                    "doc_name": status.get("doc_name") or pdf_path.name,
+                    "layouts": layouts,
+                    "metadata": status,
+                }
+            if state in {"failed", "error"}:
+                raise RuntimeError(f"Aliyun parse task failed for {pdf_path.name}: {status}")
+            if time.monotonic() - started_at > timeout_seconds:
+                raise TimeoutError(f"Aliyun parse task timed out for {pdf_path.name}")
+            time.sleep(poll_interval)
--- a/rag_eval/dataset_builder/parser/aliyun_document_parser.py
+++ b/rag_eval/dataset_builder/parser/aliyun_document_parser.py
@@ -0,0 +1,38 @@
+"""Document parser that normalizes Alibaba layout results into internal models."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from rag_eval.dataset_builder.models import ParsedDocument
+
+from .aliyun_docmind_gateway import AliyunDocmindGateway
+from .aliyun_layout_normalizer import normalize_layouts
+
+
+class AliyunDocumentParser:
+    """Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
+
+    def __init__(self, gateway: AliyunDocmindGateway):
+        """Store the gateway dependency used for remote parsing."""
+        self.gateway = gateway
+
+    def parse(self, pdf_path: Path) -> ParsedDocument:
+        """Parse one PDF file into a normalized parsed-document model."""
+        payload = self.gateway.parse_document(pdf_path)
+        layouts = payload.get("layouts") or []
+        if not layouts:
+            raise ValueError(f"No layouts returned for document: {pdf_path.name}")
+
+        document = normalize_layouts(
+            doc_id=str(payload.get("doc_id") or pdf_path.stem),
+            doc_name=str(payload.get("doc_name") or pdf_path.name),
+            layouts=list(layouts),
+        )
+        document.metadata.update(
+            {
+                "task_id": payload.get("task_id"),
+                "provider": "aliyun_docmind",
+            }
+        )
+        return document
--- a/rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
+++ b/rag_eval/dataset_builder/parser/aliyun_layout_normalizer.py
@@ -0,0 +1,181 @@
+"""Normalization helpers that convert raw layout results into source chunks."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from rag_eval.dataset_builder.models import ParsedDocument, SemanticBlock, SourceChunk, StructureNode
+
+
+def _clean_text(value: Any) -> str:
+    """Normalize free-form layout text into a compact string."""
+    if value is None:
+        return ""
+    return re.sub(r"\s+", " ", str(value)).strip()
+
+
+def _is_catalog_entry(item_type: str, text: str) -> bool:
+    """Detect table-of-contents style entries that should be skipped."""
+    lowered = text.lower()
+    return item_type == "toc" or "目录" in text or lowered.startswith("table of contents")
+
+
+def _flatten_table(item: dict[str, Any]) -> str:
+    """Convert a table layout node into a searchable plain-text representation."""
+    rows = item.get("rows") or []
+    flattened_rows: list[str] = []
+    for row in rows:
+        cells = [str(cell).strip() for cell in row if str(cell).strip()]
+        if cells:
+            flattened_rows.append(" | ".join(cells))
+    return "\n".join(flattened_rows)
+
+
+def _split_text(text: str, max_chars: int = 1200, overlap: int = 150) -> list[str]:
+    """Split long text into overlapping windows so each chunk stays reviewable."""
+    if len(text) <= max_chars:
+        return [text]
+
+    windows: list[str] = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + max_chars)
+        windows.append(text[start:end].strip())
+        if end >= len(text):
+            break
+        start = max(end - overlap, start + 1)
+    return [window for window in windows if window]
+
+
+def normalize_layouts(
+    *,
+    doc_id: str,
+    doc_name: str,
+    layouts: list[dict[str, Any]],
+    max_chunk_chars: int = 1200,
+    overlap_chars: int = 150,
+) -> ParsedDocument:
+    """Convert raw layouts into structure nodes, semantic blocks, and source chunks."""
+    structure_nodes: list[StructureNode] = []
+    semantic_blocks: list[SemanticBlock] = []
+    source_chunks: list[SourceChunk] = []
+    section_stack: list[tuple[int, str]] = []
+
+    current_block_text: list[str] = []
+    current_block_layout_ids: list[str] = []
+    current_page_start: int | None = None
+    current_page_end: int | None = None
+    current_section_path = ""
+    current_section_title = ""
+
+    def flush_block() -> None:
+        """Finalize the in-progress semantic block and emit source chunks."""
+        nonlocal current_block_text, current_block_layout_ids, current_page_start, current_page_end
+        nonlocal current_section_path, current_section_title
+
+        text = _clean_text(" ".join(current_block_text))
+        if not text or current_page_start is None or current_page_end is None:
+            current_block_text = []
+            current_block_layout_ids = []
+            current_page_start = None
+            current_page_end = None
+            return
+
+        block_id = f"{doc_id}-block-{len(semantic_blocks) + 1}"
+        block = SemanticBlock(
+            block_id=block_id,
+            doc_id=doc_id,
+            doc_name=doc_name,
+            text=text,
+            page_start=current_page_start,
+            page_end=current_page_end,
+            section_path=current_section_path,
+            section_title=current_section_title,
+            source_layout_ids=list(current_block_layout_ids),
+        )
+        semantic_blocks.append(block)
+
+        chunk_parts = _split_text(text, max_chars=max_chunk_chars, overlap=overlap_chars)
+        for index, part in enumerate(chunk_parts, start=1):
+            heading_prefix = current_section_title.strip()
+            chunk_text = f"{heading_prefix}\n{part}".strip() if heading_prefix and not part.startswith(heading_prefix) else part
+            source_chunks.append(
+                SourceChunk(
+                    chunk_id=f"{block_id}-chunk-{index}",
+                    doc_id=doc_id,
+                    doc_name=doc_name,
+                    text=chunk_text,
+                    page_start=current_page_start,
+                    page_end=current_page_end,
+                    section_path=current_section_path,
+                    section_title=current_section_title,
+                    source_layout_ids=list(current_block_layout_ids),
+                )
+            )
+
+        current_block_text = []
+        current_block_layout_ids = []
+        current_page_start = None
+        current_page_end = None
+
+    for index, item in enumerate(layouts, start=1):
+        item_type = str(item.get("type", "paragraph")).lower()
+        page = int(item.get("page", 1))
+        layout_id = str(item.get("layout_id") or f"layout-{index}")
+        level = int(item.get("level", 1))
+
+        if item_type == "table":
+            text = _flatten_table(item)
+        else:
+            text = _clean_text(item.get("text"))
+
+        if not text or _is_catalog_entry(item_type, text):
+            continue
+
+        if item_type == "heading":
+            flush_block()
+            while section_stack and section_stack[-1][0] >= level:
+                section_stack.pop()
+            section_stack.append((level, text))
+            section_titles = [title for _, title in section_stack]
+            current_section_title = text
+            current_section_path = " > ".join(section_titles)
+            structure_nodes.append(
+                StructureNode(
+                    node_id=f"{doc_id}-node-{len(structure_nodes) + 1}",
+                    level=level,
+                    title=text,
+                    page_start=page,
+                    page_end=page,
+                    section_path=current_section_path,
+                )
+            )
+            continue
+
+        if item_type == "caption":
+            text = f"图注: {text}"
+
+        if current_page_start is None:
+            current_page_start = page
+        current_page_end = page
+        current_block_text.append(text)
+        current_block_layout_ids.append(layout_id)
+
+    flush_block()
+    raw_text = "\n".join(chunk.text for chunk in source_chunks)
+    metadata = {
+        "layout_count": len(layouts),
+        "structure_node_count": len(structure_nodes),
+        "semantic_block_count": len(semantic_blocks),
+        "source_chunk_count": len(source_chunks),
+    }
+    return ParsedDocument(
+        doc_id=doc_id,
+        doc_name=doc_name,
+        raw_text=raw_text,
+        structure_nodes=structure_nodes,
+        semantic_blocks=semantic_blocks,
+        source_chunks=source_chunks,
+        metadata=metadata,
+    )
--- a/rag_eval/dataset_builder/runner.py
+++ b/rag_eval/dataset_builder/runner.py
@@ -0,0 +1,142 @@
+"""Orchestration layer for PDF-to-dataset build jobs."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.utils import ensure_directory, utc_now_iso
+
+from .generator.question_generator import OpenAIQuestionGenerator, QuestionGenerator
+from .generator.validators import dedupe_samples, validate_draft_sample
+from .models import DatasetBuildJob, DatasetBuildResult, DatasetBuildRuntime, ParseFailure
+from .parser.aliyun_document_parser import AliyunDocumentParser
+from .parser.aliyun_docmind_gateway import AliyunDocmindGateway
+from .schema import DatasetBuildConfigModel
+from .sources import discover_pdf_files
+from .writers import build_artifact_paths, write_dataset_build_artifacts
+
+
+def load_dataset_build_job(path: str | Path, settings: EvaluationSettings | None = None) -> DatasetBuildJob:
+    """Load and validate a dataset build YAML file."""
+    settings = settings or EvaluationSettings()
+    config_path = Path(path).resolve()
+    payload = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
+    model = DatasetBuildConfigModel.model_validate(payload)
+    base_dir = config_path.parent
+
+    generation_model = (
+        model.generation.model
+        or settings.dataset_generator_model
+        or "qwen3.6-plus"
+    )
+    parser_payload = payload.get("parser") or {}
+    failure_mode = parser_payload.get("failure_mode") or settings.parser_failure_mode or "fail"
+    return DatasetBuildJob(
+        job_name=model.job_name,
+        input_path=model.resolve_path(base_dir, model.input.path),
+        input_glob=model.input.glob,
+        parser_provider=model.parser.provider,
+        failure_mode=failure_mode,
+        generation_model=generation_model,
+        output_type=model.generation.output_type,
+        review_mode=model.generation.review_mode,
+        max_questions_per_document=model.generation.max_questions_per_document,
+        max_source_chunks_per_question=model.generation.max_source_chunks_per_question,
+        dataset_path=model.resolve_path(base_dir, model.output.dataset_path),
+        artifact_dir=model.resolve_path(base_dir, model.output.artifact_dir),
+        runtime=DatasetBuildRuntime(max_documents=model.runtime.max_documents),
+        source_path=config_path,
+    )
+
+
+def _create_parser(job: DatasetBuildJob, settings: EvaluationSettings) -> AliyunDocumentParser:
+    """Create the configured document parser implementation."""
+    if job.parser_provider != "aliyun_docmind":
+        raise ValueError(f"Unsupported parser provider: {job.parser_provider}")
+    gateway = AliyunDocmindGateway(settings)
+    return AliyunDocumentParser(gateway)
+
+
+def _create_generator(job: DatasetBuildJob, settings: EvaluationSettings) -> QuestionGenerator:
+    """Create the configured draft question generator implementation."""
+    return OpenAIQuestionGenerator(settings=settings, model=job.generation_model)
+
+
+def run_dataset_build(
+    config_path: str | Path,
+    *,
+    settings: EvaluationSettings | None = None,
+    parser: AliyunDocumentParser | None = None,
+    generator: QuestionGenerator | None = None,
+) -> DatasetBuildResult:
+    """Run one dataset build job end to end and persist all required artifacts."""
+    settings = settings or EvaluationSettings()
+    job = load_dataset_build_job(config_path, settings=settings)
+    pdf_files = discover_pdf_files(job.input_path, job.input_glob)
+    if job.runtime.max_documents is not None:
+        pdf_files = pdf_files[: job.runtime.max_documents]
+
+    parser = parser or _create_parser(job, settings)
+    generator = generator or _create_generator(job, settings)
+
+    run_id = utc_now_iso().replace(":", "-")
+    artifact_root = job.artifact_dir / run_id
+    ensure_directory(artifact_root)
+    artifact_paths = build_artifact_paths(artifact_root)
+
+    documents = []
+    failures: list[ParseFailure] = []
+    draft_samples = []
+
+    for pdf_path in pdf_files:
+        try:
+            document = parser.parse(pdf_path)
+        except Exception as exc:
+            failure = ParseFailure(file_path=pdf_path.as_posix(), error=str(exc))
+            failures.append(failure)
+            if job.failure_mode == "fail":
+                result = DatasetBuildResult(
+                    job=job,
+                    run_id=run_id,
+                    artifact_paths=artifact_paths,
+                    documents=documents,
+                    draft_samples=draft_samples,
+                    parse_failures=failures,
+                )
+                write_dataset_build_artifacts(result)
+                raise
+            continue
+
+        documents.append(document)
+        generated = generator.generate(
+            document,
+            max_questions=job.max_questions_per_document,
+            max_chunks_per_question=job.max_source_chunks_per_question,
+            job_name=job.job_name,
+        )
+        valid_generated = []
+        for sample in generated:
+            errors = validate_draft_sample(
+                sample,
+                document=document,
+                max_source_chunks_per_question=job.max_source_chunks_per_question,
+            )
+            if not errors:
+                valid_generated.append(sample)
+        draft_samples.extend(
+            dedupe_samples(valid_generated)[: job.max_questions_per_document]
+        )
+
+    result = DatasetBuildResult(
+        job=job,
+        run_id=run_id,
+        artifact_paths=artifact_paths,
+        documents=documents,
+        draft_samples=draft_samples,
+        parse_failures=failures,
+    )
+    write_dataset_build_artifacts(result)
+    return result
--- a/rag_eval/dataset_builder/schema.py
+++ b/rag_eval/dataset_builder/schema.py
@@ -0,0 +1,82 @@
+"""Pydantic schemas for dataset build YAML configuration files."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+
+
+class DatasetBuildInputModel(BaseModel):
+    """Schema for input PDF discovery settings."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    path: str
+    glob: str = "*.pdf"
+
+
+class DatasetBuildParserModel(BaseModel):
+    """Schema for parser selection and failure handling."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    provider: Literal["aliyun_docmind"]
+    failure_mode: Literal["fail", "skip"] | None = None
+
+
+class DatasetBuildGenerationModel(BaseModel):
+    """Schema for question generation controls."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    model: str | None = None
+    output_type: Literal["online_question_bank"]
+    review_mode: Literal["draft_with_manual_review"]
+    max_questions_per_document: int = Field(default=10, gt=0)
+    max_source_chunks_per_question: int = Field(default=3, gt=0)
+
+
+class DatasetBuildOutputModel(BaseModel):
+    """Schema for dataset build output locations."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    dataset_path: str
+    artifact_dir: str
+
+
+class DatasetBuildRuntimeModel(BaseModel):
+    """Schema for runtime throttling and document limits."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    max_documents: int | None = Field(default=None, gt=0)
+
+
+class DatasetBuildConfigModel(BaseModel):
+    """Top-level schema for a dataset build job."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    job_name: str
+    input: DatasetBuildInputModel
+    parser: DatasetBuildParserModel
+    generation: DatasetBuildGenerationModel
+    output: DatasetBuildOutputModel
+    runtime: DatasetBuildRuntimeModel = Field(default_factory=DatasetBuildRuntimeModel)
+
+    @model_validator(mode="after")
+    def validate_job_name(self) -> "DatasetBuildConfigModel":
+        """Reject blank job names that would break artifact paths."""
+        if not self.job_name.strip():
+            raise ValueError("job_name must not be empty.")
+        return self
+
+    def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
+        """Resolve relative paths against the config file directory."""
+        candidate = Path(raw_path)
+        if candidate.is_absolute():
+            return candidate
+        return (base_dir / candidate).resolve()
--- a/rag_eval/dataset_builder/sources.py
+++ b/rag_eval/dataset_builder/sources.py
@@ -0,0 +1,21 @@
+"""Input source discovery helpers for dataset build jobs."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def discover_pdf_files(input_path: Path, pattern: str = "*.pdf") -> list[Path]:
+    """Return all PDF files from a single file path or a directory scan."""
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input path does not exist: {input_path}")
+
+    if input_path.is_file():
+        if input_path.suffix.lower() != ".pdf":
+            raise ValueError(f"Input file is not a PDF: {input_path}")
+        return [input_path]
+
+    files = sorted(path for path in input_path.glob(pattern) if path.is_file() and path.suffix.lower() == ".pdf")
+    if not files:
+        raise ValueError(f"No PDF files found under {input_path} with pattern {pattern}")
+    return files
--- a/rag_eval/dataset_builder/writers.py
+++ b/rag_eval/dataset_builder/writers.py
@@ -0,0 +1,147 @@
+"""Artifact writers for dataset build runs."""
+
+from __future__ import annotations
+
+import csv
+import json
+import shutil
+from pathlib import Path
+from typing import Any
+
+from rag_eval.shared.utils import ensure_directory
+
+from .models import DatasetBuildArtifactPaths, DatasetBuildResult
+
+
+def build_artifact_paths(root_dir: Path) -> DatasetBuildArtifactPaths:
+    """Construct canonical output paths for one dataset build run."""
+    return DatasetBuildArtifactPaths(
+        root_dir=root_dir,
+        documents_jsonl=root_dir / "documents.jsonl",
+        semantic_blocks_jsonl=root_dir / "semantic_blocks.jsonl",
+        source_chunks_jsonl=root_dir / "source_chunks.jsonl",
+        dataset_draft_csv=root_dir / "dataset_draft.csv",
+        parse_failures_csv=root_dir / "parse_failures.csv",
+        metadata_json=root_dir / "metadata.json",
+    )
+
+
+def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
+    """Write a list of dictionaries as JSON Lines."""
+    with path.open("w", encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def _write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str] | None = None) -> None:
+    """Write flat records into a CSV file, including list values as JSON strings."""
+    normalized_rows: list[dict[str, Any]] = []
+    resolved_fieldnames = list(fieldnames or [])
+    for row in rows:
+        normalized_row: dict[str, Any] = {}
+        for key, value in row.items():
+            if key not in resolved_fieldnames:
+                resolved_fieldnames.append(key)
+            if isinstance(value, list):
+                normalized_row[key] = json.dumps(value, ensure_ascii=False)
+            else:
+                normalized_row[key] = value
+        normalized_rows.append(normalized_row)
+
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=resolved_fieldnames or ["placeholder"])
+        writer.writeheader()
+        if normalized_rows:
+            writer.writerows(normalized_rows)
+
+
+def _write_latest_alias_assets(result: DatasetBuildResult) -> None:
+    """Publish stable alias files so sample scenarios can target the latest build output."""
+    latest_dir = result.job.artifact_dir / "latest"
+    ensure_directory(latest_dir)
+
+    # Keep the canonical run directory and also expose a stable entrypoint for tutorials.
+    shutil.copyfile(result.artifact_paths.source_chunks_jsonl, latest_dir / "source_chunks.jsonl")
+    shutil.copyfile(result.artifact_paths.dataset_draft_csv, latest_dir / "dataset_draft.csv")
+    shutil.copyfile(result.artifact_paths.metadata_json, latest_dir / "metadata.json")
+
+
+def write_dataset_build_artifacts(result: DatasetBuildResult) -> None:
+    """Persist dataset build outputs and metadata to disk."""
+    artifact_paths = result.artifact_paths
+    ensure_directory(artifact_paths.root_dir)
+    ensure_directory(result.job.dataset_path.parent)
+
+    _write_jsonl(artifact_paths.documents_jsonl, [item.to_record() for item in result.documents])
+    _write_jsonl(
+        artifact_paths.semantic_blocks_jsonl,
+        [block.to_record() for item in result.documents for block in item.semantic_blocks],
+    )
+    _write_jsonl(
+        artifact_paths.source_chunks_jsonl,
+        [chunk.to_record() for item in result.documents for chunk in item.source_chunks],
+    )
+
+    draft_rows = [sample.to_record() for sample in result.draft_samples]
+    _write_csv(
+        artifact_paths.dataset_draft_csv,
+        draft_rows,
+        fieldnames=[
+            "sample_id",
+            "question",
+            "ground_truth",
+            "scenario",
+            "language",
+            "doc_id",
+            "doc_name",
+            "section_path",
+            "page_start",
+            "page_end",
+            "source_chunk_ids",
+            "question_type",
+            "difficulty",
+            "review_status",
+            "review_notes",
+        ],
+    )
+    _write_csv(
+        result.job.dataset_path,
+        draft_rows,
+        fieldnames=[
+            "sample_id",
+            "question",
+            "ground_truth",
+            "scenario",
+            "language",
+            "doc_id",
+            "doc_name",
+            "section_path",
+            "page_start",
+            "page_end",
+            "source_chunk_ids",
+            "question_type",
+            "difficulty",
+            "review_status",
+            "review_notes",
+        ],
+    )
+    _write_csv(
+        artifact_paths.parse_failures_csv,
+        [item.to_record() for item in result.parse_failures],
+        fieldnames=["file_path", "error"],
+    )
+
+    metadata = {
+        "run_id": result.run_id,
+        "job": result.job.snapshot(),
+        "stats": {
+            "documents_processed": len(result.documents),
+            "draft_samples": len(result.draft_samples),
+            "parse_failures": len(result.parse_failures),
+        },
+    }
+    artifact_paths.metadata_json.write_text(
+        json.dumps(metadata, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    _write_latest_alias_assets(result)
--- a/rag_eval/execution/init.py
+++ b/rag_eval/execution/init.py
@@ -0,0 +1,5 @@
+"""Execution entrypoints for running evaluation scenarios."""
+
+from .runner import run_scenario
+
+__all__ = ["run_scenario"]
--- a/rag_eval/execution/concurrency.py
+++ b/rag_eval/execution/concurrency.py
@@ -0,0 +1,23 @@
+"""Async helpers for executing bounded concurrent workloads."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Awaitable, Callable, TypeVar
+
+T = TypeVar("T")
+
+
+async def gather_with_limit(
+    factories: list[Callable[[], Awaitable[T]]],
+    limit: int,
+) -> list[T]:
+    """Run async factory callables with a maximum concurrency limit."""
+    semaphore = asyncio.Semaphore(max(1, limit))
+
+    async def guarded(factory: Callable[[], Awaitable[T]]) -> T:
+        """Wrap one factory invocation with semaphore-based throttling."""
+        async with semaphore:
+            return await factory()
+
+    return await asyncio.gather(*(guarded(factory) for factory in factories))
--- a/rag_eval/execution/errors.py
+++ b/rag_eval/execution/errors.py
@@ -0,0 +1,6 @@
+"""Custom exceptions raised during scenario execution."""
+
+class ScenarioExecutionError(RuntimeError):
+    """Raised when a scenario cannot be executed successfully."""
+
+    pass
--- a/rag_eval/execution/evaluator.py
+++ b/rag_eval/execution/evaluator.py
@@ -0,0 +1,125 @@
+"""Core evaluation workflow for offline and online scenarios."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from rag_eval.adapters.base import AppAdapter
+from rag_eval.datasets.loader import load_dataset_records
+from rag_eval.datasets.normalizers import normalize_records
+from rag_eval.execution.concurrency import gather_with_limit
+from rag_eval.metrics.pipeline import MetricPipeline
+from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
+from rag_eval.shared.utils import utc_now_iso
+
+
+class Evaluator:
+    """Coordinate dataset loading, optional app execution, and metric scoring."""
+
+    def __init__(
+        self,
+        scenario: Scenario,
+        metric_pipeline: MetricPipeline,
+        app_adapter: AppAdapter | None = None,
+    ):
+        """Create an evaluator for one resolved scenario."""
+        self.scenario = scenario
+        self.metric_pipeline = metric_pipeline
+        self.app_adapter = app_adapter
+
+    def evaluate(self) -> EvaluationResult:
+        """Execute the full evaluation flow and return the collected results."""
+        started_at = utc_now_iso()
+        raw_records = load_dataset_records(self.scenario.dataset.path)
+        samples, invalid_samples = normalize_records(
+            raw_records,
+            mode=self.scenario.mode,
+            max_samples=self.scenario.runtime.max_samples,
+        )
+
+        if self.scenario.mode == "online":
+            # Online mode enriches each sample by calling the target application first.
+            samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
+            invalid_samples.extend(online_invalids)
+
+        metric_scores = asyncio.run(
+            self.metric_pipeline.score_samples(
+                samples,
+                max_concurrency=self.scenario.runtime.metric_limit(),
+            )
+        )
+        finished_at = utc_now_iso()
+        score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
+        run_id = finished_at.replace(":", "-")
+        return EvaluationResult(
+            scenario=self.scenario,
+            run_id=run_id,
+            started_at=started_at,
+            finished_at=finished_at,
+            valid_samples=samples,
+            invalid_samples=invalid_samples,
+            score_rows=score_rows,
+        )
+
+    async def _enrich_online_samples(
+        self,
+        samples: list[NormalizedSample],
+    ) -> tuple[list[NormalizedSample], list[InvalidSample]]:
+        """Populate answers and contexts by calling the configured application adapter."""
+        if self.app_adapter is None:
+            raise ValueError("online mode requires an app adapter.")
+
+        valid: list[NormalizedSample] = []
+        invalid: list[InvalidSample] = []
+
+        async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
+            """Convert adapter exceptions into invalid samples instead of aborting the run."""
+            try:
+                return await self.app_adapter.enrich_sample(sample)
+            except Exception as exc:
+                error_type = type(exc).__name__
+                return InvalidSample(
+                    sample_id=sample.sample_id,
+                    error=f"adapter failed [{error_type}]: {exc}",
+                    raw=sample.raw,
+                )
+
+        factories = [
+            (lambda sample=sample: enrich_with_capture(sample))
+            for sample in samples
+        ]
+        results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
+
+        for sample in results:
+            if isinstance(sample, InvalidSample):
+                invalid.append(sample)
+                continue
+            # Treat incomplete adapter payloads as invalid so reporting stays explicit.
+            errors: list[str] = []
+            if not sample.answer:
+                errors.append("adapter returned empty answer")
+            if not sample.contexts:
+                errors.append("adapter returned empty contexts")
+            if errors:
+                invalid.append(
+                    InvalidSample(
+                        sample_id=sample.sample_id,
+                        error="; ".join(errors),
+                        raw=sample.raw,
+                    )
+                )
+                continue
+            valid.append(sample)
+        return valid, invalid
+
+    def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
+        """Combine sample data, metric results, and run metadata into one output row."""
+        record = sample.to_record()
+        record["contexts"] = sample.contexts
+        record.update(score.metrics)
+        record["error"] = score.error
+        record["judge_model"] = self.scenario.judge_model
+        record["embedding_model"] = self.scenario.embedding_model
+        record["run_id"] = self.scenario.scenario_name
+        return record
--- a/rag_eval/execution/runner.py
+++ b/rag_eval/execution/runner.py
@@ -0,0 +1,42 @@
+"""High-level scenario runner used by the package and CLI entrypoints."""
+
+from __future__ import annotations
+
+from rag_eval.adapters.http import HttpAppAdapter
+from rag_eval.adapters.python import PythonFunctionAdapter
+from rag_eval.config.loader import load_scenario
+from rag_eval.metrics.factory import build_metric_pipeline
+from rag_eval.reporting.writers import write_run_artifacts
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.models import Scenario
+
+from .evaluator import Evaluator
+
+
+def build_adapter(scenario: Scenario):
+    """Instantiate the adapter required by the resolved scenario, if any."""
+    if scenario.app_adapter is None:
+        return None
+    if scenario.app_adapter.type == "http":
+        return HttpAppAdapter(scenario.app_adapter)
+    if scenario.app_adapter.type == "python":
+        return PythonFunctionAdapter(scenario.app_adapter)
+    raise ValueError(f"Unsupported adapter type: {scenario.app_adapter.type}")
+
+
+def run_scenario(
+    scenario_path: str,
+    settings: EvaluationSettings | None = None,
+):
+    """Run one scenario end to end and persist its reporting artifacts."""
+    settings = settings or EvaluationSettings()
+    if not settings.openai_api_key:
+        raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
+
+    scenario = load_scenario(scenario_path)
+    adapter = build_adapter(scenario)
+    pipeline = build_metric_pipeline(scenario, settings)
+    evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
+    result = evaluator.evaluate()
+    write_run_artifacts(result)
+    return result
--- a/rag_eval/metrics/init.py
+++ b/rag_eval/metrics/init.py
@@ -0,0 +1,5 @@
+"""Metric pipeline construction helpers."""
+
+from .factory import build_metric_pipeline
+
+__all__ = ["build_metric_pipeline"]
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -0,0 +1,59 @@
+"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from openai import AsyncOpenAI
+
+from rag_eval.compat import ensure_ragas_import_compat
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.models import Scenario
+
+ensure_ragas_import_compat()
+
+from ragas.embeddings.base import embedding_factory
+from ragas.llms import llm_factory
+from ragas.metrics.collections import (
+    AnswerRelevancy,
+    ContextPrecision,
+    ContextRecall,
+    Faithfulness,
+)
+
+from .pipeline import MetricPipeline
+
+
+def build_models(
+    judge_model: str,
+    embedding_model: str,
+    settings: EvaluationSettings,
+) -> tuple[Any, Any]:
+    """Create the LLM and embedding clients required by the selected RAGAS metrics."""
+    client = AsyncOpenAI(**settings.openai_client_kwargs)
+    llm = llm_factory(judge_model, client=client)
+    embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
+    return llm, embeddings
+
+
+def build_metric_pipeline(
+    scenario: Scenario,
+    settings: EvaluationSettings,
+) -> MetricPipeline:
+    """Build a metric pipeline containing only the metrics requested by the scenario."""
+    llm, embeddings = build_models(
+        scenario.judge_model,
+        scenario.embedding_model,
+        settings,
+    )
+    # Build the full registry once, then slice it by configured metric names.
+    registry: dict[str, Any] = {
+        "faithfulness": Faithfulness(llm=llm),
+        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
+        "context_recall": ContextRecall(llm=llm),
+        "context_precision": ContextPrecision(llm=llm),
+    }
+    return MetricPipeline(
+        metrics={name: registry[name] for name in scenario.metrics},
+        metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
+    )
--- a/rag_eval/metrics/pipeline.py
+++ b/rag_eval/metrics/pipeline.py
@@ -0,0 +1,82 @@
+"""Execution pipeline for scoring normalized samples with RAGAS metrics."""
+
+from __future__ import annotations
+
+import asyncio
+import math
+from dataclasses import dataclass
+from typing import Any
+
+from rag_eval.shared.models import MetricScore, NormalizedSample
+
+
+@dataclass(slots=True)
+class MetricPipeline:
+    """Score one or many normalized samples against a configured metric set."""
+
+    metrics: dict[str, Any]
+    metric_timeout_seconds: float | None = None
+
+    async def score_sample(self, sample: NormalizedSample) -> MetricScore:
+        """Score a single sample and capture metric-level failures without aborting."""
+        results = {name: math.nan for name in self.metrics}
+        errors: list[str] = []
+
+        for name, metric in self.metrics.items():
+            try:
+                result = await self._run_metric(name, metric, sample)
+                results[name] = float(result.value)
+            except Exception as exc:
+                errors.append(f"{name}: {exc}")
+        return MetricScore(metrics=results, error=" | ".join(errors))
+
+    async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
+        """Dispatch one metric call with the argument shape expected by that metric."""
+        timeout = None
+        if self.metric_timeout_seconds is not None:
+            timeout = max(1.0, float(self.metric_timeout_seconds))
+
+        if name == "faithfulness":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                response=sample.answer,
+                retrieved_contexts=sample.contexts,
+            )
+        elif name == "answer_relevancy":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                response=sample.answer,
+            )
+        elif name == "context_recall":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                retrieved_contexts=sample.contexts,
+                reference=sample.ground_truth,
+            )
+        elif name == "context_precision":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                reference=sample.ground_truth,
+                retrieved_contexts=sample.contexts,
+            )
+        else:
+            raise ValueError(f"Unsupported metric: {name}")
+
+        if timeout is None:
+            return await coroutine
+        return await asyncio.wait_for(coroutine, timeout=timeout)
+
+    async def score_samples(
+        self,
+        samples: list[NormalizedSample],
+        max_concurrency: int,
+    ) -> list[MetricScore]:
+        """Score all samples while respecting the configured concurrency limit."""
+        semaphore = asyncio.Semaphore(max(1, max_concurrency))
+
+        async def guarded(sample: NormalizedSample) -> MetricScore:
+            """Throttle a single sample-scoring coroutine with the shared semaphore."""
+            async with semaphore:
+                return await self.score_sample(sample)
+
+        return await asyncio.gather(*(guarded(sample) for sample in samples))
--- a/rag_eval/metrics/registry.py
+++ b/rag_eval/metrics/registry.py
@@ -0,0 +1,8 @@
+"""Supported metric names recognized by scenario validation and pipeline setup."""
+
+SUPPORTED_METRICS = {
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+}
--- a/rag_eval/reporting/init.py
+++ b/rag_eval/reporting/init.py
@@ -0,0 +1,5 @@
+"""Reporting helpers that write evaluation outputs to disk."""
+
+from .writers import write_run_artifacts
+
+__all__ = ["write_run_artifacts"]
--- a/rag_eval/reporting/artifacts.py
+++ b/rag_eval/reporting/artifacts.py
@@ -0,0 +1,20 @@
+"""Helpers for deriving file-system paths for run artifacts."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from rag_eval.shared.models import RunArtifactPaths
+
+
+def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
+    """Build the canonical artifact file paths for a single evaluation run."""
+    run_dir = output_dir / run_id
+    return RunArtifactPaths(
+        root_dir=run_dir,
+        scenario_snapshot=run_dir / "scenario.snapshot.yaml",
+        scores_csv=run_dir / "scores.csv",
+        invalid_csv=run_dir / "invalid.csv",
+        summary_md=run_dir / "summary.md",
+        metadata_json=run_dir / "metadata.json",
+    )
--- a/rag_eval/reporting/summary.py
+++ b/rag_eval/reporting/summary.py
@@ -0,0 +1,78 @@
+"""Markdown summary generation for completed evaluation runs."""
+
+from __future__ import annotations
+
+import math
+
+import pandas as pd
+
+from rag_eval.shared.models import EvaluationResult
+
+
+def _table_from_frame(frame: pd.DataFrame) -> str:
+    """Render a small dataframe as a fixed-width markdown-friendly text table."""
+    if frame.empty:
+        return "No rows."
+
+    columns = list(frame.columns)
+    rows = [[str(value) for value in row] for row in frame.astype(object).values.tolist()]
+    widths = []
+    for index, column in enumerate(columns):
+        column_width = len(str(column))
+        row_width = max((len(row[index]) for row in rows), default=0)
+        widths.append(max(column_width, row_width))
+
+    header = " | ".join(str(column).ljust(widths[idx]) for idx, column in enumerate(columns))
+    separator = "-|-".join("-" * widths[idx] for idx in range(len(columns)))
+    body = [
+        " | ".join(row[idx].ljust(widths[idx]) for idx in range(len(columns)))
+        for row in rows
+    ]
+    return "\n".join([header, separator, *body])
+
+
+def build_summary_markdown(result: EvaluationResult) -> str:
+    """Build the human-readable markdown summary written for each evaluation run."""
+    total = len(result.valid_samples) + len(result.invalid_samples)
+    scores = pd.DataFrame(result.score_rows)
+
+    lines = [
+        f"# {result.scenario.scenario_name}",
+        "",
+        f"- run_id: `{result.run_id}`",
+        f"- mode: `{result.scenario.mode}`",
+        f"- total_samples: `{total}`",
+        f"- valid_samples: `{len(result.valid_samples)}`",
+        f"- invalid_samples: `{len(result.invalid_samples)}`",
+        f"- judge_model: `{result.scenario.judge_model}`",
+        f"- embedding_model: `{result.scenario.embedding_model}`",
+        "",
+        "## Metric Means",
+        "",
+    ]
+
+    if scores.empty:
+        lines.append("No valid samples were scored.")
+        return "\n".join(lines) + "\n"
+
+    for metric in result.scenario.metrics:
+        mean_value = scores[metric].mean(numeric_only=True)
+        if isinstance(mean_value, float) and not math.isnan(mean_value):
+            lines.append(f"- {metric}: `{mean_value:.4f}`")
+        else:
+            lines.append(f"- {metric}: `n/a`")
+
+    # Keep the summary self-sufficient by including every scored sample and its errors.
+    detail_columns = ["sample_id", *result.scenario.metrics, "error"]
+    detail = scores[detail_columns]
+    lines.extend(
+        [
+            "",
+            "## Per-sample Scores",
+            "",
+            "```text",
+            _table_from_frame(detail),
+            "```",
+        ]
+    )
+    return "\n".join(lines) + "\n"
--- a/rag_eval/reporting/writers.py
+++ b/rag_eval/reporting/writers.py
@@ -0,0 +1,52 @@
+"""Writers that persist evaluation outputs as local run artifacts."""
+
+from __future__ import annotations
+
+import json
+
+import pandas as pd
+import yaml
+
+from rag_eval.reporting.artifacts import build_artifact_paths
+from rag_eval.reporting.summary import build_summary_markdown
+from rag_eval.shared.models import EvaluationResult
+from rag_eval.shared.utils import ensure_directory
+
+
+def write_run_artifacts(result: EvaluationResult) -> None:
+    """Write all standard run artifacts for a completed evaluation result."""
+    artifact_paths = build_artifact_paths(result.scenario.output_dir, result.run_id)
+    ensure_directory(artifact_paths.root_dir)
+
+    artifact_paths.scenario_snapshot.write_text(
+        yaml.safe_dump(result.scenario.snapshot(), sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+
+    pd.DataFrame(result.score_rows).to_csv(artifact_paths.scores_csv, index=False)
+    pd.DataFrame(
+        [sample.to_record() for sample in result.invalid_samples]
+    ).to_csv(artifact_paths.invalid_csv, index=False)
+
+    artifact_paths.summary_md.write_text(
+        build_summary_markdown(result),
+        encoding="utf-8",
+    )
+
+    # Keep a compact machine-readable summary alongside the larger CSV and markdown outputs.
+    metadata = {
+        "run_id": result.run_id,
+        "scenario_name": result.scenario.scenario_name,
+        "mode": result.scenario.mode,
+        "judge_model": result.scenario.judge_model,
+        "embedding_model": result.scenario.embedding_model,
+        "started_at": result.started_at,
+        "finished_at": result.finished_at,
+        "dataset": result.scenario.dataset.path.as_posix(),
+        "valid_samples": len(result.valid_samples),
+        "invalid_samples": len(result.invalid_samples),
+    }
+    artifact_paths.metadata_json.write_text(
+        json.dumps(metadata, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
--- a/rag_eval/sample_rag_eval_dataset.csv
+++ b/rag_eval/sample_rag_eval_dataset.csv
@@ -0,0 +1,3 @@
+sample_id,question,contexts,answer,ground_truth,scenario,language,retrieval_config
+leave-policy-001,How many annual leave days does an employee with 6 years of service receive?,"[""Employees with 1 to 9 completed years of service receive 5 days of annual leave."",""Employees with 10 to 19 completed years of service receive 10 days of annual leave.""]","An employee with 6 years of service receives 5 annual leave days.","Employees with 1 to 9 completed years of service receive 5 annual leave days.",policy,en,"top_k=2;chunk_size=300"
+leave-policy-002,入职满12年的员工年假有几天？,"[""员工入司满1年不满10年的，年休假5天。"", ""员工入司满10年不满20年的，年休假10天。""]","根据规定，入职满12年的员工有10天年假。","员工入司满10年不满20年的，年休假10天。",policy,zh,"top_k=2;chunk_size=300"
--- a/rag_eval/settings.py
+++ b/rag_eval/settings.py
@@ -0,0 +1,68 @@
+"""Runtime settings loaded from environment variables for evaluation runs."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+
+class EvaluationSettings(BaseSettings):
+    """Application settings shared by the CLI, adapters, and metric pipeline."""
+    model_config = SettingsConfigDict(
+        env_file=REPO_ROOT / ".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+
+    openai_api_key: str | None = Field(default=None, alias="OPENAI_API_KEY")
+    openai_base_url: str = Field(default="http://6.86.80.4:30080/v1", alias="OPENAI_BASE_URL")
+    ragas_judge_model: str = Field(default="deepseek-v4-flash", alias="RAGAS_JUDGE_MODEL")
+    ragas_embedding_model: str = Field(
+        default="text-embedding-v3",
+        alias="RAGAS_EMBEDDING_MODEL",
+    )
+    openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
+    ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
+    batch_size: int = Field(default=8, alias="BATCH_SIZE")
+    alibaba_access_key_id: str | None = Field(default=None, alias="ALIBABA_ACCESS_KEY_ID")
+    alibaba_access_key_secret: str | None = Field(default=None, alias="ALIBABA_ACCESS_KEY_SECRET")
+    alibaba_endpoint: str | None = Field(default=None, alias="ALIBABA_ENDPOINT")
+    aliyun_parse_poll_interval_seconds: int = Field(
+        default=5,
+        alias="ALIYUN_PARSE_POLL_INTERVAL_SECONDS",
+    )
+    aliyun_parse_timeout_seconds: int = Field(
+        default=600,
+        alias="ALIYUN_PARSE_TIMEOUT_SECONDS",
+    )
+    aliyun_parse_layout_step_size: int = Field(
+        default=50,
+        alias="ALIYUN_PARSE_LAYOUT_STEP_SIZE",
+    )
+    aliyun_llm_enhancement: bool = Field(default=False, alias="ALIYUN_LLM_ENHANCEMENT")
+    aliyun_enhancement_mode: str = Field(default="balanced", alias="ALIYUN_ENHANCEMENT_MODE")
+    document_parse_artifact_prefix: str = Field(
+        default="outputs/dataset-builds",
+        alias="DOCUMENT_PARSE_ARTIFACT_PREFIX",
+    )
+    parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
+    dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
+
+    @property
+    def openai_client_kwargs(self) -> dict[str, str | float]:
+        """Return keyword arguments for the OpenAI client when credentials are available."""
+        if not self.openai_api_key:
+            return {}
+
+        client_kwargs: dict[str, str | float] = {
+            "api_key": self.openai_api_key,
+            "timeout": max(1.0, float(self.openai_timeout_seconds)),
+        }
+        if self.openai_base_url.strip():
+            client_kwargs["base_url"] = self.openai_base_url.strip()
+        return client_kwargs
--- a/rag_eval/shared/init.py
+++ b/rag_eval/shared/init.py
@@ -0,0 +1,25 @@
+"""Shared data models and utilities used across evaluation subsystems."""
+
+from .models import (
+    AppAdapterConfig,
+    DatasetConfig,
+    EvaluationResult,
+    InvalidSample,
+    MetricScore,
+    NormalizedSample,
+    RunArtifactPaths,
+    RuntimeConfig,
+    Scenario,
+)
+
+__all__ = [
+    "AppAdapterConfig",
+    "DatasetConfig",
+    "EvaluationResult",
+    "InvalidSample",
+    "MetricScore",
+    "NormalizedSample",
+    "RunArtifactPaths",
+    "RuntimeConfig",
+    "Scenario",
+]
--- a/rag_eval/shared/models.py
+++ b/rag_eval/shared/models.py
@@ -0,0 +1,161 @@
+"""Shared runtime data models exchanged across the evaluation pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+
+
+Mode = Literal["offline", "online"]
+AdapterType = Literal["http", "python"]
+
+
+def _serialize_paths(value: Any) -> Any:
+    """Convert Path instances nested inside snapshot payloads into POSIX strings."""
+    if isinstance(value, Path):
+        return value.as_posix()
+    if isinstance(value, dict):
+        return {key: _serialize_paths(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_serialize_paths(item) for item in value]
+    return value
+
+
+@dataclass(slots=True)
+class RuntimeConfig:
+    """Concurrency and sampling controls for one evaluation run."""
+
+    batch_size: int = 4
+    app_concurrency: int | None = None
+    metric_concurrency: int | None = None
+    max_samples: int | None = None
+
+    def metric_limit(self) -> int:
+        """Return the effective metric-scoring concurrency limit."""
+        return self.metric_concurrency or self.batch_size
+
+    def app_limit(self) -> int:
+        """Return the effective application-call concurrency limit."""
+        return self.app_concurrency or self.batch_size
+
+
+@dataclass(slots=True)
+class AppAdapterConfig:
+    """Resolved adapter configuration used by online scenarios."""
+
+    type: AdapterType
+    endpoint: str | None = None
+    method: str = "POST"
+    timeout_seconds: int = 30
+    callable: str | None = None
+    request_template: dict[str, Any] = field(default_factory=dict)
+    response_mapping: dict[str, str] = field(default_factory=dict)
+    static_kwargs: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class DatasetConfig:
+    """Dataset location information for a scenario."""
+
+    path: Path
+    format: str | None = None
+
+
+@dataclass(slots=True)
+class Scenario:
+    """Resolved evaluation scenario consumed by the execution pipeline."""
+
+    scenario_name: str
+    mode: Mode
+    dataset: DatasetConfig
+    judge_model: str
+    embedding_model: str
+    metrics: list[str]
+    output_dir: Path
+    runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
+    app_adapter: AppAdapterConfig | None = None
+    source_path: Path | None = None
+
+    def snapshot(self) -> dict[str, Any]:
+        """Serialize the scenario into a reporting-friendly dictionary snapshot."""
+        return _serialize_paths(asdict(self))
+
+
+@dataclass(slots=True)
+class NormalizedSample:
+    """Canonical sample shape used by adapters, metrics, and reporting."""
+
+    sample_id: str
+    question: str
+    contexts: list[str]
+    answer: str
+    ground_truth: str
+    scenario: str = ""
+    language: str = ""
+    retrieval_config: str = ""
+    metadata: dict[str, Any] = field(default_factory=dict)
+    raw: dict[str, Any] = field(default_factory=dict)
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the sample into a flat record for CSV and artifact generation."""
+        record = {
+            "sample_id": self.sample_id,
+            "question": self.question,
+            "contexts": self.contexts,
+            "answer": self.answer,
+            "ground_truth": self.ground_truth,
+            "scenario": self.scenario,
+            "language": self.language,
+            "retrieval_config": self.retrieval_config,
+        }
+        record.update(self.metadata)
+        return record
+
+
+@dataclass(slots=True)
+class InvalidSample:
+    """A dataset or adapter sample that could not be evaluated."""
+
+    sample_id: str
+    error: str
+    raw: dict[str, Any]
+
+    def to_record(self) -> dict[str, Any]:
+        """Convert the invalid sample into a flat reporting row."""
+        record = {"sample_id": self.sample_id, "error": self.error}
+        record.update(self.raw)
+        return record
+
+
+@dataclass(slots=True)
+class MetricScore:
+    """Metric values and accumulated errors for one evaluated sample."""
+
+    metrics: dict[str, float | None]
+    error: str = ""
+
+
+@dataclass(slots=True)
+class EvaluationResult:
+    """Aggregate result object returned after a scenario completes."""
+
+    scenario: Scenario
+    run_id: str
+    started_at: str
+    finished_at: str
+    valid_samples: list[NormalizedSample]
+    invalid_samples: list[InvalidSample]
+    score_rows: list[dict[str, Any]]
+
+
+@dataclass(slots=True)
+class RunArtifactPaths:
+    """Canonical file-system paths for all artifacts produced by one run."""
+
+    root_dir: Path
+    scenario_snapshot: Path
+    scores_csv: Path
+    invalid_csv: Path
+    summary_md: Path
+    metadata_json: Path
--- a/rag_eval/shared/utils.py
+++ b/rag_eval/shared/utils.py
@@ -0,0 +1,49 @@
+"""General-purpose helpers shared across configuration, datasets, and reporting."""
+
+from __future__ import annotations
+
+import ast
+import json
+import math
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+def utc_now_iso() -> str:
+    """Return the current UTC timestamp in ISO 8601 format."""
+    return datetime.now(timezone.utc).isoformat()
+
+
+def ensure_directory(path: Path) -> None:
+    """Create a directory path if it does not already exist."""
+    path.mkdir(parents=True, exist_ok=True)
+
+
+def parse_contexts(value: Any) -> list[str]:
+    """Normalize a context payload into a list of non-empty strings."""
+    if isinstance(value, list):
+        return [str(item).strip() for item in value if str(item).strip()]
+    if value is None or (isinstance(value, float) and math.isnan(value)):
+        return []
+
+    text = str(value).strip()
+    if not text:
+        return []
+
+    # Accept serialized lists from CSV exports before falling back to plain text.
+    for parser in (json.loads, ast.literal_eval):
+        try:
+            parsed = parser(text)
+        except (ValueError, SyntaxError, json.JSONDecodeError):
+            continue
+        if isinstance(parsed, list):
+            return [str(item).strip() for item in parsed if str(item).strip()]
+
+    # Preserve paragraph-style context dumps by splitting on blank lines first.
+    if "\n\n" in text:
+        chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
+        if chunks:
+            return chunks
+
+    return [text]