first commit

This commit is contained in:
2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions

5
rag_eval/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""Public package exports for the RAG evaluation toolkit."""
from .execution.runner import run_scenario
__all__ = ["run_scenario"]

View File

@@ -0,0 +1,7 @@
"""Adapter implementations that connect evaluation flows to target applications."""
from .base import AppAdapter
from .http import HttpAppAdapter
from .python import PythonFunctionAdapter
__all__ = ["AppAdapter", "HttpAppAdapter", "PythonFunctionAdapter"]

37
rag_eval/adapters/base.py Normal file
View File

@@ -0,0 +1,37 @@
"""Shared adapter interfaces for online application execution."""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any
from rag_eval.shared.models import NormalizedSample
class AppAdapter(ABC):
"""Abstract base class for adapters that fetch answers and contexts from apps."""
@abstractmethod
async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
"""Execute the target application for a single question."""
raise NotImplementedError
async def enrich_sample(self, sample: NormalizedSample) -> NormalizedSample:
"""Merge adapter output into an existing normalized sample."""
response = await self.run(question=sample.question, **sample.metadata)
answer = str(response.get("answer", "")).strip()
contexts = response.get("contexts") or []
# Drop empty context fragments so downstream metrics receive clean lists.
normalized_contexts = [str(item).strip() for item in contexts if str(item).strip()]
return NormalizedSample(
sample_id=sample.sample_id,
question=sample.question,
contexts=normalized_contexts,
answer=answer,
ground_truth=sample.ground_truth,
scenario=sample.scenario,
language=sample.language,
retrieval_config=sample.retrieval_config,
metadata={**sample.metadata, "raw_response": response.get("raw_response")},
raw=sample.raw,
)

45
rag_eval/adapters/http.py Normal file
View File

@@ -0,0 +1,45 @@
"""HTTP adapter implementation for online evaluation scenarios."""
from __future__ import annotations
from typing import Any
import httpx
from rag_eval.shared.models import AppAdapterConfig
from .base import AppAdapter
class HttpAppAdapter(AppAdapter):
"""Call an HTTP endpoint and map its JSON response into the normalized adapter shape."""
def __init__(self, config: AppAdapterConfig):
"""Store the HTTP adapter configuration for later requests."""
self.config = config
async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
"""Send one HTTP request and return the normalized response payload."""
payload = dict(self.config.request_template)
payload["question"] = question
payload.update(self.config.static_kwargs)
payload.update(kwargs)
async with httpx.AsyncClient(timeout=self.config.timeout_seconds) as client:
response = await client.request(
self.config.method.upper(),
self.config.endpoint or "",
json=payload,
)
response.raise_for_status()
body = response.json()
# Allow scenario config to rename answer/context fields without custom code.
mapping = self.config.response_mapping or {}
answer_key = mapping.get("answer", "answer")
contexts_key = mapping.get("contexts", "contexts")
return {
"answer": body.get(answer_key, ""),
"contexts": body.get(contexts_key, []),
"raw_response": body,
}

View File

@@ -0,0 +1,38 @@
"""Python callable adapter for in-process application integrations."""
from __future__ import annotations
from importlib import import_module
from typing import Any, Callable
from rag_eval.shared.models import AppAdapterConfig
from .base import AppAdapter
class PythonFunctionAdapter(AppAdapter):
"""Wrap a configured Python callable so it can participate in online evaluation."""
def __init__(self, config: AppAdapterConfig):
"""Load and cache the configured callable during adapter initialization."""
self.config = config
self._callable = self._load_callable(config.callable or "")
@staticmethod
def _load_callable(target: str) -> Callable[..., dict[str, Any]]:
"""Resolve a `module:function` target into a callable object."""
module_name, _, attr_name = target.partition(":")
if not module_name or not attr_name:
raise ValueError("Python adapter callable must use module:function syntax.")
module = import_module(module_name)
fn = getattr(module, attr_name)
if not callable(fn):
raise TypeError(f"Configured callable is not callable: {target}")
return fn
async def run(self, question: str, **kwargs: Any) -> dict[str, Any]:
"""Invoke the configured callable and enforce the adapter response contract."""
result = self._callable(question=question, **self.config.static_kwargs, **kwargs)
if not isinstance(result, dict):
raise TypeError("Python adapter callable must return a dict.")
return result

39
rag_eval/compat.py Normal file
View File

@@ -0,0 +1,39 @@
"""Compatibility helpers for optional third-party import paths."""
from __future__ import annotations
import sys
import types
def ensure_ragas_import_compat() -> None:
"""Patch optional langchain module paths that ragas imports eagerly.
The local environment ships a `langchain_community` build that still exposes
`langchain_community.llms.vertexai` but no longer provides
`langchain_community.chat_models.vertexai`. Ragas imports the chat module at
import time even when only OpenAI is used. Inject a minimal module so ragas
can import without mutating site-packages.
"""
module_name = "langchain_community.chat_models.vertexai"
if module_name in sys.modules:
return
try:
import langchain_community.chat_models.vertexai # type: ignore # noqa: F401
return
except ModuleNotFoundError:
pass
# Inject a minimal shim so ragas can import successfully in stripped builds.
shim = types.ModuleType(module_name)
class ChatVertexAI: # pragma: no cover - only used for import compatibility
"""Compatibility shim for environments that do not ship ChatVertexAI."""
pass
shim.ChatVertexAI = ChatVertexAI
sys.modules[module_name] = shim

View File

@@ -0,0 +1,5 @@
"""Scenario configuration loading utilities."""
from .loader import load_scenario
__all__ = ["load_scenario"]

67
rag_eval/config/loader.py Normal file
View File

@@ -0,0 +1,67 @@
"""Scenario file loading and conversion into internal runtime models."""
from __future__ import annotations
from pathlib import Path
import yaml
from rag_eval.shared.models import AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario
from .schema import ScenarioModel
from .validators import validate_scenario
def _resolve_static_kwargs_paths(base_dir: Path, raw_kwargs: dict[str, object]) -> dict[str, object]:
"""Resolve adapter static kwargs that look like relative file-system paths."""
resolved: dict[str, object] = {}
for key, value in raw_kwargs.items():
if key.endswith("_path") and isinstance(value, str):
candidate = Path(value)
resolved[key] = candidate if candidate.is_absolute() else (base_dir / candidate).resolve()
continue
resolved[key] = value
return resolved
def load_scenario(path: str | Path) -> Scenario:
"""Load, validate, and resolve a scenario file into the internal scenario model."""
scenario_path = Path(path).resolve()
payload = yaml.safe_load(scenario_path.read_text(encoding="utf-8")) or {}
model = ScenarioModel.model_validate(payload)
base_dir = scenario_path.parent
app_adapter = None
if model.app_adapter is not None:
# Convert the validated Pydantic model into the lightweight runtime dataclass.
app_adapter = AppAdapterConfig(
type=model.app_adapter.type,
endpoint=model.app_adapter.endpoint,
method=model.app_adapter.method,
timeout_seconds=model.app_adapter.timeout_seconds,
callable=model.app_adapter.callable,
request_template=model.app_adapter.request_template,
response_mapping=model.app_adapter.response_mapping,
static_kwargs=_resolve_static_kwargs_paths(base_dir, model.app_adapter.static_kwargs),
)
scenario = Scenario(
scenario_name=model.scenario_name,
mode=model.mode,
app_adapter=app_adapter,
dataset=DatasetConfig(path=model.resolve_path(base_dir, model.dataset)),
judge_model=model.judge_model,
embedding_model=model.embedding_model,
metrics=model.metrics,
output_dir=model.resolve_path(base_dir, model.output_dir),
runtime=RuntimeConfig(
batch_size=model.runtime.batch_size,
app_concurrency=model.runtime.app_concurrency,
metric_concurrency=model.runtime.metric_concurrency,
max_samples=model.runtime.max_samples,
),
source_path=scenario_path,
)
# Run cross-field checks after all relative paths have been resolved.
validate_scenario(scenario)
return scenario

78
rag_eval/config/schema.py Normal file
View File

@@ -0,0 +1,78 @@
"""Pydantic schemas used to validate raw scenario configuration files."""
from __future__ import annotations
from pathlib import Path
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
class RuntimeConfigModel(BaseModel):
"""Schema for runtime concurrency and sampling settings."""
model_config = ConfigDict(extra="ignore")
batch_size: int = 4
app_concurrency: int | None = None
metric_concurrency: int | None = None
max_samples: int | None = None
class AppAdapterConfigModel(BaseModel):
"""Schema for adapter-specific configuration in online scenarios."""
model_config = ConfigDict(extra="ignore")
type: Literal["http", "python"]
endpoint: str | None = None
method: str = "POST"
timeout_seconds: int = 30
callable: str | None = None
request_template: dict[str, Any] = Field(default_factory=dict)
response_mapping: dict[str, str] = Field(default_factory=dict)
static_kwargs: dict[str, Any] = Field(default_factory=dict)
@model_validator(mode="after")
def validate_shape(self) -> "AppAdapterConfigModel":
"""Enforce the fields required by each adapter type."""
if self.type == "http" and not self.endpoint:
raise ValueError("HTTP adapter requires endpoint.")
if self.type == "python" and not self.callable:
raise ValueError("Python adapter requires callable.")
return self
class ScenarioModel(BaseModel):
"""Schema for a user-authored evaluation scenario file."""
model_config = ConfigDict(extra="ignore")
scenario_name: str
mode: Literal["offline", "online"]
app_adapter: AppAdapterConfigModel | None = None
dataset: str
judge_model: str
embedding_model: str
metrics: list[str]
output_dir: str
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
@field_validator("metrics")
@classmethod
def ensure_metrics_not_empty(cls, value: list[str]) -> list[str]:
"""Reject scenarios that do not request any metrics."""
if not value:
raise ValueError("metrics must not be empty.")
return value
@model_validator(mode="after")
def validate_mode_requirements(self) -> "ScenarioModel":
"""Ensure online scenarios define the adapter they depend on."""
if self.mode == "online" and self.app_adapter is None:
raise ValueError("online mode requires app_adapter.")
return self
def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
"""Resolve relative paths against the scenario file directory."""
candidate = Path(raw_path)
if candidate.is_absolute():
return candidate
return (base_dir / candidate).resolve()

View File

@@ -0,0 +1,20 @@
"""Cross-field validation helpers for resolved runtime scenarios."""
from __future__ import annotations
from rag_eval.metrics.registry import SUPPORTED_METRICS
from rag_eval.shared.models import Scenario
def validate_scenario(scenario: Scenario) -> None:
"""Validate metric selection and mode-specific runtime constraints."""
unsupported = [name for name in scenario.metrics if name not in SUPPORTED_METRICS]
if unsupported:
supported = ", ".join(sorted(SUPPORTED_METRICS))
raise ValueError(
f"Unsupported metrics: {', '.join(unsupported)}. Supported metrics: {supported}"
)
if scenario.mode == "offline" and scenario.app_adapter is not None:
raise ValueError("offline mode should not define app_adapter.")
if scenario.runtime.batch_size < 1:
raise ValueError("runtime.batch_size must be >= 1.")

View File

@@ -0,0 +1,5 @@
"""Dataset build workflow for converting PDFs into reviewable online question banks."""
from .runner import run_dataset_build
__all__ = ["run_dataset_build"]

View File

@@ -0,0 +1,5 @@
"""Question generation components for draft online datasets."""
from .question_generator import OpenAIQuestionGenerator, QuestionGenerator
__all__ = ["OpenAIQuestionGenerator", "QuestionGenerator"]

View File

@@ -0,0 +1,173 @@
"""LLM-backed question generator for dataset build jobs."""
from __future__ import annotations
import json
from abc import ABC, abstractmethod
from typing import Any
from openai import OpenAI
from rag_eval.dataset_builder.models import DraftQuestionSample, ParsedDocument, SourceChunk
from rag_eval.settings import EvaluationSettings
class QuestionGenerator(ABC):
"""Abstract interface for generating draft questions from parsed documents."""
@abstractmethod
def generate(
self,
document: ParsedDocument,
*,
max_questions: int,
max_chunks_per_question: int,
job_name: str,
) -> list[DraftQuestionSample]:
"""Generate draft question samples for one parsed document."""
raise NotImplementedError
class OpenAIQuestionGenerator(QuestionGenerator):
"""Generate draft questions with an OpenAI-compatible chat completion API."""
def __init__(self, settings: EvaluationSettings, model: str, client: OpenAI | None = None):
"""Initialize the OpenAI-compatible client and target generation model."""
if not settings.openai_api_key:
raise EnvironmentError("OPENAI_API_KEY must be set before generating draft questions.")
self.client = client or OpenAI(**settings.openai_client_kwargs)
self.model = model
def _build_prompt(
self,
document: ParsedDocument,
*,
max_questions: int,
max_chunks_per_question: int,
) -> str:
"""Build a constrained JSON-generation prompt for one document."""
chunk_lines: list[str] = []
for chunk in document.source_chunks:
chunk_lines.append(
json.dumps(
{
"chunk_id": chunk.chunk_id,
"section_path": chunk.section_path,
"page_start": chunk.page_start,
"page_end": chunk.page_end,
"text": chunk.text,
},
ensure_ascii=False,
)
)
instructions = {
"task": "Generate reviewable online evaluation draft questions from one document only.",
"rules": [
"Return JSON only.",
f"Generate at most {max_questions} samples.",
f"Each sample may cite at most {max_chunks_per_question} chunk ids.",
"Every sample must stay within this document and use existing chunk ids only.",
"Allowed question_type values: fact, summary, procedure, comparison.",
"Allowed difficulty values: easy, medium, hard.",
],
"output_schema": {
"samples": [
{
"question": "string",
"ground_truth": "string",
"source_chunk_ids": ["chunk-id"],
"question_type": "fact|summary|procedure|comparison",
"difficulty": "easy|medium|hard",
}
]
},
"document": {
"doc_id": document.doc_id,
"doc_name": document.doc_name,
"chunks": chunk_lines,
},
}
return json.dumps(instructions, ensure_ascii=False, indent=2)
def _build_sample(
self,
*,
document: ParsedDocument,
payload: dict[str, Any],
index: int,
job_name: str,
) -> DraftQuestionSample:
"""Convert one model output object into the internal draft sample model."""
chunk_lookup: dict[str, SourceChunk] = {item.chunk_id: item for item in document.source_chunks}
source_chunk_ids = [str(item).strip() for item in payload.get("source_chunk_ids") or [] if str(item).strip()]
chunks = [chunk_lookup[item] for item in source_chunk_ids if item in chunk_lookup]
section_path = chunks[0].section_path if chunks else ""
page_start = min((chunk.page_start for chunk in chunks), default=0)
page_end = max((chunk.page_end for chunk in chunks), default=0)
language = "zh" if any("\u4e00" <= char <= "\u9fff" for char in payload.get("question", "")) else "en"
return DraftQuestionSample(
sample_id=f"{document.doc_id}-q{index}",
question=str(payload.get("question", "")).strip(),
ground_truth=str(payload.get("ground_truth", "")).strip(),
scenario=job_name,
language=language,
doc_id=document.doc_id,
doc_name=document.doc_name,
section_path=section_path,
page_start=page_start,
page_end=page_end,
source_chunk_ids=source_chunk_ids,
question_type=str(payload.get("question_type", "fact")).strip() or "fact",
difficulty=str(payload.get("difficulty", "medium")).strip() or "medium",
)
@staticmethod
def _parse_response_payload(content: str) -> list[dict[str, Any]]:
"""Parse the model response into a list of sample payload dictionaries."""
try:
payload = json.loads(content or "{}")
except json.JSONDecodeError as exc:
raise ValueError("Question generator returned invalid JSON.") from exc
if not isinstance(payload, dict):
raise ValueError("Question generator response must be a JSON object.")
samples = payload.get("samples") or []
if not isinstance(samples, list):
raise ValueError("Question generator response field 'samples' must be a list.")
normalized_samples: list[dict[str, Any]] = []
for item in samples:
if isinstance(item, dict):
normalized_samples.append(item)
return normalized_samples
def generate(
self,
document: ParsedDocument,
*,
max_questions: int,
max_chunks_per_question: int,
job_name: str,
) -> list[DraftQuestionSample]:
"""Generate draft questions for one parsed document."""
prompt = self._build_prompt(
document,
max_questions=max_questions,
max_chunks_per_question=max_chunks_per_question,
)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You generate structured draft question banks from source documents."},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)
content = response.choices[0].message.content or "{}"
payload = self._parse_response_payload(content)
return [
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
for index, item in enumerate(payload[:max_questions], start=1)
]

View File

@@ -0,0 +1,87 @@
"""Validation and deduplication helpers for generated draft question samples."""
from __future__ import annotations
import re
from difflib import SequenceMatcher
from rag_eval.dataset_builder.models import DraftQuestionSample, ParsedDocument
ALLOWED_QUESTION_TYPES = {"fact", "summary", "procedure", "comparison"}
ALLOWED_DIFFICULTIES = {"easy", "medium", "hard"}
def validate_draft_sample(
sample: DraftQuestionSample,
*,
document: ParsedDocument,
max_source_chunks_per_question: int | None = None,
) -> list[str]:
"""Validate one generated sample against the document and enum constraints."""
errors: list[str] = []
if not sample.question.strip():
errors.append("question is empty")
if not sample.ground_truth.strip():
errors.append("ground_truth is empty")
if not sample.source_chunk_ids:
errors.append("source_chunk_ids is empty")
if (
max_source_chunks_per_question is not None
and len(sample.source_chunk_ids) > max_source_chunks_per_question
):
errors.append(
f"source_chunk_ids exceeds limit: {len(sample.source_chunk_ids)} > {max_source_chunks_per_question}"
)
existing_chunk_ids = {chunk.chunk_id for chunk in document.source_chunks}
for chunk_id in sample.source_chunk_ids:
if chunk_id not in existing_chunk_ids:
errors.append(f"unknown source chunk: {chunk_id}")
if sample.doc_id != document.doc_id:
errors.append("sample doc_id does not match source document")
if sample.question_type not in ALLOWED_QUESTION_TYPES:
errors.append(f"unsupported question_type: {sample.question_type}")
if sample.difficulty not in ALLOWED_DIFFICULTIES:
errors.append(f"unsupported difficulty: {sample.difficulty}")
return errors
def normalize_question_text(text: str) -> str:
"""Normalize question text for exact-match deduplication."""
return re.sub(r"\s+", " ", text).strip().lower()
def dedupe_samples(samples: list[DraftQuestionSample]) -> list[DraftQuestionSample]:
"""Drop duplicate questions and enforce one output per chunk group per document."""
deduped: list[DraftQuestionSample] = []
seen_questions: set[tuple[str, str]] = set()
seen_chunk_groups: set[tuple[str, tuple[str, ...]]] = set()
seen_chunk_answers: list[tuple[str, tuple[str, ...], str]] = []
for sample in samples:
question_key = (sample.doc_id, normalize_question_text(sample.question))
if question_key in seen_questions:
continue
chunk_key = tuple(sample.source_chunk_ids)
chunk_group_key = (sample.doc_id, chunk_key)
if chunk_group_key in seen_chunk_groups:
continue
answer_key = normalize_question_text(sample.ground_truth)
duplicate = False
for existing_doc_id, existing_chunk_key, existing_answer in seen_chunk_answers:
if existing_doc_id != sample.doc_id or existing_chunk_key != chunk_key:
continue
if SequenceMatcher(None, existing_answer, answer_key).ratio() >= 0.9:
duplicate = True
break
if duplicate:
continue
seen_questions.add(question_key)
seen_chunk_groups.add(chunk_group_key)
seen_chunk_answers.append((sample.doc_id, chunk_key, answer_key))
deduped.append(sample)
return deduped

View File

@@ -0,0 +1,203 @@
"""Internal data models for the PDF-to-dataset build workflow."""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Literal
ReviewStatus = Literal["draft", "approved", "rejected", "needs_edit"]
QuestionType = Literal["fact", "summary", "procedure", "comparison"]
Difficulty = Literal["easy", "medium", "hard"]
FailureMode = Literal["fail", "skip"]
@dataclass(slots=True)
class DatasetBuildRuntime:
"""Runtime controls for one dataset build job."""
max_documents: int | None = None
@dataclass(slots=True)
class DatasetBuildJob:
"""Resolved dataset build configuration consumed by the build runner."""
job_name: str
input_path: Path
input_glob: str
parser_provider: str
failure_mode: FailureMode
generation_model: str
output_type: str
review_mode: str
max_questions_per_document: int
max_source_chunks_per_question: int
dataset_path: Path
artifact_dir: Path
runtime: DatasetBuildRuntime = field(default_factory=DatasetBuildRuntime)
source_path: Path | None = None
def snapshot(self) -> dict[str, Any]:
"""Serialize the job into JSON-friendly metadata."""
payload = asdict(self)
payload["input_path"] = self.input_path.as_posix()
payload["dataset_path"] = self.dataset_path.as_posix()
payload["artifact_dir"] = self.artifact_dir.as_posix()
if self.source_path is not None:
payload["source_path"] = self.source_path.as_posix()
return payload
@dataclass(slots=True)
class StructureNode:
"""One normalized structure heading extracted from layout results."""
node_id: str
level: int
title: str
page_start: int
page_end: int
section_path: str
@dataclass(slots=True)
class SemanticBlock:
"""One merged semantic block used as an intermediate artifact before chunking."""
block_id: str
doc_id: str
doc_name: str
text: str
page_start: int
page_end: int
section_path: str
section_title: str
source_layout_ids: list[str]
def to_record(self) -> dict[str, Any]:
"""Convert the block into a flat artifact record."""
return asdict(self)
@dataclass(slots=True)
class SourceChunk:
"""Evidence chunk used for question generation and human review."""
chunk_id: str
doc_id: str
doc_name: str
text: str
page_start: int
page_end: int
section_path: str
section_title: str
source_layout_ids: list[str]
def to_record(self) -> dict[str, Any]:
"""Convert the chunk into a flat artifact record."""
return asdict(self)
@dataclass(slots=True)
class ParsedDocument:
"""Normalized parsed document ready for question generation."""
doc_id: str
doc_name: str
raw_text: str
structure_nodes: list[StructureNode]
semantic_blocks: list[SemanticBlock]
source_chunks: list[SourceChunk]
metadata: dict[str, Any] = field(default_factory=dict)
def to_record(self) -> dict[str, Any]:
"""Convert the parsed document into a summary artifact record."""
return {
"doc_id": self.doc_id,
"doc_name": self.doc_name,
"raw_text": self.raw_text,
"structure_nodes": [asdict(item) for item in self.structure_nodes],
"metadata": self.metadata,
"semantic_block_count": len(self.semantic_blocks),
"source_chunk_count": len(self.source_chunks),
}
@dataclass(slots=True)
class DraftQuestionSample:
"""One draft online evaluation sample pending manual review."""
sample_id: str
question: str
ground_truth: str
scenario: str
language: str
doc_id: str
doc_name: str
section_path: str
page_start: int
page_end: int
source_chunk_ids: list[str]
question_type: QuestionType
difficulty: Difficulty
review_status: ReviewStatus = "draft"
review_notes: str = ""
def to_record(self) -> dict[str, Any]:
"""Convert the draft sample into a flat CSV row."""
return {
"sample_id": self.sample_id,
"question": self.question,
"ground_truth": self.ground_truth,
"scenario": self.scenario,
"language": self.language,
"doc_id": self.doc_id,
"doc_name": self.doc_name,
"section_path": self.section_path,
"page_start": self.page_start,
"page_end": self.page_end,
"source_chunk_ids": self.source_chunk_ids,
"question_type": self.question_type,
"difficulty": self.difficulty,
"review_status": self.review_status,
"review_notes": self.review_notes,
}
@dataclass(slots=True)
class ParseFailure:
"""One document parse failure recorded for reporting and skip-mode execution."""
file_path: str
error: str
def to_record(self) -> dict[str, str]:
"""Convert the failure into a flat CSV row."""
return asdict(self)
@dataclass(slots=True)
class DatasetBuildArtifactPaths:
"""Canonical file paths produced by one dataset build run."""
root_dir: Path
documents_jsonl: Path
semantic_blocks_jsonl: Path
source_chunks_jsonl: Path
dataset_draft_csv: Path
parse_failures_csv: Path
metadata_json: Path
@dataclass(slots=True)
class DatasetBuildResult:
"""Aggregate result object returned after a dataset build completes."""
job: DatasetBuildJob
run_id: str
artifact_paths: DatasetBuildArtifactPaths
documents: list[ParsedDocument]
draft_samples: list[DraftQuestionSample]
parse_failures: list[ParseFailure]

View File

@@ -0,0 +1,78 @@
"""Utilities for converting draft online datasets into offline smoke-test datasets."""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pandas as pd
from rag_eval.shared.utils import ensure_directory
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
"""Load a JSONL file into a list of dictionaries."""
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line in handle:
text = line.strip()
if not text:
continue
rows.append(json.loads(text))
return rows
def build_offline_smoke_dataset(
*,
draft_dataset_path: Path,
source_chunks_path: Path,
output_path: Path,
) -> Path:
"""Derive an offline-evaluable dataset by reusing ground truth as answer and chunk text as contexts."""
draft_frame = pd.read_csv(draft_dataset_path)
chunk_rows = _load_jsonl(source_chunks_path)
chunk_lookup = {str(row["chunk_id"]): row for row in chunk_rows}
output_rows: list[dict[str, Any]] = []
for _, row in draft_frame.iterrows():
chunk_ids = row.get("source_chunk_ids")
if isinstance(chunk_ids, str):
parsed_chunk_ids = json.loads(chunk_ids)
elif isinstance(chunk_ids, list):
parsed_chunk_ids = chunk_ids
else:
parsed_chunk_ids = []
contexts = [
str(chunk_lookup[chunk_id]["text"]).strip()
for chunk_id in parsed_chunk_ids
if chunk_id in chunk_lookup and str(chunk_lookup[chunk_id]["text"]).strip()
]
ground_truth = str(row.get("ground_truth", "")).strip()
output_rows.append(
{
"sample_id": row.get("sample_id", ""),
"question": row.get("question", ""),
"contexts": json.dumps(contexts, ensure_ascii=False),
"answer": ground_truth,
"ground_truth": ground_truth,
"scenario": row.get("scenario", ""),
"language": row.get("language", ""),
"retrieval_config": "offline-smoke-from-pdf-build",
"doc_id": row.get("doc_id", ""),
"doc_name": row.get("doc_name", ""),
"section_path": row.get("section_path", ""),
"page_start": row.get("page_start", ""),
"page_end": row.get("page_end", ""),
"source_chunk_ids": row.get("source_chunk_ids", ""),
"question_type": row.get("question_type", ""),
"difficulty": row.get("difficulty", ""),
"review_status": row.get("review_status", ""),
"review_notes": row.get("review_notes", ""),
}
)
ensure_directory(output_path.parent)
pd.DataFrame(output_rows).to_csv(output_path, index=False)
return output_path

View File

@@ -0,0 +1,7 @@
"""Parser integrations and layout normalization helpers for dataset build jobs."""
from .aliyun_document_parser import AliyunDocumentParser
from .aliyun_docmind_gateway import AliyunDocmindGateway
from .aliyun_layout_normalizer import normalize_layouts
__all__ = ["AliyunDocumentParser", "AliyunDocmindGateway", "normalize_layouts"]

View File

@@ -0,0 +1,202 @@
"""Gateway abstraction for Alibaba Cloud document parsing workflows."""
from __future__ import annotations
import time
from pathlib import Path
from typing import Any
try:
from alibabacloud_docmind_api20220711 import models as docmind_models
from alibabacloud_docmind_api20220711.client import Client as DocmindClient
from alibabacloud_tea_openapi import models as openapi_models
from alibabacloud_tea_util import models as runtime_models
except ImportError:
# Keep Alibaba SDK optional so offline flows and tests can import this module.
DocmindClient = None
docmind_models = None
openapi_models = None
runtime_models = None
try:
from alibabacloud_credentials.client import Client as CredentialClient
except ImportError:
CredentialClient = None
from rag_eval.settings import EvaluationSettings
class AliyunDocmindGateway:
"""Thin gateway interface around the external Alibaba document parser service."""
def __init__(self, settings: EvaluationSettings):
"""Store parser-related settings needed by the gateway implementation."""
self.settings = settings
self._client = None
self._models = None
self._runtime_models = None
def _load_sdk(self) -> tuple[Any, Any, Any, Any]:
"""Load Alibaba SDK modules lazily so tests and offline flows do not require them."""
if (
DocmindClient is None
or openapi_models is None
or docmind_models is None
or runtime_models is None
):
raise ImportError(
"Alibaba Cloud Docmind SDK is not installed. "
"Install alibabacloud-docmind-api20220711, "
"alibabacloud-tea-openapi, alibabacloud-tea-util, and "
"alibabacloud-credentials."
)
return DocmindClient, openapi_models, docmind_models, runtime_models
def _resolve_credentials(self) -> tuple[str, str]:
"""Resolve AccessKey credentials from settings or the Alibaba credentials client."""
if self.settings.alibaba_access_key_id and self.settings.alibaba_access_key_secret:
return self.settings.alibaba_access_key_id, self.settings.alibaba_access_key_secret
if CredentialClient is None:
raise ImportError(
"Alibaba Cloud credentials SDK is not installed and no explicit "
"ALIBABA_ACCESS_KEY_ID / ALIBABA_ACCESS_KEY_SECRET were provided."
)
credential_client = CredentialClient()
credential = credential_client.get_credential()
return credential.get_access_key_id(), credential.get_access_key_secret()
def _init_client(self) -> Any:
"""Create and cache the underlying Alibaba SDK client."""
if self._client is not None:
return self._client
client_class, openapi_models, docmind_models, runtime_models = self._load_sdk()
access_key_id, access_key_secret = self._resolve_credentials()
endpoint = (self.settings.alibaba_endpoint or "docmind-api.cn-hangzhou.aliyuncs.com").strip()
config = openapi_models.Config(
access_key_id=access_key_id,
access_key_secret=access_key_secret,
)
config.endpoint = endpoint
config.region_id = "cn-hangzhou"
config.type = "access_key"
self._client = client_class(config)
self._models = docmind_models
self._runtime_models = runtime_models
return self._client
@staticmethod
def _to_plain_dict(value: Any) -> dict[str, Any]:
"""Convert SDK response objects into ordinary dictionaries."""
if value is None:
return {}
if isinstance(value, dict):
return value
if hasattr(value, "to_map"):
return value.to_map()
if hasattr(value, "__dict__"):
return {
key: getattr(value, key)
for key in vars(value)
if not key.startswith("_")
}
return {}
@staticmethod
def _extract_layouts(payload: Any) -> list[dict[str, Any]]:
"""Convert layout collections from SDK payloads into plain dictionaries."""
if payload is None:
return []
if isinstance(payload, dict):
layouts = payload.get("layouts") or payload.get("Layouts") or []
else:
layouts = getattr(payload, "layouts", None) or getattr(payload, "Layouts", None) or []
normalized: list[dict[str, Any]] = []
for item in layouts:
normalized.append(AliyunDocmindGateway._to_plain_dict(item))
return normalized
def submit_parse_task(self, pdf_path: Path) -> str:
"""Submit one PDF parse task and return the remote task identifier."""
client = self._init_client()
runtime = self._runtime_models.RuntimeOptions()
file_name = pdf_path.name
with pdf_path.open("rb") as handle:
request = self._models.SubmitDocParserJobAdvanceRequest(
file_url_object=handle,
file_name=file_name,
file_name_extension=pdf_path.suffix.lstrip(".").lower() or "pdf",
llm_enhancement=self.settings.aliyun_llm_enhancement,
enhancement_mode=self.settings.aliyun_enhancement_mode,
)
response = client.submit_doc_parser_job_advance(request, runtime)
payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
task_id = payload.get("id") or payload.get("Id")
if not task_id:
raise RuntimeError(f"Aliyun submit_doc_parser_job_advance returned no task id for {pdf_path.name}")
return str(task_id)
def get_task_status(self, task_id: str) -> dict[str, Any]:
"""Fetch the current parse task status from the remote service."""
client = self._init_client()
request = self._models.QueryDocParserStatusRequest(id=task_id)
response = client.query_doc_parser_status(request)
payload = self._to_plain_dict(getattr(getattr(response, "body", None), "data", None))
status = payload.get("status") or payload.get("Status")
if status is not None and "status" not in payload:
payload["status"] = status
return payload
def fetch_layouts(self, task_id: str) -> list[dict[str, Any]]:
"""Fetch normalized layout pages for a completed parse task."""
client = self._init_client()
layout_num = 0
layout_step_size = min(max(1, self.settings.aliyun_parse_layout_step_size), 3000)
collected: list[dict[str, Any]] = []
while True:
request = self._models.GetDocParserResultRequest(
id=task_id,
layout_step_size=layout_step_size,
layout_num=layout_num,
)
response = client.get_doc_parser_result(request)
payload = getattr(getattr(response, "body", None), "data", None)
layouts = self._extract_layouts(payload)
if not layouts:
break
collected.extend(layouts)
layout_num += len(layouts)
if len(layouts) < layout_step_size:
break
return collected
def parse_document(self, pdf_path: Path) -> dict[str, Any]:
"""Run the submit/poll/fetch cycle and return a raw parse payload."""
task_id = self.submit_parse_task(pdf_path)
started_at = time.monotonic()
poll_interval = max(1, self.settings.aliyun_parse_poll_interval_seconds)
timeout_seconds = max(1, self.settings.aliyun_parse_timeout_seconds)
while True:
status = self.get_task_status(task_id)
state = str(status.get("status", "")).lower()
if state in {"succeeded", "success", "finished"}:
layouts = self.fetch_layouts(task_id)
return {
"task_id": task_id,
"status": state,
"doc_id": status.get("doc_id") or pdf_path.stem,
"doc_name": status.get("doc_name") or pdf_path.name,
"layouts": layouts,
"metadata": status,
}
if state in {"failed", "error"}:
raise RuntimeError(f"Aliyun parse task failed for {pdf_path.name}: {status}")
if time.monotonic() - started_at > timeout_seconds:
raise TimeoutError(f"Aliyun parse task timed out for {pdf_path.name}")
time.sleep(poll_interval)

View File

@@ -0,0 +1,38 @@
"""Document parser that normalizes Alibaba layout results into internal models."""
from __future__ import annotations
from pathlib import Path
from rag_eval.dataset_builder.models import ParsedDocument
from .aliyun_docmind_gateway import AliyunDocmindGateway
from .aliyun_layout_normalizer import normalize_layouts
class AliyunDocumentParser:
"""Parse PDFs through the Alibaba gateway and normalize the returned layouts."""
def __init__(self, gateway: AliyunDocmindGateway):
"""Store the gateway dependency used for remote parsing."""
self.gateway = gateway
def parse(self, pdf_path: Path) -> ParsedDocument:
"""Parse one PDF file into a normalized parsed-document model."""
payload = self.gateway.parse_document(pdf_path)
layouts = payload.get("layouts") or []
if not layouts:
raise ValueError(f"No layouts returned for document: {pdf_path.name}")
document = normalize_layouts(
doc_id=str(payload.get("doc_id") or pdf_path.stem),
doc_name=str(payload.get("doc_name") or pdf_path.name),
layouts=list(layouts),
)
document.metadata.update(
{
"task_id": payload.get("task_id"),
"provider": "aliyun_docmind",
}
)
return document

View File

@@ -0,0 +1,181 @@
"""Normalization helpers that convert raw layout results into source chunks."""
from __future__ import annotations
import re
from typing import Any
from rag_eval.dataset_builder.models import ParsedDocument, SemanticBlock, SourceChunk, StructureNode
def _clean_text(value: Any) -> str:
"""Normalize free-form layout text into a compact string."""
if value is None:
return ""
return re.sub(r"\s+", " ", str(value)).strip()
def _is_catalog_entry(item_type: str, text: str) -> bool:
"""Detect table-of-contents style entries that should be skipped."""
lowered = text.lower()
return item_type == "toc" or "目录" in text or lowered.startswith("table of contents")
def _flatten_table(item: dict[str, Any]) -> str:
"""Convert a table layout node into a searchable plain-text representation."""
rows = item.get("rows") or []
flattened_rows: list[str] = []
for row in rows:
cells = [str(cell).strip() for cell in row if str(cell).strip()]
if cells:
flattened_rows.append(" | ".join(cells))
return "\n".join(flattened_rows)
def _split_text(text: str, max_chars: int = 1200, overlap: int = 150) -> list[str]:
"""Split long text into overlapping windows so each chunk stays reviewable."""
if len(text) <= max_chars:
return [text]
windows: list[str] = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
windows.append(text[start:end].strip())
if end >= len(text):
break
start = max(end - overlap, start + 1)
return [window for window in windows if window]
def normalize_layouts(
*,
doc_id: str,
doc_name: str,
layouts: list[dict[str, Any]],
max_chunk_chars: int = 1200,
overlap_chars: int = 150,
) -> ParsedDocument:
"""Convert raw layouts into structure nodes, semantic blocks, and source chunks."""
structure_nodes: list[StructureNode] = []
semantic_blocks: list[SemanticBlock] = []
source_chunks: list[SourceChunk] = []
section_stack: list[tuple[int, str]] = []
current_block_text: list[str] = []
current_block_layout_ids: list[str] = []
current_page_start: int | None = None
current_page_end: int | None = None
current_section_path = ""
current_section_title = ""
def flush_block() -> None:
"""Finalize the in-progress semantic block and emit source chunks."""
nonlocal current_block_text, current_block_layout_ids, current_page_start, current_page_end
nonlocal current_section_path, current_section_title
text = _clean_text(" ".join(current_block_text))
if not text or current_page_start is None or current_page_end is None:
current_block_text = []
current_block_layout_ids = []
current_page_start = None
current_page_end = None
return
block_id = f"{doc_id}-block-{len(semantic_blocks) + 1}"
block = SemanticBlock(
block_id=block_id,
doc_id=doc_id,
doc_name=doc_name,
text=text,
page_start=current_page_start,
page_end=current_page_end,
section_path=current_section_path,
section_title=current_section_title,
source_layout_ids=list(current_block_layout_ids),
)
semantic_blocks.append(block)
chunk_parts = _split_text(text, max_chars=max_chunk_chars, overlap=overlap_chars)
for index, part in enumerate(chunk_parts, start=1):
heading_prefix = current_section_title.strip()
chunk_text = f"{heading_prefix}\n{part}".strip() if heading_prefix and not part.startswith(heading_prefix) else part
source_chunks.append(
SourceChunk(
chunk_id=f"{block_id}-chunk-{index}",
doc_id=doc_id,
doc_name=doc_name,
text=chunk_text,
page_start=current_page_start,
page_end=current_page_end,
section_path=current_section_path,
section_title=current_section_title,
source_layout_ids=list(current_block_layout_ids),
)
)
current_block_text = []
current_block_layout_ids = []
current_page_start = None
current_page_end = None
for index, item in enumerate(layouts, start=1):
item_type = str(item.get("type", "paragraph")).lower()
page = int(item.get("page", 1))
layout_id = str(item.get("layout_id") or f"layout-{index}")
level = int(item.get("level", 1))
if item_type == "table":
text = _flatten_table(item)
else:
text = _clean_text(item.get("text"))
if not text or _is_catalog_entry(item_type, text):
continue
if item_type == "heading":
flush_block()
while section_stack and section_stack[-1][0] >= level:
section_stack.pop()
section_stack.append((level, text))
section_titles = [title for _, title in section_stack]
current_section_title = text
current_section_path = " > ".join(section_titles)
structure_nodes.append(
StructureNode(
node_id=f"{doc_id}-node-{len(structure_nodes) + 1}",
level=level,
title=text,
page_start=page,
page_end=page,
section_path=current_section_path,
)
)
continue
if item_type == "caption":
text = f"图注: {text}"
if current_page_start is None:
current_page_start = page
current_page_end = page
current_block_text.append(text)
current_block_layout_ids.append(layout_id)
flush_block()
raw_text = "\n".join(chunk.text for chunk in source_chunks)
metadata = {
"layout_count": len(layouts),
"structure_node_count": len(structure_nodes),
"semantic_block_count": len(semantic_blocks),
"source_chunk_count": len(source_chunks),
}
return ParsedDocument(
doc_id=doc_id,
doc_name=doc_name,
raw_text=raw_text,
structure_nodes=structure_nodes,
semantic_blocks=semantic_blocks,
source_chunks=source_chunks,
metadata=metadata,
)

View File

@@ -0,0 +1,142 @@
"""Orchestration layer for PDF-to-dataset build jobs."""
from __future__ import annotations
from pathlib import Path
import yaml
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.utils import ensure_directory, utc_now_iso
from .generator.question_generator import OpenAIQuestionGenerator, QuestionGenerator
from .generator.validators import dedupe_samples, validate_draft_sample
from .models import DatasetBuildJob, DatasetBuildResult, DatasetBuildRuntime, ParseFailure
from .parser.aliyun_document_parser import AliyunDocumentParser
from .parser.aliyun_docmind_gateway import AliyunDocmindGateway
from .schema import DatasetBuildConfigModel
from .sources import discover_pdf_files
from .writers import build_artifact_paths, write_dataset_build_artifacts
def load_dataset_build_job(path: str | Path, settings: EvaluationSettings | None = None) -> DatasetBuildJob:
"""Load and validate a dataset build YAML file."""
settings = settings or EvaluationSettings()
config_path = Path(path).resolve()
payload = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
model = DatasetBuildConfigModel.model_validate(payload)
base_dir = config_path.parent
generation_model = (
model.generation.model
or settings.dataset_generator_model
or "qwen3.6-plus"
)
parser_payload = payload.get("parser") or {}
failure_mode = parser_payload.get("failure_mode") or settings.parser_failure_mode or "fail"
return DatasetBuildJob(
job_name=model.job_name,
input_path=model.resolve_path(base_dir, model.input.path),
input_glob=model.input.glob,
parser_provider=model.parser.provider,
failure_mode=failure_mode,
generation_model=generation_model,
output_type=model.generation.output_type,
review_mode=model.generation.review_mode,
max_questions_per_document=model.generation.max_questions_per_document,
max_source_chunks_per_question=model.generation.max_source_chunks_per_question,
dataset_path=model.resolve_path(base_dir, model.output.dataset_path),
artifact_dir=model.resolve_path(base_dir, model.output.artifact_dir),
runtime=DatasetBuildRuntime(max_documents=model.runtime.max_documents),
source_path=config_path,
)
def _create_parser(job: DatasetBuildJob, settings: EvaluationSettings) -> AliyunDocumentParser:
"""Create the configured document parser implementation."""
if job.parser_provider != "aliyun_docmind":
raise ValueError(f"Unsupported parser provider: {job.parser_provider}")
gateway = AliyunDocmindGateway(settings)
return AliyunDocumentParser(gateway)
def _create_generator(job: DatasetBuildJob, settings: EvaluationSettings) -> QuestionGenerator:
"""Create the configured draft question generator implementation."""
return OpenAIQuestionGenerator(settings=settings, model=job.generation_model)
def run_dataset_build(
config_path: str | Path,
*,
settings: EvaluationSettings | None = None,
parser: AliyunDocumentParser | None = None,
generator: QuestionGenerator | None = None,
) -> DatasetBuildResult:
"""Run one dataset build job end to end and persist all required artifacts."""
settings = settings or EvaluationSettings()
job = load_dataset_build_job(config_path, settings=settings)
pdf_files = discover_pdf_files(job.input_path, job.input_glob)
if job.runtime.max_documents is not None:
pdf_files = pdf_files[: job.runtime.max_documents]
parser = parser or _create_parser(job, settings)
generator = generator or _create_generator(job, settings)
run_id = utc_now_iso().replace(":", "-")
artifact_root = job.artifact_dir / run_id
ensure_directory(artifact_root)
artifact_paths = build_artifact_paths(artifact_root)
documents = []
failures: list[ParseFailure] = []
draft_samples = []
for pdf_path in pdf_files:
try:
document = parser.parse(pdf_path)
except Exception as exc:
failure = ParseFailure(file_path=pdf_path.as_posix(), error=str(exc))
failures.append(failure)
if job.failure_mode == "fail":
result = DatasetBuildResult(
job=job,
run_id=run_id,
artifact_paths=artifact_paths,
documents=documents,
draft_samples=draft_samples,
parse_failures=failures,
)
write_dataset_build_artifacts(result)
raise
continue
documents.append(document)
generated = generator.generate(
document,
max_questions=job.max_questions_per_document,
max_chunks_per_question=job.max_source_chunks_per_question,
job_name=job.job_name,
)
valid_generated = []
for sample in generated:
errors = validate_draft_sample(
sample,
document=document,
max_source_chunks_per_question=job.max_source_chunks_per_question,
)
if not errors:
valid_generated.append(sample)
draft_samples.extend(
dedupe_samples(valid_generated)[: job.max_questions_per_document]
)
result = DatasetBuildResult(
job=job,
run_id=run_id,
artifact_paths=artifact_paths,
documents=documents,
draft_samples=draft_samples,
parse_failures=failures,
)
write_dataset_build_artifacts(result)
return result

View File

@@ -0,0 +1,82 @@
"""Pydantic schemas for dataset build YAML configuration files."""
from __future__ import annotations
from pathlib import Path
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field, model_validator
class DatasetBuildInputModel(BaseModel):
"""Schema for input PDF discovery settings."""
model_config = ConfigDict(extra="ignore")
path: str
glob: str = "*.pdf"
class DatasetBuildParserModel(BaseModel):
"""Schema for parser selection and failure handling."""
model_config = ConfigDict(extra="ignore")
provider: Literal["aliyun_docmind"]
failure_mode: Literal["fail", "skip"] | None = None
class DatasetBuildGenerationModel(BaseModel):
"""Schema for question generation controls."""
model_config = ConfigDict(extra="ignore")
model: str | None = None
output_type: Literal["online_question_bank"]
review_mode: Literal["draft_with_manual_review"]
max_questions_per_document: int = Field(default=10, gt=0)
max_source_chunks_per_question: int = Field(default=3, gt=0)
class DatasetBuildOutputModel(BaseModel):
"""Schema for dataset build output locations."""
model_config = ConfigDict(extra="ignore")
dataset_path: str
artifact_dir: str
class DatasetBuildRuntimeModel(BaseModel):
"""Schema for runtime throttling and document limits."""
model_config = ConfigDict(extra="ignore")
max_documents: int | None = Field(default=None, gt=0)
class DatasetBuildConfigModel(BaseModel):
"""Top-level schema for a dataset build job."""
model_config = ConfigDict(extra="ignore")
job_name: str
input: DatasetBuildInputModel
parser: DatasetBuildParserModel
generation: DatasetBuildGenerationModel
output: DatasetBuildOutputModel
runtime: DatasetBuildRuntimeModel = Field(default_factory=DatasetBuildRuntimeModel)
@model_validator(mode="after")
def validate_job_name(self) -> "DatasetBuildConfigModel":
"""Reject blank job names that would break artifact paths."""
if not self.job_name.strip():
raise ValueError("job_name must not be empty.")
return self
def resolve_path(self, base_dir: Path, raw_path: str) -> Path:
"""Resolve relative paths against the config file directory."""
candidate = Path(raw_path)
if candidate.is_absolute():
return candidate
return (base_dir / candidate).resolve()

View File

@@ -0,0 +1,21 @@
"""Input source discovery helpers for dataset build jobs."""
from __future__ import annotations
from pathlib import Path
def discover_pdf_files(input_path: Path, pattern: str = "*.pdf") -> list[Path]:
"""Return all PDF files from a single file path or a directory scan."""
if not input_path.exists():
raise FileNotFoundError(f"Input path does not exist: {input_path}")
if input_path.is_file():
if input_path.suffix.lower() != ".pdf":
raise ValueError(f"Input file is not a PDF: {input_path}")
return [input_path]
files = sorted(path for path in input_path.glob(pattern) if path.is_file() and path.suffix.lower() == ".pdf")
if not files:
raise ValueError(f"No PDF files found under {input_path} with pattern {pattern}")
return files

View File

@@ -0,0 +1,147 @@
"""Artifact writers for dataset build runs."""
from __future__ import annotations
import csv
import json
import shutil
from pathlib import Path
from typing import Any
from rag_eval.shared.utils import ensure_directory
from .models import DatasetBuildArtifactPaths, DatasetBuildResult
def build_artifact_paths(root_dir: Path) -> DatasetBuildArtifactPaths:
"""Construct canonical output paths for one dataset build run."""
return DatasetBuildArtifactPaths(
root_dir=root_dir,
documents_jsonl=root_dir / "documents.jsonl",
semantic_blocks_jsonl=root_dir / "semantic_blocks.jsonl",
source_chunks_jsonl=root_dir / "source_chunks.jsonl",
dataset_draft_csv=root_dir / "dataset_draft.csv",
parse_failures_csv=root_dir / "parse_failures.csv",
metadata_json=root_dir / "metadata.json",
)
def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
"""Write a list of dictionaries as JSON Lines."""
with path.open("w", encoding="utf-8") as handle:
for row in rows:
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
def _write_csv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str] | None = None) -> None:
"""Write flat records into a CSV file, including list values as JSON strings."""
normalized_rows: list[dict[str, Any]] = []
resolved_fieldnames = list(fieldnames or [])
for row in rows:
normalized_row: dict[str, Any] = {}
for key, value in row.items():
if key not in resolved_fieldnames:
resolved_fieldnames.append(key)
if isinstance(value, list):
normalized_row[key] = json.dumps(value, ensure_ascii=False)
else:
normalized_row[key] = value
normalized_rows.append(normalized_row)
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=resolved_fieldnames or ["placeholder"])
writer.writeheader()
if normalized_rows:
writer.writerows(normalized_rows)
def _write_latest_alias_assets(result: DatasetBuildResult) -> None:
"""Publish stable alias files so sample scenarios can target the latest build output."""
latest_dir = result.job.artifact_dir / "latest"
ensure_directory(latest_dir)
# Keep the canonical run directory and also expose a stable entrypoint for tutorials.
shutil.copyfile(result.artifact_paths.source_chunks_jsonl, latest_dir / "source_chunks.jsonl")
shutil.copyfile(result.artifact_paths.dataset_draft_csv, latest_dir / "dataset_draft.csv")
shutil.copyfile(result.artifact_paths.metadata_json, latest_dir / "metadata.json")
def write_dataset_build_artifacts(result: DatasetBuildResult) -> None:
"""Persist dataset build outputs and metadata to disk."""
artifact_paths = result.artifact_paths
ensure_directory(artifact_paths.root_dir)
ensure_directory(result.job.dataset_path.parent)
_write_jsonl(artifact_paths.documents_jsonl, [item.to_record() for item in result.documents])
_write_jsonl(
artifact_paths.semantic_blocks_jsonl,
[block.to_record() for item in result.documents for block in item.semantic_blocks],
)
_write_jsonl(
artifact_paths.source_chunks_jsonl,
[chunk.to_record() for item in result.documents for chunk in item.source_chunks],
)
draft_rows = [sample.to_record() for sample in result.draft_samples]
_write_csv(
artifact_paths.dataset_draft_csv,
draft_rows,
fieldnames=[
"sample_id",
"question",
"ground_truth",
"scenario",
"language",
"doc_id",
"doc_name",
"section_path",
"page_start",
"page_end",
"source_chunk_ids",
"question_type",
"difficulty",
"review_status",
"review_notes",
],
)
_write_csv(
result.job.dataset_path,
draft_rows,
fieldnames=[
"sample_id",
"question",
"ground_truth",
"scenario",
"language",
"doc_id",
"doc_name",
"section_path",
"page_start",
"page_end",
"source_chunk_ids",
"question_type",
"difficulty",
"review_status",
"review_notes",
],
)
_write_csv(
artifact_paths.parse_failures_csv,
[item.to_record() for item in result.parse_failures],
fieldnames=["file_path", "error"],
)
metadata = {
"run_id": result.run_id,
"job": result.job.snapshot(),
"stats": {
"documents_processed": len(result.documents),
"draft_samples": len(result.draft_samples),
"parse_failures": len(result.parse_failures),
},
}
artifact_paths.metadata_json.write_text(
json.dumps(metadata, ensure_ascii=False, indent=2),
encoding="utf-8",
)
_write_latest_alias_assets(result)

View File

@@ -0,0 +1,5 @@
"""Execution entrypoints for running evaluation scenarios."""
from .runner import run_scenario
__all__ = ["run_scenario"]

View File

@@ -0,0 +1,23 @@
"""Async helpers for executing bounded concurrent workloads."""
from __future__ import annotations
import asyncio
from typing import Awaitable, Callable, TypeVar
T = TypeVar("T")
async def gather_with_limit(
factories: list[Callable[[], Awaitable[T]]],
limit: int,
) -> list[T]:
"""Run async factory callables with a maximum concurrency limit."""
semaphore = asyncio.Semaphore(max(1, limit))
async def guarded(factory: Callable[[], Awaitable[T]]) -> T:
"""Wrap one factory invocation with semaphore-based throttling."""
async with semaphore:
return await factory()
return await asyncio.gather(*(guarded(factory) for factory in factories))

View File

@@ -0,0 +1,6 @@
"""Custom exceptions raised during scenario execution."""
class ScenarioExecutionError(RuntimeError):
"""Raised when a scenario cannot be executed successfully."""
pass

View File

@@ -0,0 +1,125 @@
"""Core evaluation workflow for offline and online scenarios."""
from __future__ import annotations
import asyncio
from typing import Any
from rag_eval.adapters.base import AppAdapter
from rag_eval.datasets.loader import load_dataset_records
from rag_eval.datasets.normalizers import normalize_records
from rag_eval.execution.concurrency import gather_with_limit
from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
from rag_eval.shared.utils import utc_now_iso
class Evaluator:
"""Coordinate dataset loading, optional app execution, and metric scoring."""
def __init__(
self,
scenario: Scenario,
metric_pipeline: MetricPipeline,
app_adapter: AppAdapter | None = None,
):
"""Create an evaluator for one resolved scenario."""
self.scenario = scenario
self.metric_pipeline = metric_pipeline
self.app_adapter = app_adapter
def evaluate(self) -> EvaluationResult:
"""Execute the full evaluation flow and return the collected results."""
started_at = utc_now_iso()
raw_records = load_dataset_records(self.scenario.dataset.path)
samples, invalid_samples = normalize_records(
raw_records,
mode=self.scenario.mode,
max_samples=self.scenario.runtime.max_samples,
)
if self.scenario.mode == "online":
# Online mode enriches each sample by calling the target application first.
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
invalid_samples.extend(online_invalids)
metric_scores = asyncio.run(
self.metric_pipeline.score_samples(
samples,
max_concurrency=self.scenario.runtime.metric_limit(),
)
)
finished_at = utc_now_iso()
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
run_id = finished_at.replace(":", "-")
return EvaluationResult(
scenario=self.scenario,
run_id=run_id,
started_at=started_at,
finished_at=finished_at,
valid_samples=samples,
invalid_samples=invalid_samples,
score_rows=score_rows,
)
async def _enrich_online_samples(
self,
samples: list[NormalizedSample],
) -> tuple[list[NormalizedSample], list[InvalidSample]]:
"""Populate answers and contexts by calling the configured application adapter."""
if self.app_adapter is None:
raise ValueError("online mode requires an app adapter.")
valid: list[NormalizedSample] = []
invalid: list[InvalidSample] = []
async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
try:
return await self.app_adapter.enrich_sample(sample)
except Exception as exc:
error_type = type(exc).__name__
return InvalidSample(
sample_id=sample.sample_id,
error=f"adapter failed [{error_type}]: {exc}",
raw=sample.raw,
)
factories = [
(lambda sample=sample: enrich_with_capture(sample))
for sample in samples
]
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
for sample in results:
if isinstance(sample, InvalidSample):
invalid.append(sample)
continue
# Treat incomplete adapter payloads as invalid so reporting stays explicit.
errors: list[str] = []
if not sample.answer:
errors.append("adapter returned empty answer")
if not sample.contexts:
errors.append("adapter returned empty contexts")
if errors:
invalid.append(
InvalidSample(
sample_id=sample.sample_id,
error="; ".join(errors),
raw=sample.raw,
)
)
continue
valid.append(sample)
return valid, invalid
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
"""Combine sample data, metric results, and run metadata into one output row."""
record = sample.to_record()
record["contexts"] = sample.contexts
record.update(score.metrics)
record["error"] = score.error
record["judge_model"] = self.scenario.judge_model
record["embedding_model"] = self.scenario.embedding_model
record["run_id"] = self.scenario.scenario_name
return record

View File

@@ -0,0 +1,42 @@
"""High-level scenario runner used by the package and CLI entrypoints."""
from __future__ import annotations
from rag_eval.adapters.http import HttpAppAdapter
from rag_eval.adapters.python import PythonFunctionAdapter
from rag_eval.config.loader import load_scenario
from rag_eval.metrics.factory import build_metric_pipeline
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario
from .evaluator import Evaluator
def build_adapter(scenario: Scenario):
"""Instantiate the adapter required by the resolved scenario, if any."""
if scenario.app_adapter is None:
return None
if scenario.app_adapter.type == "http":
return HttpAppAdapter(scenario.app_adapter)
if scenario.app_adapter.type == "python":
return PythonFunctionAdapter(scenario.app_adapter)
raise ValueError(f"Unsupported adapter type: {scenario.app_adapter.type}")
def run_scenario(
scenario_path: str,
settings: EvaluationSettings | None = None,
):
"""Run one scenario end to end and persist its reporting artifacts."""
settings = settings or EvaluationSettings()
if not settings.openai_api_key:
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
scenario = load_scenario(scenario_path)
adapter = build_adapter(scenario)
pipeline = build_metric_pipeline(scenario, settings)
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
result = evaluator.evaluate()
write_run_artifacts(result)
return result

View File

@@ -0,0 +1,5 @@
"""Metric pipeline construction helpers."""
from .factory import build_metric_pipeline
__all__ = ["build_metric_pipeline"]

View File

@@ -0,0 +1,59 @@
"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
from __future__ import annotations
from typing import Any
from openai import AsyncOpenAI
from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario
ensure_ragas_import_compat()
from ragas.embeddings.base import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.collections import (
AnswerRelevancy,
ContextPrecision,
ContextRecall,
Faithfulness,
)
from .pipeline import MetricPipeline
def build_models(
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> tuple[Any, Any]:
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
client = AsyncOpenAI(**settings.openai_client_kwargs)
llm = llm_factory(judge_model, client=client)
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
return llm, embeddings
def build_metric_pipeline(
scenario: Scenario,
settings: EvaluationSettings,
) -> MetricPipeline:
"""Build a metric pipeline containing only the metrics requested by the scenario."""
llm, embeddings = build_models(
scenario.judge_model,
scenario.embedding_model,
settings,
)
# Build the full registry once, then slice it by configured metric names.
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
}
return MetricPipeline(
metrics={name: registry[name] for name in scenario.metrics},
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
)

View File

@@ -0,0 +1,82 @@
"""Execution pipeline for scoring normalized samples with RAGAS metrics."""
from __future__ import annotations
import asyncio
import math
from dataclasses import dataclass
from typing import Any
from rag_eval.shared.models import MetricScore, NormalizedSample
@dataclass(slots=True)
class MetricPipeline:
"""Score one or many normalized samples against a configured metric set."""
metrics: dict[str, Any]
metric_timeout_seconds: float | None = None
async def score_sample(self, sample: NormalizedSample) -> MetricScore:
"""Score a single sample and capture metric-level failures without aborting."""
results = {name: math.nan for name in self.metrics}
errors: list[str] = []
for name, metric in self.metrics.items():
try:
result = await self._run_metric(name, metric, sample)
results[name] = float(result.value)
except Exception as exc:
errors.append(f"{name}: {exc}")
return MetricScore(metrics=results, error=" | ".join(errors))
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
"""Dispatch one metric call with the argument shape expected by that metric."""
timeout = None
if self.metric_timeout_seconds is not None:
timeout = max(1.0, float(self.metric_timeout_seconds))
if name == "faithfulness":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
retrieved_contexts=sample.contexts,
)
elif name == "answer_relevancy":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
)
elif name == "context_recall":
coroutine = metric.ascore(
user_input=sample.question,
retrieved_contexts=sample.contexts,
reference=sample.ground_truth,
)
elif name == "context_precision":
coroutine = metric.ascore(
user_input=sample.question,
reference=sample.ground_truth,
retrieved_contexts=sample.contexts,
)
else:
raise ValueError(f"Unsupported metric: {name}")
if timeout is None:
return await coroutine
return await asyncio.wait_for(coroutine, timeout=timeout)
async def score_samples(
self,
samples: list[NormalizedSample],
max_concurrency: int,
) -> list[MetricScore]:
"""Score all samples while respecting the configured concurrency limit."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))
async def guarded(sample: NormalizedSample) -> MetricScore:
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
async with semaphore:
return await self.score_sample(sample)
return await asyncio.gather(*(guarded(sample) for sample in samples))

View File

@@ -0,0 +1,8 @@
"""Supported metric names recognized by scenario validation and pipeline setup."""
SUPPORTED_METRICS = {
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
}

View File

@@ -0,0 +1,5 @@
"""Reporting helpers that write evaluation outputs to disk."""
from .writers import write_run_artifacts
__all__ = ["write_run_artifacts"]

View File

@@ -0,0 +1,20 @@
"""Helpers for deriving file-system paths for run artifacts."""
from __future__ import annotations
from pathlib import Path
from rag_eval.shared.models import RunArtifactPaths
def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
"""Build the canonical artifact file paths for a single evaluation run."""
run_dir = output_dir / run_id
return RunArtifactPaths(
root_dir=run_dir,
scenario_snapshot=run_dir / "scenario.snapshot.yaml",
scores_csv=run_dir / "scores.csv",
invalid_csv=run_dir / "invalid.csv",
summary_md=run_dir / "summary.md",
metadata_json=run_dir / "metadata.json",
)

View File

@@ -0,0 +1,78 @@
"""Markdown summary generation for completed evaluation runs."""
from __future__ import annotations
import math
import pandas as pd
from rag_eval.shared.models import EvaluationResult
def _table_from_frame(frame: pd.DataFrame) -> str:
"""Render a small dataframe as a fixed-width markdown-friendly text table."""
if frame.empty:
return "No rows."
columns = list(frame.columns)
rows = [[str(value) for value in row] for row in frame.astype(object).values.tolist()]
widths = []
for index, column in enumerate(columns):
column_width = len(str(column))
row_width = max((len(row[index]) for row in rows), default=0)
widths.append(max(column_width, row_width))
header = " | ".join(str(column).ljust(widths[idx]) for idx, column in enumerate(columns))
separator = "-|-".join("-" * widths[idx] for idx in range(len(columns)))
body = [
" | ".join(row[idx].ljust(widths[idx]) for idx in range(len(columns)))
for row in rows
]
return "\n".join([header, separator, *body])
def build_summary_markdown(result: EvaluationResult) -> str:
"""Build the human-readable markdown summary written for each evaluation run."""
total = len(result.valid_samples) + len(result.invalid_samples)
scores = pd.DataFrame(result.score_rows)
lines = [
f"# {result.scenario.scenario_name}",
"",
f"- run_id: `{result.run_id}`",
f"- mode: `{result.scenario.mode}`",
f"- total_samples: `{total}`",
f"- valid_samples: `{len(result.valid_samples)}`",
f"- invalid_samples: `{len(result.invalid_samples)}`",
f"- judge_model: `{result.scenario.judge_model}`",
f"- embedding_model: `{result.scenario.embedding_model}`",
"",
"## Metric Means",
"",
]
if scores.empty:
lines.append("No valid samples were scored.")
return "\n".join(lines) + "\n"
for metric in result.scenario.metrics:
mean_value = scores[metric].mean(numeric_only=True)
if isinstance(mean_value, float) and not math.isnan(mean_value):
lines.append(f"- {metric}: `{mean_value:.4f}`")
else:
lines.append(f"- {metric}: `n/a`")
# Keep the summary self-sufficient by including every scored sample and its errors.
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
detail = scores[detail_columns]
lines.extend(
[
"",
"## Per-sample Scores",
"",
"```text",
_table_from_frame(detail),
"```",
]
)
return "\n".join(lines) + "\n"

View File

@@ -0,0 +1,52 @@
"""Writers that persist evaluation outputs as local run artifacts."""
from __future__ import annotations
import json
import pandas as pd
import yaml
from rag_eval.reporting.artifacts import build_artifact_paths
from rag_eval.reporting.summary import build_summary_markdown
from rag_eval.shared.models import EvaluationResult
from rag_eval.shared.utils import ensure_directory
def write_run_artifacts(result: EvaluationResult) -> None:
"""Write all standard run artifacts for a completed evaluation result."""
artifact_paths = build_artifact_paths(result.scenario.output_dir, result.run_id)
ensure_directory(artifact_paths.root_dir)
artifact_paths.scenario_snapshot.write_text(
yaml.safe_dump(result.scenario.snapshot(), sort_keys=False, allow_unicode=True),
encoding="utf-8",
)
pd.DataFrame(result.score_rows).to_csv(artifact_paths.scores_csv, index=False)
pd.DataFrame(
[sample.to_record() for sample in result.invalid_samples]
).to_csv(artifact_paths.invalid_csv, index=False)
artifact_paths.summary_md.write_text(
build_summary_markdown(result),
encoding="utf-8",
)
# Keep a compact machine-readable summary alongside the larger CSV and markdown outputs.
metadata = {
"run_id": result.run_id,
"scenario_name": result.scenario.scenario_name,
"mode": result.scenario.mode,
"judge_model": result.scenario.judge_model,
"embedding_model": result.scenario.embedding_model,
"started_at": result.started_at,
"finished_at": result.finished_at,
"dataset": result.scenario.dataset.path.as_posix(),
"valid_samples": len(result.valid_samples),
"invalid_samples": len(result.invalid_samples),
}
artifact_paths.metadata_json.write_text(
json.dumps(metadata, ensure_ascii=False, indent=2),
encoding="utf-8",
)

View File

@@ -0,0 +1,3 @@
sample_id,question,contexts,answer,ground_truth,scenario,language,retrieval_config
leave-policy-001,How many annual leave days does an employee with 6 years of service receive?,"[""Employees with 1 to 9 completed years of service receive 5 days of annual leave."",""Employees with 10 to 19 completed years of service receive 10 days of annual leave.""]","An employee with 6 years of service receives 5 annual leave days.","Employees with 1 to 9 completed years of service receive 5 annual leave days.",policy,en,"top_k=2;chunk_size=300"
leave-policy-002,入职满12年的员工年假有几天,"[""员工入司满1年不满10年的年休假5天。"", ""员工入司满10年不满20年的年休假10天。""]","根据规定入职满12年的员工有10天年假。","员工入司满10年不满20年的年休假10天。",policy,zh,"top_k=2;chunk_size=300"
1 sample_id question contexts answer ground_truth scenario language retrieval_config
2 leave-policy-001 How many annual leave days does an employee with 6 years of service receive? ["Employees with 1 to 9 completed years of service receive 5 days of annual leave.","Employees with 10 to 19 completed years of service receive 10 days of annual leave."] An employee with 6 years of service receives 5 annual leave days. Employees with 1 to 9 completed years of service receive 5 annual leave days. policy en top_k=2;chunk_size=300
3 leave-policy-002 入职满12年的员工年假有几天? ["员工入司满1年不满10年的,年休假5天。", "员工入司满10年不满20年的,年休假10天。"] 根据规定,入职满12年的员工有10天年假。 员工入司满10年不满20年的,年休假10天。 policy zh top_k=2;chunk_size=300

68
rag_eval/settings.py Normal file
View File

@@ -0,0 +1,68 @@
"""Runtime settings loaded from environment variables for evaluation runs."""
from __future__ import annotations
from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
REPO_ROOT = Path(__file__).resolve().parents[1]
class EvaluationSettings(BaseSettings):
"""Application settings shared by the CLI, adapters, and metric pipeline."""
model_config = SettingsConfigDict(
env_file=REPO_ROOT / ".env",
env_file_encoding="utf-8",
extra="ignore",
)
openai_api_key: str | None = Field(default=None, alias="OPENAI_API_KEY")
openai_base_url: str = Field(default="http://6.86.80.4:30080/v1", alias="OPENAI_BASE_URL")
ragas_judge_model: str = Field(default="deepseek-v4-flash", alias="RAGAS_JUDGE_MODEL")
ragas_embedding_model: str = Field(
default="text-embedding-v3",
alias="RAGAS_EMBEDDING_MODEL",
)
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
batch_size: int = Field(default=8, alias="BATCH_SIZE")
alibaba_access_key_id: str | None = Field(default=None, alias="ALIBABA_ACCESS_KEY_ID")
alibaba_access_key_secret: str | None = Field(default=None, alias="ALIBABA_ACCESS_KEY_SECRET")
alibaba_endpoint: str | None = Field(default=None, alias="ALIBABA_ENDPOINT")
aliyun_parse_poll_interval_seconds: int = Field(
default=5,
alias="ALIYUN_PARSE_POLL_INTERVAL_SECONDS",
)
aliyun_parse_timeout_seconds: int = Field(
default=600,
alias="ALIYUN_PARSE_TIMEOUT_SECONDS",
)
aliyun_parse_layout_step_size: int = Field(
default=50,
alias="ALIYUN_PARSE_LAYOUT_STEP_SIZE",
)
aliyun_llm_enhancement: bool = Field(default=False, alias="ALIYUN_LLM_ENHANCEMENT")
aliyun_enhancement_mode: str = Field(default="balanced", alias="ALIYUN_ENHANCEMENT_MODE")
document_parse_artifact_prefix: str = Field(
default="outputs/dataset-builds",
alias="DOCUMENT_PARSE_ARTIFACT_PREFIX",
)
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
@property
def openai_client_kwargs(self) -> dict[str, str | float]:
"""Return keyword arguments for the OpenAI client when credentials are available."""
if not self.openai_api_key:
return {}
client_kwargs: dict[str, str | float] = {
"api_key": self.openai_api_key,
"timeout": max(1.0, float(self.openai_timeout_seconds)),
}
if self.openai_base_url.strip():
client_kwargs["base_url"] = self.openai_base_url.strip()
return client_kwargs

View File

@@ -0,0 +1,25 @@
"""Shared data models and utilities used across evaluation subsystems."""
from .models import (
AppAdapterConfig,
DatasetConfig,
EvaluationResult,
InvalidSample,
MetricScore,
NormalizedSample,
RunArtifactPaths,
RuntimeConfig,
Scenario,
)
__all__ = [
"AppAdapterConfig",
"DatasetConfig",
"EvaluationResult",
"InvalidSample",
"MetricScore",
"NormalizedSample",
"RunArtifactPaths",
"RuntimeConfig",
"Scenario",
]

161
rag_eval/shared/models.py Normal file
View File

@@ -0,0 +1,161 @@
"""Shared runtime data models exchanged across the evaluation pipeline."""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Literal
Mode = Literal["offline", "online"]
AdapterType = Literal["http", "python"]
def _serialize_paths(value: Any) -> Any:
"""Convert Path instances nested inside snapshot payloads into POSIX strings."""
if isinstance(value, Path):
return value.as_posix()
if isinstance(value, dict):
return {key: _serialize_paths(item) for key, item in value.items()}
if isinstance(value, list):
return [_serialize_paths(item) for item in value]
return value
@dataclass(slots=True)
class RuntimeConfig:
"""Concurrency and sampling controls for one evaluation run."""
batch_size: int = 4
app_concurrency: int | None = None
metric_concurrency: int | None = None
max_samples: int | None = None
def metric_limit(self) -> int:
"""Return the effective metric-scoring concurrency limit."""
return self.metric_concurrency or self.batch_size
def app_limit(self) -> int:
"""Return the effective application-call concurrency limit."""
return self.app_concurrency or self.batch_size
@dataclass(slots=True)
class AppAdapterConfig:
"""Resolved adapter configuration used by online scenarios."""
type: AdapterType
endpoint: str | None = None
method: str = "POST"
timeout_seconds: int = 30
callable: str | None = None
request_template: dict[str, Any] = field(default_factory=dict)
response_mapping: dict[str, str] = field(default_factory=dict)
static_kwargs: dict[str, Any] = field(default_factory=dict)
@dataclass(slots=True)
class DatasetConfig:
"""Dataset location information for a scenario."""
path: Path
format: str | None = None
@dataclass(slots=True)
class Scenario:
"""Resolved evaluation scenario consumed by the execution pipeline."""
scenario_name: str
mode: Mode
dataset: DatasetConfig
judge_model: str
embedding_model: str
metrics: list[str]
output_dir: Path
runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
app_adapter: AppAdapterConfig | None = None
source_path: Path | None = None
def snapshot(self) -> dict[str, Any]:
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
return _serialize_paths(asdict(self))
@dataclass(slots=True)
class NormalizedSample:
"""Canonical sample shape used by adapters, metrics, and reporting."""
sample_id: str
question: str
contexts: list[str]
answer: str
ground_truth: str
scenario: str = ""
language: str = ""
retrieval_config: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
raw: dict[str, Any] = field(default_factory=dict)
def to_record(self) -> dict[str, Any]:
"""Convert the sample into a flat record for CSV and artifact generation."""
record = {
"sample_id": self.sample_id,
"question": self.question,
"contexts": self.contexts,
"answer": self.answer,
"ground_truth": self.ground_truth,
"scenario": self.scenario,
"language": self.language,
"retrieval_config": self.retrieval_config,
}
record.update(self.metadata)
return record
@dataclass(slots=True)
class InvalidSample:
"""A dataset or adapter sample that could not be evaluated."""
sample_id: str
error: str
raw: dict[str, Any]
def to_record(self) -> dict[str, Any]:
"""Convert the invalid sample into a flat reporting row."""
record = {"sample_id": self.sample_id, "error": self.error}
record.update(self.raw)
return record
@dataclass(slots=True)
class MetricScore:
"""Metric values and accumulated errors for one evaluated sample."""
metrics: dict[str, float | None]
error: str = ""
@dataclass(slots=True)
class EvaluationResult:
"""Aggregate result object returned after a scenario completes."""
scenario: Scenario
run_id: str
started_at: str
finished_at: str
valid_samples: list[NormalizedSample]
invalid_samples: list[InvalidSample]
score_rows: list[dict[str, Any]]
@dataclass(slots=True)
class RunArtifactPaths:
"""Canonical file-system paths for all artifacts produced by one run."""
root_dir: Path
scenario_snapshot: Path
scores_csv: Path
invalid_csv: Path
summary_md: Path
metadata_json: Path

49
rag_eval/shared/utils.py Normal file
View File

@@ -0,0 +1,49 @@
"""General-purpose helpers shared across configuration, datasets, and reporting."""
from __future__ import annotations
import ast
import json
import math
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
def utc_now_iso() -> str:
"""Return the current UTC timestamp in ISO 8601 format."""
return datetime.now(timezone.utc).isoformat()
def ensure_directory(path: Path) -> None:
"""Create a directory path if it does not already exist."""
path.mkdir(parents=True, exist_ok=True)
def parse_contexts(value: Any) -> list[str]:
"""Normalize a context payload into a list of non-empty strings."""
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if value is None or (isinstance(value, float) and math.isnan(value)):
return []
text = str(value).strip()
if not text:
return []
# Accept serialized lists from CSV exports before falling back to plain text.
for parser in (json.loads, ast.literal_eval):
try:
parsed = parser(text)
except (ValueError, SyntaxError, json.JSONDecodeError):
continue
if isinstance(parsed, list):
return [str(item).strip() for item in parsed if str(item).strip()]
# Preserve paragraph-style context dumps by splitting on blank lines first.
if "\n\n" in text:
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
if chunks:
return chunks
return [text]