first commit

This commit is contained in:
2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
"""Metric pipeline construction helpers."""
from .factory import build_metric_pipeline
__all__ = ["build_metric_pipeline"]

View File

@@ -0,0 +1,59 @@
"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
from __future__ import annotations
from typing import Any
from openai import AsyncOpenAI
from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario
ensure_ragas_import_compat()
from ragas.embeddings.base import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.collections import (
AnswerRelevancy,
ContextPrecision,
ContextRecall,
Faithfulness,
)
from .pipeline import MetricPipeline
def build_models(
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> tuple[Any, Any]:
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
client = AsyncOpenAI(**settings.openai_client_kwargs)
llm = llm_factory(judge_model, client=client)
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
return llm, embeddings
def build_metric_pipeline(
scenario: Scenario,
settings: EvaluationSettings,
) -> MetricPipeline:
"""Build a metric pipeline containing only the metrics requested by the scenario."""
llm, embeddings = build_models(
scenario.judge_model,
scenario.embedding_model,
settings,
)
# Build the full registry once, then slice it by configured metric names.
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
}
return MetricPipeline(
metrics={name: registry[name] for name in scenario.metrics},
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
)

View File

@@ -0,0 +1,82 @@
"""Execution pipeline for scoring normalized samples with RAGAS metrics."""
from __future__ import annotations
import asyncio
import math
from dataclasses import dataclass
from typing import Any
from rag_eval.shared.models import MetricScore, NormalizedSample
@dataclass(slots=True)
class MetricPipeline:
"""Score one or many normalized samples against a configured metric set."""
metrics: dict[str, Any]
metric_timeout_seconds: float | None = None
async def score_sample(self, sample: NormalizedSample) -> MetricScore:
"""Score a single sample and capture metric-level failures without aborting."""
results = {name: math.nan for name in self.metrics}
errors: list[str] = []
for name, metric in self.metrics.items():
try:
result = await self._run_metric(name, metric, sample)
results[name] = float(result.value)
except Exception as exc:
errors.append(f"{name}: {exc}")
return MetricScore(metrics=results, error=" | ".join(errors))
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
"""Dispatch one metric call with the argument shape expected by that metric."""
timeout = None
if self.metric_timeout_seconds is not None:
timeout = max(1.0, float(self.metric_timeout_seconds))
if name == "faithfulness":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
retrieved_contexts=sample.contexts,
)
elif name == "answer_relevancy":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
)
elif name == "context_recall":
coroutine = metric.ascore(
user_input=sample.question,
retrieved_contexts=sample.contexts,
reference=sample.ground_truth,
)
elif name == "context_precision":
coroutine = metric.ascore(
user_input=sample.question,
reference=sample.ground_truth,
retrieved_contexts=sample.contexts,
)
else:
raise ValueError(f"Unsupported metric: {name}")
if timeout is None:
return await coroutine
return await asyncio.wait_for(coroutine, timeout=timeout)
async def score_samples(
self,
samples: list[NormalizedSample],
max_concurrency: int,
) -> list[MetricScore]:
"""Score all samples while respecting the configured concurrency limit."""
semaphore = asyncio.Semaphore(max(1, max_concurrency))
async def guarded(sample: NormalizedSample) -> MetricScore:
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
async with semaphore:
return await self.score_sample(sample)
return await asyncio.gather(*(guarded(sample) for sample in samples))

View File

@@ -0,0 +1,8 @@
"""Supported metric names recognized by scenario validation and pipeline setup."""
SUPPORTED_METRICS = {
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
}