first commit

2026-06-12 14:02:15 +08:00
commit 9cbdc1d95d
69 changed files with 9486 additions and 0 deletions
--- a/rag_eval/metrics/init.py
+++ b/rag_eval/metrics/init.py
@@ -0,0 +1,5 @@
+"""Metric pipeline construction helpers."""
+
+from .factory import build_metric_pipeline
+
+__all__ = ["build_metric_pipeline"]
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -0,0 +1,59 @@
+"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from openai import AsyncOpenAI
+
+from rag_eval.compat import ensure_ragas_import_compat
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.models import Scenario
+
+ensure_ragas_import_compat()
+
+from ragas.embeddings.base import embedding_factory
+from ragas.llms import llm_factory
+from ragas.metrics.collections import (
+    AnswerRelevancy,
+    ContextPrecision,
+    ContextRecall,
+    Faithfulness,
+)
+
+from .pipeline import MetricPipeline
+
+
+def build_models(
+    judge_model: str,
+    embedding_model: str,
+    settings: EvaluationSettings,
+) -> tuple[Any, Any]:
+    """Create the LLM and embedding clients required by the selected RAGAS metrics."""
+    client = AsyncOpenAI(**settings.openai_client_kwargs)
+    llm = llm_factory(judge_model, client=client)
+    embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
+    return llm, embeddings
+
+
+def build_metric_pipeline(
+    scenario: Scenario,
+    settings: EvaluationSettings,
+) -> MetricPipeline:
+    """Build a metric pipeline containing only the metrics requested by the scenario."""
+    llm, embeddings = build_models(
+        scenario.judge_model,
+        scenario.embedding_model,
+        settings,
+    )
+    # Build the full registry once, then slice it by configured metric names.
+    registry: dict[str, Any] = {
+        "faithfulness": Faithfulness(llm=llm),
+        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
+        "context_recall": ContextRecall(llm=llm),
+        "context_precision": ContextPrecision(llm=llm),
+    }
+    return MetricPipeline(
+        metrics={name: registry[name] for name in scenario.metrics},
+        metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
+    )
--- a/rag_eval/metrics/pipeline.py
+++ b/rag_eval/metrics/pipeline.py
@@ -0,0 +1,82 @@
+"""Execution pipeline for scoring normalized samples with RAGAS metrics."""
+
+from __future__ import annotations
+
+import asyncio
+import math
+from dataclasses import dataclass
+from typing import Any
+
+from rag_eval.shared.models import MetricScore, NormalizedSample
+
+
+@dataclass(slots=True)
+class MetricPipeline:
+    """Score one or many normalized samples against a configured metric set."""
+
+    metrics: dict[str, Any]
+    metric_timeout_seconds: float | None = None
+
+    async def score_sample(self, sample: NormalizedSample) -> MetricScore:
+        """Score a single sample and capture metric-level failures without aborting."""
+        results = {name: math.nan for name in self.metrics}
+        errors: list[str] = []
+
+        for name, metric in self.metrics.items():
+            try:
+                result = await self._run_metric(name, metric, sample)
+                results[name] = float(result.value)
+            except Exception as exc:
+                errors.append(f"{name}: {exc}")
+        return MetricScore(metrics=results, error=" | ".join(errors))
+
+    async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
+        """Dispatch one metric call with the argument shape expected by that metric."""
+        timeout = None
+        if self.metric_timeout_seconds is not None:
+            timeout = max(1.0, float(self.metric_timeout_seconds))
+
+        if name == "faithfulness":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                response=sample.answer,
+                retrieved_contexts=sample.contexts,
+            )
+        elif name == "answer_relevancy":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                response=sample.answer,
+            )
+        elif name == "context_recall":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                retrieved_contexts=sample.contexts,
+                reference=sample.ground_truth,
+            )
+        elif name == "context_precision":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                reference=sample.ground_truth,
+                retrieved_contexts=sample.contexts,
+            )
+        else:
+            raise ValueError(f"Unsupported metric: {name}")
+
+        if timeout is None:
+            return await coroutine
+        return await asyncio.wait_for(coroutine, timeout=timeout)
+
+    async def score_samples(
+        self,
+        samples: list[NormalizedSample],
+        max_concurrency: int,
+    ) -> list[MetricScore]:
+        """Score all samples while respecting the configured concurrency limit."""
+        semaphore = asyncio.Semaphore(max(1, max_concurrency))
+
+        async def guarded(sample: NormalizedSample) -> MetricScore:
+            """Throttle a single sample-scoring coroutine with the shared semaphore."""
+            async with semaphore:
+                return await self.score_sample(sample)
+
+        return await asyncio.gather(*(guarded(sample) for sample in samples))
--- a/rag_eval/metrics/registry.py
+++ b/rag_eval/metrics/registry.py
@@ -0,0 +1,8 @@
+"""Supported metric names recognized by scenario validation and pipeline setup."""
+
+SUPPORTED_METRICS = {
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+}