first commit
This commit is contained in:
5
rag_eval/metrics/__init__.py
Normal file
5
rag_eval/metrics/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Metric pipeline construction helpers."""
|
||||
|
||||
from .factory import build_metric_pipeline
|
||||
|
||||
__all__ = ["build_metric_pipeline"]
|
||||
59
rag_eval/metrics/factory.py
Normal file
59
rag_eval/metrics/factory.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from rag_eval.compat import ensure_ragas_import_compat
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.models import Scenario
|
||||
|
||||
ensure_ragas_import_compat()
|
||||
|
||||
from ragas.embeddings.base import embedding_factory
|
||||
from ragas.llms import llm_factory
|
||||
from ragas.metrics.collections import (
|
||||
AnswerRelevancy,
|
||||
ContextPrecision,
|
||||
ContextRecall,
|
||||
Faithfulness,
|
||||
)
|
||||
|
||||
from .pipeline import MetricPipeline
|
||||
|
||||
|
||||
def build_models(
|
||||
judge_model: str,
|
||||
embedding_model: str,
|
||||
settings: EvaluationSettings,
|
||||
) -> tuple[Any, Any]:
|
||||
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
|
||||
client = AsyncOpenAI(**settings.openai_client_kwargs)
|
||||
llm = llm_factory(judge_model, client=client)
|
||||
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
||||
return llm, embeddings
|
||||
|
||||
|
||||
def build_metric_pipeline(
|
||||
scenario: Scenario,
|
||||
settings: EvaluationSettings,
|
||||
) -> MetricPipeline:
|
||||
"""Build a metric pipeline containing only the metrics requested by the scenario."""
|
||||
llm, embeddings = build_models(
|
||||
scenario.judge_model,
|
||||
scenario.embedding_model,
|
||||
settings,
|
||||
)
|
||||
# Build the full registry once, then slice it by configured metric names.
|
||||
registry: dict[str, Any] = {
|
||||
"faithfulness": Faithfulness(llm=llm),
|
||||
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||
"context_recall": ContextRecall(llm=llm),
|
||||
"context_precision": ContextPrecision(llm=llm),
|
||||
}
|
||||
return MetricPipeline(
|
||||
metrics={name: registry[name] for name in scenario.metrics},
|
||||
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
||||
)
|
||||
82
rag_eval/metrics/pipeline.py
Normal file
82
rag_eval/metrics/pipeline.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Execution pipeline for scoring normalized samples with RAGAS metrics."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.shared.models import MetricScore, NormalizedSample
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class MetricPipeline:
|
||||
"""Score one or many normalized samples against a configured metric set."""
|
||||
|
||||
metrics: dict[str, Any]
|
||||
metric_timeout_seconds: float | None = None
|
||||
|
||||
async def score_sample(self, sample: NormalizedSample) -> MetricScore:
|
||||
"""Score a single sample and capture metric-level failures without aborting."""
|
||||
results = {name: math.nan for name in self.metrics}
|
||||
errors: list[str] = []
|
||||
|
||||
for name, metric in self.metrics.items():
|
||||
try:
|
||||
result = await self._run_metric(name, metric, sample)
|
||||
results[name] = float(result.value)
|
||||
except Exception as exc:
|
||||
errors.append(f"{name}: {exc}")
|
||||
return MetricScore(metrics=results, error=" | ".join(errors))
|
||||
|
||||
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
|
||||
"""Dispatch one metric call with the argument shape expected by that metric."""
|
||||
timeout = None
|
||||
if self.metric_timeout_seconds is not None:
|
||||
timeout = max(1.0, float(self.metric_timeout_seconds))
|
||||
|
||||
if name == "faithfulness":
|
||||
coroutine = metric.ascore(
|
||||
user_input=sample.question,
|
||||
response=sample.answer,
|
||||
retrieved_contexts=sample.contexts,
|
||||
)
|
||||
elif name == "answer_relevancy":
|
||||
coroutine = metric.ascore(
|
||||
user_input=sample.question,
|
||||
response=sample.answer,
|
||||
)
|
||||
elif name == "context_recall":
|
||||
coroutine = metric.ascore(
|
||||
user_input=sample.question,
|
||||
retrieved_contexts=sample.contexts,
|
||||
reference=sample.ground_truth,
|
||||
)
|
||||
elif name == "context_precision":
|
||||
coroutine = metric.ascore(
|
||||
user_input=sample.question,
|
||||
reference=sample.ground_truth,
|
||||
retrieved_contexts=sample.contexts,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported metric: {name}")
|
||||
|
||||
if timeout is None:
|
||||
return await coroutine
|
||||
return await asyncio.wait_for(coroutine, timeout=timeout)
|
||||
|
||||
async def score_samples(
|
||||
self,
|
||||
samples: list[NormalizedSample],
|
||||
max_concurrency: int,
|
||||
) -> list[MetricScore]:
|
||||
"""Score all samples while respecting the configured concurrency limit."""
|
||||
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
||||
|
||||
async def guarded(sample: NormalizedSample) -> MetricScore:
|
||||
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
|
||||
async with semaphore:
|
||||
return await self.score_sample(sample)
|
||||
|
||||
return await asyncio.gather(*(guarded(sample) for sample in samples))
|
||||
8
rag_eval/metrics/registry.py
Normal file
8
rag_eval/metrics/registry.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
||||
|
||||
SUPPORTED_METRICS = {
|
||||
"faithfulness",
|
||||
"answer_relevancy",
|
||||
"context_recall",
|
||||
"context_precision",
|
||||
}
|
||||
Reference in New Issue
Block a user