Files
siemens_ragas/rag_eval/metrics/factory.py

78 lines
2.7 KiB
Python
Raw Normal View History

2026-06-12 14:02:15 +08:00
"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
from __future__ import annotations
from typing import Any
from openai import AsyncOpenAI
from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario
ensure_ragas_import_compat()
from ragas.embeddings.base import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.collections import (
AnswerRelevancy,
ContextPrecision,
ContextRecall,
FactualCorrectness,
2026-06-12 14:02:15 +08:00
Faithfulness,
NoiseSensitivity,
SemanticSimilarity,
2026-06-12 14:02:15 +08:00
)
from .pipeline import MetricPipeline
def build_models(
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> tuple[Any, Any]:
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
client = AsyncOpenAI(**settings.openai_client_kwargs)
llm = llm_factory(judge_model, client=client)
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
return llm, embeddings
def build_metric_pipeline(
scenario: Scenario,
settings: EvaluationSettings,
llm: Any | None = None,
embeddings: Any | None = None,
2026-06-12 14:02:15 +08:00
) -> MetricPipeline:
"""Build a metric pipeline containing only the metrics requested by the scenario.
If llm and embeddings are provided (pre-built by the caller), they are reused.
Otherwise, new instances are created from scenario + settings.
"""
if llm is None or embeddings is None:
llm, embeddings = build_models(
scenario.judge_model,
scenario.embedding_model,
settings,
)
2026-06-12 14:02:15 +08:00
# Build the full registry once, then slice it by configured metric names.
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
# Robustness / end-to-end metrics (架构设计 §10.2).
# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
"noise_sensitivity": NoiseSensitivity(llm=llm),
# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
"factual_correctness": FactualCorrectness(llm=llm),
# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
2026-06-12 14:02:15 +08:00
}
return MetricPipeline(
metrics={name: registry[name] for name in scenario.metrics},
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
)