2026-06-12 14:02:15 +08:00
|
|
|
"""Factories for OpenAI-backed RAGAS models and metric pipelines."""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from openai import AsyncOpenAI
|
|
|
|
|
|
|
|
|
|
from rag_eval.compat import ensure_ragas_import_compat
|
|
|
|
|
from rag_eval.settings import EvaluationSettings
|
|
|
|
|
from rag_eval.shared.models import Scenario
|
|
|
|
|
|
|
|
|
|
ensure_ragas_import_compat()
|
|
|
|
|
|
|
|
|
|
from ragas.embeddings.base import embedding_factory
|
|
|
|
|
from ragas.llms import llm_factory
|
|
|
|
|
from ragas.metrics.collections import (
|
|
|
|
|
AnswerRelevancy,
|
|
|
|
|
ContextPrecision,
|
|
|
|
|
ContextRecall,
|
2026-06-16 17:06:19 +08:00
|
|
|
FactualCorrectness,
|
2026-06-12 14:02:15 +08:00
|
|
|
Faithfulness,
|
2026-06-16 17:06:19 +08:00
|
|
|
NoiseSensitivity,
|
|
|
|
|
SemanticSimilarity,
|
2026-06-12 14:02:15 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
from .pipeline import MetricPipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_models(
|
|
|
|
|
judge_model: str,
|
|
|
|
|
embedding_model: str,
|
|
|
|
|
settings: EvaluationSettings,
|
|
|
|
|
) -> tuple[Any, Any]:
|
|
|
|
|
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
|
|
|
|
|
client = AsyncOpenAI(**settings.openai_client_kwargs)
|
|
|
|
|
llm = llm_factory(judge_model, client=client)
|
|
|
|
|
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
|
|
|
|
return llm, embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_metric_pipeline(
|
|
|
|
|
scenario: Scenario,
|
|
|
|
|
settings: EvaluationSettings,
|
2026-06-16 17:06:19 +08:00
|
|
|
llm: Any | None = None,
|
|
|
|
|
embeddings: Any | None = None,
|
2026-06-12 14:02:15 +08:00
|
|
|
) -> MetricPipeline:
|
2026-06-16 17:06:19 +08:00
|
|
|
"""Build a metric pipeline containing only the metrics requested by the scenario.
|
|
|
|
|
|
|
|
|
|
If llm and embeddings are provided (pre-built by the caller), they are reused.
|
|
|
|
|
Otherwise, new instances are created from scenario + settings.
|
|
|
|
|
"""
|
|
|
|
|
if llm is None or embeddings is None:
|
|
|
|
|
llm, embeddings = build_models(
|
|
|
|
|
scenario.judge_model,
|
|
|
|
|
scenario.embedding_model,
|
|
|
|
|
settings,
|
|
|
|
|
)
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
# Build the full registry once, then slice it by configured metric names.
|
|
|
|
|
registry: dict[str, Any] = {
|
|
|
|
|
"faithfulness": Faithfulness(llm=llm),
|
|
|
|
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
|
|
|
|
"context_recall": ContextRecall(llm=llm),
|
|
|
|
|
"context_precision": ContextPrecision(llm=llm),
|
2026-06-16 17:06:19 +08:00
|
|
|
# Robustness / end-to-end metrics (架构设计 §10.2).
|
|
|
|
|
# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
|
|
|
|
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
|
|
|
|
# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
|
|
|
|
|
"factual_correctness": FactualCorrectness(llm=llm),
|
|
|
|
|
# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
|
|
|
|
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
2026-06-12 14:02:15 +08:00
|
|
|
}
|
|
|
|
|
return MetricPipeline(
|
|
|
|
|
metrics={name: registry[name] for name in scenario.metrics},
|
|
|
|
|
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
|
|
|
|
)
|