"""Factories for OpenAI-backed RAGAS models and metric pipelines.""" from __future__ import annotations from typing import Any from openai import AsyncOpenAI from rag_eval.compat import ensure_ragas_import_compat from rag_eval.settings import EvaluationSettings from rag_eval.shared.models import Scenario ensure_ragas_import_compat() from ragas.embeddings.base import embedding_factory from ragas.llms import llm_factory from ragas.metrics.collections import ( AnswerRelevancy, ContextPrecision, ContextRecall, FactualCorrectness, Faithfulness, NoiseSensitivity, SemanticSimilarity, ) from .pipeline import MetricPipeline def _resolve_openai_client_kwargs( judge_model: str, settings: EvaluationSettings, ) -> dict[str, Any]: """Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings. Lookup order: 1. LLM Profile whose model name equals judge_model (exact match) 2. Fall back to EvaluationSettings (.env) """ try: # Lazy import to avoid circular dependency (webapp -> rag_eval is one-way). from webapp.services.profile_manager import profile_manager profiles = profile_manager.list_all() for profile in profiles: if profile.model == judge_model: kwargs: dict[str, Any] = { "api_key": profile.api_key or "sk-placeholder", "timeout": float(profile.timeout_seconds or 30), } if profile.base_url and profile.base_url.strip(): kwargs["base_url"] = profile.base_url.strip() return kwargs except Exception: # noqa: BLE001 # If profile lookup fails for any reason, fall through to .env settings. pass return settings.openai_client_kwargs def build_models( judge_model: str, embedding_model: str, settings: EvaluationSettings, ) -> tuple[Any, Any]: """Create the LLM and embedding clients required by the selected RAGAS metrics. Dynamically resolves connection settings from the stored LLM Profiles first (matched by model name), falling back to .env settings when no profile matches. """ client_kwargs = _resolve_openai_client_kwargs(judge_model, settings) client = AsyncOpenAI(**client_kwargs) llm = llm_factory(judge_model, client=client) embeddings = embedding_factory(provider="openai", model=embedding_model, client=client) return llm, embeddings def build_metric_pipeline( scenario: Scenario, settings: EvaluationSettings, llm: Any | None = None, embeddings: Any | None = None, ) -> MetricPipeline: """Build a metric pipeline containing only the metrics requested by the scenario. If llm and embeddings are provided (pre-built by the caller), they are reused. Otherwise, new instances are created from scenario + settings. """ if llm is None or embeddings is None: llm, embeddings = build_models( scenario.judge_model, scenario.embedding_model, settings, ) # Build the full registry once, then slice it by configured metric names. registry: dict[str, Any] = { "faithfulness": Faithfulness(llm=llm), "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings), "context_recall": ContextRecall(llm=llm), "context_precision": ContextPrecision(llm=llm), # Robustness / end-to-end metrics (架构设计 §10.2). # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts. "noise_sensitivity": NoiseSensitivity(llm=llm), # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth. "factual_correctness": FactualCorrectness(llm=llm), # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call). "semantic_similarity": SemanticSimilarity(embeddings=embeddings), } return MetricPipeline( metrics={name: registry[name] for name in scenario.metrics}, metric_timeout_seconds=settings.ragas_metric_timeout_seconds, )