siemens_ragas/rag_eval/metrics/factory.py

"""Factories for OpenAI-backed RAGAS models and metric pipelines."""

from __future__ import annotations

from typing import Any

from openai import AsyncOpenAI

from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario

ensure_ragas_import_compat()

from ragas.embeddings.base import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.collections import (
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
    FactualCorrectness,
    Faithfulness,
    NoiseSensitivity,
    SemanticSimilarity,
)

from .pipeline import MetricPipeline


def _resolve_openai_client_kwargs(
    judge_model: str,
    settings: EvaluationSettings,
) -> dict[str, Any]:
    """Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings.

    Lookup order:
      1. LLM Profile whose model name equals judge_model (exact match)
      2. Fall back to EvaluationSettings (.env)
    """
    try:
        # Lazy import to avoid circular dependency (webapp -> rag_eval is one-way).
        from webapp.services.profile_manager import profile_manager
        profiles = profile_manager.list_all()
        for profile in profiles:
            if profile.model == judge_model:
                kwargs: dict[str, Any] = {
                    "api_key": profile.api_key or "sk-placeholder",
                    "timeout": float(profile.timeout_seconds or 30),
                }
                if profile.base_url and profile.base_url.strip():
                    kwargs["base_url"] = profile.base_url.strip()
                return kwargs
    except Exception:  # noqa: BLE001
        # If profile lookup fails for any reason, fall through to .env settings.
        pass

    return settings.openai_client_kwargs


def build_models(
    judge_model: str,
    embedding_model: str,
    settings: EvaluationSettings,
) -> tuple[Any, Any]:
    """Create the LLM and embedding clients required by the selected RAGAS metrics.

    Dynamically resolves connection settings from the stored LLM Profiles first
    (matched by model name), falling back to .env settings when no profile matches.
    """
    client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
    client = AsyncOpenAI(**client_kwargs)
    # RAGAS structured-output judge calls can be truncated by the upstream default
    # 1024 completion budget, especially for faithfulness and GPT-5 family models.
    llm = llm_factory(
        judge_model,
        client=client,
        max_tokens=max(1, int(settings.ragas_llm_max_tokens)),
    )
    embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
    return llm, embeddings


def build_metric_pipeline(
    scenario: Scenario,
    settings: EvaluationSettings,
    llm: Any | None = None,
    embeddings: Any | None = None,
) -> MetricPipeline:
    """Build a metric pipeline containing only the metrics requested by the scenario.

    If llm and embeddings are provided (pre-built by the caller), they are reused.
    Otherwise, new instances are created from scenario + settings.
    """
    if llm is None or embeddings is None:
        llm, embeddings = build_models(
            scenario.judge_model,
            scenario.embedding_model,
            settings,
        )

    # Build the full registry once, then slice it by configured metric names.
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
        # Robustness / end-to-end metrics (架构设计 §10.2).
        # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
        "noise_sensitivity": NoiseSensitivity(llm=llm),
        # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
        "factual_correctness": FactualCorrectness(llm=llm),
        # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return MetricPipeline(
        metrics={name: registry[name] for name in scenario.metrics},
        metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
    )