siemens_ragas/rag_eval/metrics/factory.py

"""Factories for OpenAI-backed RAGAS models and metric pipelines."""

from __future__ import annotations

from typing import Any

from openai import AsyncOpenAI

from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario

ensure_ragas_import_compat()

from ragas.embeddings.base import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.collections import (
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
    FactualCorrectness,
    Faithfulness,
    NoiseSensitivity,
    SemanticSimilarity,
)

from .pipeline import MetricPipeline


def build_models(
    judge_model: str,
    embedding_model: str,
    settings: EvaluationSettings,
) -> tuple[Any, Any]:
    """Create the LLM and embedding clients required by the selected RAGAS metrics."""
    client = AsyncOpenAI(**settings.openai_client_kwargs)
    llm = llm_factory(judge_model, client=client)
    embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
    return llm, embeddings


def build_metric_pipeline(
    scenario: Scenario,
    settings: EvaluationSettings,
    llm: Any | None = None,
    embeddings: Any | None = None,
) -> MetricPipeline:
    """Build a metric pipeline containing only the metrics requested by the scenario.

    If llm and embeddings are provided (pre-built by the caller), they are reused.
    Otherwise, new instances are created from scenario + settings.
    """
    if llm is None or embeddings is None:
        llm, embeddings = build_models(
            scenario.judge_model,
            scenario.embedding_model,
            settings,
        )

    # Build the full registry once, then slice it by configured metric names.
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
        # Robustness / end-to-end metrics (架构设计 §10.2).
        # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
        "noise_sensitivity": NoiseSensitivity(llm=llm),
        # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
        "factual_correctness": FactualCorrectness(llm=llm),
        # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return MetricPipeline(
        metrics={name: registry[name] for name in scenario.metrics},
        metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
    )
first commit 2026-06-12 14:02:15 +08:00			`"""Factories for OpenAI-backed RAGAS models and metric pipelines."""`

			`from __future__ import annotations`

			`from typing import Any`

			`from openai import AsyncOpenAI`

			`from rag_eval.compat import ensure_ragas_import_compat`
			`from rag_eval.settings import EvaluationSettings`
			`from rag_eval.shared.models import Scenario`

			`ensure_ragas_import_compat()`

			`from ragas.embeddings.base import embedding_factory`
			`from ragas.llms import llm_factory`
			`from ragas.metrics.collections import (`
			`AnswerRelevancy,`
			`ContextPrecision,`
			`ContextRecall,`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`FactualCorrectness,`
first commit 2026-06-12 14:02:15 +08:00			`Faithfulness,`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`NoiseSensitivity,`
			`SemanticSimilarity,`
first commit 2026-06-12 14:02:15 +08:00			`)`

			`from .pipeline import MetricPipeline`


			`def build_models(`
			`judge_model: str,`
			`embedding_model: str,`
			`settings: EvaluationSettings,`
			`) -> tuple[Any, Any]:`
			`"""Create the LLM and embedding clients required by the selected RAGAS metrics."""`
			`client = AsyncOpenAI(**settings.openai_client_kwargs)`
			`llm = llm_factory(judge_model, client=client)`
			`embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)`
			`return llm, embeddings`


			`def build_metric_pipeline(`
			`scenario: Scenario,`
			`settings: EvaluationSettings,`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`llm: Any \| None = None,`
			`embeddings: Any \| None = None,`
first commit 2026-06-12 14:02:15 +08:00			`) -> MetricPipeline:`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`"""Build a metric pipeline containing only the metrics requested by the scenario.`

			`If llm and embeddings are provided (pre-built by the caller), they are reused.`
			`Otherwise, new instances are created from scenario + settings.`
			`"""`
			`if llm is None or embeddings is None:`
			`llm, embeddings = build_models(`
			`scenario.judge_model,`
			`scenario.embedding_model,`
			`settings,`
			`)`

first commit 2026-06-12 14:02:15 +08:00			`# Build the full registry once, then slice it by configured metric names.`
			`registry: dict[str, Any] = {`
			`"faithfulness": Faithfulness(llm=llm),`
			`"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),`
			`"context_recall": ContextRecall(llm=llm),`
			`"context_precision": ContextPrecision(llm=llm),`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`# Robustness / end-to-end metrics (架构设计 §10.2).`
			`# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.`
			`"noise_sensitivity": NoiseSensitivity(llm=llm),`
			`# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.`
			`"factual_correctness": FactualCorrectness(llm=llm),`
			`# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).`
			`"semantic_similarity": SemanticSimilarity(embeddings=embeddings),`
first commit 2026-06-12 14:02:15 +08:00			`}`
			`return MetricPipeline(`
			`metrics={name: registry[name] for name in scenario.metrics},`
			`metric_timeout_seconds=settings.ragas_metric_timeout_seconds,`
			`)`