From e4d4e4968b208e3c41370e0f28475d2288a0e108 Mon Sep 17 00:00:00 2001 From: wangwei Date: Mon, 22 Jun 2026 15:03:43 +0800 Subject: [PATCH] feat: add InlineScorer service with LLM client caching Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/webapp/test_score_api.py | 59 +++++++++++++++++ webapp/services/inline_scorer.py | 109 +++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 webapp/services/inline_scorer.py diff --git a/tests/webapp/test_score_api.py b/tests/webapp/test_score_api.py index 8f52d5c..8076208 100644 --- a/tests/webapp/test_score_api.py +++ b/tests/webapp/test_score_api.py @@ -126,3 +126,62 @@ class TestScoreResponse: assert resp.scores["faithfulness"] == 0.85 assert resp.scores["answer_relevancy"] is None assert resp.latency_ms == 1200 + + +class TestInlineScorer: + def test_score_returns_dict_with_requested_metrics(self): + """InlineScorer.score returns a dict keyed by the requested metrics.""" + from unittest.mock import AsyncMock, MagicMock, patch + from webapp.services.inline_scorer import InlineScorer + from rag_eval.settings import EvaluationSettings + + mock_score = MagicMock() + mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8} + mock_score.error = "" + + mock_pipeline = MagicMock() + mock_pipeline.score_sample = AsyncMock(return_value=mock_score) + + with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())): + with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline): + with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}): + scorer = InlineScorer() + result = scorer.score( + question="q", answer="a", + contexts=["ctx1"], + ground_truth=None, + metrics=["faithfulness", "answer_relevancy"], + judge_model="test-model", + embedding_model="test-embed", + settings=EvaluationSettings(_env_file=None), + ) + assert "faithfulness" in result + assert "answer_relevancy" in result + assert result["faithfulness"] == pytest.approx(0.9) + + def test_score_converts_nan_to_none(self): + """NaN scores are converted to None in the returned dict.""" + import math + from unittest.mock import AsyncMock, MagicMock, patch + from webapp.services.inline_scorer import InlineScorer + from rag_eval.settings import EvaluationSettings + + mock_score = MagicMock() + mock_score.metrics = {"faithfulness": float("nan")} + mock_score.error = "" + + mock_pipeline = MagicMock() + mock_pipeline.score_sample = AsyncMock(return_value=mock_score) + + with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())): + with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline): + with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}): + scorer = InlineScorer() + result = scorer.score( + question="q", answer="a", contexts=["c"], + ground_truth=None, + metrics=["faithfulness"], + judge_model="m", embedding_model="e", + settings=EvaluationSettings(_env_file=None), + ) + assert result["faithfulness"] is None diff --git a/webapp/services/inline_scorer.py b/webapp/services/inline_scorer.py new file mode 100644 index 0000000..9d0d5ed --- /dev/null +++ b/webapp/services/inline_scorer.py @@ -0,0 +1,109 @@ +"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint. + +A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by +(judge_model, embedding_model), so repeated Dify Tool calls with the same +models reuse existing AsyncOpenAI connections instead of creating new ones. +""" + +from __future__ import annotations + +import asyncio +import math +import threading +from typing import Any + +from rag_eval.compat import ensure_ragas_import_compat +from rag_eval.metrics.factory import build_models +from rag_eval.metrics.pipeline import MetricPipeline +from rag_eval.settings import EvaluationSettings +from rag_eval.shared.models import NormalizedSample + +ensure_ragas_import_compat() + +from ragas.metrics.collections import ( # noqa: E402 + AnswerRelevancy, + ContextPrecision, + ContextRecall, + FactualCorrectness, + Faithfulness, + NoiseSensitivity, + SemanticSimilarity, +) + + +def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]: + """Instantiate only the RAGAS metric objects requested.""" + registry: dict[str, Any] = { + "faithfulness": Faithfulness(llm=llm), + "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings), + "context_recall": ContextRecall(llm=llm), + "context_precision": ContextPrecision(llm=llm), + "noise_sensitivity": NoiseSensitivity(llm=llm), + "factual_correctness": FactualCorrectness(llm=llm), + "semantic_similarity": SemanticSimilarity(embeddings=embeddings), + } + return {name: registry[name] for name in metrics if name in registry} + + +class InlineScorer: + """Thread-safe single-sample RAGAS scorer with LLM client caching.""" + + def __init__(self) -> None: + """Initialize the scorer cache and synchronization primitives.""" + # Cache keyed by (judge_model, embedding_model) -> (llm, embeddings) + self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {} + self._lock = threading.Lock() + + def _get_models( + self, + judge_model: str, + embedding_model: str, + settings: EvaluationSettings, + ) -> tuple[Any, Any]: + """Return cached LLM/embedding clients, building them on first use.""" + cache_key = (judge_model, embedding_model) + with self._lock: + if cache_key not in self._model_cache: + llm, embeddings = build_models(judge_model, embedding_model, settings) + self._model_cache[cache_key] = (llm, embeddings) + return self._model_cache[cache_key] + + def score( + self, + question: str, + answer: str, + contexts: list[str], + ground_truth: str | None, + metrics: list[str], + judge_model: str, + embedding_model: str, + settings: EvaluationSettings, + ) -> dict[str, float | None]: + """Score one sample synchronously and return {metric_name: score | None}.""" + llm, embeddings = self._get_models(judge_model, embedding_model, settings) + metric_instances = _build_metric_instances(metrics, llm, embeddings) + + pipeline = MetricPipeline( + metrics=metric_instances, + metric_timeout_seconds=settings.ragas_metric_timeout_seconds, + ) + + sample = NormalizedSample( + sample_id="inline-score", + question=question, + answer=answer, + contexts=contexts, + ground_truth=ground_truth or "", + ) + + metric_score = asyncio.run(pipeline.score_sample(sample)) + + # Convert NaN and Inf into None for clean JSON output. + return { + name: (None if math.isnan(value) or math.isinf(value) else round(value, 4)) + for name, value in metric_score.metrics.items() + } + + +# Module-level singleton shared by FastAPI routes. +inline_scorer = InlineScorer()