siemens_ragas/tests/webapp/test_score_api.py

"""Tests for POST /api/score endpoint."""
from __future__ import annotations

import pytest
from pydantic import ValidationError

from webapp.models import ScoreRequest, ScoreResponse


class TestScoreRequest:
    def test_minimal_valid_request(self):
        """Only required fields — question, answer, contexts."""
        req = ScoreRequest(
            question="What is CT?",
            answer="CT is imaging.",
            contexts="CT uses X-rays.",
        )
        assert req.question == "What is CT?"
        assert req.contexts == "CT uses X-rays."
        assert req.ground_truth is None
        assert req.context_separator == " |||| "
        assert req.metrics == [
            "faithfulness",
            "answer_relevancy",
            "context_recall",
            "context_precision",
        ]

    def test_contexts_split_by_separator(self):
        """contexts_as_list() splits on context_separator."""
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="ctx1 |||| ctx2 |||| ctx3",
            context_separator=" |||| ",
        )
        assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]

    def test_contexts_split_custom_separator(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="a---b---c",
            context_separator="---",
        )
        assert req.contexts_as_list() == ["a", "b", "c"]

    def test_contexts_split_single_item(self):
        req = ScoreRequest(question="q", answer="a", contexts="only one")
        assert req.contexts_as_list() == ["only one"]

    def test_missing_question_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(answer="a", contexts="c")  # type: ignore[call-arg]

    def test_missing_answer_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", contexts="c")  # type: ignore[call-arg]

    def test_missing_contexts_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", answer="a")  # type: ignore[call-arg]

    def test_custom_metrics_accepted(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            metrics=["faithfulness"],
        )
        assert req.metrics == ["faithfulness"]

    def test_invalid_metric_name_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(
                question="q",
                answer="a",
                contexts="c",
                metrics=["not_a_metric"],
            )

    def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
        """Without ground_truth, GT-dependent metrics are excluded."""
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            metrics=[
                "faithfulness",
                "context_recall",
                "factual_correctness",
                "semantic_similarity",
                "noise_sensitivity",
            ],
        )
        effective = req.effective_metrics()
        assert "faithfulness" in effective
        assert "context_recall" not in effective
        assert "factual_correctness" not in effective
        assert "semantic_similarity" not in effective
        assert "noise_sensitivity" not in effective

    def test_effective_metrics_keeps_all_when_ground_truth_present(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            ground_truth="gt",
            metrics=["faithfulness", "context_recall", "factual_correctness"],
        )
        effective = req.effective_metrics()
        assert effective == [
            "faithfulness",
            "context_recall",
            "factual_correctness",
        ]


class TestScoreResponse:
    def test_score_response_structure(self):
        resp = ScoreResponse(
            scores={"faithfulness": 0.85, "answer_relevancy": None},
            weighted_score=0.85,
            latency_ms=1200,
        )
        assert resp.scores["faithfulness"] == 0.85
        assert resp.scores["answer_relevancy"] is None
        assert resp.latency_ms == 1200


class TestInlineScorer:
    def test_score_returns_dict_with_requested_metrics(self):
        """InlineScorer.score returns a dict keyed by the requested metrics."""
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings

        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
        mock_score.error = ""

        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)

        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a",
                        contexts=["ctx1"],
                        ground_truth=None,
                        metrics=["faithfulness", "answer_relevancy"],
                        judge_model="test-model",
                        embedding_model="test-embed",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert "faithfulness" in result
        assert "answer_relevancy" in result
        assert result["faithfulness"] == pytest.approx(0.9)

    def test_score_converts_nan_to_none(self):
        """NaN scores are converted to None in the returned dict."""
        import math
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings

        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": float("nan")}
        mock_score.error = ""

        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)

        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a", contexts=["c"],
                        ground_truth=None,
                        metrics=["faithfulness"],
                        judge_model="m", embedding_model="e",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert result["faithfulness"] is None
feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-22 15:00:05 +08:00			`"""Tests for POST /api/score endpoint."""`
			`from __future__ import annotations`

			`import pytest`
			`from pydantic import ValidationError`

			`from webapp.models import ScoreRequest, ScoreResponse`


			`class TestScoreRequest:`
			`def test_minimal_valid_request(self):`
			`"""Only required fields — question, answer, contexts."""`
			`req = ScoreRequest(`
			`question="What is CT?",`
			`answer="CT is imaging.",`
			`contexts="CT uses X-rays.",`
			`)`
			`assert req.question == "What is CT?"`
			`assert req.contexts == "CT uses X-rays."`
			`assert req.ground_truth is None`
			`assert req.context_separator == " \|\|\|\| "`
			`assert req.metrics == [`
			`"faithfulness",`
			`"answer_relevancy",`
			`"context_recall",`
			`"context_precision",`
			`]`

			`def test_contexts_split_by_separator(self):`
			`"""contexts_as_list() splits on context_separator."""`
			`req = ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="ctx1 \|\|\|\| ctx2 \|\|\|\| ctx3",`
			`context_separator=" \|\|\|\| ",`
			`)`
			`assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]`

			`def test_contexts_split_custom_separator(self):`
			`req = ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="a---b---c",`
			`context_separator="---",`
			`)`
			`assert req.contexts_as_list() == ["a", "b", "c"]`

			`def test_contexts_split_single_item(self):`
			`req = ScoreRequest(question="q", answer="a", contexts="only one")`
			`assert req.contexts_as_list() == ["only one"]`

			`def test_missing_question_raises(self):`
			`with pytest.raises(ValidationError):`
			`ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]`

			`def test_missing_answer_raises(self):`
			`with pytest.raises(ValidationError):`
			`ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]`

			`def test_missing_contexts_raises(self):`
			`with pytest.raises(ValidationError):`
			`ScoreRequest(question="q", answer="a") # type: ignore[call-arg]`

			`def test_custom_metrics_accepted(self):`
			`req = ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="c",`
			`metrics=["faithfulness"],`
			`)`
			`assert req.metrics == ["faithfulness"]`

			`def test_invalid_metric_name_raises(self):`
			`with pytest.raises(ValidationError):`
			`ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="c",`
			`metrics=["not_a_metric"],`
			`)`

			`def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):`
			`"""Without ground_truth, GT-dependent metrics are excluded."""`
			`req = ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="c",`
			`metrics=[`
			`"faithfulness",`
			`"context_recall",`
			`"factual_correctness",`
			`"semantic_similarity",`
			`"noise_sensitivity",`
			`],`
			`)`
			`effective = req.effective_metrics()`
			`assert "faithfulness" in effective`
			`assert "context_recall" not in effective`
			`assert "factual_correctness" not in effective`
			`assert "semantic_similarity" not in effective`
			`assert "noise_sensitivity" not in effective`

			`def test_effective_metrics_keeps_all_when_ground_truth_present(self):`
			`req = ScoreRequest(`
			`question="q",`
			`answer="a",`
			`contexts="c",`
			`ground_truth="gt",`
			`metrics=["faithfulness", "context_recall", "factual_correctness"],`
			`)`
			`effective = req.effective_metrics()`
			`assert effective == [`
			`"faithfulness",`
			`"context_recall",`
			`"factual_correctness",`
			`]`


			`class TestScoreResponse:`
			`def test_score_response_structure(self):`
			`resp = ScoreResponse(`
			`scores={"faithfulness": 0.85, "answer_relevancy": None},`
			`weighted_score=0.85,`
			`latency_ms=1200,`
			`)`
			`assert resp.scores["faithfulness"] == 0.85`
			`assert resp.scores["answer_relevancy"] is None`
			`assert resp.latency_ms == 1200`
feat: add InlineScorer service with LLM client caching Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> 2026-06-22 15:03:43 +08:00

			`class TestInlineScorer:`
			`def test_score_returns_dict_with_requested_metrics(self):`
			`"""InlineScorer.score returns a dict keyed by the requested metrics."""`
			`from unittest.mock import AsyncMock, MagicMock, patch`
			`from webapp.services.inline_scorer import InlineScorer`
			`from rag_eval.settings import EvaluationSettings`

			`mock_score = MagicMock()`
			`mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}`
			`mock_score.error = ""`

			`mock_pipeline = MagicMock()`
			`mock_pipeline.score_sample = AsyncMock(return_value=mock_score)`

			`with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):`
			`with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):`
			`with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):`
			`scorer = InlineScorer()`
			`result = scorer.score(`
			`question="q", answer="a",`
			`contexts=["ctx1"],`
			`ground_truth=None,`
			`metrics=["faithfulness", "answer_relevancy"],`
			`judge_model="test-model",`
			`embedding_model="test-embed",`
			`settings=EvaluationSettings(_env_file=None),`
			`)`
			`assert "faithfulness" in result`
			`assert "answer_relevancy" in result`
			`assert result["faithfulness"] == pytest.approx(0.9)`

			`def test_score_converts_nan_to_none(self):`
			`"""NaN scores are converted to None in the returned dict."""`
			`import math`
			`from unittest.mock import AsyncMock, MagicMock, patch`
			`from webapp.services.inline_scorer import InlineScorer`
			`from rag_eval.settings import EvaluationSettings`

			`mock_score = MagicMock()`
			`mock_score.metrics = {"faithfulness": float("nan")}`
			`mock_score.error = ""`

			`mock_pipeline = MagicMock()`
			`mock_pipeline.score_sample = AsyncMock(return_value=mock_score)`

			`with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):`
			`with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):`
			`with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):`
			`scorer = InlineScorer()`
			`result = scorer.score(`
			`question="q", answer="a", contexts=["c"],`
			`ground_truth=None,`
			`metrics=["faithfulness"],`
			`judge_model="m", embedding_model="e",`
			`settings=EvaluationSettings(_env_file=None),`
			`)`
			`assert result["faithfulness"] is None`