"""Tests for POST /api/score endpoint.""" from __future__ import annotations import pytest from pydantic import ValidationError from webapp.models import ScoreRequest, ScoreResponse class TestScoreRequest: def test_minimal_valid_request(self): """Only required fields — question, answer, contexts.""" req = ScoreRequest( question="What is CT?", answer="CT is imaging.", contexts="CT uses X-rays.", ) assert req.question == "What is CT?" assert req.contexts == "CT uses X-rays." assert req.ground_truth is None assert req.context_separator == " |||| " assert req.metrics == [ "faithfulness", "answer_relevancy", "context_recall", "context_precision", ] def test_contexts_split_by_separator(self): """contexts_as_list() splits on context_separator.""" req = ScoreRequest( question="q", answer="a", contexts="ctx1 |||| ctx2 |||| ctx3", context_separator=" |||| ", ) assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"] def test_contexts_split_custom_separator(self): req = ScoreRequest( question="q", answer="a", contexts="a---b---c", context_separator="---", ) assert req.contexts_as_list() == ["a", "b", "c"] def test_contexts_split_single_item(self): req = ScoreRequest(question="q", answer="a", contexts="only one") assert req.contexts_as_list() == ["only one"] def test_missing_question_raises(self): with pytest.raises(ValidationError): ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg] def test_missing_answer_raises(self): with pytest.raises(ValidationError): ScoreRequest(question="q", contexts="c") # type: ignore[call-arg] def test_missing_contexts_raises(self): with pytest.raises(ValidationError): ScoreRequest(question="q", answer="a") # type: ignore[call-arg] def test_custom_metrics_accepted(self): req = ScoreRequest( question="q", answer="a", contexts="c", metrics=["faithfulness"], ) assert req.metrics == ["faithfulness"] def test_invalid_metric_name_raises(self): with pytest.raises(ValidationError): ScoreRequest( question="q", answer="a", contexts="c", metrics=["not_a_metric"], ) def test_effective_metrics_drops_ground_truth_dependent_when_missing(self): """Without ground_truth, GT-dependent metrics are excluded.""" req = ScoreRequest( question="q", answer="a", contexts="c", metrics=[ "faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity", ], ) effective = req.effective_metrics() assert "faithfulness" in effective assert "context_recall" not in effective assert "factual_correctness" not in effective assert "semantic_similarity" not in effective assert "noise_sensitivity" not in effective def test_effective_metrics_keeps_all_when_ground_truth_present(self): req = ScoreRequest( question="q", answer="a", contexts="c", ground_truth="gt", metrics=["faithfulness", "context_recall", "factual_correctness"], ) effective = req.effective_metrics() assert effective == [ "faithfulness", "context_recall", "factual_correctness", ] class TestScoreResponse: def test_score_response_structure(self): resp = ScoreResponse( scores={"faithfulness": 0.85, "answer_relevancy": None}, weighted_score=0.85, latency_ms=1200, ) assert resp.scores["faithfulness"] == 0.85 assert resp.scores["answer_relevancy"] is None assert resp.latency_ms == 1200 class TestInlineScorer: def test_score_returns_dict_with_requested_metrics(self): """InlineScorer.score returns a dict keyed by the requested metrics.""" from unittest.mock import AsyncMock, MagicMock, patch from webapp.services.inline_scorer import InlineScorer from rag_eval.settings import EvaluationSettings mock_score = MagicMock() mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8} mock_score.error = "" mock_pipeline = MagicMock() mock_pipeline.score_sample = AsyncMock(return_value=mock_score) with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())): with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline): with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}): scorer = InlineScorer() result = scorer.score( question="q", answer="a", contexts=["ctx1"], ground_truth=None, metrics=["faithfulness", "answer_relevancy"], judge_model="test-model", embedding_model="test-embed", settings=EvaluationSettings(_env_file=None), ) assert "faithfulness" in result assert "answer_relevancy" in result assert result["faithfulness"] == pytest.approx(0.9) def test_score_converts_nan_to_none(self): """NaN scores are converted to None in the returned dict.""" import math from unittest.mock import AsyncMock, MagicMock, patch from webapp.services.inline_scorer import InlineScorer from rag_eval.settings import EvaluationSettings mock_score = MagicMock() mock_score.metrics = {"faithfulness": float("nan")} mock_score.error = "" mock_pipeline = MagicMock() mock_pipeline.score_sample = AsyncMock(return_value=mock_score) with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())): with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline): with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}): scorer = InlineScorer() result = scorer.score( question="q", answer="a", contexts=["c"], ground_truth=None, metrics=["faithfulness"], judge_model="m", embedding_model="e", settings=EvaluationSettings(_env_file=None), ) assert result["faithfulness"] is None # ── Endpoint integration tests ──────────────────────────────────────────────── @pytest.fixture() def client(monkeypatch): """TestClient with mocked InlineScorer.""" import webapp.api.score as score_mod from unittest.mock import MagicMock mock_scorer = MagicMock() mock_scorer.score.return_value = { "faithfulness": 0.85, "answer_relevancy": 0.90, } monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) from webapp.server import create_app return TestClient(create_app()) from fastapi.testclient import TestClient class TestScoreEndpoint: def test_post_score_returns_200(self, client): resp = client.post("/api/score", json={ "question": "What is CT?", "answer": "CT is imaging.", "contexts": "CT uses X-rays.", }) assert resp.status_code == 200 data = resp.json() assert "scores" in data assert "latency_ms" in data assert data["scores"]["faithfulness"] == pytest.approx(0.85) def test_weighted_score_computed(self, client): resp = client.post("/api/score", json={ "question": "q", "answer": "a", "contexts": "c", }) assert resp.status_code == 200 data = resp.json() assert data["weighted_score"] is not None def test_missing_required_fields_returns_422(self, client): resp = client.post("/api/score", json={"question": "q"}) assert resp.status_code == 422 def test_invalid_metric_name_returns_422(self, client): resp = client.post("/api/score", json={ "question": "q", "answer": "a", "contexts": "c", "metrics": ["not_a_metric"], }) assert resp.status_code == 422 def test_skipped_metrics_returned_when_no_ground_truth(self, client): resp = client.post("/api/score", json={ "question": "q", "answer": "a", "contexts": "c", "metrics": ["faithfulness", "context_recall"], }) assert resp.status_code == 200 data = resp.json() assert "context_recall" in data["skipped_metrics"] def test_contexts_split_on_separator(self, monkeypatch): """contexts string is split before passing to scorer.""" import webapp.api.score as score_mod from unittest.mock import MagicMock calls = [] def capture(**kwargs): calls.append(kwargs.get("contexts", [])) return {"faithfulness": 0.9} mock_scorer = MagicMock() mock_scorer.score.side_effect = lambda **kw: capture(**kw) monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) from webapp.server import create_app from fastapi.testclient import TestClient tc = TestClient(create_app()) tc.post("/api/score", json={ "question": "q", "answer": "a", "contexts": "ctx1 |||| ctx2", "context_separator": " |||| ", }) assert len(calls) == 1 assert calls[0] == ["ctx1", "ctx2"] def test_bearer_token_auth_required_when_configured(self, monkeypatch): """When SCORE_API_TOKEN is set, requests without token get 401.""" import webapp.api.score as score_mod from rag_eval.settings import EvaluationSettings from unittest.mock import MagicMock mock_settings = EvaluationSettings(_env_file=None) object.__setattr__(mock_settings, "score_api_token", "secret-token") monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings) mock_scorer = MagicMock() mock_scorer.score.return_value = {"faithfulness": 0.9} monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) from webapp.server import create_app from fastapi.testclient import TestClient tc = TestClient(create_app()) # No auth header -> 401 resp = tc.post("/api/score", json={ "question": "q", "answer": "a", "contexts": "c", }) assert resp.status_code == 401 # Correct token -> 200 resp = tc.post("/api/score", json={"question": "q", "answer": "a", "contexts": "c"}, headers={"Authorization": "Bearer secret-token"}, ) assert resp.status_code == 200 def test_wrong_bearer_token_returns_401(self, monkeypatch): import webapp.api.score as score_mod from rag_eval.settings import EvaluationSettings from unittest.mock import MagicMock mock_settings = EvaluationSettings(_env_file=None) object.__setattr__(mock_settings, "score_api_token", "correct-token") monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings) mock_scorer = MagicMock() mock_scorer.score.return_value = {} monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) from webapp.server import create_app from fastapi.testclient import TestClient tc = TestClient(create_app()) resp = tc.post("/api/score", json={"question": "q", "answer": "a", "contexts": "c"}, headers={"Authorization": "Bearer wrong-token"}, ) assert resp.status_code == 401