- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing
chore(weighted-score): comment out 综合加权得分 display and computation
- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
342 lines
13 KiB
Python
342 lines
13 KiB
Python
"""Tests for POST /api/score endpoint."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from webapp.models import ScoreRequest, ScoreResponse
|
|
|
|
|
|
class TestScoreRequest:
|
|
def test_minimal_valid_request(self):
|
|
"""Only required fields — question, answer, contexts."""
|
|
req = ScoreRequest(
|
|
question="What is CT?",
|
|
answer="CT is imaging.",
|
|
contexts="CT uses X-rays.",
|
|
)
|
|
assert req.question == "What is CT?"
|
|
assert req.contexts == "CT uses X-rays."
|
|
assert req.ground_truth is None
|
|
assert req.context_separator == " |||| "
|
|
assert req.metrics == [
|
|
"faithfulness",
|
|
"answer_relevancy",
|
|
"context_recall",
|
|
"context_precision",
|
|
]
|
|
|
|
def test_contexts_split_by_separator(self):
|
|
"""contexts_as_list() splits on context_separator."""
|
|
req = ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="ctx1 |||| ctx2 |||| ctx3",
|
|
context_separator=" |||| ",
|
|
)
|
|
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
|
|
|
|
def test_contexts_split_custom_separator(self):
|
|
req = ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="a---b---c",
|
|
context_separator="---",
|
|
)
|
|
assert req.contexts_as_list() == ["a", "b", "c"]
|
|
|
|
def test_contexts_split_single_item(self):
|
|
req = ScoreRequest(question="q", answer="a", contexts="only one")
|
|
assert req.contexts_as_list() == ["only one"]
|
|
|
|
def test_missing_question_raises(self):
|
|
with pytest.raises(ValidationError):
|
|
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
|
|
|
|
def test_missing_answer_raises(self):
|
|
with pytest.raises(ValidationError):
|
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
|
|
|
def test_missing_contexts_defaults_to_none(self):
|
|
"""contexts is now optional — missing contexts is allowed."""
|
|
req = ScoreRequest(question="q", answer="a")
|
|
assert req.contexts is None
|
|
assert req.contexts_as_list() == []
|
|
|
|
def test_custom_metrics_accepted(self):
|
|
req = ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="c",
|
|
metrics=["faithfulness"],
|
|
)
|
|
assert req.metrics == ["faithfulness"]
|
|
|
|
def test_invalid_metric_name_raises(self):
|
|
with pytest.raises(ValidationError):
|
|
ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="c",
|
|
metrics=["not_a_metric"],
|
|
)
|
|
|
|
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
|
|
"""Without ground_truth, GT-dependent metrics are excluded."""
|
|
req = ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="c",
|
|
metrics=[
|
|
"faithfulness",
|
|
"context_recall",
|
|
"factual_correctness",
|
|
"semantic_similarity",
|
|
"noise_sensitivity",
|
|
],
|
|
)
|
|
effective = req.effective_metrics()
|
|
assert "faithfulness" in effective
|
|
assert "context_recall" not in effective
|
|
assert "factual_correctness" not in effective
|
|
assert "semantic_similarity" not in effective
|
|
assert "noise_sensitivity" not in effective
|
|
|
|
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
|
|
req = ScoreRequest(
|
|
question="q",
|
|
answer="a",
|
|
contexts="c",
|
|
ground_truth="gt",
|
|
metrics=["faithfulness", "context_recall", "factual_correctness"],
|
|
)
|
|
effective = req.effective_metrics()
|
|
assert effective == [
|
|
"faithfulness",
|
|
"context_recall",
|
|
"factual_correctness",
|
|
]
|
|
|
|
def test_effective_metrics_drops_context_dependent_when_contexts_absent(self):
|
|
"""Without contexts, context-dependent metrics are excluded."""
|
|
req = ScoreRequest(
|
|
question="q", answer="a",
|
|
metrics=["faithfulness", "answer_relevancy", "context_precision"],
|
|
)
|
|
effective = req.effective_metrics()
|
|
assert "answer_relevancy" in effective
|
|
assert "faithfulness" not in effective
|
|
assert "context_precision" not in effective
|
|
|
|
|
|
class TestScoreResponse:
|
|
def test_score_response_structure(self):
|
|
resp = ScoreResponse(
|
|
scores={"faithfulness": 0.85, "answer_relevancy": None},
|
|
weighted_score=0.85,
|
|
latency_ms=1200,
|
|
)
|
|
assert resp.scores["faithfulness"] == 0.85
|
|
assert resp.scores["answer_relevancy"] is None
|
|
assert resp.latency_ms == 1200
|
|
|
|
|
|
class TestInlineScorer:
|
|
def test_score_returns_dict_with_requested_metrics(self):
|
|
"""InlineScorer.score returns a dict keyed by the requested metrics."""
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from webapp.services.inline_scorer import InlineScorer
|
|
from rag_eval.settings import EvaluationSettings
|
|
|
|
mock_score = MagicMock()
|
|
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
|
|
mock_score.error = ""
|
|
|
|
mock_pipeline = MagicMock()
|
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
|
|
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
|
scorer = InlineScorer()
|
|
result = scorer.score(
|
|
question="q", answer="a",
|
|
contexts=["ctx1"],
|
|
ground_truth=None,
|
|
metrics=["faithfulness", "answer_relevancy"],
|
|
judge_model="test-model",
|
|
embedding_model="test-embed",
|
|
settings=EvaluationSettings(_env_file=None),
|
|
)
|
|
assert "faithfulness" in result
|
|
assert "answer_relevancy" in result
|
|
assert result["faithfulness"] == pytest.approx(0.9)
|
|
|
|
def test_score_converts_nan_to_none(self):
|
|
"""NaN scores are converted to None in the returned dict."""
|
|
import math
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
from webapp.services.inline_scorer import InlineScorer
|
|
from rag_eval.settings import EvaluationSettings
|
|
|
|
mock_score = MagicMock()
|
|
mock_score.metrics = {"faithfulness": float("nan")}
|
|
mock_score.error = ""
|
|
|
|
mock_pipeline = MagicMock()
|
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
|
|
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
|
scorer = InlineScorer()
|
|
result = scorer.score(
|
|
question="q", answer="a", contexts=["c"],
|
|
ground_truth=None,
|
|
metrics=["faithfulness"],
|
|
judge_model="m", embedding_model="e",
|
|
settings=EvaluationSettings(_env_file=None),
|
|
)
|
|
assert result["faithfulness"] is None
|
|
|
|
|
|
# ── Endpoint integration tests ────────────────────────────────────────────────
|
|
|
|
@pytest.fixture()
|
|
def client(monkeypatch):
|
|
"""TestClient with mocked InlineScorer."""
|
|
import webapp.api.score as score_mod
|
|
from unittest.mock import MagicMock
|
|
|
|
mock_scorer = MagicMock()
|
|
mock_scorer.score.return_value = {
|
|
"faithfulness": 0.85,
|
|
"answer_relevancy": 0.90,
|
|
}
|
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
|
|
|
from webapp.server import create_app
|
|
return TestClient(create_app())
|
|
|
|
|
|
from fastapi.testclient import TestClient
|
|
|
|
|
|
class TestScoreEndpoint:
|
|
def test_post_score_returns_200(self, client):
|
|
resp = client.post("/api/score", json={
|
|
"question": "What is CT?",
|
|
"answer": "CT is imaging.",
|
|
"contexts": "CT uses X-rays.",
|
|
})
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
assert "scores" in data
|
|
assert "latency_ms" in data
|
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
|
|
|
def test_weighted_score_computed(self, client):
|
|
resp = client.post("/api/score", json={
|
|
"question": "q", "answer": "a", "contexts": "c",
|
|
})
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
# 综合加权得分已暂时禁用,始终返回 null
|
|
assert data["weighted_score"] is None
|
|
|
|
def test_missing_required_fields_returns_422(self, client):
|
|
resp = client.post("/api/score", json={"question": "q"})
|
|
assert resp.status_code == 422
|
|
|
|
def test_invalid_metric_name_returns_422(self, client):
|
|
resp = client.post("/api/score", json={
|
|
"question": "q", "answer": "a", "contexts": "c",
|
|
"metrics": ["not_a_metric"],
|
|
})
|
|
assert resp.status_code == 422
|
|
|
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
|
resp = client.post("/api/score", json={
|
|
"question": "q", "answer": "a", "contexts": "c",
|
|
"metrics": ["faithfulness", "context_recall"],
|
|
})
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
assert "context_recall" in data["skipped_metrics"]
|
|
|
|
def test_contexts_split_on_separator(self, monkeypatch):
|
|
"""contexts string is split before passing to scorer."""
|
|
import webapp.api.score as score_mod
|
|
from unittest.mock import MagicMock
|
|
calls = []
|
|
def capture(**kwargs):
|
|
calls.append(kwargs.get("contexts", []))
|
|
return {"faithfulness": 0.9}
|
|
mock_scorer = MagicMock()
|
|
mock_scorer.score.side_effect = lambda **kw: capture(**kw)
|
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
|
|
|
from webapp.server import create_app
|
|
from fastapi.testclient import TestClient
|
|
tc = TestClient(create_app())
|
|
tc.post("/api/score", json={
|
|
"question": "q", "answer": "a",
|
|
"contexts": "ctx1 |||| ctx2",
|
|
"context_separator": " |||| ",
|
|
})
|
|
assert len(calls) == 1
|
|
assert calls[0] == ["ctx1", "ctx2"]
|
|
|
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
|
import webapp.api.score as score_mod
|
|
from rag_eval.settings import EvaluationSettings
|
|
from unittest.mock import MagicMock
|
|
|
|
mock_settings = EvaluationSettings(_env_file=None)
|
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
|
|
|
mock_scorer = MagicMock()
|
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
|
|
|
from webapp.server import create_app
|
|
from fastapi.testclient import TestClient
|
|
tc = TestClient(create_app())
|
|
|
|
# No auth header -> 401
|
|
resp = tc.post("/api/score", json={
|
|
"question": "q", "answer": "a", "contexts": "c",
|
|
})
|
|
assert resp.status_code == 401
|
|
|
|
# Correct token -> 200
|
|
resp = tc.post("/api/score",
|
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
|
headers={"Authorization": "Bearer secret-token"},
|
|
)
|
|
assert resp.status_code == 200
|
|
|
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
|
import webapp.api.score as score_mod
|
|
from rag_eval.settings import EvaluationSettings
|
|
from unittest.mock import MagicMock
|
|
|
|
mock_settings = EvaluationSettings(_env_file=None)
|
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
|
|
|
mock_scorer = MagicMock()
|
|
mock_scorer.score.return_value = {}
|
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
|
|
|
from webapp.server import create_app
|
|
from fastapi.testclient import TestClient
|
|
tc = TestClient(create_app())
|
|
resp = tc.post("/api/score",
|
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
|
headers={"Authorization": "Bearer wrong-token"},
|
|
)
|
|
assert resp.status_code == 401
|