feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-22 15:00:05 +08:00
parent 9ad6ad4ebc
commit 761faf9c42
3 changed files with 266 additions and 1 deletions

View File

@@ -52,6 +52,11 @@ class EvaluationSettings(BaseSettings):
) )
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE") parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL") dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
score_api_token: str | None = Field(
default=None,
alias="SCORE_API_TOKEN",
description="Bearer token for /api/score endpoint. Empty = no auth.",
)
@property @property
def openai_client_kwargs(self) -> dict[str, str | float]: def openai_client_kwargs(self) -> dict[str, str | float]:

View File

@@ -0,0 +1,128 @@
"""Tests for POST /api/score endpoint."""
from __future__ import annotations
import pytest
from pydantic import ValidationError
from webapp.models import ScoreRequest, ScoreResponse
class TestScoreRequest:
def test_minimal_valid_request(self):
"""Only required fields — question, answer, contexts."""
req = ScoreRequest(
question="What is CT?",
answer="CT is imaging.",
contexts="CT uses X-rays.",
)
assert req.question == "What is CT?"
assert req.contexts == "CT uses X-rays."
assert req.ground_truth is None
assert req.context_separator == " |||| "
assert req.metrics == [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
def test_contexts_split_by_separator(self):
"""contexts_as_list() splits on context_separator."""
req = ScoreRequest(
question="q",
answer="a",
contexts="ctx1 |||| ctx2 |||| ctx3",
context_separator=" |||| ",
)
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
def test_contexts_split_custom_separator(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="a---b---c",
context_separator="---",
)
assert req.contexts_as_list() == ["a", "b", "c"]
def test_contexts_split_single_item(self):
req = ScoreRequest(question="q", answer="a", contexts="only one")
assert req.contexts_as_list() == ["only one"]
def test_missing_question_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
def test_missing_answer_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
def test_missing_contexts_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
def test_custom_metrics_accepted(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=["faithfulness"],
)
assert req.metrics == ["faithfulness"]
def test_invalid_metric_name_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=["not_a_metric"],
)
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
"""Without ground_truth, GT-dependent metrics are excluded."""
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=[
"faithfulness",
"context_recall",
"factual_correctness",
"semantic_similarity",
"noise_sensitivity",
],
)
effective = req.effective_metrics()
assert "faithfulness" in effective
assert "context_recall" not in effective
assert "factual_correctness" not in effective
assert "semantic_similarity" not in effective
assert "noise_sensitivity" not in effective
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
ground_truth="gt",
metrics=["faithfulness", "context_recall", "factual_correctness"],
)
effective = req.effective_metrics()
assert effective == [
"faithfulness",
"context_recall",
"factual_correctness",
]
class TestScoreResponse:
def test_score_response_structure(self):
resp = ScoreResponse(
scores={"faithfulness": 0.85, "answer_relevancy": None},
weighted_score=0.85,
latency_ms=1200,
)
assert resp.scores["faithfulness"] == 0.85
assert resp.scores["answer_relevancy"] is None
assert resp.latency_ms == 1200

View File

@@ -5,7 +5,7 @@ from __future__ import annotations
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any from typing import Any
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field, field_validator
def _utcnow_iso() -> str: def _utcnow_iso() -> str:
@@ -370,3 +370,135 @@ class PipelineJobResponse(BaseModel):
job_id: str = Field(description="任务唯一标识符,用于后续轮询状态。") job_id: str = Field(description="任务唯一标识符,用于后续轮询状态。")
job_name: str = Field(description="任务显示名称。") job_name: str = Field(description="任务显示名称。")
status: str = Field(default="queued", description="初始状态,通常为 queued。") status: str = Field(default="queued", description="初始状态,通常为 queued。")
# ---------------------------------------------------------------------------
# Dify 实时评分 API 模型
# ---------------------------------------------------------------------------
# 需要 ground_truth 才能计算的指标集合
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
"context_recall",
"factual_correctness",
"semantic_similarity",
"noise_sensitivity",
})
# 所有合法指标名称
_VALID_METRICS: frozenset[str] = frozenset({
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
"noise_sensitivity",
"factual_correctness",
"semantic_similarity",
})
_DEFAULT_SCORE_METRICS: list[str] = [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
class ScoreRequest(BaseModel):
"""Request body for the real-time single-sample scoring endpoint."""
model_config = ConfigDict(
json_schema_extra={
"examples": [
{
"summary": "基础评分请求",
"value": {
"question": "双源CT的时间分辨率是多少?",
"answer": "双源CT的单扇区时间分辨率为75ms。",
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
"ground_truth": "双源CT单扇区时间分辨率为75ms需旋转135度。",
"context_separator": " |||| ",
"metrics": [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
],
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3",
},
}
]
}
)
question: str = Field(description="问题文本。")
answer: str = Field(description="待评分的回答。")
contexts: str = Field(
description="检索上下文字符串,多段之间用 context_separator 拼接。"
)
ground_truth: str | None = Field(
default=None,
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
)
context_separator: str = Field(
default=" |||| ",
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
)
metrics: list[str] = Field(
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
description="需要计算的 RAGAS 指标列表。",
)
judge_model: str | None = Field(
default=None,
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
)
embedding_model: str | None = Field(
default=None,
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
)
@field_validator("metrics")
@classmethod
def validate_metric_names(cls, value: list[str]) -> list[str]:
"""Reject any metric name not in the supported registry."""
invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
if invalid:
raise ValueError(
f"不支持的指标名称:{invalid}"
f"合法值:{sorted(_VALID_METRICS)}"
)
if not value:
raise ValueError("metrics 不能为空列表。")
return value
def contexts_as_list(self) -> list[str]:
"""Split the contexts string into a list of non-empty fragments."""
separator = self.context_separator or " |||| "
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
def effective_metrics(self) -> list[str]:
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
if self.ground_truth is not None:
return list(self.metrics)
return [metric_name for metric_name in self.metrics if metric_name not in _GT_DEPENDENT_METRICS]
class ScoreResponse(BaseModel):
"""Response payload for the real-time scoring endpoint."""
scores: dict[str, float | None] = Field(
description="各指标得分NaN 或计算失败时为 null"
)
weighted_score: float | None = Field(
default=None,
description="等权加权综合得分(仅对非 null 指标求均值)。",
)
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
skipped_metrics: list[str] = Field(
default_factory=list,
description="因缺少 ground_truth 而跳过的指标名称列表。",
)
error: str | None = Field(
default=None,
description="打分异常时的错误信息HTTP 200 仍返回scores 为空)。",
)