feat: add POST /api/score endpoint for Dify real-time scoring
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -185,3 +185,143 @@ class TestInlineScorer:
|
|||||||
settings=EvaluationSettings(_env_file=None),
|
settings=EvaluationSettings(_env_file=None),
|
||||||
)
|
)
|
||||||
assert result["faithfulness"] is None
|
assert result["faithfulness"] is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Endpoint integration tests ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(monkeypatch):
|
||||||
|
"""TestClient with mocked InlineScorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {
|
||||||
|
"faithfulness": 0.85,
|
||||||
|
"answer_relevancy": 0.90,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreEndpoint:
|
||||||
|
def test_post_score_returns_200(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is imaging.",
|
||||||
|
"contexts": "CT uses X-rays.",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "scores" in data
|
||||||
|
assert "latency_ms" in data
|
||||||
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
||||||
|
|
||||||
|
def test_weighted_score_computed(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["weighted_score"] is not None
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_invalid_metric_name_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["not_a_metric"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["faithfulness", "context_recall"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "context_recall" in data["skipped_metrics"]
|
||||||
|
|
||||||
|
def test_contexts_split_on_separator(self, monkeypatch):
|
||||||
|
"""contexts string is split before passing to scorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
calls = []
|
||||||
|
def capture(**kwargs):
|
||||||
|
calls.append(kwargs.get("contexts", []))
|
||||||
|
return {"faithfulness": 0.9}
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.side_effect = lambda **kw: capture(**kw)
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a",
|
||||||
|
"contexts": "ctx1 |||| ctx2",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
})
|
||||||
|
assert len(calls) == 1
|
||||||
|
assert calls[0] == ["ctx1", "ctx2"]
|
||||||
|
|
||||||
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
||||||
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
|
||||||
|
# No auth header -> 401
|
||||||
|
resp = tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
# Correct token -> 200
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer secret-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer wrong-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|||||||
105
webapp/api/score.py
Normal file
105
webapp/api/score.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_settings() -> EvaluationSettings:
|
||||||
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
||||||
|
return EvaluationSettings()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_auth(authorization: str | None, token: str) -> None:
|
||||||
|
"""Raise 401 if Bearer token does not match the configured token."""
|
||||||
|
if authorization is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
||||||
|
parts = authorization.split(" ", 1)
|
||||||
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"",
|
||||||
|
response_model=ScoreResponse,
|
||||||
|
summary="单题实时评分(Dify 外部 Tool)",
|
||||||
|
responses={
|
||||||
|
200: {"description": "各指标得分和加权综合得分。"},
|
||||||
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
||||||
|
422: {"description": "请求参数校验失败。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def score_sample(
|
||||||
|
request: ScoreRequest,
|
||||||
|
authorization: Annotated[str | None, Header()] = None,
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
|
||||||
|
settings = _get_settings()
|
||||||
|
|
||||||
|
# Require Bearer auth only when the deployment configured a shared token.
|
||||||
|
if settings.score_api_token:
|
||||||
|
_check_auth(authorization, settings.score_api_token)
|
||||||
|
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
if not effective:
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={metric_name: None for metric_name in request.metrics},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=0,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
# Keep skipped metrics visible to callers by emitting them as null scores.
|
||||||
|
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
weighted = compute_weighted_score(
|
||||||
|
{key: value for key, value in raw_scores.items() if value is not None},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScoreResponse(
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=round(weighted, 4) if weighted is not None else None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
@@ -13,23 +13,95 @@ from fastapi import FastAPI
|
|||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from webapp.api import evaluations, llm_profiles, runs, scenarios
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
|
||||||
|
|
||||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
|
|
||||||
|
# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
|
||||||
|
OPENAPI_TAGS = [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"description": (
|
||||||
|
"**全链路评估 Pipeline API**\n\n"
|
||||||
|
"一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
|
||||||
|
"**使用流程**\n"
|
||||||
|
"1. `POST /api/pipeline/jobs` 提交任务,立即拿到 `job_id`。\n"
|
||||||
|
"2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
|
||||||
|
"3. 当 `status=completed` 时,`result` 字段包含所有产物路径。\n\n"
|
||||||
|
"**Pipeline 阶段**\n"
|
||||||
|
"| phase | 说明 |\n"
|
||||||
|
"|-------|------|\n"
|
||||||
|
"| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
|
||||||
|
"| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
|
||||||
|
"| `evaluating` | RAGAS 在线评测打分 |\n"
|
||||||
|
"| `done` | 所有产物写入磁盘,任务完成 |"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "evaluations",
|
||||||
|
"description": (
|
||||||
|
"**单场景评估 API**\n\n"
|
||||||
|
"基于已有 YAML 场景文件触发评估任务,并查询任务状态与日志。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "llm-profiles",
|
||||||
|
"description": (
|
||||||
|
"**LLM 配置管理 API**\n\n"
|
||||||
|
"增删改查已保存的 LLM 连接配置(模型名称、Base URL、API Key);"
|
||||||
|
"支持连通性测试;可将配置一键写入场景 YAML 文件。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "runs",
|
||||||
|
"description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "scenarios",
|
||||||
|
"description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(Dify 外部 Tool)**\n\n"
|
||||||
|
"接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n"
|
||||||
|
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
|
||||||
|
"适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meta",
|
||||||
|
"description": "**系统 API**\n\n健康检查等基础接口。",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
"""Build and configure the FastAPI application instance."""
|
"""Build and configure the FastAPI application instance."""
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Siemens RAGAS 评估控制台",
|
title="RAGAS 评估系统",
|
||||||
description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
|
description=(
|
||||||
version="0.1.0",
|
"西门子医疗影像 RAG 评估平台 API 文档。\n\n"
|
||||||
|
"提供以下能力:\n"
|
||||||
|
"- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
|
||||||
|
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
|
||||||
|
"- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
|
||||||
|
"- **LLM 配置 API** — 管理多个 LLM 连接配置,支持连通性测试\n"
|
||||||
|
"- **报告 API** — 查询历史运行记录与评估报告\n\n"
|
||||||
|
"> **快速开始**:调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
|
||||||
|
),
|
||||||
|
version="0.2.0",
|
||||||
|
openapi_tags=OPENAPI_TAGS,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.include_router(runs.router)
|
app.include_router(runs.router)
|
||||||
app.include_router(scenarios.router)
|
app.include_router(scenarios.router)
|
||||||
app.include_router(evaluations.router)
|
app.include_router(evaluations.router)
|
||||||
app.include_router(llm_profiles.router)
|
app.include_router(llm_profiles.router)
|
||||||
|
app.include_router(pipeline.router)
|
||||||
|
app.include_router(score.router)
|
||||||
|
|
||||||
@app.get("/api/health", tags=["meta"])
|
@app.get("/api/health", tags=["meta"])
|
||||||
def health() -> dict[str, str]:
|
def health() -> dict[str, str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user