From a03a24be4ea5d26d7bda4feadc187f3f8a265e59 Mon Sep 17 00:00:00 2001 From: wangwei Date: Mon, 22 Jun 2026 15:14:19 +0800 Subject: [PATCH] feat: add POST /api/score endpoint for Dify real-time scoring Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/webapp/test_score_api.py | 140 +++++++++++++++++++++++++++++++++ webapp/api/score.py | 105 +++++++++++++++++++++++++ webapp/server.py | 80 ++++++++++++++++++- 3 files changed, 321 insertions(+), 4 deletions(-) create mode 100644 webapp/api/score.py diff --git a/tests/webapp/test_score_api.py b/tests/webapp/test_score_api.py index 8076208..8fa7a12 100644 --- a/tests/webapp/test_score_api.py +++ b/tests/webapp/test_score_api.py @@ -185,3 +185,143 @@ class TestInlineScorer: settings=EvaluationSettings(_env_file=None), ) assert result["faithfulness"] is None + + +# ── Endpoint integration tests ──────────────────────────────────────────────── + +@pytest.fixture() +def client(monkeypatch): + """TestClient with mocked InlineScorer.""" + import webapp.api.score as score_mod + from unittest.mock import MagicMock + + mock_scorer = MagicMock() + mock_scorer.score.return_value = { + "faithfulness": 0.85, + "answer_relevancy": 0.90, + } + monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) + + from webapp.server import create_app + return TestClient(create_app()) + + +from fastapi.testclient import TestClient + + +class TestScoreEndpoint: + def test_post_score_returns_200(self, client): + resp = client.post("/api/score", json={ + "question": "What is CT?", + "answer": "CT is imaging.", + "contexts": "CT uses X-rays.", + }) + assert resp.status_code == 200 + data = resp.json() + assert "scores" in data + assert "latency_ms" in data + assert data["scores"]["faithfulness"] == pytest.approx(0.85) + + def test_weighted_score_computed(self, client): + resp = client.post("/api/score", json={ + "question": "q", "answer": "a", "contexts": "c", + }) + assert resp.status_code == 200 + data = resp.json() + assert data["weighted_score"] is not None + + def test_missing_required_fields_returns_422(self, client): + resp = client.post("/api/score", json={"question": "q"}) + assert resp.status_code == 422 + + def test_invalid_metric_name_returns_422(self, client): + resp = client.post("/api/score", json={ + "question": "q", "answer": "a", "contexts": "c", + "metrics": ["not_a_metric"], + }) + assert resp.status_code == 422 + + def test_skipped_metrics_returned_when_no_ground_truth(self, client): + resp = client.post("/api/score", json={ + "question": "q", "answer": "a", "contexts": "c", + "metrics": ["faithfulness", "context_recall"], + }) + assert resp.status_code == 200 + data = resp.json() + assert "context_recall" in data["skipped_metrics"] + + def test_contexts_split_on_separator(self, monkeypatch): + """contexts string is split before passing to scorer.""" + import webapp.api.score as score_mod + from unittest.mock import MagicMock + calls = [] + def capture(**kwargs): + calls.append(kwargs.get("contexts", [])) + return {"faithfulness": 0.9} + mock_scorer = MagicMock() + mock_scorer.score.side_effect = lambda **kw: capture(**kw) + monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) + + from webapp.server import create_app + from fastapi.testclient import TestClient + tc = TestClient(create_app()) + tc.post("/api/score", json={ + "question": "q", "answer": "a", + "contexts": "ctx1 |||| ctx2", + "context_separator": " |||| ", + }) + assert len(calls) == 1 + assert calls[0] == ["ctx1", "ctx2"] + + def test_bearer_token_auth_required_when_configured(self, monkeypatch): + """When SCORE_API_TOKEN is set, requests without token get 401.""" + import webapp.api.score as score_mod + from rag_eval.settings import EvaluationSettings + from unittest.mock import MagicMock + + mock_settings = EvaluationSettings(_env_file=None) + object.__setattr__(mock_settings, "score_api_token", "secret-token") + monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings) + + mock_scorer = MagicMock() + mock_scorer.score.return_value = {"faithfulness": 0.9} + monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) + + from webapp.server import create_app + from fastapi.testclient import TestClient + tc = TestClient(create_app()) + + # No auth header -> 401 + resp = tc.post("/api/score", json={ + "question": "q", "answer": "a", "contexts": "c", + }) + assert resp.status_code == 401 + + # Correct token -> 200 + resp = tc.post("/api/score", + json={"question": "q", "answer": "a", "contexts": "c"}, + headers={"Authorization": "Bearer secret-token"}, + ) + assert resp.status_code == 200 + + def test_wrong_bearer_token_returns_401(self, monkeypatch): + import webapp.api.score as score_mod + from rag_eval.settings import EvaluationSettings + from unittest.mock import MagicMock + + mock_settings = EvaluationSettings(_env_file=None) + object.__setattr__(mock_settings, "score_api_token", "correct-token") + monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings) + + mock_scorer = MagicMock() + mock_scorer.score.return_value = {} + monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer) + + from webapp.server import create_app + from fastapi.testclient import TestClient + tc = TestClient(create_app()) + resp = tc.post("/api/score", + json={"question": "q", "answer": "a", "contexts": "c"}, + headers={"Authorization": "Bearer wrong-token"}, + ) + assert resp.status_code == 401 diff --git a/webapp/api/score.py b/webapp/api/score.py new file mode 100644 index 0000000..fa46335 --- /dev/null +++ b/webapp/api/score.py @@ -0,0 +1,105 @@ +"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint).""" + +from __future__ import annotations + +import time +from typing import Annotated + +from fastapi import APIRouter, Header, HTTPException + +from rag_eval.metrics.weights import compute_weighted_score +from rag_eval.settings import EvaluationSettings +from webapp.models import ScoreRequest, ScoreResponse +from webapp.services.inline_scorer import inline_scorer + +router = APIRouter(prefix="/api/score", tags=["score"]) + + +def _get_settings() -> EvaluationSettings: + """Return a fresh EvaluationSettings instance (overridable in tests).""" + return EvaluationSettings() + + +def _check_auth(authorization: str | None, token: str) -> None: + """Raise 401 if Bearer token does not match the configured token.""" + if authorization is None: + raise HTTPException(status_code=401, detail="Missing Authorization header.") + parts = authorization.split(" ", 1) + if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token: + raise HTTPException(status_code=401, detail="Invalid Bearer token.") + + +@router.post( + "", + response_model=ScoreResponse, + summary="单题实时评分(Dify 外部 Tool)", + responses={ + 200: {"description": "各指标得分和加权综合得分。"}, + 401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"}, + 422: {"description": "请求参数校验失败。"}, + }, +) +def score_sample( + request: ScoreRequest, + authorization: Annotated[str | None, Header()] = None, +) -> ScoreResponse: + """Accept one QA sample, run RAGAS metrics synchronously, and return scores.""" + settings = _get_settings() + + # Require Bearer auth only when the deployment configured a shared token. + if settings.score_api_token: + _check_auth(authorization, settings.score_api_token) + + judge_model = request.judge_model or settings.ragas_judge_model + embedding_model = request.embedding_model or settings.ragas_embedding_model + effective = request.effective_metrics() + requested = set(request.metrics) + skipped = sorted(requested - set(effective)) + + if not effective: + return ScoreResponse( + scores={metric_name: None for metric_name in request.metrics}, + weighted_score=None, + latency_ms=0, + skipped_metrics=skipped, + ) + + t0 = time.monotonic() + try: + raw_scores = inline_scorer.score( + question=request.question, + answer=request.answer, + contexts=request.contexts_as_list(), + ground_truth=request.ground_truth, + metrics=effective, + judge_model=judge_model, + embedding_model=embedding_model, + settings=settings, + ) + except Exception as exc: # noqa: BLE001 + latency_ms = int((time.monotonic() - t0) * 1000) + return ScoreResponse( + scores={}, + weighted_score=None, + latency_ms=latency_ms, + skipped_metrics=skipped, + error=f"{type(exc).__name__}: {exc}", + ) + + latency_ms = int((time.monotonic() - t0) * 1000) + + # Keep skipped metrics visible to callers by emitting them as null scores. + all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics} + all_scores.update(raw_scores) + + weighted = compute_weighted_score( + {key: value for key, value in raw_scores.items() if value is not None}, + {}, + ) + + return ScoreResponse( + scores=all_scores, + weighted_score=round(weighted, 4) if weighted is not None else None, + latency_ms=latency_ms, + skipped_metrics=skipped, + ) diff --git a/webapp/server.py b/webapp/server.py index 06bdfe1..ff2a7e9 100644 --- a/webapp/server.py +++ b/webapp/server.py @@ -13,23 +13,95 @@ from fastapi import FastAPI from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles -from webapp.api import evaluations, llm_profiles, runs, scenarios +from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score STATIC_DIR = Path(__file__).resolve().parent / "static" +# OpenAPI tag metadata — controls the grouping and descriptions in /docs. +OPENAPI_TAGS = [ + { + "name": "pipeline", + "description": ( + "**全链路评估 Pipeline API**\n\n" + "一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n" + "**使用流程**\n" + "1. `POST /api/pipeline/jobs` 提交任务,立即拿到 `job_id`。\n" + "2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n" + "3. 当 `status=completed` 时,`result` 字段包含所有产物路径。\n\n" + "**Pipeline 阶段**\n" + "| phase | 说明 |\n" + "|-------|------|\n" + "| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n" + "| `generating_questions` | LLM 从文档片段生成草稿题库 |\n" + "| `evaluating` | RAGAS 在线评测打分 |\n" + "| `done` | 所有产物写入磁盘,任务完成 |" + ), + }, + { + "name": "evaluations", + "description": ( + "**单场景评估 API**\n\n" + "基于已有 YAML 场景文件触发评估任务,并查询任务状态与日志。" + ), + }, + { + "name": "llm-profiles", + "description": ( + "**LLM 配置管理 API**\n\n" + "增删改查已保存的 LLM 连接配置(模型名称、Base URL、API Key);" + "支持连通性测试;可将配置一键写入场景 YAML 文件。" + ), + }, + { + "name": "runs", + "description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。", + }, + { + "name": "scenarios", + "description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。", + }, + { + "name": "score", + "description": ( + "**实时评分 API(Dify 外部 Tool)**\n\n" + "接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n" + "同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n" + "适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n" + "**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 " + "`Authorization: Bearer ` 请求头。" + ), + }, + { + "name": "meta", + "description": "**系统 API**\n\n健康检查等基础接口。", + }, +] + def create_app() -> FastAPI: """Build and configure the FastAPI application instance.""" app = FastAPI( - title="Siemens RAGAS 评估控制台", - description="RAGAS 评估子系统的可视化报告与评估触发控制台。", - version="0.1.0", + title="RAGAS 评估系统", + description=( + "西门子医疗影像 RAG 评估平台 API 文档。\n\n" + "提供以下能力:\n" + "- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n" + "- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n" + "- **评估 API** — 基于 YAML 场景文件触发单次评估\n" + "- **LLM 配置 API** — 管理多个 LLM 连接配置,支持连通性测试\n" + "- **报告 API** — 查询历史运行记录与评估报告\n\n" + "> **快速开始**:调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。" + ), + version="0.2.0", + openapi_tags=OPENAPI_TAGS, ) app.include_router(runs.router) app.include_router(scenarios.router) app.include_router(evaluations.router) app.include_router(llm_profiles.router) + app.include_router(pipeline.router) + app.include_router(score.router) @app.get("/api/health", tags=["meta"]) def health() -> dict[str, str]: