From a03a24be4ea5d26d7bda4feadc187f3f8a265e59 Mon Sep 17 00:00:00 2001
From: wangwei <Wei.Wang@t-systems.com>
Date: Mon, 22 Jun 2026 15:14:19 +0800
Subject: [PATCH] feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/webapp/test_score_api.py | 140 +++++++++++++++++++++++++++++++++
 webapp/api/score.py            | 105 +++++++++++++++++++++++++
 webapp/server.py               |  80 ++++++++++++++++++-
 3 files changed, 321 insertions(+), 4 deletions(-)
 create mode 100644 webapp/api/score.py

diff --git a/tests/webapp/test_score_api.py b/tests/webapp/test_score_api.py
index 8076208..8fa7a12 100644
--- a/tests/webapp/test_score_api.py
+++ b/tests/webapp/test_score_api.py
@@ -185,3 +185,143 @@ class TestInlineScorer:
                         settings=EvaluationSettings(_env_file=None),
                     )
         assert result["faithfulness"] is None
+
+
+# ── Endpoint integration tests ────────────────────────────────────────────────
+
+@pytest.fixture()
+def client(monkeypatch):
+    """TestClient with mocked InlineScorer."""
+    import webapp.api.score as score_mod
+    from unittest.mock import MagicMock
+
+    mock_scorer = MagicMock()
+    mock_scorer.score.return_value = {
+        "faithfulness": 0.85,
+        "answer_relevancy": 0.90,
+    }
+    monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+    from webapp.server import create_app
+    return TestClient(create_app())
+
+
+from fastapi.testclient import TestClient
+
+
+class TestScoreEndpoint:
+    def test_post_score_returns_200(self, client):
+        resp = client.post("/api/score", json={
+            "question": "What is CT?",
+            "answer": "CT is imaging.",
+            "contexts": "CT uses X-rays.",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "scores" in data
+        assert "latency_ms" in data
+        assert data["scores"]["faithfulness"] == pytest.approx(0.85)
+
+    def test_weighted_score_computed(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["weighted_score"] is not None
+
+    def test_missing_required_fields_returns_422(self, client):
+        resp = client.post("/api/score", json={"question": "q"})
+        assert resp.status_code == 422
+
+    def test_invalid_metric_name_returns_422(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["not_a_metric"],
+        })
+        assert resp.status_code == 422
+
+    def test_skipped_metrics_returned_when_no_ground_truth(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["faithfulness", "context_recall"],
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "context_recall" in data["skipped_metrics"]
+
+    def test_contexts_split_on_separator(self, monkeypatch):
+        """contexts string is split before passing to scorer."""
+        import webapp.api.score as score_mod
+        from unittest.mock import MagicMock
+        calls = []
+        def capture(**kwargs):
+            calls.append(kwargs.get("contexts", []))
+            return {"faithfulness": 0.9}
+        mock_scorer = MagicMock()
+        mock_scorer.score.side_effect = lambda **kw: capture(**kw)
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+        tc.post("/api/score", json={
+            "question": "q", "answer": "a",
+            "contexts": "ctx1 |||| ctx2",
+            "context_separator": " |||| ",
+        })
+        assert len(calls) == 1
+        assert calls[0] == ["ctx1", "ctx2"]
+
+    def test_bearer_token_auth_required_when_configured(self, monkeypatch):
+        """When SCORE_API_TOKEN is set, requests without token get 401."""
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+        from unittest.mock import MagicMock
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "secret-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {"faithfulness": 0.9}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+
+        # No auth header -> 401
+        resp = tc.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 401
+
+        # Correct token -> 200
+        resp = tc.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer secret-token"},
+        )
+        assert resp.status_code == 200
+
+    def test_wrong_bearer_token_returns_401(self, monkeypatch):
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+        from unittest.mock import MagicMock
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "correct-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+        resp = tc.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer wrong-token"},
+        )
+        assert resp.status_code == 401
diff --git a/webapp/api/score.py b/webapp/api/score.py
new file mode 100644
index 0000000..fa46335
--- /dev/null
+++ b/webapp/api/score.py
@@ -0,0 +1,105 @@
+"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
+
+from __future__ import annotations
+
+import time
+from typing import Annotated
+
+from fastapi import APIRouter, Header, HTTPException
+
+from rag_eval.metrics.weights import compute_weighted_score
+from rag_eval.settings import EvaluationSettings
+from webapp.models import ScoreRequest, ScoreResponse
+from webapp.services.inline_scorer import inline_scorer
+
+router = APIRouter(prefix="/api/score", tags=["score"])
+
+
+def _get_settings() -> EvaluationSettings:
+    """Return a fresh EvaluationSettings instance (overridable in tests)."""
+    return EvaluationSettings()
+
+
+def _check_auth(authorization: str | None, token: str) -> None:
+    """Raise 401 if Bearer token does not match the configured token."""
+    if authorization is None:
+        raise HTTPException(status_code=401, detail="Missing Authorization header.")
+    parts = authorization.split(" ", 1)
+    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
+        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
+
+
+@router.post(
+    "",
+    response_model=ScoreResponse,
+    summary="单题实时评分（Dify 外部 Tool）",
+    responses={
+        200: {"description": "各指标得分和加权综合得分。"},
+        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
+        422: {"description": "请求参数校验失败。"},
+    },
+)
+def score_sample(
+    request: ScoreRequest,
+    authorization: Annotated[str | None, Header()] = None,
+) -> ScoreResponse:
+    """Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
+    settings = _get_settings()
+
+    # Require Bearer auth only when the deployment configured a shared token.
+    if settings.score_api_token:
+        _check_auth(authorization, settings.score_api_token)
+
+    judge_model = request.judge_model or settings.ragas_judge_model
+    embedding_model = request.embedding_model or settings.ragas_embedding_model
+    effective = request.effective_metrics()
+    requested = set(request.metrics)
+    skipped = sorted(requested - set(effective))
+
+    if not effective:
+        return ScoreResponse(
+            scores={metric_name: None for metric_name in request.metrics},
+            weighted_score=None,
+            latency_ms=0,
+            skipped_metrics=skipped,
+        )
+
+    t0 = time.monotonic()
+    try:
+        raw_scores = inline_scorer.score(
+            question=request.question,
+            answer=request.answer,
+            contexts=request.contexts_as_list(),
+            ground_truth=request.ground_truth,
+            metrics=effective,
+            judge_model=judge_model,
+            embedding_model=embedding_model,
+            settings=settings,
+        )
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ScoreResponse(
+            scores={},
+            weighted_score=None,
+            latency_ms=latency_ms,
+            skipped_metrics=skipped,
+            error=f"{type(exc).__name__}: {exc}",
+        )
+
+    latency_ms = int((time.monotonic() - t0) * 1000)
+
+    # Keep skipped metrics visible to callers by emitting them as null scores.
+    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
+    all_scores.update(raw_scores)
+
+    weighted = compute_weighted_score(
+        {key: value for key, value in raw_scores.items() if value is not None},
+        {},
+    )
+
+    return ScoreResponse(
+        scores=all_scores,
+        weighted_score=round(weighted, 4) if weighted is not None else None,
+        latency_ms=latency_ms,
+        skipped_metrics=skipped,
+    )
diff --git a/webapp/server.py b/webapp/server.py
index 06bdfe1..ff2a7e9 100644
--- a/webapp/server.py
+++ b/webapp/server.py
@@ -13,23 +13,95 @@ from fastapi import FastAPI
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
 
-from webapp.api import evaluations, llm_profiles, runs, scenarios
+from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
 
 STATIC_DIR = Path(__file__).resolve().parent / "static"
 
+# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
+OPENAPI_TAGS = [
+    {
+        "name": "pipeline",
+        "description": (
+            "**全链路评估 Pipeline API**\n\n"
+            "一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
+            "**使用流程**\n"
+            "1. `POST /api/pipeline/jobs` 提交任务，立即拿到 `job_id`。\n"
+            "2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
+            "3. 当 `status=completed` 时，`result` 字段包含所有产物路径。\n\n"
+            "**Pipeline 阶段**\n"
+            "| phase | 说明 |\n"
+            "|-------|------|\n"
+            "| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
+            "| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
+            "| `evaluating` | RAGAS 在线评测打分 |\n"
+            "| `done` | 所有产物写入磁盘，任务完成 |"
+        ),
+    },
+    {
+        "name": "evaluations",
+        "description": (
+            "**单场景评估 API**\n\n"
+            "基于已有 YAML 场景文件触发评估任务，并查询任务状态与日志。"
+        ),
+    },
+    {
+        "name": "llm-profiles",
+        "description": (
+            "**LLM 配置管理 API**\n\n"
+            "增删改查已保存的 LLM 连接配置（模型名称、Base URL、API Key）；"
+            "支持连通性测试；可将配置一键写入场景 YAML 文件。"
+        ),
+    },
+    {
+        "name": "runs",
+        "description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
+    },
+    {
+        "name": "scenarios",
+        "description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
+    },
+    {
+        "name": "score",
+        "description": (
+            "**实时评分 API（Dify 外部 Tool）**\n\n"
+            "接受单条问答记录 `(question, answer, contexts, ground_truth)`，\n"
+            "同步运行 RAGAS 指标打分，返回各指标得分和加权综合得分。\n\n"
+            "适用场景：Dify Agent 在回答后即时调用，用于质量监控或自我改进。\n\n"
+            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
+            "`Authorization: Bearer <token>` 请求头。"
+        ),
+    },
+    {
+        "name": "meta",
+        "description": "**系统 API**\n\n健康检查等基础接口。",
+    },
+]
+
 
 def create_app() -> FastAPI:
     """Build and configure the FastAPI application instance."""
     app = FastAPI(
-        title="Siemens RAGAS 评估控制台",
-        description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
-        version="0.1.0",
+        title="RAGAS 评估系统",
+        description=(
+            "西门子医疗影像 RAG 评估平台 API 文档。\n\n"
+            "提供以下能力：\n"
+            "- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
+            "- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
+            "- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
+            "- **LLM 配置 API** — 管理多个 LLM 连接配置，支持连通性测试\n"
+            "- **报告 API** — 查询历史运行记录与评估报告\n\n"
+            "> **快速开始**：调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
+        ),
+        version="0.2.0",
+        openapi_tags=OPENAPI_TAGS,
     )
 
     app.include_router(runs.router)
     app.include_router(scenarios.router)
     app.include_router(evaluations.router)
     app.include_router(llm_profiles.router)
+    app.include_router(pipeline.router)
+    app.include_router(score.router)
 
     @app.get("/api/health", tags=["meta"])
     def health() -> dict[str, str]: