feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 15:14:19 +08:00
parent e4d4e4968b
commit a03a24be4e
3 changed files with 321 additions and 4 deletions
--- a/webapp/api/score.py
+++ b/webapp/api/score.py
@@ -0,0 +1,105 @@
+"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
+
+from __future__ import annotations
+
+import time
+from typing import Annotated
+
+from fastapi import APIRouter, Header, HTTPException
+
+from rag_eval.metrics.weights import compute_weighted_score
+from rag_eval.settings import EvaluationSettings
+from webapp.models import ScoreRequest, ScoreResponse
+from webapp.services.inline_scorer import inline_scorer
+
+router = APIRouter(prefix="/api/score", tags=["score"])
+
+
+def _get_settings() -> EvaluationSettings:
+    """Return a fresh EvaluationSettings instance (overridable in tests)."""
+    return EvaluationSettings()
+
+
+def _check_auth(authorization: str | None, token: str) -> None:
+    """Raise 401 if Bearer token does not match the configured token."""
+    if authorization is None:
+        raise HTTPException(status_code=401, detail="Missing Authorization header.")
+    parts = authorization.split(" ", 1)
+    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
+        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
+
+
+@router.post(
+    "",
+    response_model=ScoreResponse,
+    summary="单题实时评分（Dify 外部 Tool）",
+    responses={
+        200: {"description": "各指标得分和加权综合得分。"},
+        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
+        422: {"description": "请求参数校验失败。"},
+    },
+)
+def score_sample(
+    request: ScoreRequest,
+    authorization: Annotated[str | None, Header()] = None,
+) -> ScoreResponse:
+    """Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
+    settings = _get_settings()
+
+    # Require Bearer auth only when the deployment configured a shared token.
+    if settings.score_api_token:
+        _check_auth(authorization, settings.score_api_token)
+
+    judge_model = request.judge_model or settings.ragas_judge_model
+    embedding_model = request.embedding_model or settings.ragas_embedding_model
+    effective = request.effective_metrics()
+    requested = set(request.metrics)
+    skipped = sorted(requested - set(effective))
+
+    if not effective:
+        return ScoreResponse(
+            scores={metric_name: None for metric_name in request.metrics},
+            weighted_score=None,
+            latency_ms=0,
+            skipped_metrics=skipped,
+        )
+
+    t0 = time.monotonic()
+    try:
+        raw_scores = inline_scorer.score(
+            question=request.question,
+            answer=request.answer,
+            contexts=request.contexts_as_list(),
+            ground_truth=request.ground_truth,
+            metrics=effective,
+            judge_model=judge_model,
+            embedding_model=embedding_model,
+            settings=settings,
+        )
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ScoreResponse(
+            scores={},
+            weighted_score=None,
+            latency_ms=latency_ms,
+            skipped_metrics=skipped,
+            error=f"{type(exc).__name__}: {exc}",
+        )
+
+    latency_ms = int((time.monotonic() - t0) * 1000)
+
+    # Keep skipped metrics visible to callers by emitting them as null scores.
+    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
+    all_scores.update(raw_scores)
+
+    weighted = compute_weighted_score(
+        {key: value for key, value in raw_scores.items() if value is not None},
+        {},
+    )
+
+    return ScoreResponse(
+        scores=all_scores,
+        weighted_score=round(weighted, 4) if weighted is not None else None,
+        latency_ms=latency_ms,
+        skipped_metrics=skipped,
+    )