"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint).""" from __future__ import annotations import logging import time from typing import Annotated from fastapi import APIRouter, Header, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse from rag_eval.metrics.weights import compute_weighted_score from rag_eval.settings import EvaluationSettings from webapp.models import ScoreRequest, ScoreResponse from webapp.services.inline_scorer import inline_scorer router = APIRouter(prefix="/api/score", tags=["score"]) logger = logging.getLogger("webapp.api.score") def _get_settings() -> EvaluationSettings: """Return a fresh EvaluationSettings instance (overridable in tests).""" return EvaluationSettings() def _check_auth(authorization: str | None, token: str) -> None: """Raise 401 if Bearer token does not match the configured token.""" if authorization is None: raise HTTPException(status_code=401, detail="Missing Authorization header.") parts = authorization.split(" ", 1) if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token: raise HTTPException(status_code=401, detail="Invalid Bearer token.") @router.post( "", response_model=ScoreResponse, summary="单题实时评分(Dify 外部 Tool)", responses={ 200: { "description": "各指标得分、加权综合得分及耗时。", "content": { "application/json": { "example": { "scores": { "faithfulness": 0.875, "answer_relevancy": 0.920, "context_recall": 0.810, "context_precision": 0.850, }, "weighted_score": 0.8638, "latency_ms": 3420, "skipped_metrics": [], "error": None, } } }, }, 401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"}, 422: {"description": "请求参数校验失败(必填字段缺失或 metrics 名称不合法)。"}, }, ) def score_sample( raw_request: Request, request: ScoreRequest, authorization: Annotated[str | None, Header()] = None, ) -> ScoreResponse: """接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。 **主要用途**:供 Dify 外部 Tool 调用。Dify Agent 在生成回答后,将 `(question, answer, contexts)` 发送到此端点,即可获得 RAGAS 质量评分, 用于日志记录、质量监控或触发 Agent 自我改进流程。 **contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串, 服务端自动拆分后传入 RAGAS 管道。**contexts 为可选字段**,缺失时自动跳过依赖检索内容的指标 (`faithfulness`、`context_recall`、`context_precision`、`noise_sensitivity`)。 **ground_truth 可选**: - 提供时:所有指定指标均参与计算。 - 缺失时:自动跳过依赖参考答案的指标(`context_recall`、 `factual_correctness`、`semantic_similarity`、`noise_sensitivity`), 跳过的指标在响应的 `skipped_metrics` 列表中列出,对应 `scores` 值为 `null`。 **支持的 RAGAS 指标**: - `faithfulness` — 回答与检索片段的事实一致性 - `answer_relevancy` — 回答与问题的相关性 - `context_recall` — 参考答案覆盖到的检索内容比例(需 ground_truth) - `context_precision` — 检索片段中与答案相关的部分占比 - `noise_sensitivity` — 对无关噪声片段的敏感度(需 ground_truth) - `factual_correctness` — 回答与参考答案的事实准确性(需 ground_truth) - `semantic_similarity` — 回答与参考答案的语义相似度(需 ground_truth) **推荐模型配置**: - `judge_model`: `gpt-5` - `embedding_model`: `text-embedding-3-small` **鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需在请求头携带 `Authorization: Bearer `;留空则无需鉴权(适合内网部署)。 """ client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown" logger.info( "[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s has_ctx=%s", client, raw_request.method, raw_request.headers.get("content-type", ""), request.metrics, request.ground_truth is not None, bool(request.contexts), ) settings = _get_settings() # Require Bearer auth only when the deployment configured a shared token. if settings.score_api_token: _check_auth(authorization, settings.score_api_token) judge_model = request.judge_model or settings.ragas_judge_model embedding_model = request.embedding_model or settings.ragas_embedding_model effective = request.effective_metrics() requested = set(request.metrics) skipped = sorted(requested - set(effective)) if not effective: return ScoreResponse( scores={metric_name: None for metric_name in request.metrics}, weighted_score=None, latency_ms=0, skipped_metrics=skipped, ) t0 = time.monotonic() try: raw_scores = inline_scorer.score( question=request.question, answer=request.answer, contexts=request.contexts_as_list(), ground_truth=request.ground_truth, metrics=effective, judge_model=judge_model, embedding_model=embedding_model, settings=settings, ) except Exception as exc: # noqa: BLE001 latency_ms = int((time.monotonic() - t0) * 1000) return ScoreResponse( scores={}, weighted_score=None, latency_ms=latency_ms, skipped_metrics=skipped, error=f"{type(exc).__name__}: {exc}", ) latency_ms = int((time.monotonic() - t0) * 1000) # Keep skipped metrics visible to callers by emitting them as null scores. all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics} all_scores.update(raw_scores) # 综合加权得分计算(已暂时禁用) # weighted = compute_weighted_score( # {key: value for key, value in raw_scores.items() if value is not None}, # {}, # ) logger.info( "[score] done latency=%dms skipped=%s scores=%s", latency_ms, skipped, {k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()}, ) return ScoreResponse( scores=all_scores, weighted_score=None, # 综合加权得分已暂时禁用 latency_ms=latency_ms, skipped_metrics=skipped, )