2026-06-22 15:14:19 +08:00
|
|
|
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-06-22 18:14:01 +08:00
|
|
|
|
import logging
|
2026-06-22 15:14:19 +08:00
|
|
|
|
import time
|
|
|
|
|
|
from typing import Annotated
|
|
|
|
|
|
|
2026-06-22 18:14:01 +08:00
|
|
|
|
from fastapi import APIRouter, Header, HTTPException, Request
|
|
|
|
|
|
from fastapi.exceptions import RequestValidationError
|
|
|
|
|
|
from fastapi.responses import JSONResponse
|
2026-06-22 15:14:19 +08:00
|
|
|
|
|
|
|
|
|
|
from rag_eval.metrics.weights import compute_weighted_score
|
|
|
|
|
|
from rag_eval.settings import EvaluationSettings
|
|
|
|
|
|
from webapp.models import ScoreRequest, ScoreResponse
|
|
|
|
|
|
from webapp.services.inline_scorer import inline_scorer
|
|
|
|
|
|
|
|
|
|
|
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
2026-06-22 18:14:01 +08:00
|
|
|
|
logger = logging.getLogger("webapp.api.score")
|
2026-06-22 15:14:19 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_settings() -> EvaluationSettings:
|
|
|
|
|
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
|
|
|
|
|
return EvaluationSettings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _check_auth(authorization: str | None, token: str) -> None:
|
|
|
|
|
|
"""Raise 401 if Bearer token does not match the configured token."""
|
|
|
|
|
|
if authorization is None:
|
|
|
|
|
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
|
|
|
|
|
parts = authorization.split(" ", 1)
|
|
|
|
|
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
|
|
|
|
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.post(
|
|
|
|
|
|
"",
|
|
|
|
|
|
response_model=ScoreResponse,
|
|
|
|
|
|
summary="单题实时评分(Dify 外部 Tool)",
|
|
|
|
|
|
responses={
|
2026-06-22 15:52:30 +08:00
|
|
|
|
200: {
|
|
|
|
|
|
"description": "各指标得分、加权综合得分及耗时。",
|
|
|
|
|
|
"content": {
|
|
|
|
|
|
"application/json": {
|
|
|
|
|
|
"example": {
|
|
|
|
|
|
"scores": {
|
|
|
|
|
|
"faithfulness": 0.875,
|
|
|
|
|
|
"answer_relevancy": 0.920,
|
|
|
|
|
|
"context_recall": 0.810,
|
|
|
|
|
|
"context_precision": 0.850,
|
|
|
|
|
|
},
|
|
|
|
|
|
"weighted_score": 0.8638,
|
|
|
|
|
|
"latency_ms": 3420,
|
|
|
|
|
|
"skipped_metrics": [],
|
|
|
|
|
|
"error": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
2026-06-22 15:14:19 +08:00
|
|
|
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
2026-06-22 15:52:30 +08:00
|
|
|
|
422: {"description": "请求参数校验失败(必填字段缺失或 metrics 名称不合法)。"},
|
2026-06-22 15:14:19 +08:00
|
|
|
|
},
|
|
|
|
|
|
)
|
|
|
|
|
|
def score_sample(
|
2026-06-22 18:14:01 +08:00
|
|
|
|
raw_request: Request,
|
2026-06-22 15:14:19 +08:00
|
|
|
|
request: ScoreRequest,
|
|
|
|
|
|
authorization: Annotated[str | None, Header()] = None,
|
|
|
|
|
|
) -> ScoreResponse:
|
2026-06-22 15:52:30 +08:00
|
|
|
|
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
|
|
|
|
|
|
|
|
|
|
|
|
**主要用途**:供 Dify 外部 Tool 调用。Dify Agent 在生成回答后,将
|
|
|
|
|
|
`(question, answer, contexts)` 发送到此端点,即可获得 RAGAS 质量评分,
|
|
|
|
|
|
用于日志记录、质量监控或触发 Agent 自我改进流程。
|
|
|
|
|
|
|
|
|
|
|
|
**contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串,
|
|
|
|
|
|
服务端自动拆分后传入 RAGAS 管道。
|
|
|
|
|
|
|
|
|
|
|
|
**ground_truth 可选**:
|
|
|
|
|
|
- 提供时:所有指定指标均参与计算。
|
|
|
|
|
|
- 缺失时:自动跳过依赖参考答案的指标(`context_recall`、
|
|
|
|
|
|
`factual_correctness`、`semantic_similarity`、`noise_sensitivity`),
|
|
|
|
|
|
跳过的指标在响应的 `skipped_metrics` 列表中列出,对应 `scores` 值为 `null`。
|
|
|
|
|
|
|
|
|
|
|
|
**支持的 RAGAS 指标**:
|
|
|
|
|
|
- `faithfulness` — 回答与检索片段的事实一致性
|
|
|
|
|
|
- `answer_relevancy` — 回答与问题的相关性
|
|
|
|
|
|
- `context_recall` — 参考答案覆盖到的检索内容比例(需 ground_truth)
|
|
|
|
|
|
- `context_precision` — 检索片段中与答案相关的部分占比
|
|
|
|
|
|
- `noise_sensitivity` — 对无关噪声片段的敏感度(需 ground_truth)
|
|
|
|
|
|
- `factual_correctness` — 回答与参考答案的事实准确性(需 ground_truth)
|
|
|
|
|
|
- `semantic_similarity` — 回答与参考答案的语义相似度(需 ground_truth)
|
|
|
|
|
|
|
2026-06-23 15:11:34 +08:00
|
|
|
|
**推荐模型配置**:
|
|
|
|
|
|
- `judge_model`: `gpt-5.4`
|
|
|
|
|
|
- `embedding_model`: `text-embedding-3-small`
|
|
|
|
|
|
|
2026-06-22 15:52:30 +08:00
|
|
|
|
**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需在请求头携带
|
|
|
|
|
|
`Authorization: Bearer <token>`;留空则无需鉴权(适合内网部署)。
|
|
|
|
|
|
"""
|
2026-06-22 18:14:01 +08:00
|
|
|
|
client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
"[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s",
|
|
|
|
|
|
client,
|
|
|
|
|
|
raw_request.method,
|
|
|
|
|
|
raw_request.headers.get("content-type", ""),
|
|
|
|
|
|
request.metrics,
|
|
|
|
|
|
request.ground_truth is not None,
|
|
|
|
|
|
)
|
2026-06-22 15:14:19 +08:00
|
|
|
|
settings = _get_settings()
|
|
|
|
|
|
|
|
|
|
|
|
# Require Bearer auth only when the deployment configured a shared token.
|
|
|
|
|
|
if settings.score_api_token:
|
|
|
|
|
|
_check_auth(authorization, settings.score_api_token)
|
|
|
|
|
|
|
|
|
|
|
|
judge_model = request.judge_model or settings.ragas_judge_model
|
|
|
|
|
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
|
|
|
|
|
effective = request.effective_metrics()
|
|
|
|
|
|
requested = set(request.metrics)
|
|
|
|
|
|
skipped = sorted(requested - set(effective))
|
|
|
|
|
|
|
|
|
|
|
|
if not effective:
|
|
|
|
|
|
return ScoreResponse(
|
|
|
|
|
|
scores={metric_name: None for metric_name in request.metrics},
|
|
|
|
|
|
weighted_score=None,
|
|
|
|
|
|
latency_ms=0,
|
|
|
|
|
|
skipped_metrics=skipped,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
t0 = time.monotonic()
|
|
|
|
|
|
try:
|
|
|
|
|
|
raw_scores = inline_scorer.score(
|
|
|
|
|
|
question=request.question,
|
|
|
|
|
|
answer=request.answer,
|
|
|
|
|
|
contexts=request.contexts_as_list(),
|
|
|
|
|
|
ground_truth=request.ground_truth,
|
|
|
|
|
|
metrics=effective,
|
|
|
|
|
|
judge_model=judge_model,
|
|
|
|
|
|
embedding_model=embedding_model,
|
|
|
|
|
|
settings=settings,
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as exc: # noqa: BLE001
|
|
|
|
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
|
|
return ScoreResponse(
|
|
|
|
|
|
scores={},
|
|
|
|
|
|
weighted_score=None,
|
|
|
|
|
|
latency_ms=latency_ms,
|
|
|
|
|
|
skipped_metrics=skipped,
|
|
|
|
|
|
error=f"{type(exc).__name__}: {exc}",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
|
|
|
|
|
|
|
|
# Keep skipped metrics visible to callers by emitting them as null scores.
|
|
|
|
|
|
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
|
|
|
|
|
all_scores.update(raw_scores)
|
|
|
|
|
|
|
|
|
|
|
|
weighted = compute_weighted_score(
|
|
|
|
|
|
{key: value for key, value in raw_scores.items() if value is not None},
|
|
|
|
|
|
{},
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-06-22 18:14:01 +08:00
|
|
|
|
logger.info(
|
|
|
|
|
|
"[score] done latency=%dms skipped=%s scores=%s",
|
|
|
|
|
|
latency_ms,
|
|
|
|
|
|
skipped,
|
|
|
|
|
|
{k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
|
|
|
|
|
|
)
|
2026-06-22 15:14:19 +08:00
|
|
|
|
return ScoreResponse(
|
|
|
|
|
|
scores=all_scores,
|
|
|
|
|
|
weighted_score=round(weighted, 4) if weighted is not None else None,
|
|
|
|
|
|
latency_ms=latency_ms,
|
|
|
|
|
|
skipped_metrics=skipped,
|
|
|
|
|
|
)
|