Files
siemens_ragas/webapp/api/score.py

150 lines
5.8 KiB
Python
Raw Normal View History

"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
from __future__ import annotations
import time
from typing import Annotated
from fastapi import APIRouter, Header, HTTPException
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from webapp.models import ScoreRequest, ScoreResponse
from webapp.services.inline_scorer import inline_scorer
router = APIRouter(prefix="/api/score", tags=["score"])
def _get_settings() -> EvaluationSettings:
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
return EvaluationSettings()
def _check_auth(authorization: str | None, token: str) -> None:
"""Raise 401 if Bearer token does not match the configured token."""
if authorization is None:
raise HTTPException(status_code=401, detail="Missing Authorization header.")
parts = authorization.split(" ", 1)
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
"",
response_model=ScoreResponse,
summary="单题实时评分Dify 外部 Tool",
responses={
200: {
"description": "各指标得分、加权综合得分及耗时。",
"content": {
"application/json": {
"example": {
"scores": {
"faithfulness": 0.875,
"answer_relevancy": 0.920,
"context_recall": 0.810,
"context_precision": 0.850,
},
"weighted_score": 0.8638,
"latency_ms": 3420,
"skipped_metrics": [],
"error": None,
}
}
},
},
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
422: {"description": "请求参数校验失败(必填字段缺失或 metrics 名称不合法)。"},
},
)
def score_sample(
request: ScoreRequest,
authorization: Annotated[str | None, Header()] = None,
) -> ScoreResponse:
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
**主要用途** Dify 外部 Tool 调用Dify Agent 在生成回答后
`(question, answer, contexts)` 发送到此端点即可获得 RAGAS 质量评分
用于日志记录质量监控或触发 Agent 自我改进流程
**contexts 格式**多个检索片段用 `context_separator`默认 `" |||| "`拼接为一个字符串
服务端自动拆分后传入 RAGAS 管道
**ground_truth 可选**
- 提供时所有指定指标均参与计算
- 缺失时自动跳过依赖参考答案的指标`context_recall`
`factual_correctness``semantic_similarity``noise_sensitivity`
跳过的指标在响应的 `skipped_metrics` 列表中列出对应 `scores` 值为 `null`
**支持的 RAGAS 指标**
- `faithfulness` 回答与检索片段的事实一致性
- `answer_relevancy` 回答与问题的相关性
- `context_recall` 参考答案覆盖到的检索内容比例 ground_truth
- `context_precision` 检索片段中与答案相关的部分占比
- `noise_sensitivity` 对无关噪声片段的敏感度 ground_truth
- `factual_correctness` 回答与参考答案的事实准确性 ground_truth
- `semantic_similarity` 回答与参考答案的语义相似度 ground_truth
**鉴权** `.env` 中配置了 `SCORE_API_TOKEN`需在请求头携带
`Authorization: Bearer <token>`留空则无需鉴权适合内网部署
"""
settings = _get_settings()
# Require Bearer auth only when the deployment configured a shared token.
if settings.score_api_token:
_check_auth(authorization, settings.score_api_token)
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
if not effective:
return ScoreResponse(
scores={metric_name: None for metric_name in request.metrics},
weighted_score=None,
latency_ms=0,
skipped_metrics=skipped,
)
t0 = time.monotonic()
try:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
return ScoreResponse(
scores={},
weighted_score=None,
latency_ms=latency_ms,
skipped_metrics=skipped,
error=f"{type(exc).__name__}: {exc}",
)
latency_ms = int((time.monotonic() - t0) * 1000)
# Keep skipped metrics visible to callers by emitting them as null scores.
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
all_scores.update(raw_scores)
weighted = compute_weighted_score(
{key: value for key, value in raw_scores.items() if value is not None},
{},
)
return ScoreResponse(
scores=all_scores,
weighted_score=round(weighted, 4) if weighted is not None else None,
latency_ms=latency_ms,
skipped_metrics=skipped,
)