Files
siemens_ragas/webapp/api/score.py
wangwei a781ba1e4a config: set default judge_model=gpt-5, embedding_model=text-embedding-3-small
gpt-5.4/5.5/5.2/5.4-mini/5.4-nano are incompatible with RAGAS 0.4.3
because they require max_completion_tokens instead of max_tokens.
gpt-5 / gpt-4.1 support max_tokens and json_object mode required by RAGAS.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-23 15:29:01 +08:00

174 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
from __future__ import annotations
import logging
import time
from typing import Annotated
from fastapi import APIRouter, Header, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from webapp.models import ScoreRequest, ScoreResponse
from webapp.services.inline_scorer import inline_scorer
router = APIRouter(prefix="/api/score", tags=["score"])
logger = logging.getLogger("webapp.api.score")
def _get_settings() -> EvaluationSettings:
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
return EvaluationSettings()
def _check_auth(authorization: str | None, token: str) -> None:
"""Raise 401 if Bearer token does not match the configured token."""
if authorization is None:
raise HTTPException(status_code=401, detail="Missing Authorization header.")
parts = authorization.split(" ", 1)
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
"",
response_model=ScoreResponse,
summary="单题实时评分Dify 外部 Tool",
responses={
200: {
"description": "各指标得分、加权综合得分及耗时。",
"content": {
"application/json": {
"example": {
"scores": {
"faithfulness": 0.875,
"answer_relevancy": 0.920,
"context_recall": 0.810,
"context_precision": 0.850,
},
"weighted_score": 0.8638,
"latency_ms": 3420,
"skipped_metrics": [],
"error": None,
}
}
},
},
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
422: {"description": "请求参数校验失败(必填字段缺失或 metrics 名称不合法)。"},
},
)
def score_sample(
raw_request: Request,
request: ScoreRequest,
authorization: Annotated[str | None, Header()] = None,
) -> ScoreResponse:
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
**主要用途**:供 Dify 外部 Tool 调用。Dify Agent 在生成回答后,将
`(question, answer, contexts)` 发送到此端点,即可获得 RAGAS 质量评分,
用于日志记录、质量监控或触发 Agent 自我改进流程。
**contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串,
服务端自动拆分后传入 RAGAS 管道。
**ground_truth 可选**
- 提供时:所有指定指标均参与计算。
- 缺失时:自动跳过依赖参考答案的指标(`context_recall`、
`factual_correctness`、`semantic_similarity`、`noise_sensitivity`
跳过的指标在响应的 `skipped_metrics` 列表中列出,对应 `scores` 值为 `null`。
**支持的 RAGAS 指标**
- `faithfulness` — 回答与检索片段的事实一致性
- `answer_relevancy` — 回答与问题的相关性
- `context_recall` — 参考答案覆盖到的检索内容比例(需 ground_truth
- `context_precision` — 检索片段中与答案相关的部分占比
- `noise_sensitivity` — 对无关噪声片段的敏感度(需 ground_truth
- `factual_correctness` — 回答与参考答案的事实准确性(需 ground_truth
- `semantic_similarity` — 回答与参考答案的语义相似度(需 ground_truth
**推荐模型配置**
- `judge_model`: `gpt-5`
- `embedding_model`: `text-embedding-3-small`
**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需在请求头携带
`Authorization: Bearer <token>`;留空则无需鉴权(适合内网部署)。
"""
client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
logger.info(
"[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s",
client,
raw_request.method,
raw_request.headers.get("content-type", ""),
request.metrics,
request.ground_truth is not None,
)
settings = _get_settings()
# Require Bearer auth only when the deployment configured a shared token.
if settings.score_api_token:
_check_auth(authorization, settings.score_api_token)
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
if not effective:
return ScoreResponse(
scores={metric_name: None for metric_name in request.metrics},
weighted_score=None,
latency_ms=0,
skipped_metrics=skipped,
)
t0 = time.monotonic()
try:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
return ScoreResponse(
scores={},
weighted_score=None,
latency_ms=latency_ms,
skipped_metrics=skipped,
error=f"{type(exc).__name__}: {exc}",
)
latency_ms = int((time.monotonic() - t0) * 1000)
# Keep skipped metrics visible to callers by emitting them as null scores.
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
all_scores.update(raw_scores)
weighted = compute_weighted_score(
{key: value for key, value in raw_scores.items() if value is not None},
{},
)
logger.info(
"[score] done latency=%dms skipped=%s scores=%s",
latency_ms,
skipped,
{k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
)
return ScoreResponse(
scores=all_scores,
weighted_score=round(weighted, 4) if weighted is not None else None,
latency_ms=latency_ms,
skipped_metrics=skipped,
)