siemens_ragas/webapp/api/score.py

"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""

from __future__ import annotations

import logging
import time
from typing import Annotated

from fastapi import APIRouter, Header, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse

from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from webapp.models import ScoreRequest, ScoreResponse
from webapp.services.inline_scorer import inline_scorer

router = APIRouter(prefix="/api/score", tags=["score"])
logger = logging.getLogger("webapp.api.score")


def _get_settings() -> EvaluationSettings:
    """Return a fresh EvaluationSettings instance (overridable in tests)."""
    return EvaluationSettings()


def _check_auth(authorization: str | None, token: str) -> None:
    """Raise 401 if Bearer token does not match the configured token."""
    if authorization is None:
        raise HTTPException(status_code=401, detail="Missing Authorization header.")
    parts = authorization.split(" ", 1)
    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
        raise HTTPException(status_code=401, detail="Invalid Bearer token.")


@router.post(
    "",
    response_model=ScoreResponse,
    summary="单题实时评分（Dify 外部 Tool）",
    responses={
        200: {
            "description": "各指标得分、加权综合得分及耗时。",
            "content": {
                "application/json": {
                    "example": {
                        "scores": {
                            "faithfulness": 0.875,
                            "answer_relevancy": 0.920,
                            "context_recall": 0.810,
                            "context_precision": 0.850,
                        },
                        "weighted_score": 0.8638,
                        "latency_ms": 3420,
                        "skipped_metrics": [],
                        "error": None,
                    }
                }
            },
        },
        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
        422: {"description": "请求参数校验失败（必填字段缺失或 metrics 名称不合法）。"},
    },
)
def score_sample(
    raw_request: Request,
    request: ScoreRequest,
    authorization: Annotated[str | None, Header()] = None,
) -> ScoreResponse:
    """接受单条问答记录，同步运行 RAGAS 指标打分，实时返回各指标得分。

    **主要用途**：供 Dify 外部 Tool 调用。Dify Agent 在生成回答后，将
    `(question, answer, contexts)` 发送到此端点，即可获得 RAGAS 质量评分，
    用于日志记录、质量监控或触发 Agent 自我改进流程。

    **contexts 格式**：多个检索片段用 `context_separator`（默认 `" |||| "`）拼接为一个字符串，
    服务端自动拆分后传入 RAGAS 管道。

    **ground_truth 可选**：
    - 提供时：所有指定指标均参与计算。
    - 缺失时：自动跳过依赖参考答案的指标（`context_recall`、
      `factual_correctness`、`semantic_similarity`、`noise_sensitivity`），
      跳过的指标在响应的 `skipped_metrics` 列表中列出，对应 `scores` 值为 `null`。

    **支持的 RAGAS 指标**：
    - `faithfulness` — 回答与检索片段的事实一致性
    - `answer_relevancy` — 回答与问题的相关性
    - `context_recall` — 参考答案覆盖到的检索内容比例（需 ground_truth）
    - `context_precision` — 检索片段中与答案相关的部分占比
    - `noise_sensitivity` — 对无关噪声片段的敏感度（需 ground_truth）
    - `factual_correctness` — 回答与参考答案的事实准确性（需 ground_truth）
    - `semantic_similarity` — 回答与参考答案的语义相似度（需 ground_truth）

    **推荐模型配置**：
    - `judge_model`: `gpt-5.4`
    - `embedding_model`: `text-embedding-3-small`

    **鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需在请求头携带
    `Authorization: Bearer <token>`；留空则无需鉴权（适合内网部署）。
    """
    client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
    logger.info(
        "[score] incoming  client=%s  method=%s  content_type=%s  metrics=%s  has_gt=%s",
        client,
        raw_request.method,
        raw_request.headers.get("content-type", ""),
        request.metrics,
        request.ground_truth is not None,
    )
    settings = _get_settings()

    # Require Bearer auth only when the deployment configured a shared token.
    if settings.score_api_token:
        _check_auth(authorization, settings.score_api_token)

    judge_model = request.judge_model or settings.ragas_judge_model
    embedding_model = request.embedding_model or settings.ragas_embedding_model
    effective = request.effective_metrics()
    requested = set(request.metrics)
    skipped = sorted(requested - set(effective))

    if not effective:
        return ScoreResponse(
            scores={metric_name: None for metric_name in request.metrics},
            weighted_score=None,
            latency_ms=0,
            skipped_metrics=skipped,
        )

    t0 = time.monotonic()
    try:
        raw_scores = inline_scorer.score(
            question=request.question,
            answer=request.answer,
            contexts=request.contexts_as_list(),
            ground_truth=request.ground_truth,
            metrics=effective,
            judge_model=judge_model,
            embedding_model=embedding_model,
            settings=settings,
        )
    except Exception as exc:  # noqa: BLE001
        latency_ms = int((time.monotonic() - t0) * 1000)
        return ScoreResponse(
            scores={},
            weighted_score=None,
            latency_ms=latency_ms,
            skipped_metrics=skipped,
            error=f"{type(exc).__name__}: {exc}",
        )

    latency_ms = int((time.monotonic() - t0) * 1000)

    # Keep skipped metrics visible to callers by emitting them as null scores.
    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
    all_scores.update(raw_scores)

    weighted = compute_weighted_score(
        {key: value for key, value in raw_scores.items() if value is not None},
        {},
    )

    logger.info(
        "[score] done  latency=%dms  skipped=%s  scores=%s",
        latency_ms,
        skipped,
        {k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
    )
    return ScoreResponse(
        scores=all_scores,
        weighted_score=round(weighted, 4) if weighted is not None else None,
        latency_ms=latency_ms,
        skipped_metrics=skipped,
    )
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
 								from __future__ import annotations
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								import logging
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								import time
 								from typing import Annotated
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								from fastapi import APIRouter, Header, HTTPException, Request
 								from fastapi.exceptions import RequestValidationError
 								from fastapi.responses import JSONResponse
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
 								from rag_eval.metrics.weights import compute_weighted_score
 								from rag_eval.settings import EvaluationSettings
 								from webapp.models import ScoreRequest, ScoreResponse
 								from webapp.services.inline_scorer import inline_scorer
 								router = APIRouter(prefix="/api/score", tags=["score"])
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								logger = logging.getLogger("webapp.api.score")
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
 								def _get_settings() -> EvaluationSettings:
 								    """Return a fresh EvaluationSettings instance (overridable in tests)."""
 								    return EvaluationSettings()
 								def _check_auth(authorization: str | None, token: str) -> None:
 								    """Raise 401 if Bearer token does not match the configured token."""
 								    if authorization is None:
 								        raise HTTPException(status_code=401, detail="Missing Authorization header.")
 								    parts = authorization.split(" ", 1)
 								    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
 								        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
 								@router.post(
 								    "",
 								    response_model=ScoreResponse,
 								    summary="单题实时评分（Dify 外部 Tool）",
 								    responses={
-												docs: enhance /api/score OpenAPI docs with full Chinese docstring and response example

- Add detailed Chinese route docstring covering all 7 metrics, contexts format,
  ground_truth optional behavior, and Bearer auth instructions
- Add 200 response content example for Swagger UI Try-it-out
- Bump app version to 0.3.0

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:52:30 +08:00
+: {
 								            "description": "各指标得分、加权综合得分及耗时。",
 								            "content": {
 								                "application/json": {
 								                    "example": {
 								                        "scores": {
 								                            "faithfulness": 0.875,
 								                            "answer_relevancy": 0.920,
 								                            "context_recall": 0.810,
 								                            "context_precision": 0.850,
 								                        },
 								                        "weighted_score": 0.8638,
 								                        "latency_ms": 3420,
 								                        "skipped_metrics": [],
 								                        "error": None,
 								                    }
 								                }
 								            },
 								        },
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
-												docs: enhance /api/score OpenAPI docs with full Chinese docstring and response example

- Add detailed Chinese route docstring covering all 7 metrics, contexts format,
  ground_truth optional behavior, and Bearer auth instructions
- Add 200 response content example for Swagger UI Try-it-out
- Bump app version to 0.3.0

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:52:30 +08:00
+: {"description": "请求参数校验失败（必填字段缺失或 metrics 名称不合法）。"},
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								    },
 								)
 								def score_sample(
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								    raw_request: Request,
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								    request: ScoreRequest,
 								    authorization: Annotated[str | None, Header()] = None,
 								) -> ScoreResponse:
-												docs: enhance /api/score OpenAPI docs with full Chinese docstring and response example

- Add detailed Chinese route docstring covering all 7 metrics, contexts format,
  ground_truth optional behavior, and Bearer auth instructions
- Add 200 response content example for Swagger UI Try-it-out
- Bump app version to 0.3.0

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:52:30 +08:00
+								    """接受单条问答记录，同步运行 RAGAS 指标打分，实时返回各指标得分。
 								    **主要用途**：供 Dify 外部 Tool 调用。Dify Agent 在生成回答后，将
 								    `(question, answer, contexts)` 发送到此端点，即可获得 RAGAS 质量评分，
 								    用于日志记录、质量监控或触发 Agent 自我改进流程。
 								    **contexts 格式**：多个检索片段用 `context_separator`（默认 `" |||| "`）拼接为一个字符串，
 								    服务端自动拆分后传入 RAGAS 管道。
 								    **ground_truth 可选**：
 								    - 提供时：所有指定指标均参与计算。
 								    - 缺失时：自动跳过依赖参考答案的指标（`context_recall`、
 								      `factual_correctness`、`semantic_similarity`、`noise_sensitivity`），
 								      跳过的指标在响应的 `skipped_metrics` 列表中列出，对应 `scores` 值为 `null`。
 								    **支持的 RAGAS 指标**：
 								    - `faithfulness` — 回答与检索片段的事实一致性
 								    - `answer_relevancy` — 回答与问题的相关性
 								    - `context_recall` — 参考答案覆盖到的检索内容比例（需 ground_truth）
 								    - `context_precision` — 检索片段中与答案相关的部分占比
 								    - `noise_sensitivity` — 对无关噪声片段的敏感度（需 ground_truth）
 								    - `factual_correctness` — 回答与参考答案的事实准确性（需 ground_truth）
 								    - `semantic_similarity` — 回答与参考答案的语义相似度（需 ground_truth）
-												docs: update /api/score example to use gpt-5.4 and text-embedding-3-small

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-23 15:11:34 +08:00
+								    **推荐模型配置**：
 								    - `judge_model`: `gpt-5.4`
 								    - `embedding_model`: `text-embedding-3-small`
-												docs: enhance /api/score OpenAPI docs with full Chinese docstring and response example

- Add detailed Chinese route docstring covering all 7 metrics, contexts format,
  ground_truth optional behavior, and Bearer auth instructions
- Add 200 response content example for Swagger UI Try-it-out
- Bump app version to 0.3.0

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:52:30 +08:00
+								    **鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需在请求头携带
 								    `Authorization: Bearer <token>`；留空则无需鉴权（适合内网部署）。
 								    """
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								    client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
 								    logger.info(
 								        "[score] incoming  client=%s  method=%s  content_type=%s  metrics=%s  has_gt=%s",
 								        client,
 								        raw_request.method,
 								        raw_request.headers.get("content-type", ""),
 								        request.metrics,
 								        request.ground_truth is not None,
 								    )
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								    settings = _get_settings()
 								    # Require Bearer auth only when the deployment configured a shared token.
 								    if settings.score_api_token:
 								        _check_auth(authorization, settings.score_api_token)
 								    judge_model = request.judge_model or settings.ragas_judge_model
 								    embedding_model = request.embedding_model or settings.ragas_embedding_model
 								    effective = request.effective_metrics()
 								    requested = set(request.metrics)
 								    skipped = sorted(requested - set(effective))
 								    if not effective:
 								        return ScoreResponse(
 								            scores={metric_name: None for metric_name in request.metrics},
 								            weighted_score=None,
 								            latency_ms=0,
 								            skipped_metrics=skipped,
 								        )
 								    t0 = time.monotonic()
 								    try:
 								        raw_scores = inline_scorer.score(
 								            question=request.question,
 								            answer=request.answer,
 								            contexts=request.contexts_as_list(),
 								            ground_truth=request.ground_truth,
 								            metrics=effective,
 								            judge_model=judge_model,
 								            embedding_model=embedding_model,
 								            settings=settings,
 								        )
 								    except Exception as exc:  # noqa: BLE001
 								        latency_ms = int((time.monotonic() - t0) * 1000)
 								        return ScoreResponse(
 								            scores={},
 								            weighted_score=None,
 								            latency_ms=latency_ms,
 								            skipped_metrics=skipped,
 								            error=f"{type(exc).__name__}: {exc}",
 								        )
 								    latency_ms = int((time.monotonic() - t0) * 1000)
 								    # Keep skipped metrics visible to callers by emitting them as null scores.
 								    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
 								    all_scores.update(raw_scores)
 								    weighted = compute_weighted_score(
 								        {key: value for key, value in raw_scores.items() if value is not None},
 								        {},
 								    )
-												feat: add detailed request logging to /api/score and global 422 handler

- Log incoming request (client, content-type, metrics, has_gt) on each /api/score call
- Log scoring result (latency, skipped metrics, scores) on success
- Register global RequestValidationError handler: logs url/content-type/errors
  so 422 causes are visible in server log without checking HTTP response body
- Fix jsonable_encoder for exc.errors() to handle non-serializable ctx objects

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 18:14:01 +08:00
+								    logger.info(
 								        "[score] done  latency=%dms  skipped=%s  scores=%s",
 								        latency_ms,
 								        skipped,
 								        {k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
 								    )
-												feat: add POST /api/score endpoint for Dify real-time scoring

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-22 15:14:19 +08:00
+								    return ScoreResponse(
 								        scores=all_scores,
 								        weighted_score=round(weighted, 4) if weighted is not None else None,
 								        latency_ms=latency_ms,
 								        skipped_metrics=skipped,
 								    )