Files
siemens_ragas/webapp/api/session_score_jobs.py

207 lines
8.9 KiB
Python
Raw Normal View History

"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
`POST /api/score/session_async` call with a shared `session_id`. All results are
accumulated into one report, visible in 运行列表报告详情.
Key behaviour:
- Deterministic run_id: derived from session_id same session always maps to the
same report directory (outputs/score-session/session-<id>/).
- Append semantics: each call adds a new sample row. Previous rows are preserved.
- Advisor regeneration: optimization_advice.md is regenerated after every call
using the full set of accumulated rows.
- Each call returns its own `job_id` for individual status polling, plus the
shared `run_id` and `session_id`.
Endpoints:
POST /api/score/session_async Submit one call (returns job_id + run_id)
GET /api/score/sessions List all sessions
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
GET /api/score/session/jobs/{job_id} Status of one individual call
"""
from __future__ import annotations
import logging
from fastapi import APIRouter, HTTPException
from webapp.models import (
AsyncScoreJobStatus,
ScoreRequest,
SessionScoreJobResponse,
SessionScoreRequest,
SessionStatus,
)
from webapp.services.session_score_manager import session_score_manager
router = APIRouter(prefix="/api/score", tags=["score"])
logger = logging.getLogger("webapp.api.session_score_jobs")
@router.post(
"/session_async",
status_code=202,
response_model=SessionScoreJobResponse,
summary="提交 Session 异步评分(多样本批量聚合)",
2026-06-27 14:31:45 +08:00
description=(
"**用途**\n"
"- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
"- 相同 `session_id` 的多次调用不会生成多个独立报告,而是持续追加到同一个 session 报告。\n\n"
"**请求字段说明**\n"
"- `session_id`:会话唯一标识,同一会话必须保持一致。\n"
"- `question` / `answer`:本次待评分的问答对。\n"
"- `contexts`:检索片段拼接字符串,按 `context_separator` 拆分。\n"
"- `ground_truth`:标准答案,可选;缺失时会自动跳过依赖它的指标。\n"
"- `metrics`:本次需要计算的指标列表。\n"
"- `judge_model` / `embedding_model`:可选;为空时回退到系统默认配置。\n\n"
"**处理行为**\n"
"1. 服务端立即返回 `202 Accepted`,并生成本次调用的 `job_id`。\n"
"2. 系统根据 `session_id` 计算固定 `run_id`,格式为 `session-<sanitized-session_id>`。\n"
"3. 本次评分完成后,会向该 session 的 `scores.csv` 追加一行样本数据。\n"
"4. 系统会基于当前 session 的全量样本重写 `summary.md`,并重新生成 `optimization_advice.md`。\n"
"5. 报告可在「运行列表」中按 `run_id` 查看;同一 session 的后续调用会持续增量更新该报告。\n\n"
"**后续查询接口**\n"
"- `GET /api/score/session/jobs/{job_id}`:查询本次调用状态与得分。\n"
"- `GET /api/score/sessions/{session_id}`:查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
"- `GET /api/runs/{run_id}`:查看完整评估报告内容。\n\n"
"**典型请求示例**\n"
"```json\n"
"{\n"
" \"session_id\": \"dify-session-001\",\n"
" \"question\": \"单源CT与双源CT在球管配置上有何本质区别\",\n"
" \"answer\": \"单源CT只有一套球管-探测器系统双源CT有两套独立的球管-探测器系统。\",\n"
" \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
" \"context_separator\": \" |||| \",\n"
" \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
" \"judge_model\": \"gpt-5.5\",\n"
" \"embedding_model\": \"text-embedding-3-small\"\n"
"}\n"
"```"
),
responses={
202: {
"description": (
"调用已排队,立即返回 job_id + run_id202 Accepted\n\n"
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
"在「运行列表」中查看完整报告run_id 即 `session-<session_id>` 形式)。"
),
"content": {
"application/json": {
"example": {
"job_id": "abc123def456",
"session_id": "dify-session-001",
"run_id": "session-dify-session-001",
"status": "queued",
"call_count": 1,
}
}
},
},
},
)
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
相同 `session_id` 的多次调用合并到同一评估报告中每次调用
1. 新增一个样本行到 `scores.csv`
2. 重写 `summary.md`包含所有累积样本的指标均值
3. 重新生成 `optimization_advice.md`基于全量样本的 LLM 优化建议
**适合 Dify 工作流**在循环节点中批量调用所有轮次共用同一 `session_id`
最终在 RAGAS 平台运行列表中查看完整的批量评估报告
"""
logger.info(
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
request.session_id,
request.metrics,
bool(request.contexts),
bool(request.ground_truth),
)
# Strip session_id to build a plain ScoreRequest for the manager
score_request = ScoreRequest(
question=request.question,
answer=request.answer,
contexts=request.contexts,
ground_truth=request.ground_truth,
context_separator=request.context_separator,
metrics=request.metrics,
judge_model=request.judge_model,
embedding_model=request.embedding_model,
)
status, run_id = session_score_manager.submit(request.session_id, score_request)
# Compute call_count from current session state
session_status = session_score_manager.get_session(request.session_id)
call_count = session_status.call_count if session_status else 1
logger.info(
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
status.job_id, request.session_id, run_id, call_count,
)
return SessionScoreJobResponse(
job_id=status.job_id,
session_id=request.session_id,
run_id=run_id,
status=status.status,
call_count=call_count,
)
@router.get(
"/sessions",
response_model=dict,
summary="列出所有 Session 聚合状态",
)
def list_sessions() -> dict:
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
sessions = session_score_manager.list_sessions()
logger.info("[session_score] list_sessions count=%d", len(sessions))
return {"sessions": [s.model_dump() for s in sessions]}
@router.get(
"/sessions/{session_id}",
response_model=SessionStatus,
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
responses={404: {"description": "指定 session_id 不存在。"}},
)
def get_session(session_id: str) -> SessionStatus:
"""查询 session 的聚合评分状态。
返回内容
- `run_id`运行列表中查看完整报告
- `call_count` session 累计调用次数
- `metric_means`所有已累积样本的各指标均值实时读取 scores.csv
- `jobs` session 所有调用记录列表
"""
status = session_score_manager.get_session(session_id)
if status is None:
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
return status
@router.get(
"/session/jobs/{job_id}",
response_model=AsyncScoreJobStatus,
summary="查询 Session 单次调用状态",
responses={404: {"description": "指定 job_id 不存在。"}},
)
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
"""查询 session 评分中某次调用的状态和评分结果。
`status` `completed` `run_id` 即所属 session 的报告目录
`scores` 包含本次调用的各指标得分
"""
status = session_score_manager.get_job(job_id)
if status is None:
raise HTTPException(
status_code=404, detail=f"Session score job not found: {job_id}"
)
return status