172 lines
6.5 KiB
Python
172 lines
6.5 KiB
Python
|
|
"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
|
|||
|
|
|
|||
|
|
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
|
|||
|
|
`POST /api/score/session_async` call with a shared `session_id`. All results are
|
|||
|
|
accumulated into one report, visible in 「运行列表」→「报告详情」.
|
|||
|
|
|
|||
|
|
Key behaviour:
|
|||
|
|
- Deterministic run_id: derived from session_id — same session always maps to the
|
|||
|
|
same report directory (outputs/score-session/session-<id>/).
|
|||
|
|
- Append semantics: each call adds a new sample row. Previous rows are preserved.
|
|||
|
|
- Advisor regeneration: optimization_advice.md is regenerated after every call
|
|||
|
|
using the full set of accumulated rows.
|
|||
|
|
- Each call returns its own `job_id` for individual status polling, plus the
|
|||
|
|
shared `run_id` and `session_id`.
|
|||
|
|
|
|||
|
|
Endpoints:
|
|||
|
|
POST /api/score/session_async Submit one call (returns job_id + run_id)
|
|||
|
|
GET /api/score/sessions List all sessions
|
|||
|
|
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
|
|||
|
|
GET /api/score/session/jobs/{job_id} Status of one individual call
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from fastapi import APIRouter, HTTPException
|
|||
|
|
|
|||
|
|
from webapp.models import (
|
|||
|
|
AsyncScoreJobStatus,
|
|||
|
|
ScoreRequest,
|
|||
|
|
SessionScoreJobResponse,
|
|||
|
|
SessionScoreRequest,
|
|||
|
|
SessionStatus,
|
|||
|
|
)
|
|||
|
|
from webapp.services.session_score_manager import session_score_manager
|
|||
|
|
|
|||
|
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
|||
|
|
logger = logging.getLogger("webapp.api.session_score_jobs")
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.post(
|
|||
|
|
"/session_async",
|
|||
|
|
status_code=202,
|
|||
|
|
response_model=SessionScoreJobResponse,
|
|||
|
|
summary="提交 Session 异步评分(多样本批量聚合)",
|
|||
|
|
responses={
|
|||
|
|
202: {
|
|||
|
|
"description": (
|
|||
|
|
"调用已排队,立即返回 job_id + run_id(202 Accepted)。\n\n"
|
|||
|
|
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
|
|||
|
|
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
|
|||
|
|
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
|
|||
|
|
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
|
|||
|
|
"在「运行列表」中查看完整报告(run_id 即 `session-<session_id>` 形式)。"
|
|||
|
|
),
|
|||
|
|
"content": {
|
|||
|
|
"application/json": {
|
|||
|
|
"example": {
|
|||
|
|
"job_id": "abc123def456",
|
|||
|
|
"session_id": "dify-session-001",
|
|||
|
|
"run_id": "session-dify-session-001",
|
|||
|
|
"status": "queued",
|
|||
|
|
"call_count": 1,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
)
|
|||
|
|
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
|
|||
|
|
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
|
|||
|
|
|
|||
|
|
相同 `session_id` 的多次调用合并到同一评估报告中,每次调用:
|
|||
|
|
1. 新增一个样本行到 `scores.csv`
|
|||
|
|
2. 重写 `summary.md`(包含所有累积样本的指标均值)
|
|||
|
|
3. 重新生成 `optimization_advice.md`(基于全量样本的 LLM 优化建议)
|
|||
|
|
|
|||
|
|
**适合 Dify 工作流**:在循环节点中批量调用,所有轮次共用同一 `session_id`,
|
|||
|
|
最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
|
|||
|
|
"""
|
|||
|
|
logger.info(
|
|||
|
|
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
|
|||
|
|
request.session_id,
|
|||
|
|
request.metrics,
|
|||
|
|
bool(request.contexts),
|
|||
|
|
bool(request.ground_truth),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Strip session_id to build a plain ScoreRequest for the manager
|
|||
|
|
score_request = ScoreRequest(
|
|||
|
|
question=request.question,
|
|||
|
|
answer=request.answer,
|
|||
|
|
contexts=request.contexts,
|
|||
|
|
ground_truth=request.ground_truth,
|
|||
|
|
context_separator=request.context_separator,
|
|||
|
|
metrics=request.metrics,
|
|||
|
|
judge_model=request.judge_model,
|
|||
|
|
embedding_model=request.embedding_model,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
status, run_id = session_score_manager.submit(request.session_id, score_request)
|
|||
|
|
|
|||
|
|
# Compute call_count from current session state
|
|||
|
|
session_status = session_score_manager.get_session(request.session_id)
|
|||
|
|
call_count = session_status.call_count if session_status else 1
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
|
|||
|
|
status.job_id, request.session_id, run_id, call_count,
|
|||
|
|
)
|
|||
|
|
return SessionScoreJobResponse(
|
|||
|
|
job_id=status.job_id,
|
|||
|
|
session_id=request.session_id,
|
|||
|
|
run_id=run_id,
|
|||
|
|
status=status.status,
|
|||
|
|
call_count=call_count,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get(
|
|||
|
|
"/sessions",
|
|||
|
|
response_model=dict,
|
|||
|
|
summary="列出所有 Session 聚合状态",
|
|||
|
|
)
|
|||
|
|
def list_sessions() -> dict:
|
|||
|
|
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
|
|||
|
|
sessions = session_score_manager.list_sessions()
|
|||
|
|
logger.info("[session_score] list_sessions count=%d", len(sessions))
|
|||
|
|
return {"sessions": [s.model_dump() for s in sessions]}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get(
|
|||
|
|
"/sessions/{session_id}",
|
|||
|
|
response_model=SessionStatus,
|
|||
|
|
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
|
|||
|
|
responses={404: {"description": "指定 session_id 不存在。"}},
|
|||
|
|
)
|
|||
|
|
def get_session(session_id: str) -> SessionStatus:
|
|||
|
|
"""查询 session 的聚合评分状态。
|
|||
|
|
|
|||
|
|
返回内容:
|
|||
|
|
- `run_id`:在「运行列表」中查看完整报告
|
|||
|
|
- `call_count`:本 session 累计调用次数
|
|||
|
|
- `metric_means`:所有已累积样本的各指标均值(实时读取 scores.csv)
|
|||
|
|
- `jobs`:本 session 所有调用记录列表
|
|||
|
|
"""
|
|||
|
|
status = session_score_manager.get_session(session_id)
|
|||
|
|
if status is None:
|
|||
|
|
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
|
|||
|
|
return status
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get(
|
|||
|
|
"/session/jobs/{job_id}",
|
|||
|
|
response_model=AsyncScoreJobStatus,
|
|||
|
|
summary="查询 Session 单次调用状态",
|
|||
|
|
responses={404: {"description": "指定 job_id 不存在。"}},
|
|||
|
|
)
|
|||
|
|
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
|
|||
|
|
"""查询 session 评分中某次调用的状态和评分结果。
|
|||
|
|
|
|||
|
|
`status` 为 `completed` 时,`run_id` 即所属 session 的报告目录,
|
|||
|
|
`scores` 包含本次调用的各指标得分。
|
|||
|
|
"""
|
|||
|
|
status = session_score_manager.get_job(job_id)
|
|||
|
|
if status is None:
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=404, detail=f"Session score job not found: {job_id}"
|
|||
|
|
)
|
|||
|
|
return status
|