Files
siemens_ragas/webapp/api/session_score_jobs.py
wangwei 754a30ad59 feat(session-async): add /api/score/session_async with incremental session report aggregation
- New POST /api/score/session_async endpoint: same session_id calls append to one shared report
- New GET /api/score/sessions/{session_id}: returns call_count, metric_means, all job records
- New GET /api/score/session/jobs/{job_id}: individual call status
- SessionScoreJobManager: deterministic run_id from session_id, per-session mutex for CSV append, advisor regenerated on every call
- SessionScoreRequest (extends ScoreRequest + session_id), SessionScoreJobResponse, SessionStatus models added
- 24 new tests, all passing

chore(weighted-score): comment out 综合加权得分 display and computation

- report.js: hide 综合加权得分 card in report detail page
- score_jobs.js: hide 综合 chip in async job list
- report_builder.py: overall_ws=None (computation disabled)
- summary.py: weighted_score summary line disabled
- evaluator.py: weighted_score/sample_weight columns no longer written to scores.csv
- score.py /api/score: weighted_score always returns null
- score_job_manager.py + session_score_manager.py: weighted=None
- Updated 3 tests to match new behaviour (6 pre-existing failures unchanged)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-26 16:09:33 +08:00

172 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
`POST /api/score/session_async` call with a shared `session_id`. All results are
accumulated into one report, visible in 「运行列表」→「报告详情」.
Key behaviour:
- Deterministic run_id: derived from session_id — same session always maps to the
same report directory (outputs/score-session/session-<id>/).
- Append semantics: each call adds a new sample row. Previous rows are preserved.
- Advisor regeneration: optimization_advice.md is regenerated after every call
using the full set of accumulated rows.
- Each call returns its own `job_id` for individual status polling, plus the
shared `run_id` and `session_id`.
Endpoints:
POST /api/score/session_async Submit one call (returns job_id + run_id)
GET /api/score/sessions List all sessions
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
GET /api/score/session/jobs/{job_id} Status of one individual call
"""
from __future__ import annotations
import logging
from fastapi import APIRouter, HTTPException
from webapp.models import (
AsyncScoreJobStatus,
ScoreRequest,
SessionScoreJobResponse,
SessionScoreRequest,
SessionStatus,
)
from webapp.services.session_score_manager import session_score_manager
router = APIRouter(prefix="/api/score", tags=["score"])
logger = logging.getLogger("webapp.api.session_score_jobs")
@router.post(
"/session_async",
status_code=202,
response_model=SessionScoreJobResponse,
summary="提交 Session 异步评分(多样本批量聚合)",
responses={
202: {
"description": (
"调用已排队,立即返回 job_id + run_id202 Accepted\n\n"
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
"在「运行列表」中查看完整报告run_id 即 `session-<session_id>` 形式)。"
),
"content": {
"application/json": {
"example": {
"job_id": "abc123def456",
"session_id": "dify-session-001",
"run_id": "session-dify-session-001",
"status": "queued",
"call_count": 1,
}
}
},
},
},
)
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
相同 `session_id` 的多次调用合并到同一评估报告中,每次调用:
1. 新增一个样本行到 `scores.csv`
2. 重写 `summary.md`(包含所有累积样本的指标均值)
3. 重新生成 `optimization_advice.md`(基于全量样本的 LLM 优化建议)
**适合 Dify 工作流**:在循环节点中批量调用,所有轮次共用同一 `session_id`
最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
"""
logger.info(
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
request.session_id,
request.metrics,
bool(request.contexts),
bool(request.ground_truth),
)
# Strip session_id to build a plain ScoreRequest for the manager
score_request = ScoreRequest(
question=request.question,
answer=request.answer,
contexts=request.contexts,
ground_truth=request.ground_truth,
context_separator=request.context_separator,
metrics=request.metrics,
judge_model=request.judge_model,
embedding_model=request.embedding_model,
)
status, run_id = session_score_manager.submit(request.session_id, score_request)
# Compute call_count from current session state
session_status = session_score_manager.get_session(request.session_id)
call_count = session_status.call_count if session_status else 1
logger.info(
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
status.job_id, request.session_id, run_id, call_count,
)
return SessionScoreJobResponse(
job_id=status.job_id,
session_id=request.session_id,
run_id=run_id,
status=status.status,
call_count=call_count,
)
@router.get(
"/sessions",
response_model=dict,
summary="列出所有 Session 聚合状态",
)
def list_sessions() -> dict:
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
sessions = session_score_manager.list_sessions()
logger.info("[session_score] list_sessions count=%d", len(sessions))
return {"sessions": [s.model_dump() for s in sessions]}
@router.get(
"/sessions/{session_id}",
response_model=SessionStatus,
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
responses={404: {"description": "指定 session_id 不存在。"}},
)
def get_session(session_id: str) -> SessionStatus:
"""查询 session 的聚合评分状态。
返回内容:
- `run_id`:在「运行列表」中查看完整报告
- `call_count`:本 session 累计调用次数
- `metric_means`:所有已累积样本的各指标均值(实时读取 scores.csv
- `jobs`:本 session 所有调用记录列表
"""
status = session_score_manager.get_session(session_id)
if status is None:
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
return status
@router.get(
"/session/jobs/{job_id}",
response_model=AsyncScoreJobStatus,
summary="查询 Session 单次调用状态",
responses={404: {"description": "指定 job_id 不存在。"}},
)
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
"""查询 session 评分中某次调用的状态和评分结果。
`status` 为 `completed` 时,`run_id` 即所属 session 的报告目录,
`scores` 包含本次调用的各指标得分。
"""
status = session_score_manager.get_job(job_id)
if status is None:
raise HTTPException(
status_code=404, detail=f"Session score job not found: {job_id}"
)
return status