Compare commits
23 Commits
24956bbf75
...
1bcb208f92
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1bcb208f92 | ||
|
|
a03a24be4e | ||
|
|
e4d4e4968b | ||
|
|
761faf9c42 | ||
|
|
9ad6ad4ebc | ||
|
|
eee96eb158 | ||
|
|
ccf25eb1f9 | ||
|
|
199b3af611 | ||
|
|
f9e3ba0f64 | ||
|
|
36e5506e2a | ||
|
|
835614189e | ||
|
|
ce0d2291b0 | ||
|
|
480f6d66ea | ||
|
|
d371ef7d24 | ||
|
|
8617eaa5aa | ||
|
|
e0b064587f | ||
|
|
078097af00 | ||
|
|
ca586bf9bb | ||
|
|
9ad2daff73 | ||
|
|
e8af5b906c | ||
|
|
8ea2b9c7d2 | ||
|
|
074800b741 | ||
|
|
3019390592 |
@@ -30,3 +30,8 @@ PARSER_FAILURE_MODE=fail
|
|||||||
|
|
||||||
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
|
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
|
||||||
DATASET_GENERATOR_MODEL=qwen3.6-plus
|
DATASET_GENERATOR_MODEL=qwen3.6-plus
|
||||||
|
|
||||||
|
# ===== Dify 集成 — 实时评分 API =====
|
||||||
|
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
|
||||||
|
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
|
||||||
|
SCORE_API_TOKEN=
|
||||||
|
|||||||
173
deploy.sh
Normal file
173
deploy.sh
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# deploy.sh — Siemens RAGAS 一键部署脚本(Linux)
|
||||||
|
# 用法:bash deploy.sh
|
||||||
|
# 功能:检查环境 → 安装依赖 → 初始化配置 → 启动后台服务
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — Linux 一键部署${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 阶段 1:Python 版本检查 ───────────────────────────────────────
|
||||||
|
info "阶段 1/7:检查 Python 版本..."
|
||||||
|
|
||||||
|
PYTHON_BIN=""
|
||||||
|
for candidate in python3.12 python3.13 python3.14 python3; do
|
||||||
|
if command -v "$candidate" &>/dev/null; then
|
||||||
|
version=$("$candidate" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || true)
|
||||||
|
major=$(echo "$version" | cut -d. -f1)
|
||||||
|
minor=$(echo "$version" | cut -d. -f2)
|
||||||
|
if [ "${major:-0}" -ge 3 ] && [ "${minor:-0}" -ge 12 ]; then
|
||||||
|
PYTHON_BIN="$candidate"
|
||||||
|
ok "Python $version ($candidate)"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$PYTHON_BIN" ]; then
|
||||||
|
err "未找到 Python 3.12+。请安装后重试。"
|
||||||
|
err " Ubuntu/Debian: sudo apt install python3.12 python3.12-venv"
|
||||||
|
err " CentOS/RHEL: sudo dnf install python3.12"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 2:虚拟环境 ──────────────────────────────────────────────
|
||||||
|
info "阶段 2/7:准备虚拟环境..."
|
||||||
|
|
||||||
|
if [ -d ".venv" ] && [ -f ".venv/bin/python" ]; then
|
||||||
|
ok ".venv 已存在,跳过创建"
|
||||||
|
else
|
||||||
|
info "创建 .venv..."
|
||||||
|
"$PYTHON_BIN" -m venv .venv
|
||||||
|
ok ".venv 创建完成"
|
||||||
|
fi
|
||||||
|
|
||||||
|
PIP=".venv/bin/pip"
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# ── 阶段 3:安装依赖 ──────────────────────────────────────────────
|
||||||
|
info "阶段 3/7:安装项目依赖(可能需要几分钟)..."
|
||||||
|
|
||||||
|
"$PIP" install --upgrade pip -q
|
||||||
|
ok "pip 已升级"
|
||||||
|
|
||||||
|
"$PIP" install -e . -q
|
||||||
|
ok "项目依赖安装完成(pyproject.toml)"
|
||||||
|
|
||||||
|
"$PIP" install fastapi uvicorn httpx -q
|
||||||
|
ok "Web 服务依赖安装完成(fastapi / uvicorn / httpx)"
|
||||||
|
|
||||||
|
# ── 阶段 4:配置文件 ──────────────────────────────────────────────
|
||||||
|
info "阶段 4/7:初始化配置文件..."
|
||||||
|
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
cp .env.example .env
|
||||||
|
warn ".env 已从 .env.example 复制,请编辑填写实际的 API Key 等配置后再启动:"
|
||||||
|
warn " nano .env 或 vim .env"
|
||||||
|
warn " 关键字段:OPENAI_API_KEY, OPENAI_BASE_URL, ALIBABA_ACCESS_KEY_ID, ALIBABA_ACCESS_KEY_SECRET"
|
||||||
|
else
|
||||||
|
ok ".env 已存在,跳过"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 5:目录初始化 ────────────────────────────────────────────
|
||||||
|
info "阶段 5/7:初始化目录结构..."
|
||||||
|
|
||||||
|
mkdir -p configs logs outputs datasets
|
||||||
|
ok "目录就绪:configs/ logs/ outputs/ datasets/"
|
||||||
|
|
||||||
|
# 确保其他脚本有执行权限
|
||||||
|
for script in start.sh stop.sh run_eval.sh; do
|
||||||
|
[ -f "$script" ] && chmod +x "$script"
|
||||||
|
done
|
||||||
|
ok "辅助脚本已设置执行权限"
|
||||||
|
|
||||||
|
# ── 阶段 6:Demo 数据 ─────────────────────────────────────────────
|
||||||
|
info "阶段 6/7:初始化演示数据..."
|
||||||
|
|
||||||
|
DEMO_DIR="outputs/kba-knowledge-base-offline-baseline"
|
||||||
|
if [ -d "$DEMO_DIR" ]; then
|
||||||
|
ok "演示数据已存在,跳过"
|
||||||
|
else
|
||||||
|
info "生成演示数据(scripts/seed_sample_run.py)..."
|
||||||
|
if "$PYTHON" scripts/seed_sample_run.py; then
|
||||||
|
ok "演示数据生成完成"
|
||||||
|
else
|
||||||
|
warn "演示数据生成失败,控制台报告页将为空(服务仍可正常启动)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 7:启动服务 ──────────────────────────────────────────────
|
||||||
|
info "阶段 7/7:启动 Web 服务..."
|
||||||
|
|
||||||
|
# 检查 .env 是否包含默认占位符
|
||||||
|
if grep -q "your-api-key" .env 2>/dev/null; then
|
||||||
|
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
|
||||||
|
warn "请编辑 .env 后重新运行 start.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 端口检测
|
||||||
|
PORT=8800
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
warn "端口 $PORT 已被占用,尝试 8801..."
|
||||||
|
PORT=8801
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
err "端口 8800 和 8801 均被占用。请手动运行:"
|
||||||
|
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 清理残留 PID
|
||||||
|
if [ -f ".server.pid" ]; then
|
||||||
|
OLD_PID=$(cat .server.pid)
|
||||||
|
if kill -0 "$OLD_PID" 2>/dev/null; then
|
||||||
|
warn "检测到已有服务进程 (PID=$OLD_PID),停止旧进程..."
|
||||||
|
kill "$OLD_PID" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
rm -f .server.pid
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 后台启动
|
||||||
|
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
echo "$SERVER_PID" > .server.pid
|
||||||
|
|
||||||
|
# 等待 3 秒验证进程存活
|
||||||
|
sleep 3
|
||||||
|
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||||
|
ok "服务已启动 (PID=$SERVER_PID)"
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${GREEN} 部署成功!${NC}"
|
||||||
|
echo -e "${GREEN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
|
||||||
|
echo -e "${GREEN} 本机访问: http://127.0.0.1:${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 服务日志: tail -f logs/server.log${NC}"
|
||||||
|
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
else
|
||||||
|
err "服务启动失败,请查看日志:"
|
||||||
|
err " tail -20 logs/server.log"
|
||||||
|
rm -f .server.pid
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
1537
docs/superpowers/plans/2026-06-18-metric-doc-weights.md
Normal file
1537
docs/superpowers/plans/2026-06-18-metric-doc-weights.md
Normal file
File diff suppressed because it is too large
Load Diff
974
docs/superpowers/plans/2026-06-22-dify-score-api.md
Normal file
974
docs/superpowers/plans/2026-06-22-dify-score-api.md
Normal file
@@ -0,0 +1,974 @@
|
|||||||
|
# Dify 实时评分 API Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** 新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,接受单条问答记录并同步返回 RAGAS 各指标得分。
|
||||||
|
|
||||||
|
**Architecture:** 新增 `inline_scorer.py` 服务层封装 RAGAS 打分逻辑,以 `(judge_model, embedding_model)` 为 key 缓存 LLM 客户端;新增 `webapp/api/score.py` 路由;`ScoreRequest`/`ScoreResponse` 放入 `webapp/models.py`;`SCORE_API_TOKEN` 加入 `EvaluationSettings`。
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, RAGAS 0.4.3, pytest
|
||||||
|
|
||||||
|
## Global Constraints
|
||||||
|
|
||||||
|
- Python 3.12+,PEP 8,4 空格缩进,类型注解必须
|
||||||
|
- contexts 用 `context_separator`(默认 `" |||| "`)拆分为 list[str]
|
||||||
|
- ground_truth 为可选;缺失时跳过 context_recall / factual_correctness / semantic_similarity / noise_sensitivity
|
||||||
|
- SCORE_API_TOKEN 为空时不鉴权(内网部署场景)
|
||||||
|
- 所有测试用 pytest,不依赖真实 LLM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
| 操作 | 文件 | 职责 |
|
||||||
|
|------|------|------|
|
||||||
|
| 新建 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + 单题打分 |
|
||||||
|
| 新建 | `webapp/api/score.py` | `/api/score` 路由 |
|
||||||
|
| 新建 | `tests/webapp/test_score_api.py` | 端点测试(全 mock) |
|
||||||
|
| 修改 | `webapp/models.py` | 新增 ScoreRequest / ScoreResponse |
|
||||||
|
| 修改 | `rag_eval/settings.py` | 新增 score_api_token 字段 |
|
||||||
|
| 修改 | `webapp/server.py` | 注册 score router,更新 OPENAPI_TAGS 和 description |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: ScoreRequest / ScoreResponse 模型 + settings 字段
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/models.py`
|
||||||
|
- Modify: `rag_eval/settings.py`
|
||||||
|
- Test: `tests/webapp/test_score_api.py` (partial — model validation tests)
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Produces:
|
||||||
|
- `ScoreRequest` Pydantic model(见下方字段)
|
||||||
|
- `ScoreResponse` Pydantic model
|
||||||
|
- `EvaluationSettings.score_api_token: str | None`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing model-validation tests**
|
||||||
|
|
||||||
|
Create `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for POST /api/score endpoint."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreRequest:
|
||||||
|
def test_minimal_valid_request(self):
|
||||||
|
"""Only required fields — question, answer, contexts."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="What is CT?",
|
||||||
|
answer="CT is imaging.",
|
||||||
|
contexts="CT uses X-rays.",
|
||||||
|
)
|
||||||
|
assert req.question == "What is CT?"
|
||||||
|
assert req.contexts == "CT uses X-rays."
|
||||||
|
assert req.ground_truth is None
|
||||||
|
assert req.context_separator == " |||| "
|
||||||
|
assert req.metrics == ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
|
||||||
|
|
||||||
|
def test_contexts_split_by_separator(self):
|
||||||
|
"""contexts_as_list() splits on context_separator."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts="ctx1 |||| ctx2 |||| ctx3",
|
||||||
|
context_separator=" |||| ",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
|
||||||
|
|
||||||
|
def test_contexts_split_custom_separator(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts="a---b---c",
|
||||||
|
context_separator="---",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["a", "b", "c"]
|
||||||
|
|
||||||
|
def test_contexts_split_single_item(self):
|
||||||
|
req = ScoreRequest(question="q", answer="a", contexts="only one")
|
||||||
|
assert req.contexts_as_list() == ["only one"]
|
||||||
|
|
||||||
|
def test_missing_question_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_answer_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_contexts_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_custom_metrics_accepted(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
)
|
||||||
|
assert req.metrics == ["faithfulness"]
|
||||||
|
|
||||||
|
def test_invalid_metric_name_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", answer="a", contexts="c", metrics=["not_a_metric"])
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
|
||||||
|
"""Without ground_truth, GT-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "faithfulness" in effective
|
||||||
|
assert "context_recall" not in effective
|
||||||
|
assert "factual_correctness" not in effective
|
||||||
|
assert "semantic_similarity" not in effective
|
||||||
|
assert "noise_sensitivity" not in effective
|
||||||
|
|
||||||
|
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c", ground_truth="gt",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert effective == ["faithfulness", "context_recall", "factual_correctness"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreResponse:
|
||||||
|
def test_score_response_structure(self):
|
||||||
|
resp = ScoreResponse(
|
||||||
|
scores={"faithfulness": 0.85, "answer_relevancy": None},
|
||||||
|
weighted_score=0.85,
|
||||||
|
latency_ms=1200,
|
||||||
|
)
|
||||||
|
assert resp.scores["faithfulness"] == 0.85
|
||||||
|
assert resp.scores["answer_relevancy"] is None
|
||||||
|
assert resp.latency_ms == 1200
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
|
||||||
|
```
|
||||||
|
Expected: `ImportError: cannot import name 'ScoreRequest' from 'webapp.models'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Add ScoreRequest and ScoreResponse to `webapp/models.py`**
|
||||||
|
|
||||||
|
Append to the end of `webapp/models.py` (after `PipelineJobResponse`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dify 实时评分 API 模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 需要 ground_truth 才能计算的指标集合
|
||||||
|
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 所有合法指标名称
|
||||||
|
_VALID_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
})
|
||||||
|
|
||||||
|
_DEFAULT_SCORE_METRICS: list[str] = [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreRequest(BaseModel):
|
||||||
|
"""Request body for the real-time single-sample scoring endpoint."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "基础评分请求",
|
||||||
|
"value": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy", "context_recall", "context_precision"],
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
question: str = Field(description="问题文本。")
|
||||||
|
answer: str = Field(description="待评分的回答。")
|
||||||
|
contexts: str = Field(
|
||||||
|
description="检索上下文字符串,多段之间用 context_separator 拼接。"
|
||||||
|
)
|
||||||
|
ground_truth: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
|
||||||
|
)
|
||||||
|
context_separator: str = Field(
|
||||||
|
default=" |||| ",
|
||||||
|
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
|
||||||
|
description="需要计算的 RAGAS 指标列表。",
|
||||||
|
)
|
||||||
|
judge_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
|
||||||
|
)
|
||||||
|
embedding_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
|
||||||
|
)
|
||||||
|
|
||||||
|
@field_validator("metrics")
|
||||||
|
@classmethod
|
||||||
|
def validate_metric_names(cls, value: list[str]) -> list[str]:
|
||||||
|
"""Reject any metric name not in the supported registry."""
|
||||||
|
invalid = [m for m in value if m not in _VALID_METRICS]
|
||||||
|
if invalid:
|
||||||
|
raise ValueError(
|
||||||
|
f"不支持的指标名称:{invalid}。"
|
||||||
|
f"合法值:{sorted(_VALID_METRICS)}"
|
||||||
|
)
|
||||||
|
if not value:
|
||||||
|
raise ValueError("metrics 不能为空列表。")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def contexts_as_list(self) -> list[str]:
|
||||||
|
"""Split the contexts string into a list of non-empty fragments."""
|
||||||
|
sep = self.context_separator or " |||| "
|
||||||
|
return [s.strip() for s in self.contexts.split(sep) if s.strip()]
|
||||||
|
|
||||||
|
def effective_metrics(self) -> list[str]:
|
||||||
|
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
|
||||||
|
if self.ground_truth is not None:
|
||||||
|
return list(self.metrics)
|
||||||
|
return [m for m in self.metrics if m not in _GT_DEPENDENT_METRICS]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreResponse(BaseModel):
|
||||||
|
"""Response payload for the real-time scoring endpoint."""
|
||||||
|
|
||||||
|
scores: dict[str, float | None] = Field(
|
||||||
|
description="各指标得分(NaN 或计算失败时为 null)。"
|
||||||
|
)
|
||||||
|
weighted_score: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="等权加权综合得分(仅对非 null 指标求均值)。",
|
||||||
|
)
|
||||||
|
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
|
||||||
|
skipped_metrics: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="因缺少 ground_truth 而跳过的指标名称列表。",
|
||||||
|
)
|
||||||
|
error: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add `field_validator` to the import line at the top of `webapp/models.py`:
|
||||||
|
```python
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Add `score_api_token` to `rag_eval/settings.py`**
|
||||||
|
|
||||||
|
Add after the `dataset_generator_model` field:
|
||||||
|
```python
|
||||||
|
score_api_token: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
alias="SCORE_API_TOKEN",
|
||||||
|
description="Bearer token for /api/score endpoint. Empty = no auth.",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
|
||||||
|
```
|
||||||
|
Expected: all 12 tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/models.py rag_eval/settings.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: InlineScorer 服务(LLM 缓存 + 打分)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/services/inline_scorer.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes:
|
||||||
|
- `build_models(judge_model, embedding_model, settings) -> tuple[Any, Any]` from `rag_eval.metrics.factory`
|
||||||
|
- `MetricPipeline(metrics, metric_timeout_seconds)` from `rag_eval.metrics.pipeline`
|
||||||
|
- `NormalizedSample` from `rag_eval.shared.models`
|
||||||
|
- `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights`
|
||||||
|
- `EvaluationSettings` from `rag_eval.settings`
|
||||||
|
- Produces:
|
||||||
|
- `inline_scorer: InlineScorer` (module-level singleton)
|
||||||
|
- `InlineScorer.score(question, answer, contexts, ground_truth, metrics, judge_model, embedding_model, settings) -> dict[str, float | None]`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing test**
|
||||||
|
|
||||||
|
Add to `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class TestInlineScorer:
|
||||||
|
def test_score_returns_dict_with_requested_metrics(self):
|
||||||
|
"""InlineScorer.score returns a dict keyed by the requested metrics."""
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts=["ctx1"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness", "answer_relevancy"],
|
||||||
|
judge_model="test-model",
|
||||||
|
embedding_model="test-embed",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert "faithfulness" in result
|
||||||
|
assert "answer_relevancy" in result
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9)
|
||||||
|
|
||||||
|
def test_score_converts_nan_to_none(self):
|
||||||
|
"""NaN scores are converted to None in the returned dict."""
|
||||||
|
import math
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": float("nan")}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a", contexts=["c"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.services.inline_scorer'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/services/inline_scorer.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
|
||||||
|
|
||||||
|
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
|
||||||
|
(judge_model, embedding_model), so repeated Dify Tool calls with the same
|
||||||
|
models reuse existing AsyncOpenAI connections instead of creating new ones.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.compat import ensure_ragas_import_compat
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import NormalizedSample
|
||||||
|
|
||||||
|
ensure_ragas_import_compat()
|
||||||
|
|
||||||
|
from ragas.metrics.collections import ( # noqa: E402
|
||||||
|
AnswerRelevancy,
|
||||||
|
ContextPrecision,
|
||||||
|
ContextRecall,
|
||||||
|
FactualCorrectness,
|
||||||
|
Faithfulness,
|
||||||
|
NoiseSensitivity,
|
||||||
|
SemanticSimilarity,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
|
||||||
|
"""Instantiate only the RAGAS metric objects requested."""
|
||||||
|
registry: dict[str, Any] = {
|
||||||
|
"faithfulness": Faithfulness(llm=llm),
|
||||||
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||||
|
"context_recall": ContextRecall(llm=llm),
|
||||||
|
"context_precision": ContextPrecision(llm=llm),
|
||||||
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||||
|
"factual_correctness": FactualCorrectness(llm=llm),
|
||||||
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||||
|
}
|
||||||
|
return {name: registry[name] for name in metrics if name in registry}
|
||||||
|
|
||||||
|
|
||||||
|
class InlineScorer:
|
||||||
|
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
|
||||||
|
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def _get_models(
|
||||||
|
self,
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> tuple[Any, Any]:
|
||||||
|
"""Return cached LLM/embedding clients, building them on first use."""
|
||||||
|
cache_key = (judge_model, embedding_model)
|
||||||
|
with self._lock:
|
||||||
|
if cache_key not in self._model_cache:
|
||||||
|
llm, embeddings = build_models(judge_model, embedding_model, settings)
|
||||||
|
self._model_cache[cache_key] = (llm, embeddings)
|
||||||
|
return self._model_cache[cache_key]
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Score one sample synchronously and return {metric_name: score | None}.
|
||||||
|
|
||||||
|
NaN values from RAGAS are converted to None for clean JSON serialization.
|
||||||
|
"""
|
||||||
|
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
|
||||||
|
metric_instances = _build_metric_instances(metrics, llm, embeddings)
|
||||||
|
|
||||||
|
pipeline = MetricPipeline(
|
||||||
|
metrics=metric_instances,
|
||||||
|
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="inline-score",
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
contexts=contexts,
|
||||||
|
ground_truth=ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
metric_score = asyncio.run(pipeline.score_sample(sample))
|
||||||
|
|
||||||
|
# Convert NaN → None for clean JSON output
|
||||||
|
return {
|
||||||
|
name: (None if math.isnan(v) or math.isinf(v) else round(v, 4))
|
||||||
|
for name, v in metric_score.metrics.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
inline_scorer = InlineScorer()
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
|
||||||
|
```
|
||||||
|
Expected: both tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/services/inline_scorer.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add InlineScorer service with LLM client caching"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: `/api/score` 路由 + 鉴权 + 集成测试
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/api/score.py`
|
||||||
|
- Modify: `webapp/server.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes:
|
||||||
|
- `ScoreRequest`, `ScoreResponse` from `webapp.models`
|
||||||
|
- `inline_scorer: InlineScorer` from `webapp.services.inline_scorer`
|
||||||
|
- `EvaluationSettings` from `rag_eval.settings`
|
||||||
|
- `compute_weighted_score(scores, {}) -> float | None` from `rag_eval.metrics.weights`
|
||||||
|
- Produces: `POST /api/score` endpoint
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing endpoint tests**
|
||||||
|
|
||||||
|
Add to `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ── Fixtures ─────────────────────────────────────────────────────────────────
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(monkeypatch):
|
||||||
|
"""TestClient with mocked InlineScorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {
|
||||||
|
"faithfulness": 0.85,
|
||||||
|
"answer_relevancy": 0.90,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreEndpoint:
|
||||||
|
def test_post_score_returns_200(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is imaging.",
|
||||||
|
"contexts": "CT uses X-rays.",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "scores" in data
|
||||||
|
assert "latency_ms" in data
|
||||||
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
||||||
|
|
||||||
|
def test_weighted_score_computed(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
# weighted_score is the mean of all non-null scores
|
||||||
|
assert data["weighted_score"] is not None
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_invalid_metric_name_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["not_a_metric"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["faithfulness", "context_recall"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "context_recall" in data["skipped_metrics"]
|
||||||
|
|
||||||
|
def test_contexts_split_on_separator(self, client, monkeypatch):
|
||||||
|
"""contexts string is split before passing to scorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
calls = []
|
||||||
|
def capture(*args, **kwargs):
|
||||||
|
calls.append(kwargs.get("contexts", []))
|
||||||
|
return {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod.inline_scorer, "score", capture)
|
||||||
|
|
||||||
|
client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a",
|
||||||
|
"contexts": "ctx1 |||| ctx2",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
})
|
||||||
|
assert calls[0] == ["ctx1", "ctx2"]
|
||||||
|
|
||||||
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
||||||
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
test_client = TestClient(create_app())
|
||||||
|
|
||||||
|
# No auth header → 401
|
||||||
|
resp = test_client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
# Correct token → 200
|
||||||
|
resp = test_client.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer secret-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
test_client = TestClient(create_app())
|
||||||
|
resp = test_client.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer wrong-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 401
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreEndpoint -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.api.score'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/api/score.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_settings() -> EvaluationSettings:
|
||||||
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
||||||
|
return EvaluationSettings()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_auth(authorization: str | None, token: str) -> None:
|
||||||
|
"""Raise 401 if Bearer token does not match the configured token."""
|
||||||
|
if authorization is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
||||||
|
parts = authorization.split(" ", 1)
|
||||||
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"",
|
||||||
|
response_model=ScoreResponse,
|
||||||
|
summary="单题实时评分(Dify 外部 Tool)",
|
||||||
|
responses={
|
||||||
|
200: {"description": "各指标得分和加权综合得分。"},
|
||||||
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
||||||
|
422: {"description": "请求参数校验失败。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def score_sample(
|
||||||
|
request: ScoreRequest,
|
||||||
|
authorization: Annotated[str | None, Header()] = None,
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
|
||||||
|
|
||||||
|
供 Dify 外部 Tool 调用。将 `contexts` 字段按 `context_separator` 拆分后传入
|
||||||
|
RAGAS 管道;`ground_truth` 缺失时自动跳过依赖它的指标。
|
||||||
|
"""
|
||||||
|
settings = _get_settings()
|
||||||
|
|
||||||
|
# 鉴权(仅在配置了 token 时生效)
|
||||||
|
if settings.score_api_token:
|
||||||
|
_check_auth(authorization, settings.score_api_token)
|
||||||
|
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
if not effective:
|
||||||
|
# All requested metrics require ground_truth which is absent.
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={m: None for m in request.metrics},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=0,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
# Merge: skipped metrics appear as null in final scores dict.
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
# Weighted score = equal-weight mean of non-null effective scores.
|
||||||
|
weighted = compute_weighted_score(
|
||||||
|
{k: v for k, v in raw_scores.items() if v is not None},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScoreResponse(
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=round(weighted, 4) if weighted is not None else None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Register router in `webapp/server.py`**
|
||||||
|
|
||||||
|
Add `score` to the import line:
|
||||||
|
```python
|
||||||
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
|
||||||
|
```
|
||||||
|
|
||||||
|
Add the router registration after `pipeline.router`:
|
||||||
|
```python
|
||||||
|
app.include_router(score.router)
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `"score"` tag to `OPENAPI_TAGS` list (insert before `"meta"`):
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(Dify 外部 Tool)**\n\n"
|
||||||
|
"接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n"
|
||||||
|
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
|
||||||
|
"适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
|
Also update the `description` field in `FastAPI(...)` to add a bullet:
|
||||||
|
```python
|
||||||
|
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 6: Verify server boots and route appears**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
from webapp.server import create_app
|
||||||
|
app = create_app()
|
||||||
|
routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes]
|
||||||
|
score_routes = [(p,m) for p,m in routes if 'score' in p]
|
||||||
|
print('Score routes:', score_routes)
|
||||||
|
"
|
||||||
|
```
|
||||||
|
Expected output:
|
||||||
|
```
|
||||||
|
Score routes: [('/api/score', ['POST'])]
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/api/score.py webapp/server.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add POST /api/score endpoint for Dify real-time scoring"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: 全量回归 + `.env.example` 更新
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `.env.example`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add SCORE_API_TOKEN to `.env.example`**
|
||||||
|
|
||||||
|
Add this block after `DATASET_GENERATOR_MODEL=qwen3.6-plus`:
|
||||||
|
|
||||||
|
```
|
||||||
|
# ===== Dify 集成 — 实时评分 API =====
|
||||||
|
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
|
||||||
|
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
|
||||||
|
SCORE_API_TOKEN=
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run full test suite**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/ -v --tb=short
|
||||||
|
```
|
||||||
|
|
||||||
|
Pre-existing failures to ignore:
|
||||||
|
- `test_normalize_sample_pdf_offline_smoke_row` — 缺少 CSV fixture
|
||||||
|
- `test_evaluator_and_reporting_write_run_assets` — 预存在的断言不匹配
|
||||||
|
- `test_question_generator_rejects_invalid_json` — retry 循环吞掉了 ValueError
|
||||||
|
- `test_question_generator_rejects_non_list_samples` — 同上
|
||||||
|
|
||||||
|
**零新增失败**即为通过。
|
||||||
|
|
||||||
|
- [ ] **Step 3: Final commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add .env.example
|
||||||
|
git commit -m "feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
|
||||||
|
|
||||||
|
- POST /api/score: real-time RAGAS scoring for Dify external Tool
|
||||||
|
- ScoreRequest/ScoreResponse Pydantic models with full field docs
|
||||||
|
- InlineScorer with (judge_model, embedding_model) client cache
|
||||||
|
- Bearer token auth via SCORE_API_TOKEN env var (optional)
|
||||||
|
- contexts split by configurable separator (default ' |||| ')
|
||||||
|
- GT-dependent metrics auto-skipped when ground_truth absent
|
||||||
|
- Full test coverage (22 new tests)
|
||||||
|
|
||||||
|
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dify 侧配置参考
|
||||||
|
|
||||||
|
任务完成后,在 Dify 「工具」→「自定义工具」中填写如下 OpenAPI Schema:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
openapi: 3.1.0
|
||||||
|
info:
|
||||||
|
title: RAGAS 实时评分
|
||||||
|
version: 1.0.0
|
||||||
|
servers:
|
||||||
|
- url: http://<your-server>:8800
|
||||||
|
paths:
|
||||||
|
/api/score:
|
||||||
|
post:
|
||||||
|
operationId: scoreQA
|
||||||
|
summary: 对一条问答记录进行 RAGAS 评分
|
||||||
|
requestBody:
|
||||||
|
required: true
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
required: [question, answer, contexts]
|
||||||
|
properties:
|
||||||
|
question: { type: string }
|
||||||
|
answer: { type: string }
|
||||||
|
contexts: { type: string, description: "多段上下文用 ' |||| ' 拼接" }
|
||||||
|
ground_truth: { type: string }
|
||||||
|
metrics:
|
||||||
|
type: array
|
||||||
|
items: { type: string }
|
||||||
|
default: [faithfulness, answer_relevancy, context_recall, context_precision]
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: 评分结果
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
scores: { type: object }
|
||||||
|
weighted_score: { type: number }
|
||||||
|
latency_ms: { type: integer }
|
||||||
|
skipped_metrics: { type: array, items: { type: string } }
|
||||||
|
```
|
||||||
240
docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
Normal file
240
docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
# 指标权重 & 文档片段权重功能设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-18
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 在「新建评估」运行评估时,支持为 RAGAS 指标和文档配置权重,计算加权综合得分并在报告中展示。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
1. **指标权重(Metric Weights)**:允许为每个 RAGAS 指标配置浮点权重(如 faithfulness: 0.35),计算每道题的加权综合得分 `weighted_score`。
|
||||||
|
2. **文档权重(Doc Weights)**:允许为特定 PDF 文档名称配置权重(如 `"322_双源CT.pdf": 2.0`),该文档的题目在汇总指标均值时按权重放大贡献。
|
||||||
|
3. **前端覆盖**:在「新建评估」页面选中场景后,展示可编辑的权重面板,运行前可临时覆盖 YAML 中的权重。
|
||||||
|
4. **完全向后兼容**:两个字段均为可选,省略时退化为等权行为,现有场景 YAML 无需修改。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 数据模型
|
||||||
|
|
||||||
|
### 2.1 场景 YAML(新增可选字段)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# 可选。缺省时所有指标权重 = 1.0
|
||||||
|
metric_weights:
|
||||||
|
faithfulness: 0.35
|
||||||
|
context_recall: 0.25
|
||||||
|
context_precision: 0.20
|
||||||
|
answer_relevancy: 0.20
|
||||||
|
|
||||||
|
# 可选。缺省时所有文档权重 = 1.0
|
||||||
|
doc_weights:
|
||||||
|
"322_双源CT成像技术.pdf": 2.0
|
||||||
|
"323_单源CT对比.pdf": 1.5
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 Pydantic Schema(`rag_eval/config/schema.py`)
|
||||||
|
|
||||||
|
`ScenarioModel` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
```
|
||||||
|
|
||||||
|
`ConfigDict(extra="ignore")` 不变,新字段不影响既有 YAML 的加载。
|
||||||
|
|
||||||
|
### 2.3 内部 Scenario dataclass(`rag_eval/shared/models.py`)
|
||||||
|
|
||||||
|
`Scenario` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
```
|
||||||
|
|
||||||
|
随 `scenario.snapshot()` 序列化,供 `run_reader` / 报告层读取。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 后端:权重计算逻辑
|
||||||
|
|
||||||
|
### 3.1 新模块 `rag_eval/metrics/weights.py`
|
||||||
|
|
||||||
|
纯函数模块,无外部依赖,独立可测:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
|
||||||
|
"""返回 key 对应的权重,缺失时返回 default。"""
|
||||||
|
|
||||||
|
def compute_weighted_score(
|
||||||
|
scores: dict[str, float | None],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""
|
||||||
|
给定各指标得分和权重,返回加权综合得分。
|
||||||
|
- 忽略 NaN / None 值
|
||||||
|
- metric_weights 为空时退化为等权均值
|
||||||
|
- 全部 NaN 时返回 None
|
||||||
|
公式: Σ(w_i * s_i) / Σ(w_i),只对非 NaN 项求和
|
||||||
|
"""
|
||||||
|
|
||||||
|
def weighted_metric_means(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metrics: list[str],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""
|
||||||
|
对每个指标计算文档加权均值。
|
||||||
|
- sample_weight = doc_weights.get(row["doc_name"], 1.0)
|
||||||
|
- 公式: Σ(sample_weight_j * score_m_j) / Σ(sample_weight_j)
|
||||||
|
- doc_weights 为空时退化为普通算术均值
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 评估器(`rag_eval/execution/evaluator.py`)
|
||||||
|
|
||||||
|
`_merge_score()` 新增两列:
|
||||||
|
```python
|
||||||
|
record["weighted_score"] = compute_weighted_score(
|
||||||
|
score.metrics, self.scenario.metric_weights
|
||||||
|
)
|
||||||
|
record["sample_weight"] = self.scenario.doc_weights.get(
|
||||||
|
sample.metadata.get("doc_name", ""), 1.0
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`scores.csv` 新增 `weighted_score`、`sample_weight` 两列。
|
||||||
|
|
||||||
|
### 3.3 报告摘要(`rag_eval/reporting/summary.py`)
|
||||||
|
|
||||||
|
`build_summary_markdown()` 改用 `weighted_metric_means()` 计算各指标均值;
|
||||||
|
新增 `weighted_score` 整体均值行:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Metric Means(加权)
|
||||||
|
- faithfulness: 0.8123 (w=0.35)
|
||||||
|
- context_recall: 0.7654 (w=0.25)
|
||||||
|
- context_precision: 0.7200 (w=0.20)
|
||||||
|
- answer_relevancy: 0.7400 (w=0.20)
|
||||||
|
- **weighted_score: 0.7789**
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. yaml_patcher 扩展(`webapp/services/yaml_patcher.py`)
|
||||||
|
|
||||||
|
`apply_profiles_to_scenario()` 扩展签名,新增可选参数:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def apply_profiles_to_scenario(
|
||||||
|
scenario_path: str,
|
||||||
|
judge_profile: LLMProfile | None,
|
||||||
|
answer_profile: LLMProfile | None,
|
||||||
|
dataset_profile: LLMProfile | None,
|
||||||
|
metric_weights: dict[str, float] | None = None, # 新增
|
||||||
|
doc_weights: dict[str, float] | None = None, # 新增
|
||||||
|
_resolve_absolute: bool = False,
|
||||||
|
) -> list[str]:
|
||||||
|
```
|
||||||
|
|
||||||
|
- `metric_weights` 非 None 时写入 `data["metric_weights"]`,追加 `"metric_weights"` 到 patched 列表
|
||||||
|
- `doc_weights` 非 None 时写入 `data["doc_weights"]`,追加 `"doc_weights"` 到 patched 列表
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Webapp 模型与 API 扩展
|
||||||
|
|
||||||
|
### 5.1 `webapp/models.py`
|
||||||
|
|
||||||
|
`ProfileApplyRequest` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] | None = None
|
||||||
|
doc_weights: dict[str, float] | None = None
|
||||||
|
```
|
||||||
|
|
||||||
|
`ProfileApplyResponse` 不变(`patched_fields` 已包含新字段名)。
|
||||||
|
|
||||||
|
### 5.2 `webapp/api/llm_profiles.py` — `apply_profiles()`
|
||||||
|
|
||||||
|
透传 `metric_weights` / `doc_weights` 给 `apply_profiles_to_scenario()`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 前端:权重配置面板
|
||||||
|
|
||||||
|
### 6.1 HTML(`index.html`)
|
||||||
|
|
||||||
|
在 `#llm-assignment-panel` 下方新增 `#weight-config-panel`(选中场景后显示):
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────┐
|
||||||
|
│ 权重配置 (可选,留空使用场景原始配置) │
|
||||||
|
├─────────────────────────────────────────────┤
|
||||||
|
│ 指标权重 │
|
||||||
|
│ faithfulness [____1.0____] │
|
||||||
|
│ context_recall [____1.0____] │
|
||||||
|
│ ...(根据选中场景的 metrics 动态生成) │
|
||||||
|
│ │
|
||||||
|
│ 文档权重(doc_weights) │
|
||||||
|
│ [doc名称_______________] [权重__] [+] [✕] │
|
||||||
|
│ [doc名称_______________] [权重__] [+] [✕] │
|
||||||
|
│ + 添加文档权重规则 │
|
||||||
|
└─────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 `runner.js`
|
||||||
|
|
||||||
|
- `renderScenarioItem()` 选中后调用 `Runner._renderWeightPanel(sc)` 动态生成指标行
|
||||||
|
- `_applyProfilesIfNeeded()` 同时读取权重输入,追加到 `apply` 请求 body
|
||||||
|
- `Runner._collectWeights()` 收集 metric_weights / doc_weights,全部为 1.0 时不发送(跳过)
|
||||||
|
|
||||||
|
### 6.3 CSS(`app.css`)
|
||||||
|
|
||||||
|
新增 `.weight-config-panel`、`.weight-row`、`.weight-input` 样式,与现有 `.llm-role-row` 风格一致。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 报告展示(`webapp/services/report_builder.py`)
|
||||||
|
|
||||||
|
- `RunSummary.metric_means` 改用 `weighted_metric_means()` 计算(需从 `scenario.snapshot.yaml` 读取 `doc_weights` / `metric_weights`)
|
||||||
|
- `RunSummary` 新增 `weighted_score_mean: float | None` 字段
|
||||||
|
- 前端 `report.js` 的指标卡片区新增「综合加权得分」卡片,使用 `good/warn/bad` 配色
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 测试计划
|
||||||
|
|
||||||
|
| 测试文件 | 覆盖内容 |
|
||||||
|
|----------|---------|
|
||||||
|
| `tests/test_weights.py` | `compute_weighted_score` / `weighted_metric_means` 纯函数,含 NaN 边界、空权重、全 NaN |
|
||||||
|
| `tests/test_dataset_build.py` | 无改动(隔离良好) |
|
||||||
|
| `tests/test_offline_eval.py` | `_merge_score` 新增 weighted_score / sample_weight 列断言 |
|
||||||
|
| `tests/webapp/test_llm_profiles_api.py` | `apply_profiles` 带 metric_weights / doc_weights 的 patching 测试 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. 改动文件清单
|
||||||
|
|
||||||
|
| 文件 | 改动类型 |
|
||||||
|
|------|---------|
|
||||||
|
| `rag_eval/config/schema.py` | 新增字段 |
|
||||||
|
| `rag_eval/shared/models.py` | 新增字段 |
|
||||||
|
| `rag_eval/config/loader.py` | 透传新字段到 Scenario |
|
||||||
|
| `rag_eval/metrics/weights.py` | **新建** |
|
||||||
|
| `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 |
|
||||||
|
| `rag_eval/reporting/summary.py` | 改用加权均值 |
|
||||||
|
| `webapp/services/yaml_patcher.py` | 新增 metric_weights / doc_weights 参数 |
|
||||||
|
| `webapp/models.py` | ProfileApplyRequest 新增字段;RunSummary 新增 weighted_score_mean |
|
||||||
|
| `webapp/api/llm_profiles.py` | 透传新参数 |
|
||||||
|
| `webapp/services/report_builder.py` | 加权均值计算 |
|
||||||
|
| `webapp/static/index.html` | 新增权重配置面板 |
|
||||||
|
| `webapp/static/js/runner.js` | 权重面板逻辑 |
|
||||||
|
| `webapp/static/css/app.css` | 新增权重面板样式 |
|
||||||
|
| `tests/test_weights.py` | **新建** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. 向后兼容保证
|
||||||
|
|
||||||
|
- `metric_weights: {}` + `doc_weights: {}` → 所有权重 = 1.0,行为与当前完全一致
|
||||||
|
- 现有场景 YAML 不含这两个字段 → Pydantic `default_factory=dict` 填充空字典
|
||||||
|
- `scores.csv` 新增两列不影响现有报告读取(`run_reader` 只读已知列)
|
||||||
138
docs/superpowers/specs/2026-06-22-dify-score-api-design.md
Normal file
138
docs/superpowers/specs/2026-06-22-dify-score-api-design.md
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
# Dify 集成 — 单题实时评分 API 设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-22
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 在现有 FastAPI 服务中新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,实现单条问答记录的实时 RAGAS 指标评分。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
让 Dify Agent 能在回答完问题后,将 `(question, answer, contexts, ground_truth)` 发给 siemens_ragas 服务,实时获取各 RAGAS 指标得分,用于质量监控或 Agent 自我改进。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. API 规范
|
||||||
|
|
||||||
|
### `POST /api/score`
|
||||||
|
|
||||||
|
**请求体:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "片段1:双源CT采用两套管-探测器系统... |||| 片段2:单扇区采集旋转135度...",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy"],
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**字段说明:**
|
||||||
|
|
||||||
|
| 字段 | 类型 | 必填 | 说明 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| `question` | str | ✅ | 问题文本 |
|
||||||
|
| `answer` | str | ✅ | 待评分的回答 |
|
||||||
|
| `contexts` | str | ✅ | 检索到的上下文,多段用 `context_separator` 拼接 |
|
||||||
|
| `ground_truth` | str | ❌ | 标准答案;缺失时跳过依赖它的指标(context_recall、factual_correctness、semantic_similarity) |
|
||||||
|
| `context_separator` | str | ❌ | 默认 `" \|\|\|\| "`(四个竖线,两侧各一空格) |
|
||||||
|
| `metrics` | list[str] | ❌ | 默认 `["faithfulness", "answer_relevancy", "context_recall", "context_precision"]` |
|
||||||
|
| `judge_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_JUDGE_MODEL` |
|
||||||
|
| `embedding_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_EMBEDDING_MODEL` |
|
||||||
|
|
||||||
|
**响应体(200 OK):**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scores": {
|
||||||
|
"faithfulness": 0.8750,
|
||||||
|
"answer_relevancy": 0.9200
|
||||||
|
},
|
||||||
|
"weighted_score": 0.8975,
|
||||||
|
"latency_ms": 3420
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**错误响应:**
|
||||||
|
|
||||||
|
| 状态码 | 场景 |
|
||||||
|
|--------|------|
|
||||||
|
| 400 | 必填字段缺失、metrics 名称不合法 |
|
||||||
|
| 401 | 配置了 `SCORE_API_TOKEN` 但请求未携带有效 Bearer Token |
|
||||||
|
| 422 | 请求体 JSON 格式错误(Pydantic 校验) |
|
||||||
|
| 500 | RAGAS 内部评分异常,附带 error 字段 |
|
||||||
|
|
||||||
|
**鉴权(可选):**
|
||||||
|
若 `.env` 中 `SCORE_API_TOKEN` 非空,则要求请求头携带 `Authorization: Bearer <token>`。为空则不鉴权(内网部署场景)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 架构与文件改动
|
||||||
|
|
||||||
|
### 新文件
|
||||||
|
|
||||||
|
| 文件 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/api/score.py` | 路由定义,请求验证,调用 InlineScorer |
|
||||||
|
| `webapp/services/inline_scorer.py` | LLM 客户端缓存 + RAGAS 评分逻辑封装 |
|
||||||
|
|
||||||
|
### 修改文件
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/models.py` | 新增 `ScoreRequest`、`ScoreResponse` |
|
||||||
|
| `webapp/server.py` | 注册 `score.router`,更新 `openapi_tags` |
|
||||||
|
| `rag_eval/settings.py` | 新增 `score_api_token: str | None` 字段 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. `inline_scorer.py` 设计
|
||||||
|
|
||||||
|
```python
|
||||||
|
class InlineScorer:
|
||||||
|
"""同步执行 RAGAS 单题评分,内部缓存 LLM 客户端。"""
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""返回 {metric_name: score} 字典,NaN 记为 None。"""
|
||||||
|
```
|
||||||
|
|
||||||
|
**客户端缓存策略:**
|
||||||
|
以 `(judge_model, embedding_model)` 为 key,缓存 `(llm, embeddings)` 对象,避免每次请求都重建 AsyncOpenAI 连接。缓存为模块级单例(`_scorer_cache: dict`),线程安全(加 `threading.Lock`)。
|
||||||
|
|
||||||
|
**评分执行:**
|
||||||
|
复用 `build_metric_pipeline` 构建 `MetricPipeline`,然后 `asyncio.run(pipeline.score_sample(sample))` 执行。与现有 `evaluator.py` 模式一致。
|
||||||
|
|
||||||
|
**ground_truth 为空时的指标跳过逻辑:**
|
||||||
|
`context_recall`、`factual_correctness`、`semantic_similarity`、`noise_sensitivity` 需要 ground_truth;若请求中未提供,自动从 metrics 列表中移除这些指标,并在响应中对应字段返回 `null`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Dify 侧配置方法
|
||||||
|
|
||||||
|
1. 在 Dify 「工具」→「自定义工具」中创建新工具
|
||||||
|
2. 填写 OpenAPI Schema(与 `/api/score` 端点对齐)
|
||||||
|
3. 鉴权方式:API Key(Bearer)或无鉴权
|
||||||
|
4. 在 Agent / Workflow 节点中引用该工具,将 `question`、`answer`、`contexts` 变量映射到工具输入
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 不在范围内
|
||||||
|
|
||||||
|
- 批量评分接口(异步 job)
|
||||||
|
- Dify Workflow 节点插件(需要 Dify 插件开发框架)
|
||||||
|
- 评分结果持久化到 scores.csv
|
||||||
|
- 与现有 report_builder 集成展示
|
||||||
173
docs/superpowers/specs/2026-06-22-linux-deploy-design.md
Normal file
173
docs/superpowers/specs/2026-06-22-linux-deploy-design.md
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
# Linux 一键部署脚本设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-22
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 为 siemens_ragas 项目提供 Linux 环境的部署与运维脚本(无 Docker,无 systemd)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
提供四个 Bash 脚本,覆盖 Linux 服务器上的完整生命周期:
|
||||||
|
|
||||||
|
| 脚本 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `deploy.sh` | 一键完成环境检查、依赖安装、配置初始化、启动服务 |
|
||||||
|
| `start.sh` | 仅启动 Web 服务(已部署后复用,不重装依赖) |
|
||||||
|
| `stop.sh` | 停止后台 Web 服务 |
|
||||||
|
| `run_eval.sh` | 运行单次评估(对应 Windows 的 `run_eval.ps1`) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 约束与假设
|
||||||
|
|
||||||
|
- Linux 目标环境有 PyPI 网络访问(pip 可直接安装)
|
||||||
|
- 代码已通过 `git clone` 或文件拷贝到服务器
|
||||||
|
- 使用 `pip + venv`(不使用 uv)
|
||||||
|
- Web 服务监听 `0.0.0.0:8800`(内网可达)
|
||||||
|
- 后台运行使用 `nohup`,PID 写入 `.server.pid`,日志追加到 `logs/server.log`
|
||||||
|
- 所有脚本均放在仓库根目录,路径相对于 `$SCRIPT_DIR`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. `deploy.sh` 详细设计
|
||||||
|
|
||||||
|
### 3.1 阶段 1:Python 版本检查
|
||||||
|
|
||||||
|
```
|
||||||
|
require Python >= 3.12
|
||||||
|
```
|
||||||
|
|
||||||
|
- `python3 --version` 解析 major.minor
|
||||||
|
- 不满足则打印错误并 `exit 1`
|
||||||
|
- 满足则打印 `[OK] Python X.Y.Z`
|
||||||
|
|
||||||
|
### 3.2 阶段 2:虚拟环境
|
||||||
|
|
||||||
|
- 目标路径:`$SCRIPT_DIR/.venv`
|
||||||
|
- 已存在则跳过创建(打印 `[OK] .venv already exists`)
|
||||||
|
- 不存在则 `python3 -m venv .venv`
|
||||||
|
|
||||||
|
### 3.3 阶段 3:依赖安装
|
||||||
|
|
||||||
|
```bash
|
||||||
|
.venv/bin/pip install --upgrade pip -q
|
||||||
|
.venv/bin/pip install -e . -q # 安装 pyproject.toml 中的依赖
|
||||||
|
.venv/bin/pip install fastapi uvicorn httpx -q # Web 服务额外依赖
|
||||||
|
```
|
||||||
|
|
||||||
|
- 失败则打印错误并 `exit 1`
|
||||||
|
- `fastapi`、`uvicorn`、`httpx` 在 `pyproject.toml` 中未列,需单独安装
|
||||||
|
|
||||||
|
### 3.4 阶段 4:配置文件
|
||||||
|
|
||||||
|
- 若 `.env` 不存在:`cp .env.example .env`,打印警告提示用户编辑后再启动
|
||||||
|
- 若 `.env` 已存在:跳过,打印 `[OK] .env found`
|
||||||
|
|
||||||
|
### 3.5 阶段 5:目录初始化
|
||||||
|
|
||||||
|
创建以下目录(`mkdir -p`,幂等):
|
||||||
|
- `configs/` — LLM Profile 持久化存储
|
||||||
|
- `logs/` — 评估日志 + 服务器日志
|
||||||
|
- `outputs/` — 评估运行产物
|
||||||
|
- `datasets/` — 原始数据集
|
||||||
|
|
||||||
|
### 3.6 阶段 6:Demo 数据
|
||||||
|
|
||||||
|
- 检查 `outputs/kba-knowledge-base-offline-baseline/` 是否存在
|
||||||
|
- 不存在则运行 `.venv/bin/python scripts/seed_sample_run.py`
|
||||||
|
- 失败时打印 `[WARN]`(非致命,报告页为空但服务可启动)
|
||||||
|
|
||||||
|
### 3.7 阶段 7:端口检测
|
||||||
|
|
||||||
|
- 默认端口 `8800`
|
||||||
|
- 用 `ss -tlnp` 或 `netstat -tlnp` 检查是否占用
|
||||||
|
- 占用则尝试 `8801`,仍占用则报错退出
|
||||||
|
|
||||||
|
### 3.8 阶段 8:启动服务
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nohup .venv/bin/python webmain.py \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port $PORT \
|
||||||
|
>> logs/server.log 2>&1 &
|
||||||
|
echo $! > .server.pid
|
||||||
|
```
|
||||||
|
|
||||||
|
- 等待 2 秒后用 `kill -0 $PID` 检测进程是否存活
|
||||||
|
- 存活则打印 URL 和 stop 方法
|
||||||
|
- 未存活则打印 `[ERROR] Server failed to start. Check logs/server.log.` 并 `exit 1`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. `start.sh` 详细设计
|
||||||
|
|
||||||
|
单独负责启动,不做任何环境初始化。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# 检查 .venv 存在
|
||||||
|
# 端口检测(同 deploy.sh 逻辑)
|
||||||
|
# 检查 .env 存在(不存在则 warn 但不阻止)
|
||||||
|
# nohup 启动 + PID 文件 + 存活验证
|
||||||
|
# 打印 URL
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. `stop.sh` 详细设计
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# 读取 .server.pid
|
||||||
|
# 若文件不存在:打印 "No server PID file found." 退出
|
||||||
|
# kill $PID
|
||||||
|
# 等待 2 秒,若进程仍存活用 kill -9
|
||||||
|
# 删除 .server.pid
|
||||||
|
# 打印 "Server stopped."
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. `run_eval.sh` 详细设计
|
||||||
|
|
||||||
|
对应 Windows 的 `run_eval.ps1`。
|
||||||
|
|
||||||
|
```
|
||||||
|
用法:
|
||||||
|
./run_eval.sh # online eval (默认)
|
||||||
|
./run_eval.sh offline # offline smoke
|
||||||
|
./run_eval.sh scenarios/xxx.yaml # 自定义场景
|
||||||
|
./run_eval.sh online DEBUG # 自定义日志级别
|
||||||
|
```
|
||||||
|
|
||||||
|
- 参数 1(Scenario):`online` / `offline` / 文件路径,默认 `online`
|
||||||
|
- 参数 2(LogLevel):`DEBUG` / `INFO` / `WARNING` / `ERROR`,默认 `INFO`
|
||||||
|
- 场景别名映射:
|
||||||
|
- `online` → `scenarios/online/siemens-pdf-question-bank-online.yaml`
|
||||||
|
- `offline` → `scenarios/offline/siemens-pdf-offline-smoke.yaml`
|
||||||
|
- 时间戳日志文件:`logs/eval_$(date +%Y-%m-%d_%H%M%S).log`
|
||||||
|
- 环境变量:`PYTHONIOENCODING=utf-8 PYTHONPATH=.`
|
||||||
|
- 调用:`.venv/bin/python main.py --scenario $SCENARIO --log-file $LOG_FILE --log-level $LOG_LEVEL`
|
||||||
|
- 非零退出码时打印错误并 `exit 1`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 通用约定
|
||||||
|
|
||||||
|
- 所有脚本首行:`#!/usr/bin/env bash`
|
||||||
|
- `set -euo pipefail` — 错误立即退出,未定义变量报错,管道错误传播
|
||||||
|
- `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` — 从任意目录执行均正确
|
||||||
|
- `cd "$SCRIPT_DIR"` — 切换到仓库根目录
|
||||||
|
- 颜色输出:绿色 `[OK]`、黄色 `[WARN]`、红色 `[ERROR]`(检测 tty,非交互式终端降级为无色)
|
||||||
|
- 执行权限:脚本自身需要 `chmod +x`(在 deploy.sh 内对其他脚本自动 chmod)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 不在范围内
|
||||||
|
|
||||||
|
- Docker / docker-compose 支持
|
||||||
|
- systemd service 配置
|
||||||
|
- Nginx 反向代理配置
|
||||||
|
- SSL/TLS 配置
|
||||||
|
- 离线/内网镜像源配置
|
||||||
@@ -62,6 +62,8 @@ def load_scenario(path: str | Path) -> Scenario:
|
|||||||
),
|
),
|
||||||
source_path=scenario_path,
|
source_path=scenario_path,
|
||||||
optimization_advisor=model.optimization_advisor,
|
optimization_advisor=model.optimization_advisor,
|
||||||
|
metric_weights=dict(model.metric_weights),
|
||||||
|
doc_weights=dict(model.doc_weights),
|
||||||
)
|
)
|
||||||
# Run cross-field checks after all relative paths have been resolved.
|
# Run cross-field checks after all relative paths have been resolved.
|
||||||
validate_scenario(scenario)
|
validate_scenario(scenario)
|
||||||
|
|||||||
@@ -55,6 +55,8 @@ class ScenarioModel(BaseModel):
|
|||||||
output_dir: str
|
output_dir: str
|
||||||
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
|
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
|
||||||
optimization_advisor: bool = False
|
optimization_advisor: bool = False
|
||||||
|
metric_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
|
||||||
@field_validator("metrics")
|
@field_validator("metrics")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records
|
|||||||
from rag_eval.datasets.normalizers import normalize_records
|
from rag_eval.datasets.normalizers import normalize_records
|
||||||
from rag_eval.execution.concurrency import gather_with_limit
|
from rag_eval.execution.concurrency import gather_with_limit
|
||||||
from rag_eval.metrics.pipeline import MetricPipeline
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
|
||||||
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
||||||
from rag_eval.shared.utils import utc_now_iso
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
|
||||||
@@ -171,7 +172,7 @@ class Evaluator:
|
|||||||
return valid, invalid
|
return valid, invalid
|
||||||
|
|
||||||
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
||||||
"""Combine sample data, metric results, and run metadata into one output row."""
|
"""Combine sample data, metric results, run metadata, and weight columns."""
|
||||||
record = sample.to_record()
|
record = sample.to_record()
|
||||||
record["contexts"] = sample.contexts
|
record["contexts"] = sample.contexts
|
||||||
record.update(score.metrics)
|
record.update(score.metrics)
|
||||||
@@ -179,4 +180,12 @@ class Evaluator:
|
|||||||
record["judge_model"] = self.scenario.judge_model
|
record["judge_model"] = self.scenario.judge_model
|
||||||
record["embedding_model"] = self.scenario.embedding_model
|
record["embedding_model"] = self.scenario.embedding_model
|
||||||
record["run_id"] = self.scenario.scenario_name
|
record["run_id"] = self.scenario.scenario_name
|
||||||
|
# Weighted score columns — enable post-hoc weighted aggregation in reporting.
|
||||||
|
record["weighted_score"] = compute_weighted_score(
|
||||||
|
score.metrics, self.scenario.metric_weights
|
||||||
|
)
|
||||||
|
doc_name = str(sample.metadata.get("doc_name", "") or "")
|
||||||
|
record["sample_weight"] = resolve_weight(
|
||||||
|
self.scenario.doc_weights, doc_name, default=1.0
|
||||||
|
)
|
||||||
return record
|
return record
|
||||||
|
|||||||
152
rag_eval/metrics/weights.py
Normal file
152
rag_eval/metrics/weights.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Utility functions for weighted metric aggregation.
|
||||||
|
|
||||||
|
All functions are pure (no side effects, no I/O) and operate on plain dicts/lists.
|
||||||
|
Weights do not need to be pre-normalised — normalisation is done internally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
|
||||||
|
"""Return the weight for *key*, or *default* when absent."""
|
||||||
|
return float(weights.get(key, default))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_weighted_score(
|
||||||
|
scores: dict[str, float | None],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""Return the weighted mean of valid (non-NaN, non-None) metric scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scores: mapping of metric_name -> raw score (may be NaN or None).
|
||||||
|
metric_weights: optional per-metric weights; absent keys default to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Weighted mean as a float, or None when no valid score exists.
|
||||||
|
"""
|
||||||
|
total_weight = 0.0
|
||||||
|
total_score = 0.0
|
||||||
|
for metric, score in scores.items():
|
||||||
|
if score is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
value = float(score)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if math.isnan(value) or math.isinf(value):
|
||||||
|
continue
|
||||||
|
weight = resolve_weight(metric_weights, metric, default=1.0)
|
||||||
|
total_weight += weight
|
||||||
|
total_score += weight * value
|
||||||
|
if total_weight == 0.0:
|
||||||
|
return None
|
||||||
|
return total_score / total_weight
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_metric_means(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metrics: list[str],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Compute per-metric weighted means across all score rows.
|
||||||
|
|
||||||
|
Each row's contribution is scaled by the doc_weight for its ``doc_name``.
|
||||||
|
Rows with NaN/None for a given metric are excluded from that metric's mean.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
score_rows: list of score record dicts (from scores.csv).
|
||||||
|
metrics: ordered list of metric names to aggregate.
|
||||||
|
doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping metric_name -> weighted mean (or None if no valid data).
|
||||||
|
"""
|
||||||
|
totals: dict[str, float] = {metric: 0.0 for metric in metrics}
|
||||||
|
weights_sum: dict[str, float] = {metric: 0.0 for metric in metrics}
|
||||||
|
|
||||||
|
for row in score_rows:
|
||||||
|
doc_name = str(row.get("doc_name", "") or "")
|
||||||
|
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
|
||||||
|
for metric in metrics:
|
||||||
|
raw_value = row.get(metric)
|
||||||
|
if raw_value is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
value = float(raw_value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if math.isnan(value) or math.isinf(value):
|
||||||
|
continue
|
||||||
|
totals[metric] += sample_weight * value
|
||||||
|
weights_sum[metric] += sample_weight
|
||||||
|
|
||||||
|
return {
|
||||||
|
metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None)
|
||||||
|
for metric in metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def compute_overall_weighted_score_mean(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""Compute the overall weighted-score mean across all samples.
|
||||||
|
|
||||||
|
For each sample:
|
||||||
|
1. Compute per-sample weighted_score via compute_weighted_score.
|
||||||
|
2. Scale by the doc weight for that sample's doc_name.
|
||||||
|
Then return the weighted mean of all per-sample weighted_scores.
|
||||||
|
"""
|
||||||
|
total_weight = 0.0
|
||||||
|
total_score = 0.0
|
||||||
|
for row in score_rows:
|
||||||
|
metric_scores: dict[str, float | None] = {}
|
||||||
|
for key, value in row.items():
|
||||||
|
if key in _META_COLUMNS:
|
||||||
|
continue
|
||||||
|
metric_scores[key] = value # type: ignore[assignment]
|
||||||
|
|
||||||
|
weighted_score = compute_weighted_score(metric_scores, metric_weights)
|
||||||
|
if weighted_score is None:
|
||||||
|
continue
|
||||||
|
doc_name = str(row.get("doc_name", "") or "")
|
||||||
|
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
|
||||||
|
total_weight += sample_weight
|
||||||
|
total_score += sample_weight * weighted_score
|
||||||
|
|
||||||
|
return total_score / total_weight if total_weight > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
# Columns in scores.csv that are sample metadata, not metric scores.
|
||||||
|
_META_COLUMNS = frozenset(
|
||||||
|
{
|
||||||
|
"sample_id",
|
||||||
|
"question",
|
||||||
|
"contexts",
|
||||||
|
"answer",
|
||||||
|
"ground_truth",
|
||||||
|
"scenario",
|
||||||
|
"language",
|
||||||
|
"retrieval_config",
|
||||||
|
"error",
|
||||||
|
"judge_model",
|
||||||
|
"embedding_model",
|
||||||
|
"run_id",
|
||||||
|
"difficulty",
|
||||||
|
"question_type",
|
||||||
|
"doc_id",
|
||||||
|
"doc_name",
|
||||||
|
"section_path",
|
||||||
|
"page_start",
|
||||||
|
"page_end",
|
||||||
|
"source_chunk_ids",
|
||||||
|
"review_status",
|
||||||
|
"review_notes",
|
||||||
|
"weighted_score",
|
||||||
|
"sample_weight",
|
||||||
|
}
|
||||||
|
)
|
||||||
@@ -6,6 +6,10 @@ import math
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means,
|
||||||
|
)
|
||||||
from rag_eval.shared.models import EvaluationResult
|
from rag_eval.shared.models import EvaluationResult
|
||||||
|
|
||||||
|
|
||||||
@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
|
|||||||
lines.append("No valid samples were scored.")
|
lines.append("No valid samples were scored.")
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
for metric in result.scenario.metrics:
|
score_rows_list = scores.to_dict(orient="records")
|
||||||
mean_value = scores[metric].mean(numeric_only=True)
|
w_means = weighted_metric_means(
|
||||||
if isinstance(mean_value, float) and not math.isnan(mean_value):
|
score_rows_list, result.scenario.metrics, result.scenario.doc_weights
|
||||||
lines.append(f"- {metric}: `{mean_value:.4f}`")
|
)
|
||||||
else:
|
|
||||||
lines.append(f"- {metric}: `n/a`")
|
|
||||||
|
|
||||||
# Keep the summary self-sufficient by including every scored sample and its errors.
|
has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
|
||||||
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
|
|
||||||
detail = scores[detail_columns]
|
for metric in result.scenario.metrics:
|
||||||
lines.extend(
|
mean_value = w_means.get(metric)
|
||||||
[
|
w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
|
||||||
|
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
|
||||||
|
if mean_value is not None and not math.isnan(mean_value):
|
||||||
|
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
|
||||||
|
else:
|
||||||
|
lines.append(f"- {metric}: `n/a`{weight_note}")
|
||||||
|
|
||||||
|
if has_weights:
|
||||||
|
overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
||||||
|
)
|
||||||
|
weight_suffix = " (加权)"
|
||||||
|
if overall_ws is not None and not math.isnan(overall_ws):
|
||||||
|
lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
||||||
|
else:
|
||||||
|
lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
||||||
|
|
||||||
|
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
||||||
|
existing_columns = [c for c in detail_columns if c in scores.columns]
|
||||||
|
detail = scores[existing_columns]
|
||||||
|
lines.extend([
|
||||||
"",
|
"",
|
||||||
"## Per-sample Scores",
|
"## Per-sample Scores",
|
||||||
"",
|
"",
|
||||||
"```text",
|
"```text",
|
||||||
_table_from_frame(detail),
|
_table_from_frame(detail),
|
||||||
"```",
|
"```",
|
||||||
]
|
])
|
||||||
)
|
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|||||||
@@ -52,6 +52,11 @@ class EvaluationSettings(BaseSettings):
|
|||||||
)
|
)
|
||||||
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
|
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
|
||||||
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
|
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
|
||||||
|
score_api_token: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
alias="SCORE_API_TOKEN",
|
||||||
|
description="Bearer token for /api/score endpoint. Empty = no auth.",
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def openai_client_kwargs(self) -> dict[str, str | float]:
|
def openai_client_kwargs(self) -> dict[str, str | float]:
|
||||||
|
|||||||
@@ -77,6 +77,8 @@ class Scenario:
|
|||||||
app_adapter: AppAdapterConfig | None = None
|
app_adapter: AppAdapterConfig | None = None
|
||||||
source_path: Path | None = None
|
source_path: Path | None = None
|
||||||
optimization_advisor: bool = False
|
optimization_advisor: bool = False
|
||||||
|
metric_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
|
||||||
def snapshot(self) -> dict[str, Any]:
|
def snapshot(self) -> dict[str, Any]:
|
||||||
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
|
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
|
||||||
|
|||||||
147
run_eval.sh
Normal file
147
run_eval.sh
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# run_eval.sh — Siemens RAGAS 评估运行脚本(Linux)
|
||||||
|
# 对应 Windows 的 run_eval.ps1
|
||||||
|
#
|
||||||
|
# 用法:
|
||||||
|
# bash run_eval.sh # online 评估(默认)
|
||||||
|
# bash run_eval.sh offline # offline 冒烟测试
|
||||||
|
# bash run_eval.sh scenarios/xxx.yaml # 自定义场景
|
||||||
|
# bash run_eval.sh online DEBUG # 指定日志级别
|
||||||
|
# bash run_eval.sh build scenarios/siemens_build/siemens-pdf-build.yaml
|
||||||
|
# # 题库生成
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
||||||
|
|
||||||
|
# ── 参数解析 ──────────────────────────────────────────────────────
|
||||||
|
SCENARIO="${1:-online}"
|
||||||
|
LOG_LEVEL="${2:-INFO}"
|
||||||
|
|
||||||
|
# 场景别名映射
|
||||||
|
declare -A SCENARIO_MAP=(
|
||||||
|
["online"]="scenarios/online/siemens-pdf-question-bank-online.yaml"
|
||||||
|
["offline"]="scenarios/offline/siemens-pdf-offline-smoke.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 检测是否是 dataset build 模式
|
||||||
|
BUILD_MODE=false
|
||||||
|
BUILD_CONFIG=""
|
||||||
|
if [ "$SCENARIO" = "build" ]; then
|
||||||
|
BUILD_MODE=true
|
||||||
|
BUILD_CONFIG="${2:-scenarios/siemens_build/siemens-pdf-build.yaml}"
|
||||||
|
LOG_LEVEL="${3:-INFO}"
|
||||||
|
elif [ -v "SCENARIO_MAP[$SCENARIO]" ]; then
|
||||||
|
SCENARIO="${SCENARIO_MAP[$SCENARIO]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 验证 ──────────────────────────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS — 评估运行${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查虚拟环境
|
||||||
|
if [ ! -f ".venv/bin/python" ]; then
|
||||||
|
err "未找到 .venv,请先执行部署:bash deploy.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# Build 模式校验
|
||||||
|
if [ "$BUILD_MODE" = true ]; then
|
||||||
|
if [ ! -f "$BUILD_CONFIG" ]; then
|
||||||
|
err "题库生成配置文件不存在:$BUILD_CONFIG"
|
||||||
|
echo ""
|
||||||
|
echo "可用配置:"
|
||||||
|
find scenarios/ -name "*.yaml" 2>/dev/null | head -20 | sed 's/^/ /'
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "模式 : 题库生成 (dataset build)"
|
||||||
|
ok "配置文件 : $BUILD_CONFIG"
|
||||||
|
else
|
||||||
|
# 场景文件校验
|
||||||
|
if [ ! -f "$SCENARIO" ]; then
|
||||||
|
err "场景文件不存在:$SCENARIO"
|
||||||
|
echo ""
|
||||||
|
echo "用法示例:"
|
||||||
|
echo " bash run_eval.sh # online 评估"
|
||||||
|
echo " bash run_eval.sh offline # offline 冒烟"
|
||||||
|
echo " bash run_eval.sh scenarios/xxx.yaml # 自定义场景"
|
||||||
|
echo " bash run_eval.sh build [config.yaml] # 题库生成"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "场景文件 : $SCENARIO"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 日志级别校验
|
||||||
|
LOG_LEVEL_UPPER="${LOG_LEVEL^^}"
|
||||||
|
case "$LOG_LEVEL_UPPER" in
|
||||||
|
DEBUG|INFO|WARNING|ERROR) ;;
|
||||||
|
*)
|
||||||
|
warn "未知日志级别 '$LOG_LEVEL',使用默认值 INFO"
|
||||||
|
LOG_LEVEL_UPPER="INFO"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
ok "日志级别 : $LOG_LEVEL_UPPER"
|
||||||
|
|
||||||
|
# 创建日志目录
|
||||||
|
mkdir -p logs
|
||||||
|
TIMESTAMP=$(date +%Y-%m-%d_%H%M%S)
|
||||||
|
LOG_FILE="logs/eval_${TIMESTAMP}.log"
|
||||||
|
ok "日志文件 : $LOG_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} 开始运行,按 Ctrl+C 中止${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 运行 ──────────────────────────────────────────────────────────
|
||||||
|
export PYTHONIOENCODING="utf-8"
|
||||||
|
export PYTHONPATH="."
|
||||||
|
|
||||||
|
if [ "$BUILD_MODE" = true ]; then
|
||||||
|
"$PYTHON" main.py \
|
||||||
|
--dataset-build-config "$BUILD_CONFIG"
|
||||||
|
else
|
||||||
|
"$PYTHON" main.py \
|
||||||
|
--scenario "$SCENARIO" \
|
||||||
|
--log-file "$LOG_FILE" \
|
||||||
|
--log-level "$LOG_LEVEL_UPPER"
|
||||||
|
fi
|
||||||
|
|
||||||
|
EXIT_CODE=$?
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
if [ $EXIT_CODE -eq 0 ]; then
|
||||||
|
echo -e "${GREEN}============================================================${NC}"
|
||||||
|
echo -e "${GREEN} 运行完成!${NC}"
|
||||||
|
if [ "$BUILD_MODE" = false ]; then
|
||||||
|
echo -e "${GREEN} 日志已保存到:$LOG_FILE${NC}"
|
||||||
|
fi
|
||||||
|
echo -e "${CYAN} 在 Web 控制台查看报告:bash start.sh${NC}"
|
||||||
|
echo -e "${GREEN}============================================================${NC}"
|
||||||
|
else
|
||||||
|
err "运行失败(exit code=$EXIT_CODE)"
|
||||||
|
if [ "$BUILD_MODE" = false ]; then
|
||||||
|
err "查看日志:cat $LOG_FILE"
|
||||||
|
fi
|
||||||
|
exit $EXIT_CODE
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
94
start.sh
Normal file
94
start.sh
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# start.sh — 启动 Siemens RAGAS Web 服务(后台运行)
|
||||||
|
# 前提:已执行过 deploy.sh(.venv 和依赖均已就绪)
|
||||||
|
# 用法:bash start.sh
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — 启动服务${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查虚拟环境
|
||||||
|
if [ ! -f ".venv/bin/python" ]; then
|
||||||
|
err "未找到 .venv,请先执行部署:bash deploy.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# 检查 .env
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
warn ".env 不存在,请先复制并编辑配置:"
|
||||||
|
warn " cp .env.example .env && nano .env"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q "your-api-key" .env 2>/dev/null; then
|
||||||
|
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查是否已有运行中的进程
|
||||||
|
if [ -f ".server.pid" ]; then
|
||||||
|
EXISTING_PID=$(cat .server.pid)
|
||||||
|
if kill -0 "$EXISTING_PID" 2>/dev/null; then
|
||||||
|
warn "服务已在运行 (PID=$EXISTING_PID),无需重复启动"
|
||||||
|
warn "如需重启请先执行:bash stop.sh"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
# PID 文件残留,清理
|
||||||
|
rm -f .server.pid
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建必要目录
|
||||||
|
mkdir -p logs
|
||||||
|
|
||||||
|
# 端口检测
|
||||||
|
PORT=8800
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
warn "端口 $PORT 已被占用,尝试 8801..."
|
||||||
|
PORT=8801
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
err "端口 8800 和 8801 均被占用,请手动指定端口:"
|
||||||
|
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 后台启动
|
||||||
|
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
echo "$SERVER_PID" > .server.pid
|
||||||
|
|
||||||
|
# 等待 3 秒验证进程存活
|
||||||
|
sleep 3
|
||||||
|
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||||
|
ok "服务已启动 (PID=$SERVER_PID)"
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 本机访问: http://127.0.0.1:${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 查看日志: tail -f logs/server.log${NC}"
|
||||||
|
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
|
||||||
|
echo ""
|
||||||
|
else
|
||||||
|
err "服务启动失败,请查看日志:"
|
||||||
|
err " tail -20 logs/server.log"
|
||||||
|
rm -f .server.pid
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
68
stop.sh
Normal file
68
stop.sh
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# stop.sh — 停止 Siemens RAGAS 后台 Web 服务
|
||||||
|
# 用法:bash stop.sh
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — 停止服务${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
PID_FILE="$SCRIPT_DIR/.server.pid"
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
warn "未找到 .server.pid,服务可能未启动或已停止"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
warn "进程 $PID 已不存在,清理 PID 文件"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 优雅停止(SIGTERM)
|
||||||
|
echo -e " 正在停止进程 (PID=$PID)..."
|
||||||
|
kill "$PID" 2>/dev/null || true
|
||||||
|
|
||||||
|
# 等待最多 5 秒
|
||||||
|
for i in 1 2 3 4 5; do
|
||||||
|
sleep 1
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo -e " 等待进程退出... ($i/5)"
|
||||||
|
done
|
||||||
|
|
||||||
|
# 若进程仍存在,强制终止
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
warn "进程未响应,强制终止 (SIGKILL)..."
|
||||||
|
kill -9 "$PID" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
err "无法停止进程 $PID,请手动执行:kill -9 $PID"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ok "服务已停止"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
@@ -80,6 +80,64 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
|
self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
|
||||||
self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
|
self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
|
||||||
|
|
||||||
|
def test_load_scenario_metric_and_doc_weights(self) -> None:
|
||||||
|
"""load_scenario passes metric_weights and doc_weights into Scenario."""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from rag_eval.config.loader import load_scenario
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"scenario_name": "w-test",
|
||||||
|
"mode": "offline",
|
||||||
|
"dataset": "nonexistent.csv",
|
||||||
|
"judge_model": "m",
|
||||||
|
"embedding_model": "e",
|
||||||
|
"metrics": ["faithfulness"],
|
||||||
|
"output_dir": "out",
|
||||||
|
"metric_weights": {"faithfulness": 0.7},
|
||||||
|
"doc_weights": {"doc.pdf": 2.0},
|
||||||
|
}
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
|
||||||
|
yaml.dump(payload, f, allow_unicode=True)
|
||||||
|
tmp_path = f.name
|
||||||
|
try:
|
||||||
|
scenario = load_scenario(tmp_path)
|
||||||
|
assert scenario.metric_weights == {"faithfulness": 0.7}
|
||||||
|
assert scenario.doc_weights == {"doc.pdf": 2.0}
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
def test_load_scenario_defaults_to_empty_weights(self) -> None:
|
||||||
|
"""load_scenario defaults metric_weights and doc_weights to empty dicts."""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from rag_eval.config.loader import load_scenario
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"scenario_name": "no-w",
|
||||||
|
"mode": "offline",
|
||||||
|
"dataset": "nonexistent.csv",
|
||||||
|
"judge_model": "m",
|
||||||
|
"embedding_model": "e",
|
||||||
|
"metrics": ["faithfulness"],
|
||||||
|
"output_dir": "out",
|
||||||
|
}
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
|
||||||
|
yaml.dump(payload, f, allow_unicode=True)
|
||||||
|
tmp_path = f.name
|
||||||
|
try:
|
||||||
|
scenario = load_scenario(tmp_path)
|
||||||
|
assert scenario.metric_weights == {}
|
||||||
|
assert scenario.doc_weights == {}
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
|
def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
|
||||||
scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
|
scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
|
||||||
snapshot = scenario.snapshot()
|
snapshot = scenario.snapshot()
|
||||||
@@ -125,6 +183,117 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class EvaluatorAndReportingTests(unittest.TestCase):
|
class EvaluatorAndReportingTests(unittest.TestCase):
|
||||||
|
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
||||||
|
"""_merge_score adds weighted_score and sample_weight columns."""
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from rag_eval.execution.evaluator import Evaluator
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="w-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness", "context_recall"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
|
||||||
|
doc_weights={"doc.pdf": 2.0},
|
||||||
|
)
|
||||||
|
evaluator = Evaluator(
|
||||||
|
scenario=scenario,
|
||||||
|
metric_pipeline=MagicMock(),
|
||||||
|
app_adapter=None,
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["ctx"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
metadata={"doc_name": "doc.pdf"},
|
||||||
|
)
|
||||||
|
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
||||||
|
row = evaluator._merge_score(sample, score)
|
||||||
|
# (3*1.0 + 1*0.0) / (3+1) = 0.75
|
||||||
|
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
||||||
|
assert row["sample_weight"] == 2.0
|
||||||
|
|
||||||
|
def test_summary_markdown_shows_weighted_score(self):
|
||||||
|
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||||
|
import math
|
||||||
|
from rag_eval.reporting.summary import build_summary_markdown
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
|
||||||
|
)
|
||||||
|
from pathlib import Path
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="ws-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 1.0},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["c"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario, run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample], invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1", "faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8, "sample_weight": 1.0,
|
||||||
|
"doc_name": "", "error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
assert "weighted_score" in md
|
||||||
|
assert "0.8000" in md
|
||||||
|
|
||||||
|
def test_summary_markdown_hides_weighted_score_without_weights(self):
|
||||||
|
"""build_summary_markdown preserves unweighted summaries when no weights set."""
|
||||||
|
from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="plain-test",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m",
|
||||||
|
embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1",
|
||||||
|
question="q",
|
||||||
|
contexts=["c"],
|
||||||
|
answer="a",
|
||||||
|
ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00",
|
||||||
|
finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1",
|
||||||
|
"faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8,
|
||||||
|
"sample_weight": 1.0,
|
||||||
|
"doc_name": "",
|
||||||
|
"error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
|
||||||
|
assert "- **weighted_score" not in md
|
||||||
|
|
||||||
def test_metric_pipeline_scores_sample(self) -> None:
|
def test_metric_pipeline_scores_sample(self) -> None:
|
||||||
pipeline = MetricPipeline(
|
pipeline = MetricPipeline(
|
||||||
metrics={
|
metrics={
|
||||||
|
|||||||
89
tests/test_webapp_report_builder.py
Normal file
89
tests/test_webapp_report_builder.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Regression tests for weighted webapp report aggregation."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from webapp.services.report_builder import build_report
|
||||||
|
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
def _write_run_artifacts(run_dir: Path) -> None:
|
||||||
|
"""Create a minimal run directory with weighted scores and a snapshot."""
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,1.0,0.5,0.8333,3.0",
|
||||||
|
"s2,b.pdf,0.0,0.5,0.1667,1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
|
||||||
|
(run_dir / "scenario.snapshot.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"metrics:",
|
||||||
|
" - faithfulness",
|
||||||
|
" - context_recall",
|
||||||
|
"metric_weights:",
|
||||||
|
" faithfulness: 2.0",
|
||||||
|
" context_recall: 1.0",
|
||||||
|
"doc_weights:",
|
||||||
|
" a.pdf: 3.0",
|
||||||
|
" b.pdf: 1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
|
||||||
|
"""Snapshot weight reader returns both weight maps as plain float dicts."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
|
assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
|
||||||
|
"""Report aggregation uses weighted means and surfaces snapshot weights."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["faithfulness", "context_recall"])
|
||||||
|
|
||||||
|
assert report.metric_means == {
|
||||||
|
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
||||||
|
"context_recall": pytest.approx(0.5, rel=1e-4),
|
||||||
|
}
|
||||||
|
assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
|
||||||
|
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
assert report.summary_markdown == "summary"
|
||||||
|
assert report.advice_markdown == "advice"
|
||||||
|
|
||||||
|
|
||||||
|
def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
|
||||||
|
"""Metric inference excludes weighted helper columns from scores.csv."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,0.8,0.8,2.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||||
124
tests/test_weights.py
Normal file
124
tests/test_weights.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
"""Unit tests for rag_eval/metrics/weights.py"""
|
||||||
|
import math
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
compute_weighted_score,
|
||||||
|
resolve_weight,
|
||||||
|
weighted_metric_means,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveWeight:
|
||||||
|
def test_returns_value_when_key_present(self):
|
||||||
|
assert resolve_weight({"faith": 0.5}, "faith") == 0.5
|
||||||
|
|
||||||
|
def test_returns_default_when_key_missing(self):
|
||||||
|
assert resolve_weight({}, "faith") == 1.0
|
||||||
|
|
||||||
|
def test_returns_custom_default_when_key_missing(self):
|
||||||
|
assert resolve_weight({}, "faith", default=2.0) == 2.0
|
||||||
|
|
||||||
|
def test_empty_dict_returns_default(self):
|
||||||
|
assert resolve_weight({}, "anything") == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeWeightedScore:
|
||||||
|
def test_equal_weights_is_simple_mean(self):
|
||||||
|
scores = {"faithfulness": 0.8, "context_recall": 0.6}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.7, rel=1e-4)
|
||||||
|
|
||||||
|
def test_explicit_weights(self):
|
||||||
|
scores = {"faithfulness": 1.0, "context_recall": 0.0}
|
||||||
|
weights = {"faithfulness": 3.0, "context_recall": 1.0}
|
||||||
|
result = compute_weighted_score(scores, weights)
|
||||||
|
assert result == pytest.approx(0.75, rel=1e-4)
|
||||||
|
|
||||||
|
def test_nan_values_excluded(self):
|
||||||
|
scores = {"faithfulness": float("nan"), "context_recall": 0.8}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.8, rel=1e-4)
|
||||||
|
|
||||||
|
def test_none_values_excluded(self):
|
||||||
|
scores = {"faithfulness": None, "context_recall": 0.6}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.6, rel=1e-4)
|
||||||
|
|
||||||
|
def test_all_nan_returns_none(self):
|
||||||
|
scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
|
||||||
|
assert compute_weighted_score(scores, {}) is None
|
||||||
|
|
||||||
|
def test_empty_scores_returns_none(self):
|
||||||
|
assert compute_weighted_score({}, {}) is None
|
||||||
|
|
||||||
|
def test_missing_metric_in_weights_uses_default_1(self):
|
||||||
|
scores = {"faithfulness": 0.8, "context_recall": 0.4}
|
||||||
|
weights = {"faithfulness": 2.0}
|
||||||
|
result = compute_weighted_score(scores, weights)
|
||||||
|
assert result == pytest.approx(2.0 / 3, rel=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWeightedMetricMeans:
|
||||||
|
def _rows(self):
|
||||||
|
return [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_equal_weights_gives_arithmetic_mean(self):
|
||||||
|
rows = self._rows()
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
|
||||||
|
|
||||||
|
def test_doc_weight_amplifies_contribution(self):
|
||||||
|
rows = self._rows()
|
||||||
|
doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
|
||||||
|
|
||||||
|
def test_nan_rows_skipped_per_metric(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
|
||||||
|
]
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
|
||||||
|
|
||||||
|
def test_missing_metric_column_returns_none(self):
|
||||||
|
rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["unknown_metric"] is None
|
||||||
|
|
||||||
|
def test_empty_rows_returns_none_for_all(self):
|
||||||
|
result = weighted_metric_means([], ["faithfulness"], {})
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeOverallWeightedScoreMean:
|
||||||
|
def test_basic_weighted_mean_of_weighted_scores(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
|
||||||
|
]
|
||||||
|
metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
|
||||||
|
result = compute_overall_weighted_score_mean(rows, metric_weights, {})
|
||||||
|
assert result == pytest.approx(0.5, rel=1e-4)
|
||||||
|
|
||||||
|
def test_doc_weight_amplifies_sample(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "important.pdf", "faithfulness": 1.0},
|
||||||
|
{"doc_name": "other.pdf", "faithfulness": 0.0},
|
||||||
|
]
|
||||||
|
doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
|
||||||
|
result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
|
||||||
|
assert result == pytest.approx(0.9, rel=1e-4)
|
||||||
|
|
||||||
|
def test_all_nan_returns_none(self):
|
||||||
|
rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
|
||||||
|
assert compute_overall_weighted_score_mean(rows, {}, {}) is None
|
||||||
@@ -137,3 +137,104 @@ def test_apply_no_profiles_returns_empty(tmp_path):
|
|||||||
_resolve_absolute=True,
|
_resolve_absolute=True,
|
||||||
)
|
)
|
||||||
assert patched == []
|
assert patched == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_metric_weights_patches_yaml(tmp_path):
|
||||||
|
"""Applying metric_weights writes them into the YAML."""
|
||||||
|
import yaml as yaml_lib
|
||||||
|
import pytest
|
||||||
|
scenario_file = tmp_path / "w-scenario.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
|
||||||
|
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None, answer_profile=None, dataset_profile=None,
|
||||||
|
metric_weights={"faithfulness": 0.7, "context_recall": 0.3},
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "metric_weights" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert abs(data["metric_weights"]["faithfulness"] - 0.7) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_doc_weights_patches_yaml(tmp_path):
|
||||||
|
"""Applying doc_weights writes them into the YAML."""
|
||||||
|
import yaml as yaml_lib
|
||||||
|
scenario_file = tmp_path / "dw-scenario.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
|
||||||
|
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None, answer_profile=None, dataset_profile=None,
|
||||||
|
doc_weights={"doc.pdf": 2.0},
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "doc_weights" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert abs(data["doc_weights"]["doc.pdf"] - 2.0) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Connectivity test endpoint tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
|
||||||
|
def test_probe_connectivity_success(client):
|
||||||
|
"""POST /api/llm-profiles/probe returns ok=True on successful completion."""
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [MagicMock()]
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
|
||||||
|
resp = client.post("/api/llm-profiles/probe", json={
|
||||||
|
"model": "test-model",
|
||||||
|
"base_url": "http://x/v1",
|
||||||
|
"api_key": "sk-test",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ok"] is True
|
||||||
|
assert data["latency_ms"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_probe_connectivity_failure(client):
|
||||||
|
"""POST /api/llm-profiles/probe returns ok=False when the LLM call raises."""
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.side_effect = Exception("connection refused")
|
||||||
|
resp = client.post("/api/llm-profiles/probe", json={
|
||||||
|
"model": "test-model",
|
||||||
|
"base_url": "http://x/v1",
|
||||||
|
"api_key": "sk-test",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ok"] is False
|
||||||
|
assert "connection refused" in data["message"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_test_saved_profile_success(client):
|
||||||
|
"""POST /api/llm-profiles/{id}/test returns ok=True for a saved profile."""
|
||||||
|
body = {"name": "T", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [MagicMock()]
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
|
||||||
|
resp = client.post(f"/api/llm-profiles/{pid}/test")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["ok"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_test_nonexistent_profile_returns_404(client):
|
||||||
|
"""POST /api/llm-profiles/{id}/test returns 404 for unknown profile id."""
|
||||||
|
resp = client.post("/api/llm-profiles/nonexistent/test")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|||||||
327
tests/webapp/test_score_api.py
Normal file
327
tests/webapp/test_score_api.py
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
"""Tests for POST /api/score endpoint."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreRequest:
|
||||||
|
def test_minimal_valid_request(self):
|
||||||
|
"""Only required fields — question, answer, contexts."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="What is CT?",
|
||||||
|
answer="CT is imaging.",
|
||||||
|
contexts="CT uses X-rays.",
|
||||||
|
)
|
||||||
|
assert req.question == "What is CT?"
|
||||||
|
assert req.contexts == "CT uses X-rays."
|
||||||
|
assert req.ground_truth is None
|
||||||
|
assert req.context_separator == " |||| "
|
||||||
|
assert req.metrics == [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_contexts_split_by_separator(self):
|
||||||
|
"""contexts_as_list() splits on context_separator."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="ctx1 |||| ctx2 |||| ctx3",
|
||||||
|
context_separator=" |||| ",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
|
||||||
|
|
||||||
|
def test_contexts_split_custom_separator(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="a---b---c",
|
||||||
|
context_separator="---",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["a", "b", "c"]
|
||||||
|
|
||||||
|
def test_contexts_split_single_item(self):
|
||||||
|
req = ScoreRequest(question="q", answer="a", contexts="only one")
|
||||||
|
assert req.contexts_as_list() == ["only one"]
|
||||||
|
|
||||||
|
def test_missing_question_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_answer_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_contexts_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_custom_metrics_accepted(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
)
|
||||||
|
assert req.metrics == ["faithfulness"]
|
||||||
|
|
||||||
|
def test_invalid_metric_name_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=["not_a_metric"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
|
||||||
|
"""Without ground_truth, GT-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=[
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "faithfulness" in effective
|
||||||
|
assert "context_recall" not in effective
|
||||||
|
assert "factual_correctness" not in effective
|
||||||
|
assert "semantic_similarity" not in effective
|
||||||
|
assert "noise_sensitivity" not in effective
|
||||||
|
|
||||||
|
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
ground_truth="gt",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert effective == [
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreResponse:
|
||||||
|
def test_score_response_structure(self):
|
||||||
|
resp = ScoreResponse(
|
||||||
|
scores={"faithfulness": 0.85, "answer_relevancy": None},
|
||||||
|
weighted_score=0.85,
|
||||||
|
latency_ms=1200,
|
||||||
|
)
|
||||||
|
assert resp.scores["faithfulness"] == 0.85
|
||||||
|
assert resp.scores["answer_relevancy"] is None
|
||||||
|
assert resp.latency_ms == 1200
|
||||||
|
|
||||||
|
|
||||||
|
class TestInlineScorer:
|
||||||
|
def test_score_returns_dict_with_requested_metrics(self):
|
||||||
|
"""InlineScorer.score returns a dict keyed by the requested metrics."""
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts=["ctx1"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness", "answer_relevancy"],
|
||||||
|
judge_model="test-model",
|
||||||
|
embedding_model="test-embed",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert "faithfulness" in result
|
||||||
|
assert "answer_relevancy" in result
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9)
|
||||||
|
|
||||||
|
def test_score_converts_nan_to_none(self):
|
||||||
|
"""NaN scores are converted to None in the returned dict."""
|
||||||
|
import math
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": float("nan")}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a", contexts=["c"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Endpoint integration tests ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(monkeypatch):
|
||||||
|
"""TestClient with mocked InlineScorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {
|
||||||
|
"faithfulness": 0.85,
|
||||||
|
"answer_relevancy": 0.90,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreEndpoint:
|
||||||
|
def test_post_score_returns_200(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is imaging.",
|
||||||
|
"contexts": "CT uses X-rays.",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "scores" in data
|
||||||
|
assert "latency_ms" in data
|
||||||
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
||||||
|
|
||||||
|
def test_weighted_score_computed(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["weighted_score"] is not None
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_invalid_metric_name_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["not_a_metric"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["faithfulness", "context_recall"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "context_recall" in data["skipped_metrics"]
|
||||||
|
|
||||||
|
def test_contexts_split_on_separator(self, monkeypatch):
|
||||||
|
"""contexts string is split before passing to scorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
calls = []
|
||||||
|
def capture(**kwargs):
|
||||||
|
calls.append(kwargs.get("contexts", []))
|
||||||
|
return {"faithfulness": 0.9}
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.side_effect = lambda **kw: capture(**kw)
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a",
|
||||||
|
"contexts": "ctx1 |||| ctx2",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
})
|
||||||
|
assert len(calls) == 1
|
||||||
|
assert calls[0] == ["ctx1", "ctx2"]
|
||||||
|
|
||||||
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
||||||
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
|
||||||
|
# No auth header -> 401
|
||||||
|
resp = tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
# Correct token -> 200
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer secret-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer wrong-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 401
|
||||||
@@ -2,13 +2,18 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
from webapp.models import (
|
from webapp.models import (
|
||||||
CreateProfileRequest,
|
CreateProfileRequest,
|
||||||
LLMProfile,
|
LLMProfile,
|
||||||
ProfileApplyRequest,
|
ProfileApplyRequest,
|
||||||
ProfileApplyResponse,
|
ProfileApplyResponse,
|
||||||
|
ProfileProbeRequest,
|
||||||
|
ProfileTestResponse,
|
||||||
)
|
)
|
||||||
from webapp.services.profile_manager import profile_manager
|
from webapp.services.profile_manager import profile_manager
|
||||||
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
@@ -16,6 +21,43 @@ from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
|||||||
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
|
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
|
||||||
|
|
||||||
|
|
||||||
|
def _do_connectivity_test(
|
||||||
|
model: str,
|
||||||
|
base_url: str,
|
||||||
|
api_key: str,
|
||||||
|
timeout_seconds: int,
|
||||||
|
) -> ProfileTestResponse:
|
||||||
|
"""Send a minimal chat completion request and return the test result."""
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=base_url.rstrip("/"),
|
||||||
|
timeout=float(timeout_seconds),
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
max_tokens=1,
|
||||||
|
)
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
|
||||||
|
def probe_connectivity(request: ProfileProbeRequest) -> ProfileTestResponse:
|
||||||
|
"""Test LLM connectivity with inline credentials (no saved profile required)."""
|
||||||
|
return _do_connectivity_test(
|
||||||
|
model=request.model,
|
||||||
|
base_url=request.base_url,
|
||||||
|
api_key=request.api_key,
|
||||||
|
timeout_seconds=request.timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=dict)
|
@router.get("", response_model=dict)
|
||||||
def list_profiles() -> dict:
|
def list_profiles() -> dict:
|
||||||
"""Return all saved LLM profiles."""
|
"""Return all saved LLM profiles."""
|
||||||
@@ -59,6 +101,20 @@ def delete_profile(profile_id: str) -> dict:
|
|||||||
return {"deleted": True}
|
return {"deleted": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{profile_id}/test", response_model=ProfileTestResponse)
|
||||||
|
def test_profile(profile_id: str) -> ProfileTestResponse:
|
||||||
|
"""Test LLM connectivity for a saved profile."""
|
||||||
|
profile = profile_manager.get(profile_id)
|
||||||
|
if profile is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
return _do_connectivity_test(
|
||||||
|
model=profile.model,
|
||||||
|
base_url=profile.base_url,
|
||||||
|
api_key=profile.api_key,
|
||||||
|
timeout_seconds=profile.timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/apply", response_model=ProfileApplyResponse)
|
@router.post("/apply", response_model=ProfileApplyResponse)
|
||||||
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
|
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
|
||||||
"""Patch selected LLM profiles into the target scenario YAML file."""
|
"""Patch selected LLM profiles into the target scenario YAML file."""
|
||||||
@@ -89,6 +145,8 @@ def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
|
|||||||
judge_profile=role_profiles["judge"],
|
judge_profile=role_profiles["judge"],
|
||||||
answer_profile=role_profiles["answer"],
|
answer_profile=role_profiles["answer"],
|
||||||
dataset_profile=role_profiles["dataset"],
|
dataset_profile=role_profiles["dataset"],
|
||||||
|
metric_weights=request.metric_weights,
|
||||||
|
doc_weights=request.doc_weights,
|
||||||
)
|
)
|
||||||
return ProfileApplyResponse(
|
return ProfileApplyResponse(
|
||||||
scenario_path=request.scenario_path,
|
scenario_path=request.scenario_path,
|
||||||
|
|||||||
105
webapp/api/score.py
Normal file
105
webapp/api/score.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_settings() -> EvaluationSettings:
|
||||||
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
||||||
|
return EvaluationSettings()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_auth(authorization: str | None, token: str) -> None:
|
||||||
|
"""Raise 401 if Bearer token does not match the configured token."""
|
||||||
|
if authorization is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
||||||
|
parts = authorization.split(" ", 1)
|
||||||
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"",
|
||||||
|
response_model=ScoreResponse,
|
||||||
|
summary="单题实时评分(Dify 外部 Tool)",
|
||||||
|
responses={
|
||||||
|
200: {"description": "各指标得分和加权综合得分。"},
|
||||||
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
||||||
|
422: {"description": "请求参数校验失败。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def score_sample(
|
||||||
|
request: ScoreRequest,
|
||||||
|
authorization: Annotated[str | None, Header()] = None,
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
|
||||||
|
settings = _get_settings()
|
||||||
|
|
||||||
|
# Require Bearer auth only when the deployment configured a shared token.
|
||||||
|
if settings.score_api_token:
|
||||||
|
_check_auth(authorization, settings.score_api_token)
|
||||||
|
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
if not effective:
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={metric_name: None for metric_name in request.metrics},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=0,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
# Keep skipped metrics visible to callers by emitting them as null scores.
|
||||||
|
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
weighted = compute_weighted_score(
|
||||||
|
{key: value for key, value in raw_scores.items() if value is not None},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScoreResponse(
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=round(weighted, 4) if weighted is not None else None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
332
webapp/models.py
332
webapp/models.py
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||||
|
|
||||||
|
|
||||||
def _utcnow_iso() -> str:
|
def _utcnow_iso() -> str:
|
||||||
@@ -74,6 +74,18 @@ class ReportData(BaseModel):
|
|||||||
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||||||
summary_markdown: str = ""
|
summary_markdown: str = ""
|
||||||
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
|
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
|
||||||
|
weighted_score_mean: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="加权综合得分均值(metric_weights × doc_weights 共同作用)。",
|
||||||
|
)
|
||||||
|
metric_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="该次运行使用的指标权重配置(来自 scenario.snapshot.yaml)。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="该次运行使用的文档权重配置(来自 scenario.snapshot.yaml)。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RunDetail(BaseModel):
|
class RunDetail(BaseModel):
|
||||||
@@ -93,6 +105,14 @@ class ScenarioInfo(BaseModel):
|
|||||||
judge_model: str = ""
|
judge_model: str = ""
|
||||||
metrics: list[str] = Field(default_factory=list)
|
metrics: list[str] = Field(default_factory=list)
|
||||||
error: str = ""
|
error: str = ""
|
||||||
|
metric_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="从场景 YAML 读取的指标权重配置,供前端权重面板预填。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="从场景 YAML 读取的文档权重配置,供前端权重面板预填。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TaskStatus(BaseModel):
|
class TaskStatus(BaseModel):
|
||||||
@@ -150,6 +170,14 @@ class ProfileApplyRequest(BaseModel):
|
|||||||
judge_profile_id: str | None = None
|
judge_profile_id: str | None = None
|
||||||
answer_profile_id: str | None = None
|
answer_profile_id: str | None = None
|
||||||
dataset_profile_id: str | None = None
|
dataset_profile_id: str | None = None
|
||||||
|
metric_weights: dict[str, float] | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="指标权重映射,如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="文档权重映射,如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ProfileApplyResponse(BaseModel):
|
class ProfileApplyResponse(BaseModel):
|
||||||
@@ -159,6 +187,23 @@ class ProfileApplyResponse(BaseModel):
|
|||||||
patched_fields: list[str] = Field(default_factory=list)
|
patched_fields: list[str] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileProbeRequest(BaseModel):
|
||||||
|
"""Inline credentials for testing LLM connectivity without saving a profile."""
|
||||||
|
|
||||||
|
model: str
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileTestResponse(BaseModel):
|
||||||
|
"""Result of a LLM connectivity test."""
|
||||||
|
|
||||||
|
ok: bool
|
||||||
|
message: str
|
||||||
|
latency_ms: int | None = None
|
||||||
|
|
||||||
|
|
||||||
def jsonable(value: Any) -> Any:
|
def jsonable(value: Any) -> Any:
|
||||||
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||||||
import math
|
import math
|
||||||
@@ -172,3 +217,288 @@ def jsonable(value: Any) -> Any:
|
|||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
return [jsonable(item) for item in value]
|
return [jsonable(item) for item in value]
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Full pipeline (build + eval) job models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class PipelineJobRequest(BaseModel):
|
||||||
|
"""Request body for launching an end-to-end build + evaluation pipeline job."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "西门子 CT 文档评估(完整参数)",
|
||||||
|
"value": {
|
||||||
|
"docs_path": "datasets/siemens-pdfs",
|
||||||
|
"job_name": "siemens-ct-eval-2026",
|
||||||
|
"generation_model": "qwen3.6-plus",
|
||||||
|
"answer_model": "deepseek-v4-flash",
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
"max_questions_per_document": 10,
|
||||||
|
"max_source_chunks_per_question": 3,
|
||||||
|
"max_documents": None,
|
||||||
|
"max_samples": None,
|
||||||
|
"metrics": [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
"optimization_advisor": False,
|
||||||
|
"failure_mode": "skip",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"summary": "快速冒烟测试(仅 2 份文档、5 道题)",
|
||||||
|
"value": {
|
||||||
|
"docs_path": "datasets/siemens-pdfs",
|
||||||
|
"job_name": "smoke-test",
|
||||||
|
"generation_model": "qwen3.6-plus",
|
||||||
|
"answer_model": "deepseek-v4-flash",
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
"max_questions_per_document": 5,
|
||||||
|
"max_source_chunks_per_question": 3,
|
||||||
|
"max_documents": 2,
|
||||||
|
"max_samples": 10,
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy"],
|
||||||
|
"optimization_advisor": False,
|
||||||
|
"failure_mode": "skip",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
docs_path: str = Field(
|
||||||
|
description="PDF 文档所在文件夹的绝对路径或相对于仓库根目录的相对路径。"
|
||||||
|
)
|
||||||
|
job_name: str = Field(
|
||||||
|
default="",
|
||||||
|
description="任务显示名称;留空时系统自动生成唯一标识。",
|
||||||
|
)
|
||||||
|
generation_model: str = Field(
|
||||||
|
default="qwen3.6-plus",
|
||||||
|
description="用于从文档片段生成草稿题库的 LLM 模型名称。",
|
||||||
|
)
|
||||||
|
answer_model: str = Field(
|
||||||
|
default="deepseek-v4-flash",
|
||||||
|
description="在线评估时调用的答题 LLM 模型名称(siemens_pdf_qa adapter)。",
|
||||||
|
)
|
||||||
|
judge_model: str = Field(
|
||||||
|
default="deepseek-v4-flash",
|
||||||
|
description="RAGAS 指标评分时使用的 Judge LLM 模型名称。",
|
||||||
|
)
|
||||||
|
embedding_model: str = Field(
|
||||||
|
default="text-embedding-v3",
|
||||||
|
description="RAGAS context-recall / context-precision 使用的 Embedding 模型名称。",
|
||||||
|
)
|
||||||
|
max_questions_per_document: int = Field(
|
||||||
|
default=10, gt=0,
|
||||||
|
description="每份 PDF 文档最多生成的草稿题目数量。",
|
||||||
|
)
|
||||||
|
max_source_chunks_per_question: int = Field(
|
||||||
|
default=3, gt=0,
|
||||||
|
description="每道题目最多引用的文档片段(source chunk)数量。",
|
||||||
|
)
|
||||||
|
max_documents: int | None = Field(
|
||||||
|
default=None, gt=0,
|
||||||
|
description="限制处理的 PDF 文件数量上限(冒烟测试时使用)。",
|
||||||
|
)
|
||||||
|
max_samples: int | None = Field(
|
||||||
|
default=None, gt=0,
|
||||||
|
description="限制评估的题目数量上限(冒烟测试时使用)。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
description=(
|
||||||
|
"需要计算的 RAGAS 指标列表。"
|
||||||
|
"可选值:faithfulness, answer_relevancy, context_recall, "
|
||||||
|
"context_precision, noise_sensitivity, factual_correctness, semantic_similarity。"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
optimization_advisor: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="为 True 时启用 RAGAS 优化建议模块,生成 optimization_advice.md。",
|
||||||
|
)
|
||||||
|
failure_mode: str = Field(
|
||||||
|
default="skip",
|
||||||
|
description="PDF 解析失败时的处理策略:skip(跳过继续)或 fail(立即中止)。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineResult(BaseModel):
|
||||||
|
"""Artifact locations and statistics for a completed pipeline run."""
|
||||||
|
|
||||||
|
build_artifact_dir: str = Field(description="题库生成阶段的产物根目录路径。")
|
||||||
|
dataset_csv: str = Field(description="生成的草稿题库 CSV 文件路径(评估输入)。")
|
||||||
|
source_chunks_jsonl: str = Field(description="文档片段索引文件路径(在线评估 adapter 使用)。")
|
||||||
|
total_questions: int = Field(description="成功生成的有效题目总数。")
|
||||||
|
parse_failures: int = Field(description="文档解析失败的 PDF 数量。")
|
||||||
|
eval_run_id: str = Field(description="RAGAS 评估运行 ID。")
|
||||||
|
eval_output_dir: str = Field(description="RAGAS 评估产物根目录路径。")
|
||||||
|
scores_csv: str = Field(description="每道题目逐项评分的 CSV 文件路径。")
|
||||||
|
summary_md: str = Field(description="评估结果摘要 Markdown 文件路径。")
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineJobStatus(BaseModel):
|
||||||
|
"""State of one end-to-end pipeline job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
job_name: str = Field(description="任务显示名称。")
|
||||||
|
status: str = Field(description="任务状态:queued | running | completed | failed。")
|
||||||
|
phase: str = Field(default="idle", description="当前执行阶段:idle | parsing_documents | generating_questions | evaluating | done。")
|
||||||
|
logs: list[str] = Field(default_factory=list, description="实时日志行列表。")
|
||||||
|
result: PipelineResult | None = Field(default=None, description="任务完成后填充的产物路径与统计信息。")
|
||||||
|
error: str | None = Field(default=None, description="失败时的错误信息。")
|
||||||
|
created_at: str = Field(default="", description="任务创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="任务结束时间(ISO 8601 UTC)。")
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineJobResponse(BaseModel):
|
||||||
|
"""Immediate response returned after a pipeline job is queued."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续轮询状态。")
|
||||||
|
job_name: str = Field(description="任务显示名称。")
|
||||||
|
status: str = Field(default="queued", description="初始状态,通常为 queued。")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dify 实时评分 API 模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 需要 ground_truth 才能计算的指标集合
|
||||||
|
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 所有合法指标名称
|
||||||
|
_VALID_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
})
|
||||||
|
|
||||||
|
_DEFAULT_SCORE_METRICS: list[str] = [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreRequest(BaseModel):
|
||||||
|
"""Request body for the real-time single-sample scoring endpoint."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "基础评分请求",
|
||||||
|
"value": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
question: str = Field(description="问题文本。")
|
||||||
|
answer: str = Field(description="待评分的回答。")
|
||||||
|
contexts: str = Field(
|
||||||
|
description="检索上下文字符串,多段之间用 context_separator 拼接。"
|
||||||
|
)
|
||||||
|
ground_truth: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
|
||||||
|
)
|
||||||
|
context_separator: str = Field(
|
||||||
|
default=" |||| ",
|
||||||
|
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
|
||||||
|
description="需要计算的 RAGAS 指标列表。",
|
||||||
|
)
|
||||||
|
judge_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
|
||||||
|
)
|
||||||
|
embedding_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
|
||||||
|
)
|
||||||
|
|
||||||
|
@field_validator("metrics")
|
||||||
|
@classmethod
|
||||||
|
def validate_metric_names(cls, value: list[str]) -> list[str]:
|
||||||
|
"""Reject any metric name not in the supported registry."""
|
||||||
|
invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
|
||||||
|
if invalid:
|
||||||
|
raise ValueError(
|
||||||
|
f"不支持的指标名称:{invalid}。"
|
||||||
|
f"合法值:{sorted(_VALID_METRICS)}"
|
||||||
|
)
|
||||||
|
if not value:
|
||||||
|
raise ValueError("metrics 不能为空列表。")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def contexts_as_list(self) -> list[str]:
|
||||||
|
"""Split the contexts string into a list of non-empty fragments."""
|
||||||
|
separator = self.context_separator or " |||| "
|
||||||
|
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
|
||||||
|
|
||||||
|
def effective_metrics(self) -> list[str]:
|
||||||
|
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
|
||||||
|
if self.ground_truth is not None:
|
||||||
|
return list(self.metrics)
|
||||||
|
return [metric_name for metric_name in self.metrics if metric_name not in _GT_DEPENDENT_METRICS]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreResponse(BaseModel):
|
||||||
|
"""Response payload for the real-time scoring endpoint."""
|
||||||
|
|
||||||
|
scores: dict[str, float | None] = Field(
|
||||||
|
description="各指标得分(NaN 或计算失败时为 null)。"
|
||||||
|
)
|
||||||
|
weighted_score: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="等权加权综合得分(仅对非 null 指标求均值)。",
|
||||||
|
)
|
||||||
|
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
|
||||||
|
skipped_metrics: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="因缺少 ground_truth 而跳过的指标名称列表。",
|
||||||
|
)
|
||||||
|
error: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
||||||
|
)
|
||||||
|
|||||||
@@ -13,23 +13,95 @@ from fastapi import FastAPI
|
|||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from webapp.api import evaluations, llm_profiles, runs, scenarios
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
|
||||||
|
|
||||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
|
|
||||||
|
# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
|
||||||
|
OPENAPI_TAGS = [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"description": (
|
||||||
|
"**全链路评估 Pipeline API**\n\n"
|
||||||
|
"一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
|
||||||
|
"**使用流程**\n"
|
||||||
|
"1. `POST /api/pipeline/jobs` 提交任务,立即拿到 `job_id`。\n"
|
||||||
|
"2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
|
||||||
|
"3. 当 `status=completed` 时,`result` 字段包含所有产物路径。\n\n"
|
||||||
|
"**Pipeline 阶段**\n"
|
||||||
|
"| phase | 说明 |\n"
|
||||||
|
"|-------|------|\n"
|
||||||
|
"| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
|
||||||
|
"| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
|
||||||
|
"| `evaluating` | RAGAS 在线评测打分 |\n"
|
||||||
|
"| `done` | 所有产物写入磁盘,任务完成 |"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "evaluations",
|
||||||
|
"description": (
|
||||||
|
"**单场景评估 API**\n\n"
|
||||||
|
"基于已有 YAML 场景文件触发评估任务,并查询任务状态与日志。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "llm-profiles",
|
||||||
|
"description": (
|
||||||
|
"**LLM 配置管理 API**\n\n"
|
||||||
|
"增删改查已保存的 LLM 连接配置(模型名称、Base URL、API Key);"
|
||||||
|
"支持连通性测试;可将配置一键写入场景 YAML 文件。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "runs",
|
||||||
|
"description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "scenarios",
|
||||||
|
"description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(Dify 外部 Tool)**\n\n"
|
||||||
|
"接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n"
|
||||||
|
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
|
||||||
|
"适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meta",
|
||||||
|
"description": "**系统 API**\n\n健康检查等基础接口。",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
"""Build and configure the FastAPI application instance."""
|
"""Build and configure the FastAPI application instance."""
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Siemens RAGAS 评估控制台",
|
title="RAGAS 评估系统",
|
||||||
description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
|
description=(
|
||||||
version="0.1.0",
|
"西门子医疗影像 RAG 评估平台 API 文档。\n\n"
|
||||||
|
"提供以下能力:\n"
|
||||||
|
"- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
|
||||||
|
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
|
||||||
|
"- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
|
||||||
|
"- **LLM 配置 API** — 管理多个 LLM 连接配置,支持连通性测试\n"
|
||||||
|
"- **报告 API** — 查询历史运行记录与评估报告\n\n"
|
||||||
|
"> **快速开始**:调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
|
||||||
|
),
|
||||||
|
version="0.2.0",
|
||||||
|
openapi_tags=OPENAPI_TAGS,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.include_router(runs.router)
|
app.include_router(runs.router)
|
||||||
app.include_router(scenarios.router)
|
app.include_router(scenarios.router)
|
||||||
app.include_router(evaluations.router)
|
app.include_router(evaluations.router)
|
||||||
app.include_router(llm_profiles.router)
|
app.include_router(llm_profiles.router)
|
||||||
|
app.include_router(pipeline.router)
|
||||||
|
app.include_router(score.router)
|
||||||
|
|
||||||
@app.get("/api/health", tags=["meta"])
|
@app.get("/api/health", tags=["meta"])
|
||||||
def health() -> dict[str, str]:
|
def health() -> dict[str, str]:
|
||||||
|
|||||||
109
webapp/services/inline_scorer.py
Normal file
109
webapp/services/inline_scorer.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
|
||||||
|
|
||||||
|
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
|
||||||
|
(judge_model, embedding_model), so repeated Dify Tool calls with the same
|
||||||
|
models reuse existing AsyncOpenAI connections instead of creating new ones.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.compat import ensure_ragas_import_compat
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import NormalizedSample
|
||||||
|
|
||||||
|
ensure_ragas_import_compat()
|
||||||
|
|
||||||
|
from ragas.metrics.collections import ( # noqa: E402
|
||||||
|
AnswerRelevancy,
|
||||||
|
ContextPrecision,
|
||||||
|
ContextRecall,
|
||||||
|
FactualCorrectness,
|
||||||
|
Faithfulness,
|
||||||
|
NoiseSensitivity,
|
||||||
|
SemanticSimilarity,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
|
||||||
|
"""Instantiate only the RAGAS metric objects requested."""
|
||||||
|
registry: dict[str, Any] = {
|
||||||
|
"faithfulness": Faithfulness(llm=llm),
|
||||||
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||||
|
"context_recall": ContextRecall(llm=llm),
|
||||||
|
"context_precision": ContextPrecision(llm=llm),
|
||||||
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||||
|
"factual_correctness": FactualCorrectness(llm=llm),
|
||||||
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||||
|
}
|
||||||
|
return {name: registry[name] for name in metrics if name in registry}
|
||||||
|
|
||||||
|
|
||||||
|
class InlineScorer:
|
||||||
|
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize the scorer cache and synchronization primitives."""
|
||||||
|
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
|
||||||
|
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def _get_models(
|
||||||
|
self,
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> tuple[Any, Any]:
|
||||||
|
"""Return cached LLM/embedding clients, building them on first use."""
|
||||||
|
cache_key = (judge_model, embedding_model)
|
||||||
|
with self._lock:
|
||||||
|
if cache_key not in self._model_cache:
|
||||||
|
llm, embeddings = build_models(judge_model, embedding_model, settings)
|
||||||
|
self._model_cache[cache_key] = (llm, embeddings)
|
||||||
|
return self._model_cache[cache_key]
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Score one sample synchronously and return {metric_name: score | None}."""
|
||||||
|
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
|
||||||
|
metric_instances = _build_metric_instances(metrics, llm, embeddings)
|
||||||
|
|
||||||
|
pipeline = MetricPipeline(
|
||||||
|
metrics=metric_instances,
|
||||||
|
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="inline-score",
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
contexts=contexts,
|
||||||
|
ground_truth=ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
metric_score = asyncio.run(pipeline.score_sample(sample))
|
||||||
|
|
||||||
|
# Convert NaN and Inf into None for clean JSON output.
|
||||||
|
return {
|
||||||
|
name: (None if math.isnan(value) or math.isinf(value) else round(value, 4))
|
||||||
|
for name, value in metric_score.metrics.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
inline_scorer = InlineScorer()
|
||||||
@@ -13,6 +13,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means as _weighted_metric_means,
|
||||||
|
)
|
||||||
|
from webapp.services.run_reader import _read_weights_from_snapshot
|
||||||
from webapp.services.text_utils import parse_contexts
|
from webapp.services.text_utils import parse_contexts
|
||||||
from webapp.models import (
|
from webapp.models import (
|
||||||
DistributionBin,
|
DistributionBin,
|
||||||
@@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None:
|
|||||||
return round(float(value), 4)
|
return round(float(value), 4)
|
||||||
|
|
||||||
|
|
||||||
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
|
|
||||||
"""Compute the mean of each metric column across all scored samples."""
|
|
||||||
means: dict[str, float | None] = {}
|
|
||||||
for metric in metrics:
|
|
||||||
if metric in frame.columns:
|
|
||||||
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
|
|
||||||
else:
|
|
||||||
means[metric] = None
|
|
||||||
return means
|
|
||||||
|
|
||||||
|
|
||||||
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
||||||
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
||||||
bins: list[DistributionBin] = []
|
bins: list[DistributionBin] = []
|
||||||
@@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
frame = run_reader.read_scores_frame(run_dir)
|
frame = run_reader.read_scores_frame(run_dir)
|
||||||
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||||
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
if frame.empty or not metrics:
|
if frame.empty or not metrics:
|
||||||
return ReportData(
|
return ReportData(
|
||||||
@@ -172,6 +167,18 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
metric_means={metric: None for metric in metrics},
|
metric_means={metric: None for metric in metrics},
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
advice_markdown=advice_markdown,
|
advice_markdown=advice_markdown,
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
|
)
|
||||||
|
|
||||||
|
score_rows_list = frame.to_dict(orient="records")
|
||||||
|
|
||||||
|
# Use weighted metric means (degrades to arithmetic mean when weights are empty).
|
||||||
|
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
|
||||||
|
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
|
||||||
|
|
||||||
|
overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
score_rows_list, metric_weights, doc_weights
|
||||||
)
|
)
|
||||||
|
|
||||||
distributions = {
|
distributions = {
|
||||||
@@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
|
|
||||||
return ReportData(
|
return ReportData(
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
metric_means=_metric_means(frame, metrics),
|
metric_means=rounded_means,
|
||||||
distributions=distributions,
|
distributions=distributions,
|
||||||
groupings=_groupings(frame, metrics),
|
groupings=_groupings(frame, metrics),
|
||||||
lowest_samples=_lowest_samples(frame, metrics),
|
lowest_samples=_lowest_samples(frame, metrics),
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
advice_markdown=advice_markdown,
|
advice_markdown=advice_markdown,
|
||||||
|
weighted_score_mean=_round_or_none(overall_ws),
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
|
||||||
|
"""Read metric_weights and doc_weights from a scenario snapshot if present.
|
||||||
|
|
||||||
|
Returns a (metric_weights, doc_weights) tuple of plain dicts.
|
||||||
|
Both default to empty dicts when the snapshot is absent or lacks the fields.
|
||||||
|
"""
|
||||||
|
snapshot = run_dir / "scenario.snapshot.yaml"
|
||||||
|
if not snapshot.is_file():
|
||||||
|
return {}, {}
|
||||||
|
try:
|
||||||
|
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
|
||||||
|
except (OSError, yaml.YAMLError):
|
||||||
|
return {}, {}
|
||||||
|
mw = payload.get("metric_weights") or {}
|
||||||
|
dw = payload.get("doc_weights") or {}
|
||||||
|
return (
|
||||||
|
{str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
|
||||||
|
{str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||||
"""Find every run directory (one that contains metadata.json) under the roots."""
|
"""Find every run directory (one that contains metadata.json) under the roots."""
|
||||||
run_dirs: list[Path] = []
|
run_dirs: list[Path] = []
|
||||||
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
|
|||||||
"source_chunk_ids",
|
"source_chunk_ids",
|
||||||
"review_status",
|
"review_status",
|
||||||
"review_notes",
|
"review_notes",
|
||||||
|
"weighted_score",
|
||||||
|
"sample_weight",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -37,6 +37,16 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
|
|||||||
|
|
||||||
metrics = payload.get("metrics")
|
metrics = payload.get("metrics")
|
||||||
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
||||||
|
raw_metric_weights = payload.get("metric_weights") or {}
|
||||||
|
raw_doc_weights = payload.get("doc_weights") or {}
|
||||||
|
metric_weights = {
|
||||||
|
str(k): float(v) for k, v in raw_metric_weights.items()
|
||||||
|
if isinstance(v, (int, float))
|
||||||
|
}
|
||||||
|
doc_weights = {
|
||||||
|
str(k): float(v) for k, v in raw_doc_weights.items()
|
||||||
|
if isinstance(v, (int, float))
|
||||||
|
}
|
||||||
|
|
||||||
return ScenarioInfo(
|
return ScenarioInfo(
|
||||||
path=relative,
|
path=relative,
|
||||||
@@ -45,6 +55,8 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
|
|||||||
dataset=str(payload.get("dataset", "")),
|
dataset=str(payload.get("dataset", "")),
|
||||||
judge_model=str(payload.get("judge_model", "")),
|
judge_model=str(payload.get("judge_model", "")),
|
||||||
metrics=metric_list,
|
metrics=metric_list,
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -32,9 +32,11 @@ def apply_profiles_to_scenario(
|
|||||||
judge_profile: LLMProfile | None,
|
judge_profile: LLMProfile | None,
|
||||||
answer_profile: LLMProfile | None,
|
answer_profile: LLMProfile | None,
|
||||||
dataset_profile: LLMProfile | None,
|
dataset_profile: LLMProfile | None,
|
||||||
|
metric_weights: dict[str, float] | None = None,
|
||||||
|
doc_weights: dict[str, float] | None = None,
|
||||||
_resolve_absolute: bool = False,
|
_resolve_absolute: bool = False,
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
"""Patch the YAML file at *scenario_path* with the supplied profiles.
|
"""Patch the YAML file at *scenario_path* with the supplied profiles and weights.
|
||||||
|
|
||||||
Returns a list of dotted field names that were actually patched.
|
Returns a list of dotted field names that were actually patched.
|
||||||
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
|
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
|
||||||
@@ -67,6 +69,14 @@ def apply_profiles_to_scenario(
|
|||||||
generation["model"] = dataset_profile.model
|
generation["model"] = dataset_profile.model
|
||||||
patched.append("generation.model")
|
patched.append("generation.model")
|
||||||
|
|
||||||
|
if metric_weights is not None:
|
||||||
|
data["metric_weights"] = dict(metric_weights)
|
||||||
|
patched.append("metric_weights")
|
||||||
|
|
||||||
|
if doc_weights is not None:
|
||||||
|
data["doc_weights"] = dict(doc_weights)
|
||||||
|
patched.append("doc_weights")
|
||||||
|
|
||||||
resolved.write_text(
|
resolved.write_text(
|
||||||
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
|
|||||||
@@ -308,6 +308,203 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
|
|||||||
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
|
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
|
||||||
.llm-role-select { min-width: 240px; }
|
.llm-role-select { min-width: 240px; }
|
||||||
|
|
||||||
|
/* ---------- API 文档 iframe ---------- */
|
||||||
|
#view-apidocs { padding: 0; display: flex; flex-direction: column; flex: 1; }
|
||||||
|
.apidocs-frame {
|
||||||
|
flex: 1;
|
||||||
|
width: 100%;
|
||||||
|
height: calc(100vh - 64px);
|
||||||
|
border: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.report-actions {
|
||||||
|
display: flex; justify-content: flex-end; margin: 0 0 12px;
|
||||||
|
}
|
||||||
|
.btn-export-pdf {
|
||||||
|
font-size: 13px; display: flex; align-items: center; gap: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- 报告历史切换下拉 ---------- */
|
||||||
|
.report-switcher {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
background: var(--surface); border: 1px solid var(--line);
|
||||||
|
border-radius: var(--radius); padding: 10px 16px;
|
||||||
|
margin-bottom: 14px; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.report-switcher-label {
|
||||||
|
font-size: 13px; font-weight: 600; color: var(--slate); white-space: nowrap;
|
||||||
|
}
|
||||||
|
.report-switcher-select {
|
||||||
|
flex: 1; min-width: 0;
|
||||||
|
border: 1px solid var(--line); border-radius: 6px; padding: 6px 10px;
|
||||||
|
font-size: 13px; font-family: inherit; background: var(--bg); color: var(--ink);
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.report-switcher-select:focus { outline: none; border-color: var(--petrol); }
|
||||||
|
|
||||||
|
/* ?? ?????? ??????????????????????????????????? */
|
||||||
|
.weight-config-panel { margin-top: 12px; }
|
||||||
|
.weight-section-title { font-size: 13px; font-weight: 600; color: var(--text); margin-bottom: 8px; }
|
||||||
|
.weight-rows { display: flex; flex-direction: column; gap: 6px; }
|
||||||
|
.weight-row {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
.weight-row-label { min-width: 180px; color: var(--slate); font-family: monospace; }
|
||||||
|
.weight-row-input {
|
||||||
|
width: 80px; padding: 4px 8px; border: 1px solid var(--border);
|
||||||
|
border-radius: 6px; font-size: 13px; text-align: right;
|
||||||
|
}
|
||||||
|
.weight-row-input:focus { outline: none; border-color: #6366f1; }
|
||||||
|
.doc-weight-name {
|
||||||
|
flex: 1; padding: 4px 8px; border: 1px solid var(--border);
|
||||||
|
border-radius: 6px; font-size: 13px; min-width: 0;
|
||||||
|
}
|
||||||
|
.weight-row-remove { color: var(--bad); cursor: pointer; font-size: 14px; background: none; border: none; padding: 2px 6px; }
|
||||||
|
.weight-row-remove:hover { background: #fee2e2; border-radius: 4px; }
|
||||||
|
|
||||||
|
/* weighted_score ???????? */
|
||||||
|
.metric-card.weighted-score-card {
|
||||||
|
border: 2px solid #6366f1;
|
||||||
|
background: #f5f3ff;
|
||||||
|
}
|
||||||
|
.metric-card.weighted-score-card .metric-name { color: #4f46e5; font-weight: 700; }
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
打印样式(导出 PDF 用)
|
||||||
|
浏览器打印时隐藏 UI chrome,保留报告内容,图表 canvas 原样输出
|
||||||
|
================================================================ */
|
||||||
|
@media print {
|
||||||
|
/* ── 页面尺寸与边距 ── */
|
||||||
|
@page {
|
||||||
|
size: A4 portrait;
|
||||||
|
margin: 18mm 16mm 18mm 16mm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 隐藏所有非报告元素 ── */
|
||||||
|
.sidebar,
|
||||||
|
.topbar,
|
||||||
|
.report-actions,
|
||||||
|
.no-print,
|
||||||
|
#dist-metric-select,
|
||||||
|
.grouping-tabs,
|
||||||
|
#view-runs,
|
||||||
|
#view-new,
|
||||||
|
#view-profiles { display: none !important; }
|
||||||
|
|
||||||
|
/* ── 全局基础 ── */
|
||||||
|
body {
|
||||||
|
font-size: 11pt;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: #0f1b2d;
|
||||||
|
background: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 布局重置:main 全宽 ── */
|
||||||
|
.app { display: block; }
|
||||||
|
.main { display: block; width: 100%; }
|
||||||
|
.view { padding: 0; display: block !important; }
|
||||||
|
#view-report { display: block !important; }
|
||||||
|
|
||||||
|
/* ── 报告内容 ── */
|
||||||
|
#report-content { display: block !important; }
|
||||||
|
#report-empty { display: none !important; }
|
||||||
|
|
||||||
|
/* ── 元信息条 ── */
|
||||||
|
.report-meta {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
border-bottom: 2px solid #009999;
|
||||||
|
padding-bottom: 8pt;
|
||||||
|
margin-bottom: 14pt;
|
||||||
|
}
|
||||||
|
.report-meta-title { font-size: 14pt; font-weight: 700; }
|
||||||
|
.report-meta-info { font-size: 9pt; color: #64748b; }
|
||||||
|
|
||||||
|
/* ── Section 标签 ── */
|
||||||
|
.section-label {
|
||||||
|
font-size: 9pt;
|
||||||
|
font-weight: 700;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
color: #64748b;
|
||||||
|
text-transform: uppercase;
|
||||||
|
margin: 14pt 0 6pt;
|
||||||
|
border-bottom: 1px solid #e2e8f0;
|
||||||
|
padding-bottom: 3pt;
|
||||||
|
break-after: avoid;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── ① 指标均值卡片 ── */
|
||||||
|
.metric-cards {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(90pt, 1fr));
|
||||||
|
gap: 8pt;
|
||||||
|
margin-bottom: 12pt;
|
||||||
|
}
|
||||||
|
.metric-card {
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
border-radius: 6pt;
|
||||||
|
padding: 10pt 8pt;
|
||||||
|
text-align: center;
|
||||||
|
break-inside: avoid;
|
||||||
|
}
|
||||||
|
.metric-value { font-size: 20pt; font-weight: 700; }
|
||||||
|
.metric-name { font-size: 8pt; color: #64748b; margin-top: 2pt; }
|
||||||
|
|
||||||
|
/* ── ② 分布 + ③ 分组:打印时改为纵向排列 ── */
|
||||||
|
.report-row {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
.report-half {
|
||||||
|
margin-bottom: 12pt;
|
||||||
|
break-inside: avoid;
|
||||||
|
}
|
||||||
|
#dist-chart {
|
||||||
|
max-height: 160pt;
|
||||||
|
width: 100% !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 面板统一 ── */
|
||||||
|
.panel {
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
border-radius: 6pt;
|
||||||
|
padding: 10pt 12pt;
|
||||||
|
margin-bottom: 10pt;
|
||||||
|
break-inside: avoid;
|
||||||
|
box-shadow: none;
|
||||||
|
}
|
||||||
|
.panel h2 { font-size: 12pt; margin-bottom: 4pt; }
|
||||||
|
|
||||||
|
/* ── ④ 最低分样本:打印时全部展开,隐藏点击提示 ── */
|
||||||
|
.lowest-detail { display: block !important; hidden: false; }
|
||||||
|
.lowest-row { break-inside: avoid; }
|
||||||
|
.lowest-detail-inner { padding: 8pt 0; font-size: 10pt; }
|
||||||
|
.detail-label { font-size: 8pt; font-weight: 700; color: #64748b; margin-bottom: 2pt; }
|
||||||
|
.detail-context .ctx-item { border-bottom: 1px dashed #e2e8f0; padding: 2pt 0; font-size: 9pt; }
|
||||||
|
|
||||||
|
/* ── ⑤ 优化建议 ── */
|
||||||
|
#advice-section { display: block !important; }
|
||||||
|
.advice-panel { border: 1px solid #e2e8f0; border-radius: 6pt; padding: 10pt 12pt; }
|
||||||
|
.advice-md h2 { font-size: 12pt; margin-top: 10pt; }
|
||||||
|
.advice-md h3 { font-size: 11pt; }
|
||||||
|
.advice-md ul { margin: 4pt 0 4pt 16pt; }
|
||||||
|
.advice-md li { margin-bottom: 3pt; }
|
||||||
|
|
||||||
|
/* ── 分组表 ── */
|
||||||
|
table.group-table { width: 100%; font-size: 9pt; border-collapse: collapse; }
|
||||||
|
table.group-table th,
|
||||||
|
table.group-table td { padding: 4pt 6pt; border-bottom: 1px solid #e2e8f0; }
|
||||||
|
table.group-table th { font-weight: 700; color: #64748b; }
|
||||||
|
|
||||||
|
/* ── 颜色保留(部分浏览器打印默认去色) ── */
|
||||||
|
.good { color: #16a34a !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.warn { color: #eab308 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.bad { color: #dc2626 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.good { background: #dcfce7 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.warn { background: #fef9c3 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.bad { background: #fee2e2 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------- ⑤ 优化建议面板 ---------- */
|
/* ---------- ⑤ 优化建议面板 ---------- */
|
||||||
.advice-panel { border-left: 3px solid #7c3aed; }
|
.advice-panel { border-left: 3px solid #7c3aed; }
|
||||||
.advice-header {
|
.advice-header {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>Siemens RAGAS 评估控制台</title>
|
<title>RAGAS 评估控制台</title>
|
||||||
<link rel="stylesheet" href="/static/css/app.css" />
|
<link rel="stylesheet" href="/static/css/app.css" />
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
||||||
</head>
|
</head>
|
||||||
@@ -28,6 +28,9 @@
|
|||||||
<button class="nav-item" data-view="profiles">
|
<button class="nav-item" data-view="profiles">
|
||||||
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
||||||
</button>
|
</button>
|
||||||
|
<button class="nav-item" data-view="apidocs">
|
||||||
|
<span class="nav-ico">⎔</span><span>API 文档</span>
|
||||||
|
</button>
|
||||||
</nav>
|
</nav>
|
||||||
<div class="sidebar-foot">
|
<div class="sidebar-foot">
|
||||||
<span class="dot" id="health-dot"></span>
|
<span class="dot" id="health-dot"></span>
|
||||||
@@ -89,6 +92,22 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- ??????????????? -->
|
||||||
|
<div class="panel weight-config-panel" id="weight-config-panel" hidden>
|
||||||
|
<h2>???? <span class="muted" style="font-size:13px;font-weight:400">???????????????</span></h2>
|
||||||
|
|
||||||
|
<div class="weight-section">
|
||||||
|
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">???????????????????</span></div>
|
||||||
|
<div id="metric-weight-rows" class="weight-rows"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="weight-section" style="margin-top:16px">
|
||||||
|
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">?? PDF ???????????????????????</span></div>
|
||||||
|
<div id="doc-weight-rows" class="weight-rows"></div>
|
||||||
|
<button class="btn btn-sm" id="add-doc-weight-btn" style="margin-top:8px">? ??????</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="panel" id="task-panel" hidden>
|
<div class="panel" id="task-panel" hidden>
|
||||||
<div class="task-head">
|
<div class="task-head">
|
||||||
<h2>评估进度</h2>
|
<h2>评估进度</h2>
|
||||||
@@ -103,12 +122,25 @@
|
|||||||
|
|
||||||
<!-- 报告详情视图 -->
|
<!-- 报告详情视图 -->
|
||||||
<section class="view" id="view-report" hidden>
|
<section class="view" id="view-report" hidden>
|
||||||
|
<!-- 历史报告切换下拉(顶部,始终可见) -->
|
||||||
|
<div class="report-switcher no-print" id="report-switcher">
|
||||||
|
<label class="report-switcher-label">切换报告</label>
|
||||||
|
<select class="select report-switcher-select" id="report-switcher-select">
|
||||||
|
<option value="">— 加载中… —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="empty" id="report-empty">
|
<div class="empty" id="report-empty">
|
||||||
<p>请先从「运行列表」选择一次运行。</p>
|
<p>请先从「运行列表」选择一次运行。</p>
|
||||||
</div>
|
</div>
|
||||||
<div id="report-content" hidden>
|
<div id="report-content" hidden>
|
||||||
<!-- 顶部元信息条 -->
|
<!-- 顶部元信息条 -->
|
||||||
<div class="report-meta" id="report-meta"></div>
|
<div class="report-meta" id="report-meta"></div>
|
||||||
|
<div class="report-actions no-print">
|
||||||
|
<button class="btn btn-ghost btn-export-pdf" id="export-pdf-btn" onclick="Report.exportPdf()">
|
||||||
|
📄 导出 PDF
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- ① 指标均值卡片 -->
|
<!-- ① 指标均值卡片 -->
|
||||||
<div class="section-label">① 指标均值 OVERVIEW</div>
|
<div class="section-label">① 指标均值 OVERVIEW</div>
|
||||||
@@ -199,6 +231,17 @@
|
|||||||
<p class="muted">点击「新建配置」添加第一个。</p>
|
<p class="muted">点击「新建配置」添加第一个。</p>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<!-- API 文档视图 -->
|
||||||
|
<section class="view" id="view-apidocs" hidden>
|
||||||
|
<iframe
|
||||||
|
id="apidocs-frame"
|
||||||
|
src="/docs"
|
||||||
|
class="apidocs-frame"
|
||||||
|
title="API 文档"
|
||||||
|
allowfullscreen>
|
||||||
|
</iframe>
|
||||||
|
</section>
|
||||||
</main>
|
</main>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -5,8 +5,8 @@
|
|||||||
const App = {
|
const App = {
|
||||||
currentRunId: null,
|
currentRunId: null,
|
||||||
activeView: null,
|
activeView: null,
|
||||||
views: ["runs", "new", "report", "profiles"],
|
views: ["runs", "new", "report", "profiles", "apidocs"],
|
||||||
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置" },
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", apidocs: "API 文档" },
|
||||||
|
|
||||||
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
||||||
init() {
|
init() {
|
||||||
|
|||||||
@@ -4,11 +4,16 @@ const Report = {
|
|||||||
distChart: null,
|
distChart: null,
|
||||||
currentDetail: null,
|
currentDetail: null,
|
||||||
activeGrouping: null,
|
activeGrouping: null,
|
||||||
|
_switcherLoaded: false,
|
||||||
|
|
||||||
// 加载并渲染指定运行的完整报告。
|
// 加载并渲染指定运行的完整报告。
|
||||||
async render(runId) {
|
async render(runId) {
|
||||||
const empty = document.getElementById("report-empty");
|
const empty = document.getElementById("report-empty");
|
||||||
const content = document.getElementById("report-content");
|
const content = document.getElementById("report-content");
|
||||||
|
|
||||||
|
// 加载历史报告下拉(仅首次)
|
||||||
|
Report._loadSwitcher(runId);
|
||||||
|
|
||||||
if (!runId) {
|
if (!runId) {
|
||||||
empty.hidden = false;
|
empty.hidden = false;
|
||||||
content.hidden = true;
|
content.hidden = true;
|
||||||
@@ -28,6 +33,10 @@ const Report = {
|
|||||||
Report.renderLowest(detail.report);
|
Report.renderLowest(detail.report);
|
||||||
Report.renderAdvice(detail.summary, detail.report);
|
Report.renderAdvice(detail.summary, detail.report);
|
||||||
content.style.opacity = "1";
|
content.style.opacity = "1";
|
||||||
|
|
||||||
|
// 同步下拉选中项
|
||||||
|
const sel = document.getElementById("report-switcher-select");
|
||||||
|
if (sel) sel.value = runId;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
empty.hidden = false;
|
empty.hidden = false;
|
||||||
content.hidden = true;
|
content.hidden = true;
|
||||||
@@ -35,6 +44,55 @@ const Report = {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// 加载并填充历史报告下拉选择框
|
||||||
|
async _loadSwitcher(currentRunId) {
|
||||||
|
const sel = document.getElementById("report-switcher-select");
|
||||||
|
if (!sel) return;
|
||||||
|
|
||||||
|
// 已加载过就只更新选中值,不重复请求
|
||||||
|
if (Report._switcherLoaded) {
|
||||||
|
if (currentRunId) sel.value = currentRunId;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const data = await API.runs();
|
||||||
|
const runs = data.runs || [];
|
||||||
|
sel.innerHTML = "";
|
||||||
|
if (runs.length === 0) {
|
||||||
|
sel.innerHTML = '<option value="">(无历史运行)</option>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
runs.forEach((run) => {
|
||||||
|
const opt = document.createElement("option");
|
||||||
|
opt.value = run.run_id;
|
||||||
|
const timeStr = App.shortTime(run.finished_at);
|
||||||
|
const meanText = run.metric_means
|
||||||
|
? Object.entries(run.metric_means)
|
||||||
|
.filter(([, v]) => v !== null && v !== undefined)
|
||||||
|
.slice(0, 2)
|
||||||
|
.map(([k, v]) => `${App.shortMetric(k)}=${v.toFixed(2)}`)
|
||||||
|
.join(" ")
|
||||||
|
: "";
|
||||||
|
opt.textContent = `${run.scenario_name || run.run_id} ${timeStr}${meanText ? " [" + meanText + "]" : ""}`;
|
||||||
|
sel.appendChild(opt);
|
||||||
|
});
|
||||||
|
Report._switcherLoaded = true;
|
||||||
|
if (currentRunId) sel.value = currentRunId;
|
||||||
|
} catch (_e) {
|
||||||
|
sel.innerHTML = '<option value="">(加载失败)</option>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// 绑定切换事件(只绑一次)
|
||||||
|
sel.addEventListener("change", () => {
|
||||||
|
const rid = sel.value;
|
||||||
|
if (!rid) return;
|
||||||
|
App.currentRunId = rid;
|
||||||
|
App.enableReportNav();
|
||||||
|
Report.render(rid);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
// 顶部元信息条。
|
// 顶部元信息条。
|
||||||
renderMeta(summary) {
|
renderMeta(summary) {
|
||||||
const el = document.getElementById("report-meta");
|
const el = document.getElementById("report-meta");
|
||||||
@@ -69,6 +127,18 @@ const Report = {
|
|||||||
`;
|
`;
|
||||||
wrap.appendChild(card);
|
wrap.appendChild(card);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// 综合加权得分卡片
|
||||||
|
const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
|
||||||
|
const wsCard = document.createElement("div");
|
||||||
|
wsCard.className = "metric-card weighted-score-card";
|
||||||
|
const wsCls = App.scoreClass(wsValue);
|
||||||
|
const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
|
||||||
|
wsCard.innerHTML = `
|
||||||
|
<div class="metric-value ${wsCls}">${wsText}</div>
|
||||||
|
<div class="metric-name">综合加权得分</div>
|
||||||
|
`;
|
||||||
|
wrap.appendChild(wsCard);
|
||||||
},
|
},
|
||||||
|
|
||||||
// ② 分数分布直方图(可切换指标)。
|
// ② 分数分布直方图(可切换指标)。
|
||||||
@@ -286,4 +356,22 @@ const Report = {
|
|||||||
|
|
||||||
body.innerHTML = `<div class="advice-md">${html}</div>`;
|
body.innerHTML = `<div class="advice-md">${html}</div>`;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// 导出 PDF:展开所有低分样本 → 打印 → 还原折叠状态
|
||||||
|
exportPdf() {
|
||||||
|
// 1. 记录当前各 detail 展开状态,并全部展开
|
||||||
|
const details = document.querySelectorAll("#lowest-table .lowest-detail");
|
||||||
|
const wasHidden = Array.from(details).map((el) => el.hidden);
|
||||||
|
details.forEach((el) => { el.hidden = false; });
|
||||||
|
|
||||||
|
// 2. 打印完成后还原折叠状态
|
||||||
|
const restore = () => {
|
||||||
|
details.forEach((el, i) => { el.hidden = wasHidden[i]; });
|
||||||
|
window.removeEventListener("afterprint", restore);
|
||||||
|
};
|
||||||
|
window.addEventListener("afterprint", restore);
|
||||||
|
|
||||||
|
// 3. 触发打印(浏览器弹出打印对话框,用户选"另存为 PDF")
|
||||||
|
window.print();
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
// runner.js — 新建评估视图:列出场景、LLM角色配置、触发评估、轮询任务状态与日志。
|
// runner.js — 新建评估视图:列出场景、LLM角色配置、权重配置、触发评估、轮询任务状态。
|
||||||
|
|
||||||
const Runner = {
|
const Runner = {
|
||||||
selectedScenario: null,
|
selectedScenario: null,
|
||||||
|
selectedScenarioInfo: null,
|
||||||
pollTimer: null,
|
pollTimer: null,
|
||||||
lastRunId: null,
|
lastRunId: null,
|
||||||
|
|
||||||
// 绑定运行按钮。
|
|
||||||
init() {
|
init() {
|
||||||
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
|
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
|
||||||
document.getElementById("view-report-btn").addEventListener("click", () => {
|
document.getElementById("view-report-btn").addEventListener("click", () => {
|
||||||
@@ -14,9 +14,9 @@ const Runner = {
|
|||||||
App.navigate("report", Runner.lastRunId);
|
App.navigate("report", Runner.lastRunId);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
document.getElementById("add-doc-weight-btn").addEventListener("click", () => Runner._addDocWeightRow());
|
||||||
},
|
},
|
||||||
|
|
||||||
// 加载并渲染可触发的场景列表。
|
|
||||||
async loadScenarios() {
|
async loadScenarios() {
|
||||||
const list = document.getElementById("scenario-list");
|
const list = document.getElementById("scenario-list");
|
||||||
list.innerHTML = '<p class="muted">加载中…</p>';
|
list.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
@@ -32,17 +32,14 @@ const Runner = {
|
|||||||
} catch (err) {
|
} catch (err) {
|
||||||
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
}
|
}
|
||||||
// 同时加载 profiles 供角色选择
|
|
||||||
Runner._populateProfileSelects();
|
Runner._populateProfileSelects();
|
||||||
},
|
},
|
||||||
|
|
||||||
// 填充三个角色下拉框
|
|
||||||
async _populateProfileSelects() {
|
async _populateProfileSelects() {
|
||||||
const cached = Profiles.getAll();
|
const cached = Profiles.getAll();
|
||||||
const profiles = cached.length > 0
|
const profiles = cached.length > 0
|
||||||
? cached
|
? cached
|
||||||
: (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
|
: (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
|
||||||
|
|
||||||
["role-judge", "role-answer", "role-dataset"].forEach(id => {
|
["role-judge", "role-answer", "role-dataset"].forEach(id => {
|
||||||
const sel = document.getElementById(id);
|
const sel = document.getElementById(id);
|
||||||
sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
|
sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
|
||||||
@@ -55,17 +52,14 @@ const Runner = {
|
|||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
// 构造单个场景条目。
|
|
||||||
renderScenarioItem(sc) {
|
renderScenarioItem(sc) {
|
||||||
const item = document.createElement("div");
|
const item = document.createElement("div");
|
||||||
const invalid = !!sc.error;
|
const invalid = !!sc.error;
|
||||||
item.className = "scenario-item" + (invalid ? " invalid" : "");
|
item.className = "scenario-item" + (invalid ? " invalid" : "");
|
||||||
|
|
||||||
const modeTag = sc.mode
|
const modeTag = sc.mode
|
||||||
? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
|
? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
|
||||||
: "";
|
: "";
|
||||||
const metricCount = (sc.metrics || []).length;
|
const metricCount = (sc.metrics || []).length;
|
||||||
|
|
||||||
item.innerHTML = `
|
item.innerHTML = `
|
||||||
<div>
|
<div>
|
||||||
<div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
|
<div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
|
||||||
@@ -77,27 +71,94 @@ const Runner = {
|
|||||||
<span class="tag">${metricCount} 指标</span>
|
<span class="tag">${metricCount} 指标</span>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
|
|
||||||
if (!invalid) {
|
if (!invalid) {
|
||||||
item.addEventListener("click", () => {
|
item.addEventListener("click", () => {
|
||||||
document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
|
document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
|
||||||
item.classList.add("selected");
|
item.classList.add("selected");
|
||||||
Runner.selectedScenario = sc.path;
|
Runner.selectedScenario = sc.path;
|
||||||
|
Runner.selectedScenarioInfo = sc;
|
||||||
document.getElementById("selected-scenario").textContent = sc.path;
|
document.getElementById("selected-scenario").textContent = sc.path;
|
||||||
document.getElementById("run-btn").disabled = false;
|
document.getElementById("run-btn").disabled = false;
|
||||||
// 显示 LLM 角色面板
|
|
||||||
document.getElementById("llm-assignment-panel").hidden = false;
|
document.getElementById("llm-assignment-panel").hidden = false;
|
||||||
|
Runner._renderWeightPanel(sc);
|
||||||
|
document.getElementById("weight-config-panel").hidden = false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return item;
|
return item;
|
||||||
},
|
},
|
||||||
|
|
||||||
// 触发评估:先 apply profiles(若选了),再触发任务。
|
// 根据选中场景渲染指标权重行(动态生成,按场景 metrics 列表)
|
||||||
|
_renderWeightPanel(sc) {
|
||||||
|
const metricRows = document.getElementById("metric-weight-rows");
|
||||||
|
metricRows.innerHTML = "";
|
||||||
|
const metrics = sc.metrics || [];
|
||||||
|
const existingWeights = sc.metric_weights || {};
|
||||||
|
metrics.forEach(metric => {
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "weight-row";
|
||||||
|
const currentVal = existingWeights[metric] != null ? existingWeights[metric] : 1.0;
|
||||||
|
row.innerHTML = `
|
||||||
|
<span class="weight-row-label">${App.escape(metric)}</span>
|
||||||
|
<input class="weight-row-input" type="number" min="0" step="0.1"
|
||||||
|
data-metric="${App.escape(metric)}" value="${currentVal}" />
|
||||||
|
`;
|
||||||
|
metricRows.appendChild(row);
|
||||||
|
});
|
||||||
|
|
||||||
|
// 填充已有文档权重
|
||||||
|
const docRows = document.getElementById("doc-weight-rows");
|
||||||
|
docRows.innerHTML = "";
|
||||||
|
const existingDocWeights = sc.doc_weights || {};
|
||||||
|
Object.entries(existingDocWeights).forEach(([docName, w]) => {
|
||||||
|
Runner._addDocWeightRow(docName, w);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// 添加一行文档权重输入
|
||||||
|
_addDocWeightRow(docName, weight) {
|
||||||
|
const name = docName !== undefined ? docName : "";
|
||||||
|
const w = weight !== undefined ? weight : 1.0;
|
||||||
|
const container = document.getElementById("doc-weight-rows");
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "weight-row";
|
||||||
|
row.innerHTML = `
|
||||||
|
<input class="doc-weight-name" type="text" placeholder="PDF 文件名(如 322_双源CT.pdf)" value="${App.escape(String(name))}" />
|
||||||
|
<input class="weight-row-input" type="number" min="0" step="0.1" value="${w}" />
|
||||||
|
<button class="weight-row-remove" title="删除">✕</button>
|
||||||
|
`;
|
||||||
|
row.querySelector(".weight-row-remove").addEventListener("click", () => row.remove());
|
||||||
|
container.appendChild(row);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 收集权重面板当前值;全等权时返回 null(不发送)
|
||||||
|
_collectWeights() {
|
||||||
|
const metricWeights = {};
|
||||||
|
document.querySelectorAll("#metric-weight-rows .weight-row-input").forEach(input => {
|
||||||
|
const metric = input.dataset.metric;
|
||||||
|
const val = parseFloat(input.value);
|
||||||
|
if (metric && !isNaN(val)) metricWeights[metric] = val;
|
||||||
|
});
|
||||||
|
|
||||||
|
const docWeights = {};
|
||||||
|
document.querySelectorAll("#doc-weight-rows .weight-row").forEach(row => {
|
||||||
|
const nameInput = row.querySelector(".doc-weight-name");
|
||||||
|
const valInput = row.querySelector(".weight-row-input");
|
||||||
|
if (!nameInput || !valInput) return;
|
||||||
|
const name = nameInput.value.trim();
|
||||||
|
const val = parseFloat(valInput.value);
|
||||||
|
if (name && !isNaN(val)) docWeights[name] = val;
|
||||||
|
});
|
||||||
|
|
||||||
|
const allMetricDefault = Object.values(metricWeights).every(v => Math.abs(v - 1.0) < 1e-9);
|
||||||
|
const noDocWeights = Object.keys(docWeights).length === 0;
|
||||||
|
if (allMetricDefault && noDocWeights) return { metricWeights: null, docWeights: null };
|
||||||
|
return { metricWeights, docWeights };
|
||||||
|
},
|
||||||
|
|
||||||
async trigger() {
|
async trigger() {
|
||||||
if (!Runner.selectedScenario) return;
|
if (!Runner.selectedScenario) return;
|
||||||
const runBtn = document.getElementById("run-btn");
|
const runBtn = document.getElementById("run-btn");
|
||||||
runBtn.disabled = true;
|
runBtn.disabled = true;
|
||||||
|
|
||||||
const panel = document.getElementById("task-panel");
|
const panel = document.getElementById("task-panel");
|
||||||
const logBox = document.getElementById("task-log");
|
const logBox = document.getElementById("task-log");
|
||||||
const statusBadge = document.getElementById("task-status");
|
const statusBadge = document.getElementById("task-status");
|
||||||
@@ -106,12 +167,8 @@ const Runner = {
|
|||||||
reportBtn.hidden = true;
|
reportBtn.hidden = true;
|
||||||
logBox.textContent = "";
|
logBox.textContent = "";
|
||||||
Runner._setStatus(statusBadge, "queued");
|
Runner._setStatus(statusBadge, "queued");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Step 1: apply LLM profiles to YAML if any selected
|
|
||||||
await Runner._applyProfilesIfNeeded(logBox);
|
await Runner._applyProfilesIfNeeded(logBox);
|
||||||
|
|
||||||
// Step 2: trigger evaluation
|
|
||||||
const resp = await API.triggerEvaluation(Runner.selectedScenario);
|
const resp = await API.triggerEvaluation(Runner.selectedScenario);
|
||||||
Runner.poll(resp.task_id);
|
Runner.poll(resp.task_id);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -121,20 +178,22 @@ const Runner = {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// 如果用户选了 profile,就先 apply 写回 YAML
|
|
||||||
async _applyProfilesIfNeeded(logBox) {
|
async _applyProfilesIfNeeded(logBox) {
|
||||||
const judgeId = document.getElementById("role-judge").value;
|
const judgeId = document.getElementById("role-judge").value;
|
||||||
const answerId = document.getElementById("role-answer").value;
|
const answerId = document.getElementById("role-answer").value;
|
||||||
const datasetId = document.getElementById("role-dataset").value;
|
const datasetId = document.getElementById("role-dataset").value;
|
||||||
|
const { metricWeights, docWeights } = Runner._collectWeights();
|
||||||
|
|
||||||
if (!judgeId && !answerId && !datasetId) return; // 全空,跳过
|
if (!judgeId && !answerId && !datasetId && !metricWeights && !docWeights) return;
|
||||||
|
|
||||||
logBox.textContent = "正在将 LLM 配置写入场景文件…\n";
|
logBox.textContent = "正在将 LLM 配置和权重写入场景文件…\n";
|
||||||
const body = {
|
const body = {
|
||||||
scenario_path: Runner.selectedScenario,
|
scenario_path: Runner.selectedScenario,
|
||||||
judge_profile_id: judgeId || null,
|
judge_profile_id: judgeId || null,
|
||||||
answer_profile_id: answerId || null,
|
answer_profile_id: answerId || null,
|
||||||
dataset_profile_id: datasetId || null,
|
dataset_profile_id: datasetId || null,
|
||||||
|
metric_weights: metricWeights,
|
||||||
|
doc_weights: docWeights,
|
||||||
};
|
};
|
||||||
const result = await API.applyProfiles(body);
|
const result = await API.applyProfiles(body);
|
||||||
const fields = (result.patched_fields || []).join(", ");
|
const fields = (result.patched_fields || []).join(", ");
|
||||||
@@ -143,13 +202,11 @@ const Runner = {
|
|||||||
: "(未找到可更新的字段,继续运行)\n";
|
: "(未找到可更新的字段,继续运行)\n";
|
||||||
},
|
},
|
||||||
|
|
||||||
// 周期性轮询任务状态,刷新日志与徽标。
|
|
||||||
poll(taskId) {
|
poll(taskId) {
|
||||||
const logBox = document.getElementById("task-log");
|
const logBox = document.getElementById("task-log");
|
||||||
const statusBadge = document.getElementById("task-status");
|
const statusBadge = document.getElementById("task-status");
|
||||||
const reportBtn = document.getElementById("view-report-btn");
|
const reportBtn = document.getElementById("view-report-btn");
|
||||||
const runBtn = document.getElementById("run-btn");
|
const runBtn = document.getElementById("run-btn");
|
||||||
|
|
||||||
if (Runner.pollTimer) clearInterval(Runner.pollTimer);
|
if (Runner.pollTimer) clearInterval(Runner.pollTimer);
|
||||||
Runner.pollTimer = setInterval(async () => {
|
Runner.pollTimer = setInterval(async () => {
|
||||||
try {
|
try {
|
||||||
@@ -157,7 +214,6 @@ const Runner = {
|
|||||||
logBox.textContent = (status.logs || []).join("\n");
|
logBox.textContent = (status.logs || []).join("\n");
|
||||||
logBox.scrollTop = logBox.scrollHeight;
|
logBox.scrollTop = logBox.scrollHeight;
|
||||||
Runner._setStatus(statusBadge, status.status);
|
Runner._setStatus(statusBadge, status.status);
|
||||||
|
|
||||||
if (status.status === "completed" || status.status === "failed") {
|
if (status.status === "completed" || status.status === "failed") {
|
||||||
clearInterval(Runner.pollTimer);
|
clearInterval(Runner.pollTimer);
|
||||||
runBtn.disabled = false;
|
runBtn.disabled = false;
|
||||||
@@ -175,7 +231,6 @@ const Runner = {
|
|||||||
}, 1200);
|
}, 1200);
|
||||||
},
|
},
|
||||||
|
|
||||||
// 更新状态徽标的文本与配色类。
|
|
||||||
_setStatus(badge, status) {
|
_setStatus(badge, status) {
|
||||||
badge.textContent = status;
|
badge.textContent = status;
|
||||||
badge.className = "badge " + status;
|
badge.className = "badge " + status;
|
||||||
|
|||||||
Reference in New Issue
Block a user