Compare commits

...

23 Commits

Author SHA1 Message Date
wangwei
1bcb208f92 feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 15:28:20 +08:00
wangwei
a03a24be4e feat: add POST /api/score endpoint for Dify real-time scoring
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 15:14:19 +08:00
wangwei
e4d4e4968b feat: add InlineScorer service with LLM client caching
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 15:03:43 +08:00
wangwei
761faf9c42 feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 15:00:05 +08:00
wangwei
9ad6ad4ebc docs: add Dify score API implementation plan 2026-06-22 14:55:43 +08:00
wangwei
eee96eb158 docs: add Dify score API integration design spec
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 14:51:52 +08:00
wangwei
ccf25eb1f9 feat: add Linux deployment scripts (deploy/start/stop/run_eval)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 14:28:44 +08:00
wangwei
199b3af611 docs: add Linux deploy script design spec
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-22 14:18:14 +08:00
wangwei
f9e3ba0f64 feat: add weight config panel to 新建评估 and weighted_score card to report
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:28:15 +08:00
wangwei
36e5506e2a feat: report_builder uses weighted means; ReportData gains weighted_score_mean
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:16:09 +08:00
wangwei
835614189e feat: ScenarioInfo exposes metric_weights and doc_weights from YAML
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:05:26 +08:00
wangwei
ce0d2291b0 feat: yaml_patcher and ProfileApplyRequest support metric_weights and doc_weights
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 17:02:21 +08:00
wangwei
480f6d66ea feat: use weighted metric means and add weighted_score row to summary.md
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:59:56 +08:00
wangwei
d371ef7d24 feat: add weighted_score and sample_weight columns to score rows
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:53:45 +08:00
wangwei
8617eaa5aa feat: add metric_weights and doc_weights to Scenario schema and dataclass
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:50:33 +08:00
wangwei
e0b064587f feat: add metric/doc weight computation module (weights.py)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:47:47 +08:00
wangwei
078097af00 docs: add metric/doc weights implementation plan 2026-06-18 16:43:08 +08:00
wangwei
ca586bf9bb docs: add metric and doc weights feature design spec
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-18 16:37:18 +08:00
wangwei
9ad2daff73 feat: restore API文档 nav item (iframe /docs) without touching other 4 modules 2026-06-17 11:24:16 +08:00
wangwei
e8af5b906c chore: remove API docs iframe nav item, rename title to RAGAS 评估控制台 2026-06-17 11:18:01 +08:00
wangwei
8ea2b9c7d2 feat: add API文档 nav item with embedded Swagger UI iframe 2026-06-17 11:09:55 +08:00
wangwei
074800b741 feat: add history report switcher dropdown in report detail view 2026-06-17 10:35:56 +08:00
wangwei
3019390592 feat: add export-to-PDF via browser print with @media print CSS 2026-06-17 10:28:01 +08:00
36 changed files with 5729 additions and 65 deletions

View File

@@ -30,3 +30,8 @@ PARSER_FAILURE_MODE=fail
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖) # 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
DATASET_GENERATOR_MODEL=qwen3.6-plus DATASET_GENERATOR_MODEL=qwen3.6-plus
# ===== Dify 集成 — 实时评分 API =====
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
SCORE_API_TOKEN=

173
deploy.sh Normal file
View File

@@ -0,0 +1,173 @@
#!/usr/bin/env bash
# deploy.sh — Siemens RAGAS 一键部署脚本Linux
# 用法bash deploy.sh
# 功能:检查环境 → 安装依赖 → 初始化配置 → 启动后台服务
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# ── 颜色输出 ──────────────────────────────────────────────────────
if [ -t 1 ]; then
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
else
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
fi
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
echo ""
echo -e "${CYAN}============================================================${NC}"
echo -e "${CYAN} Siemens RAGAS Console — Linux 一键部署${NC}"
echo -e "${CYAN}============================================================${NC}"
echo ""
# ── 阶段 1Python 版本检查 ───────────────────────────────────────
info "阶段 1/7检查 Python 版本..."
PYTHON_BIN=""
for candidate in python3.12 python3.13 python3.14 python3; do
if command -v "$candidate" &>/dev/null; then
version=$("$candidate" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || true)
major=$(echo "$version" | cut -d. -f1)
minor=$(echo "$version" | cut -d. -f2)
if [ "${major:-0}" -ge 3 ] && [ "${minor:-0}" -ge 12 ]; then
PYTHON_BIN="$candidate"
ok "Python $version ($candidate)"
break
fi
fi
done
if [ -z "$PYTHON_BIN" ]; then
err "未找到 Python 3.12+。请安装后重试。"
err " Ubuntu/Debian: sudo apt install python3.12 python3.12-venv"
err " CentOS/RHEL: sudo dnf install python3.12"
exit 1
fi
# ── 阶段 2虚拟环境 ──────────────────────────────────────────────
info "阶段 2/7准备虚拟环境..."
if [ -d ".venv" ] && [ -f ".venv/bin/python" ]; then
ok ".venv 已存在,跳过创建"
else
info "创建 .venv..."
"$PYTHON_BIN" -m venv .venv
ok ".venv 创建完成"
fi
PIP=".venv/bin/pip"
PYTHON=".venv/bin/python"
# ── 阶段 3安装依赖 ──────────────────────────────────────────────
info "阶段 3/7安装项目依赖可能需要几分钟..."
"$PIP" install --upgrade pip -q
ok "pip 已升级"
"$PIP" install -e . -q
ok "项目依赖安装完成pyproject.toml"
"$PIP" install fastapi uvicorn httpx -q
ok "Web 服务依赖安装完成fastapi / uvicorn / httpx"
# ── 阶段 4配置文件 ──────────────────────────────────────────────
info "阶段 4/7初始化配置文件..."
if [ ! -f ".env" ]; then
cp .env.example .env
warn ".env 已从 .env.example 复制,请编辑填写实际的 API Key 等配置后再启动:"
warn " nano .env 或 vim .env"
warn " 关键字段OPENAI_API_KEY, OPENAI_BASE_URL, ALIBABA_ACCESS_KEY_ID, ALIBABA_ACCESS_KEY_SECRET"
else
ok ".env 已存在,跳过"
fi
# ── 阶段 5目录初始化 ────────────────────────────────────────────
info "阶段 5/7初始化目录结构..."
mkdir -p configs logs outputs datasets
ok "目录就绪configs/ logs/ outputs/ datasets/"
# 确保其他脚本有执行权限
for script in start.sh stop.sh run_eval.sh; do
[ -f "$script" ] && chmod +x "$script"
done
ok "辅助脚本已设置执行权限"
# ── 阶段 6Demo 数据 ─────────────────────────────────────────────
info "阶段 6/7初始化演示数据..."
DEMO_DIR="outputs/kba-knowledge-base-offline-baseline"
if [ -d "$DEMO_DIR" ]; then
ok "演示数据已存在,跳过"
else
info "生成演示数据scripts/seed_sample_run.py..."
if "$PYTHON" scripts/seed_sample_run.py; then
ok "演示数据生成完成"
else
warn "演示数据生成失败,控制台报告页将为空(服务仍可正常启动)"
fi
fi
# ── 阶段 7启动服务 ──────────────────────────────────────────────
info "阶段 7/7启动 Web 服务..."
# 检查 .env 是否包含默认占位符
if grep -q "your-api-key" .env 2>/dev/null; then
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
warn "请编辑 .env 后重新运行 start.sh"
fi
# 端口检测
PORT=8800
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
warn "端口 $PORT 已被占用,尝试 8801..."
PORT=8801
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
err "端口 8800 和 8801 均被占用。请手动运行:"
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
exit 1
fi
fi
# 清理残留 PID
if [ -f ".server.pid" ]; then
OLD_PID=$(cat .server.pid)
if kill -0 "$OLD_PID" 2>/dev/null; then
warn "检测到已有服务进程 (PID=$OLD_PID),停止旧进程..."
kill "$OLD_PID" 2>/dev/null || true
sleep 1
fi
rm -f .server.pid
fi
# 后台启动
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
SERVER_PID=$!
echo "$SERVER_PID" > .server.pid
# 等待 3 秒验证进程存活
sleep 3
if kill -0 "$SERVER_PID" 2>/dev/null; then
ok "服务已启动 (PID=$SERVER_PID)"
echo ""
echo -e "${CYAN}============================================================${NC}"
echo -e "${GREEN} 部署成功!${NC}"
echo -e "${GREEN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
echo -e "${GREEN} 本机访问: http://127.0.0.1:${PORT}${NC}"
echo -e "${CYAN} 服务日志: tail -f logs/server.log${NC}"
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
echo -e "${CYAN}============================================================${NC}"
echo ""
else
err "服务启动失败,请查看日志:"
err " tail -20 logs/server.log"
rm -f .server.pid
exit 1
fi

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,974 @@
# Dify 实时评分 API Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** 新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,接受单条问答记录并同步返回 RAGAS 各指标得分。
**Architecture:** 新增 `inline_scorer.py` 服务层封装 RAGAS 打分逻辑,以 `(judge_model, embedding_model)` 为 key 缓存 LLM 客户端;新增 `webapp/api/score.py` 路由;`ScoreRequest`/`ScoreResponse` 放入 `webapp/models.py``SCORE_API_TOKEN` 加入 `EvaluationSettings`
**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, RAGAS 0.4.3, pytest
## Global Constraints
- Python 3.12+PEP 84 空格缩进,类型注解必须
- contexts 用 `context_separator`(默认 `" |||| "`)拆分为 list[str]
- ground_truth 为可选;缺失时跳过 context_recall / factual_correctness / semantic_similarity / noise_sensitivity
- SCORE_API_TOKEN 为空时不鉴权(内网部署场景)
- 所有测试用 pytest不依赖真实 LLM
---
## 文件清单
| 操作 | 文件 | 职责 |
|------|------|------|
| 新建 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + 单题打分 |
| 新建 | `webapp/api/score.py` | `/api/score` 路由 |
| 新建 | `tests/webapp/test_score_api.py` | 端点测试(全 mock |
| 修改 | `webapp/models.py` | 新增 ScoreRequest / ScoreResponse |
| 修改 | `rag_eval/settings.py` | 新增 score_api_token 字段 |
| 修改 | `webapp/server.py` | 注册 score router更新 OPENAPI_TAGS 和 description |
---
## Task 1: ScoreRequest / ScoreResponse 模型 + settings 字段
**Files:**
- Modify: `webapp/models.py`
- Modify: `rag_eval/settings.py`
- Test: `tests/webapp/test_score_api.py` (partial — model validation tests)
**Interfaces:**
- Produces:
- `ScoreRequest` Pydantic model见下方字段
- `ScoreResponse` Pydantic model
- `EvaluationSettings.score_api_token: str | None`
- [ ] **Step 1: Write failing model-validation tests**
Create `tests/webapp/test_score_api.py`:
```python
"""Tests for POST /api/score endpoint."""
from __future__ import annotations
import math
import pytest
from pydantic import ValidationError
from webapp.models import ScoreRequest, ScoreResponse
class TestScoreRequest:
def test_minimal_valid_request(self):
"""Only required fields — question, answer, contexts."""
req = ScoreRequest(
question="What is CT?",
answer="CT is imaging.",
contexts="CT uses X-rays.",
)
assert req.question == "What is CT?"
assert req.contexts == "CT uses X-rays."
assert req.ground_truth is None
assert req.context_separator == " |||| "
assert req.metrics == ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
def test_contexts_split_by_separator(self):
"""contexts_as_list() splits on context_separator."""
req = ScoreRequest(
question="q", answer="a",
contexts="ctx1 |||| ctx2 |||| ctx3",
context_separator=" |||| ",
)
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
def test_contexts_split_custom_separator(self):
req = ScoreRequest(
question="q", answer="a",
contexts="a---b---c",
context_separator="---",
)
assert req.contexts_as_list() == ["a", "b", "c"]
def test_contexts_split_single_item(self):
req = ScoreRequest(question="q", answer="a", contexts="only one")
assert req.contexts_as_list() == ["only one"]
def test_missing_question_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
def test_missing_answer_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
def test_missing_contexts_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
def test_custom_metrics_accepted(self):
req = ScoreRequest(
question="q", answer="a", contexts="c",
metrics=["faithfulness"],
)
assert req.metrics == ["faithfulness"]
def test_invalid_metric_name_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", answer="a", contexts="c", metrics=["not_a_metric"])
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
"""Without ground_truth, GT-dependent metrics are excluded."""
req = ScoreRequest(
question="q", answer="a", contexts="c",
metrics=["faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity"],
)
effective = req.effective_metrics()
assert "faithfulness" in effective
assert "context_recall" not in effective
assert "factual_correctness" not in effective
assert "semantic_similarity" not in effective
assert "noise_sensitivity" not in effective
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
req = ScoreRequest(
question="q", answer="a", contexts="c", ground_truth="gt",
metrics=["faithfulness", "context_recall", "factual_correctness"],
)
effective = req.effective_metrics()
assert effective == ["faithfulness", "context_recall", "factual_correctness"]
class TestScoreResponse:
def test_score_response_structure(self):
resp = ScoreResponse(
scores={"faithfulness": 0.85, "answer_relevancy": None},
weighted_score=0.85,
latency_ms=1200,
)
assert resp.scores["faithfulness"] == 0.85
assert resp.scores["answer_relevancy"] is None
assert resp.latency_ms == 1200
```
- [ ] **Step 2: Run to verify FAIL**
```
cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
```
Expected: `ImportError: cannot import name 'ScoreRequest' from 'webapp.models'`
- [ ] **Step 3: Add ScoreRequest and ScoreResponse to `webapp/models.py`**
Append to the end of `webapp/models.py` (after `PipelineJobResponse`):
```python
# ---------------------------------------------------------------------------
# Dify 实时评分 API 模型
# ---------------------------------------------------------------------------
# 需要 ground_truth 才能计算的指标集合
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
"context_recall",
"factual_correctness",
"semantic_similarity",
"noise_sensitivity",
})
# 所有合法指标名称
_VALID_METRICS: frozenset[str] = frozenset({
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
"noise_sensitivity",
"factual_correctness",
"semantic_similarity",
})
_DEFAULT_SCORE_METRICS: list[str] = [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
class ScoreRequest(BaseModel):
"""Request body for the real-time single-sample scoring endpoint."""
model_config = ConfigDict(
json_schema_extra={
"examples": [
{
"summary": "基础评分请求",
"value": {
"question": "双源CT的时间分辨率是多少?",
"answer": "双源CT的单扇区时间分辨率为75ms。",
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
"ground_truth": "双源CT单扇区时间分辨率为75ms需旋转135度。",
"context_separator": " |||| ",
"metrics": ["faithfulness", "answer_relevancy", "context_recall", "context_precision"],
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3",
},
}
]
}
)
question: str = Field(description="问题文本。")
answer: str = Field(description="待评分的回答。")
contexts: str = Field(
description="检索上下文字符串,多段之间用 context_separator 拼接。"
)
ground_truth: str | None = Field(
default=None,
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
)
context_separator: str = Field(
default=" |||| ",
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
)
metrics: list[str] = Field(
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
description="需要计算的 RAGAS 指标列表。",
)
judge_model: str | None = Field(
default=None,
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
)
embedding_model: str | None = Field(
default=None,
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
)
@field_validator("metrics")
@classmethod
def validate_metric_names(cls, value: list[str]) -> list[str]:
"""Reject any metric name not in the supported registry."""
invalid = [m for m in value if m not in _VALID_METRICS]
if invalid:
raise ValueError(
f"不支持的指标名称:{invalid}。"
f"合法值:{sorted(_VALID_METRICS)}"
)
if not value:
raise ValueError("metrics 不能为空列表。")
return value
def contexts_as_list(self) -> list[str]:
"""Split the contexts string into a list of non-empty fragments."""
sep = self.context_separator or " |||| "
return [s.strip() for s in self.contexts.split(sep) if s.strip()]
def effective_metrics(self) -> list[str]:
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
if self.ground_truth is not None:
return list(self.metrics)
return [m for m in self.metrics if m not in _GT_DEPENDENT_METRICS]
class ScoreResponse(BaseModel):
"""Response payload for the real-time scoring endpoint."""
scores: dict[str, float | None] = Field(
description="各指标得分NaN 或计算失败时为 null。"
)
weighted_score: float | None = Field(
default=None,
description="等权加权综合得分(仅对非 null 指标求均值)。",
)
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
skipped_metrics: list[str] = Field(
default_factory=list,
description="因缺少 ground_truth 而跳过的指标名称列表。",
)
error: str | None = Field(
default=None,
description="打分异常时的错误信息HTTP 200 仍返回scores 为空)。",
)
```
Also add `field_validator` to the import line at the top of `webapp/models.py`:
```python
from pydantic import BaseModel, ConfigDict, Field, field_validator
```
- [ ] **Step 4: Add `score_api_token` to `rag_eval/settings.py`**
Add after the `dataset_generator_model` field:
```python
score_api_token: str | None = Field(
default=None,
alias="SCORE_API_TOKEN",
description="Bearer token for /api/score endpoint. Empty = no auth.",
)
```
- [ ] **Step 5: Run to verify PASS**
```
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
```
Expected: all 12 tests PASS.
- [ ] **Step 6: Commit**
```
git add webapp/models.py rag_eval/settings.py tests/webapp/test_score_api.py
git commit -m "feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting"
```
---
## Task 2: InlineScorer 服务LLM 缓存 + 打分)
**Files:**
- Create: `webapp/services/inline_scorer.py`
**Interfaces:**
- Consumes:
- `build_models(judge_model, embedding_model, settings) -> tuple[Any, Any]` from `rag_eval.metrics.factory`
- `MetricPipeline(metrics, metric_timeout_seconds)` from `rag_eval.metrics.pipeline`
- `NormalizedSample` from `rag_eval.shared.models`
- `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights`
- `EvaluationSettings` from `rag_eval.settings`
- Produces:
- `inline_scorer: InlineScorer` (module-level singleton)
- `InlineScorer.score(question, answer, contexts, ground_truth, metrics, judge_model, embedding_model, settings) -> dict[str, float | None]`
- [ ] **Step 1: Write failing test**
Add to `tests/webapp/test_score_api.py`:
```python
class TestInlineScorer:
def test_score_returns_dict_with_requested_metrics(self):
"""InlineScorer.score returns a dict keyed by the requested metrics."""
from unittest.mock import AsyncMock, MagicMock, patch
from webapp.services.inline_scorer import InlineScorer
from rag_eval.settings import EvaluationSettings
mock_score = MagicMock()
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
mock_score.error = ""
mock_pipeline = MagicMock()
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
scorer = InlineScorer()
result = scorer.score(
question="q", answer="a",
contexts=["ctx1"],
ground_truth=None,
metrics=["faithfulness", "answer_relevancy"],
judge_model="test-model",
embedding_model="test-embed",
settings=EvaluationSettings(_env_file=None),
)
assert "faithfulness" in result
assert "answer_relevancy" in result
assert result["faithfulness"] == pytest.approx(0.9)
def test_score_converts_nan_to_none(self):
"""NaN scores are converted to None in the returned dict."""
import math
from unittest.mock import AsyncMock, MagicMock, patch
from webapp.services.inline_scorer import InlineScorer
from rag_eval.settings import EvaluationSettings
mock_score = MagicMock()
mock_score.metrics = {"faithfulness": float("nan")}
mock_score.error = ""
mock_pipeline = MagicMock()
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
scorer = InlineScorer()
result = scorer.score(
question="q", answer="a", contexts=["c"],
ground_truth=None,
metrics=["faithfulness"],
judge_model="m", embedding_model="e",
settings=EvaluationSettings(_env_file=None),
)
assert result["faithfulness"] is None
```
- [ ] **Step 2: Run to verify FAIL**
```
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
```
Expected: `ModuleNotFoundError: No module named 'webapp.services.inline_scorer'`
- [ ] **Step 3: Create `webapp/services/inline_scorer.py`**
```python
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
(judge_model, embedding_model), so repeated Dify Tool calls with the same
models reuse existing AsyncOpenAI connections instead of creating new ones.
"""
from __future__ import annotations
import asyncio
import math
import threading
from typing import Any
from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.metrics.factory import build_models
from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import NormalizedSample
ensure_ragas_import_compat()
from ragas.metrics.collections import ( # noqa: E402
AnswerRelevancy,
ContextPrecision,
ContextRecall,
FactualCorrectness,
Faithfulness,
NoiseSensitivity,
SemanticSimilarity,
)
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
"""Instantiate only the RAGAS metric objects requested."""
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
"noise_sensitivity": NoiseSensitivity(llm=llm),
"factual_correctness": FactualCorrectness(llm=llm),
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
}
return {name: registry[name] for name in metrics if name in registry}
class InlineScorer:
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
def __init__(self) -> None:
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
self._lock = threading.Lock()
def _get_models(
self,
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> tuple[Any, Any]:
"""Return cached LLM/embedding clients, building them on first use."""
cache_key = (judge_model, embedding_model)
with self._lock:
if cache_key not in self._model_cache:
llm, embeddings = build_models(judge_model, embedding_model, settings)
self._model_cache[cache_key] = (llm, embeddings)
return self._model_cache[cache_key]
def score(
self,
question: str,
answer: str,
contexts: list[str],
ground_truth: str | None,
metrics: list[str],
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> dict[str, float | None]:
"""Score one sample synchronously and return {metric_name: score | None}.
NaN values from RAGAS are converted to None for clean JSON serialization.
"""
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
metric_instances = _build_metric_instances(metrics, llm, embeddings)
pipeline = MetricPipeline(
metrics=metric_instances,
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
)
sample = NormalizedSample(
sample_id="inline-score",
question=question,
answer=answer,
contexts=contexts,
ground_truth=ground_truth or "",
)
metric_score = asyncio.run(pipeline.score_sample(sample))
# Convert NaN → None for clean JSON output
return {
name: (None if math.isnan(v) or math.isinf(v) else round(v, 4))
for name, v in metric_score.metrics.items()
}
# Module-level singleton shared by FastAPI routes.
inline_scorer = InlineScorer()
```
- [ ] **Step 4: Run to verify PASS**
```
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
```
Expected: both tests PASS.
- [ ] **Step 5: Commit**
```
git add webapp/services/inline_scorer.py tests/webapp/test_score_api.py
git commit -m "feat: add InlineScorer service with LLM client caching"
```
---
## Task 3: `/api/score` 路由 + 鉴权 + 集成测试
**Files:**
- Create: `webapp/api/score.py`
- Modify: `webapp/server.py`
**Interfaces:**
- Consumes:
- `ScoreRequest`, `ScoreResponse` from `webapp.models`
- `inline_scorer: InlineScorer` from `webapp.services.inline_scorer`
- `EvaluationSettings` from `rag_eval.settings`
- `compute_weighted_score(scores, {}) -> float | None` from `rag_eval.metrics.weights`
- Produces: `POST /api/score` endpoint
- [ ] **Step 1: Write failing endpoint tests**
Add to `tests/webapp/test_score_api.py`:
```python
# ── Fixtures ─────────────────────────────────────────────────────────────────
import pytest
from fastapi.testclient import TestClient
from unittest.mock import MagicMock, patch
@pytest.fixture()
def client(monkeypatch):
"""TestClient with mocked InlineScorer."""
import webapp.api.score as score_mod
mock_scorer = MagicMock()
mock_scorer.score.return_value = {
"faithfulness": 0.85,
"answer_relevancy": 0.90,
}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
return TestClient(create_app())
class TestScoreEndpoint:
def test_post_score_returns_200(self, client):
resp = client.post("/api/score", json={
"question": "What is CT?",
"answer": "CT is imaging.",
"contexts": "CT uses X-rays.",
})
assert resp.status_code == 200
data = resp.json()
assert "scores" in data
assert "latency_ms" in data
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
def test_weighted_score_computed(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
})
assert resp.status_code == 200
data = resp.json()
# weighted_score is the mean of all non-null scores
assert data["weighted_score"] is not None
def test_missing_required_fields_returns_422(self, client):
resp = client.post("/api/score", json={"question": "q"})
assert resp.status_code == 422
def test_invalid_metric_name_returns_422(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
"metrics": ["not_a_metric"],
})
assert resp.status_code == 422
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
"metrics": ["faithfulness", "context_recall"],
})
assert resp.status_code == 200
data = resp.json()
assert "context_recall" in data["skipped_metrics"]
def test_contexts_split_on_separator(self, client, monkeypatch):
"""contexts string is split before passing to scorer."""
import webapp.api.score as score_mod
calls = []
def capture(*args, **kwargs):
calls.append(kwargs.get("contexts", []))
return {"faithfulness": 0.9}
monkeypatch.setattr(score_mod.inline_scorer, "score", capture)
client.post("/api/score", json={
"question": "q", "answer": "a",
"contexts": "ctx1 |||| ctx2",
"context_separator": " |||| ",
})
assert calls[0] == ["ctx1", "ctx2"]
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
"""When SCORE_API_TOKEN is set, requests without token get 401."""
import webapp.api.score as score_mod
from rag_eval.settings import EvaluationSettings
mock_settings = EvaluationSettings(_env_file=None)
object.__setattr__(mock_settings, "score_api_token", "secret-token")
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
mock_scorer = MagicMock()
mock_scorer.score.return_value = {"faithfulness": 0.9}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
test_client = TestClient(create_app())
# No auth header → 401
resp = test_client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
})
assert resp.status_code == 401
# Correct token → 200
resp = test_client.post("/api/score",
json={"question": "q", "answer": "a", "contexts": "c"},
headers={"Authorization": "Bearer secret-token"},
)
assert resp.status_code == 200
def test_wrong_bearer_token_returns_401(self, monkeypatch):
import webapp.api.score as score_mod
from rag_eval.settings import EvaluationSettings
mock_settings = EvaluationSettings(_env_file=None)
object.__setattr__(mock_settings, "score_api_token", "correct-token")
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
mock_scorer = MagicMock()
mock_scorer.score.return_value = {}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
test_client = TestClient(create_app())
resp = test_client.post("/api/score",
json={"question": "q", "answer": "a", "contexts": "c"},
headers={"Authorization": "Bearer wrong-token"},
)
assert resp.status_code == 401
```
- [ ] **Step 2: Run to verify FAIL**
```
python -m pytest tests/webapp/test_score_api.py::TestScoreEndpoint -v
```
Expected: `ModuleNotFoundError: No module named 'webapp.api.score'`
- [ ] **Step 3: Create `webapp/api/score.py`**
```python
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
from __future__ import annotations
import time
from fastapi import APIRouter, Header, HTTPException
from typing import Annotated
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from webapp.models import ScoreRequest, ScoreResponse
from webapp.services.inline_scorer import inline_scorer
router = APIRouter(prefix="/api/score", tags=["score"])
def _get_settings() -> EvaluationSettings:
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
return EvaluationSettings()
def _check_auth(authorization: str | None, token: str) -> None:
"""Raise 401 if Bearer token does not match the configured token."""
if authorization is None:
raise HTTPException(status_code=401, detail="Missing Authorization header.")
parts = authorization.split(" ", 1)
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
"",
response_model=ScoreResponse,
summary="单题实时评分Dify 外部 Tool",
responses={
200: {"description": "各指标得分和加权综合得分。"},
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
422: {"description": "请求参数校验失败。"},
},
)
def score_sample(
request: ScoreRequest,
authorization: Annotated[str | None, Header()] = None,
) -> ScoreResponse:
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
供 Dify 外部 Tool 调用。将 `contexts` 字段按 `context_separator` 拆分后传入
RAGAS 管道;`ground_truth` 缺失时自动跳过依赖它的指标。
"""
settings = _get_settings()
# 鉴权(仅在配置了 token 时生效)
if settings.score_api_token:
_check_auth(authorization, settings.score_api_token)
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
if not effective:
# All requested metrics require ground_truth which is absent.
return ScoreResponse(
scores={m: None for m in request.metrics},
weighted_score=None,
latency_ms=0,
skipped_metrics=skipped,
)
t0 = time.monotonic()
try:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
return ScoreResponse(
scores={},
weighted_score=None,
latency_ms=latency_ms,
skipped_metrics=skipped,
error=f"{type(exc).__name__}: {exc}",
)
latency_ms = int((time.monotonic() - t0) * 1000)
# Merge: skipped metrics appear as null in final scores dict.
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
all_scores.update(raw_scores)
# Weighted score = equal-weight mean of non-null effective scores.
weighted = compute_weighted_score(
{k: v for k, v in raw_scores.items() if v is not None},
{},
)
return ScoreResponse(
scores=all_scores,
weighted_score=round(weighted, 4) if weighted is not None else None,
latency_ms=latency_ms,
skipped_metrics=skipped,
)
```
- [ ] **Step 4: Register router in `webapp/server.py`**
Add `score` to the import line:
```python
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
```
Add the router registration after `pipeline.router`:
```python
app.include_router(score.router)
```
Add `"score"` tag to `OPENAPI_TAGS` list (insert before `"meta"`):
```python
{
"name": "score",
"description": (
"**实时评分 APIDify 外部 Tool**\n\n"
"接受单条问答记录 `(question, answer, contexts, ground_truth)`\n"
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
"适用场景Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
"`Authorization: Bearer <token>` 请求头。"
),
},
```
Also update the `description` field in `FastAPI(...)` to add a bullet:
```python
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
```
- [ ] **Step 5: Run to verify PASS**
```
python -m pytest tests/webapp/test_score_api.py -v
```
Expected: all tests PASS.
- [ ] **Step 6: Verify server boots and route appears**
```
python -c "
from webapp.server import create_app
app = create_app()
routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes]
score_routes = [(p,m) for p,m in routes if 'score' in p]
print('Score routes:', score_routes)
"
```
Expected output:
```
Score routes: [('/api/score', ['POST'])]
```
- [ ] **Step 7: Commit**
```
git add webapp/api/score.py webapp/server.py tests/webapp/test_score_api.py
git commit -m "feat: add POST /api/score endpoint for Dify real-time scoring"
```
---
## Task 4: 全量回归 + `.env.example` 更新
**Files:**
- Modify: `.env.example`
- [ ] **Step 1: Add SCORE_API_TOKEN to `.env.example`**
Add this block after `DATASET_GENERATOR_MODEL=qwen3.6-plus`:
```
# ===== Dify 集成 — 实时评分 API =====
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
SCORE_API_TOKEN=
```
- [ ] **Step 2: Run full test suite**
```
python -m pytest tests/ -v --tb=short
```
Pre-existing failures to ignore:
- `test_normalize_sample_pdf_offline_smoke_row` — 缺少 CSV fixture
- `test_evaluator_and_reporting_write_run_assets` — 预存在的断言不匹配
- `test_question_generator_rejects_invalid_json` — retry 循环吞掉了 ValueError
- `test_question_generator_rejects_non_list_samples` — 同上
**零新增失败**即为通过。
- [ ] **Step 3: Final commit**
```
git add .env.example
git commit -m "feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
- POST /api/score: real-time RAGAS scoring for Dify external Tool
- ScoreRequest/ScoreResponse Pydantic models with full field docs
- InlineScorer with (judge_model, embedding_model) client cache
- Bearer token auth via SCORE_API_TOKEN env var (optional)
- contexts split by configurable separator (default ' |||| ')
- GT-dependent metrics auto-skipped when ground_truth absent
- Full test coverage (22 new tests)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
```
---
## Dify 侧配置参考
任务完成后,在 Dify 「工具」→「自定义工具」中填写如下 OpenAPI Schema
```yaml
openapi: 3.1.0
info:
title: RAGAS 实时评分
version: 1.0.0
servers:
- url: http://<your-server>:8800
paths:
/api/score:
post:
operationId: scoreQA
summary: 对一条问答记录进行 RAGAS 评分
requestBody:
required: true
content:
application/json:
schema:
type: object
required: [question, answer, contexts]
properties:
question: { type: string }
answer: { type: string }
contexts: { type: string, description: "多段上下文用 ' |||| ' 拼接" }
ground_truth: { type: string }
metrics:
type: array
items: { type: string }
default: [faithfulness, answer_relevancy, context_recall, context_precision]
responses:
'200':
description: 评分结果
content:
application/json:
schema:
type: object
properties:
scores: { type: object }
weighted_score: { type: number }
latency_ms: { type: integer }
skipped_metrics: { type: array, items: { type: string } }
```

View File

@@ -0,0 +1,240 @@
# 指标权重 & 文档片段权重功能设计
**日期**: 2026-06-18
**状态**: 已批准,待实现
**范围**: 在「新建评估」运行评估时,支持为 RAGAS 指标和文档配置权重,计算加权综合得分并在报告中展示。
---
## 1. 目标
1. **指标权重Metric Weights**:允许为每个 RAGAS 指标配置浮点权重(如 faithfulness: 0.35),计算每道题的加权综合得分 `weighted_score`
2. **文档权重Doc Weights**:允许为特定 PDF 文档名称配置权重(如 `"322_双源CT.pdf": 2.0`),该文档的题目在汇总指标均值时按权重放大贡献。
3. **前端覆盖**:在「新建评估」页面选中场景后,展示可编辑的权重面板,运行前可临时覆盖 YAML 中的权重。
4. **完全向后兼容**:两个字段均为可选,省略时退化为等权行为,现有场景 YAML 无需修改。
---
## 2. 数据模型
### 2.1 场景 YAML新增可选字段
```yaml
# 可选。缺省时所有指标权重 = 1.0
metric_weights:
faithfulness: 0.35
context_recall: 0.25
context_precision: 0.20
answer_relevancy: 0.20
# 可选。缺省时所有文档权重 = 1.0
doc_weights:
"322_双源CT成像技术.pdf": 2.0
"323_单源CT对比.pdf": 1.5
```
### 2.2 Pydantic Schema`rag_eval/config/schema.py`
`ScenarioModel` 新增:
```python
metric_weights: dict[str, float] = Field(default_factory=dict)
doc_weights: dict[str, float] = Field(default_factory=dict)
```
`ConfigDict(extra="ignore")` 不变,新字段不影响既有 YAML 的加载。
### 2.3 内部 Scenario dataclass`rag_eval/shared/models.py`
`Scenario` 新增:
```python
metric_weights: dict[str, float] = field(default_factory=dict)
doc_weights: dict[str, float] = field(default_factory=dict)
```
`scenario.snapshot()` 序列化,供 `run_reader` / 报告层读取。
---
## 3. 后端:权重计算逻辑
### 3.1 新模块 `rag_eval/metrics/weights.py`
纯函数模块,无外部依赖,独立可测:
```python
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
"""返回 key 对应的权重,缺失时返回 default。"""
def compute_weighted_score(
scores: dict[str, float | None],
metric_weights: dict[str, float],
) -> float | None:
"""
给定各指标得分和权重,返回加权综合得分。
- 忽略 NaN / None 值
- metric_weights 为空时退化为等权均值
- 全部 NaN 时返回 None
公式: Σ(w_i * s_i) / Σ(w_i),只对非 NaN 项求和
"""
def weighted_metric_means(
score_rows: list[dict],
metrics: list[str],
doc_weights: dict[str, float],
) -> dict[str, float | None]:
"""
对每个指标计算文档加权均值。
- sample_weight = doc_weights.get(row["doc_name"], 1.0)
- 公式: Σ(sample_weight_j * score_m_j) / Σ(sample_weight_j)
- doc_weights 为空时退化为普通算术均值
"""
```
### 3.2 评估器(`rag_eval/execution/evaluator.py`
`_merge_score()` 新增两列:
```python
record["weighted_score"] = compute_weighted_score(
score.metrics, self.scenario.metric_weights
)
record["sample_weight"] = self.scenario.doc_weights.get(
sample.metadata.get("doc_name", ""), 1.0
)
```
`scores.csv` 新增 `weighted_score``sample_weight` 两列。
### 3.3 报告摘要(`rag_eval/reporting/summary.py`
`build_summary_markdown()` 改用 `weighted_metric_means()` 计算各指标均值;
新增 `weighted_score` 整体均值行:
```
## Metric Means加权
- faithfulness: 0.8123 (w=0.35)
- context_recall: 0.7654 (w=0.25)
- context_precision: 0.7200 (w=0.20)
- answer_relevancy: 0.7400 (w=0.20)
- **weighted_score: 0.7789**
```
---
## 4. yaml_patcher 扩展(`webapp/services/yaml_patcher.py`
`apply_profiles_to_scenario()` 扩展签名,新增可选参数:
```python
def apply_profiles_to_scenario(
scenario_path: str,
judge_profile: LLMProfile | None,
answer_profile: LLMProfile | None,
dataset_profile: LLMProfile | None,
metric_weights: dict[str, float] | None = None, # 新增
doc_weights: dict[str, float] | None = None, # 新增
_resolve_absolute: bool = False,
) -> list[str]:
```
- `metric_weights` 非 None 时写入 `data["metric_weights"]`,追加 `"metric_weights"` 到 patched 列表
- `doc_weights` 非 None 时写入 `data["doc_weights"]`,追加 `"doc_weights"` 到 patched 列表
---
## 5. Webapp 模型与 API 扩展
### 5.1 `webapp/models.py`
`ProfileApplyRequest` 新增:
```python
metric_weights: dict[str, float] | None = None
doc_weights: dict[str, float] | None = None
```
`ProfileApplyResponse` 不变(`patched_fields` 已包含新字段名)。
### 5.2 `webapp/api/llm_profiles.py` — `apply_profiles()`
透传 `metric_weights` / `doc_weights``apply_profiles_to_scenario()`
---
## 6. 前端:权重配置面板
### 6.1 HTML`index.html`
`#llm-assignment-panel` 下方新增 `#weight-config-panel`(选中场景后显示):
```
┌─────────────────────────────────────────────┐
│ 权重配置 (可选,留空使用场景原始配置) │
├─────────────────────────────────────────────┤
│ 指标权重 │
│ faithfulness [____1.0____] │
│ context_recall [____1.0____] │
│ ...(根据选中场景的 metrics 动态生成) │
│ │
│ 文档权重doc_weights
│ [doc名称_______________] [权重__] [] [✕] │
│ [doc名称_______________] [权重__] [] [✕] │
添加文档权重规则 │
└─────────────────────────────────────────────┘
```
### 6.2 `runner.js`
- `renderScenarioItem()` 选中后调用 `Runner._renderWeightPanel(sc)` 动态生成指标行
- `_applyProfilesIfNeeded()` 同时读取权重输入,追加到 `apply` 请求 body
- `Runner._collectWeights()` 收集 metric_weights / doc_weights全部为 1.0 时不发送(跳过)
### 6.3 CSS`app.css`
新增 `.weight-config-panel``.weight-row``.weight-input` 样式,与现有 `.llm-role-row` 风格一致。
---
## 7. 报告展示(`webapp/services/report_builder.py`
- `RunSummary.metric_means` 改用 `weighted_metric_means()` 计算(需从 `scenario.snapshot.yaml` 读取 `doc_weights` / `metric_weights`
- `RunSummary` 新增 `weighted_score_mean: float | None` 字段
- 前端 `report.js` 的指标卡片区新增「综合加权得分」卡片,使用 `good/warn/bad` 配色
---
## 8. 测试计划
| 测试文件 | 覆盖内容 |
|----------|---------|
| `tests/test_weights.py` | `compute_weighted_score` / `weighted_metric_means` 纯函数,含 NaN 边界、空权重、全 NaN |
| `tests/test_dataset_build.py` | 无改动(隔离良好) |
| `tests/test_offline_eval.py` | `_merge_score` 新增 weighted_score / sample_weight 列断言 |
| `tests/webapp/test_llm_profiles_api.py` | `apply_profiles` 带 metric_weights / doc_weights 的 patching 测试 |
---
## 9. 改动文件清单
| 文件 | 改动类型 |
|------|---------|
| `rag_eval/config/schema.py` | 新增字段 |
| `rag_eval/shared/models.py` | 新增字段 |
| `rag_eval/config/loader.py` | 透传新字段到 Scenario |
| `rag_eval/metrics/weights.py` | **新建** |
| `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 |
| `rag_eval/reporting/summary.py` | 改用加权均值 |
| `webapp/services/yaml_patcher.py` | 新增 metric_weights / doc_weights 参数 |
| `webapp/models.py` | ProfileApplyRequest 新增字段RunSummary 新增 weighted_score_mean |
| `webapp/api/llm_profiles.py` | 透传新参数 |
| `webapp/services/report_builder.py` | 加权均值计算 |
| `webapp/static/index.html` | 新增权重配置面板 |
| `webapp/static/js/runner.js` | 权重面板逻辑 |
| `webapp/static/css/app.css` | 新增权重面板样式 |
| `tests/test_weights.py` | **新建** |
---
## 10. 向后兼容保证
- `metric_weights: {}` + `doc_weights: {}` → 所有权重 = 1.0,行为与当前完全一致
- 现有场景 YAML 不含这两个字段 → Pydantic `default_factory=dict` 填充空字典
- `scores.csv` 新增两列不影响现有报告读取(`run_reader` 只读已知列)

View File

@@ -0,0 +1,138 @@
# Dify 集成 — 单题实时评分 API 设计
**日期**: 2026-06-22
**状态**: 已批准,待实现
**范围**: 在现有 FastAPI 服务中新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,实现单条问答记录的实时 RAGAS 指标评分。
---
## 1. 目标
让 Dify Agent 能在回答完问题后,将 `(question, answer, contexts, ground_truth)` 发给 siemens_ragas 服务,实时获取各 RAGAS 指标得分,用于质量监控或 Agent 自我改进。
---
## 2. API 规范
### `POST /api/score`
**请求体:**
```json
{
"question": "双源CT的时间分辨率是多少?",
"answer": "双源CT的单扇区时间分辨率为75ms。",
"contexts": "片段1双源CT采用两套管-探测器系统... |||| 片段2单扇区采集旋转135度...",
"ground_truth": "双源CT单扇区时间分辨率为75ms需旋转135度。",
"context_separator": " |||| ",
"metrics": ["faithfulness", "answer_relevancy"],
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3"
}
```
**字段说明:**
| 字段 | 类型 | 必填 | 说明 |
|------|------|------|------|
| `question` | str | ✅ | 问题文本 |
| `answer` | str | ✅ | 待评分的回答 |
| `contexts` | str | ✅ | 检索到的上下文,多段用 `context_separator` 拼接 |
| `ground_truth` | str | ❌ | 标准答案缺失时跳过依赖它的指标context_recall、factual_correctness、semantic_similarity |
| `context_separator` | str | ❌ | 默认 `" \|\|\|\| "`(四个竖线,两侧各一空格) |
| `metrics` | list[str] | ❌ | 默认 `["faithfulness", "answer_relevancy", "context_recall", "context_precision"]` |
| `judge_model` | str | ❌ | 默认读 `.env``RAGAS_JUDGE_MODEL` |
| `embedding_model` | str | ❌ | 默认读 `.env``RAGAS_EMBEDDING_MODEL` |
**响应体200 OK**
```json
{
"scores": {
"faithfulness": 0.8750,
"answer_relevancy": 0.9200
},
"weighted_score": 0.8975,
"latency_ms": 3420
}
```
**错误响应:**
| 状态码 | 场景 |
|--------|------|
| 400 | 必填字段缺失、metrics 名称不合法 |
| 401 | 配置了 `SCORE_API_TOKEN` 但请求未携带有效 Bearer Token |
| 422 | 请求体 JSON 格式错误Pydantic 校验) |
| 500 | RAGAS 内部评分异常,附带 error 字段 |
**鉴权(可选):**
`.env``SCORE_API_TOKEN` 非空,则要求请求头携带 `Authorization: Bearer <token>`。为空则不鉴权(内网部署场景)。
---
## 3. 架构与文件改动
### 新文件
| 文件 | 职责 |
|------|------|
| `webapp/api/score.py` | 路由定义,请求验证,调用 InlineScorer |
| `webapp/services/inline_scorer.py` | LLM 客户端缓存 + RAGAS 评分逻辑封装 |
### 修改文件
| 文件 | 改动 |
|------|------|
| `webapp/models.py` | 新增 `ScoreRequest``ScoreResponse` |
| `webapp/server.py` | 注册 `score.router`,更新 `openapi_tags` |
| `rag_eval/settings.py` | 新增 `score_api_token: str | None` 字段 |
---
## 4. `inline_scorer.py` 设计
```python
class InlineScorer:
"""同步执行 RAGAS 单题评分,内部缓存 LLM 客户端。"""
def score(
self,
question: str,
answer: str,
contexts: list[str],
ground_truth: str | None,
metrics: list[str],
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> dict[str, float | None]:
"""返回 {metric_name: score} 字典NaN 记为 None。"""
```
**客户端缓存策略:**
`(judge_model, embedding_model)` 为 key缓存 `(llm, embeddings)` 对象,避免每次请求都重建 AsyncOpenAI 连接。缓存为模块级单例(`_scorer_cache: dict`),线程安全(加 `threading.Lock`)。
**评分执行:**
复用 `build_metric_pipeline` 构建 `MetricPipeline`,然后 `asyncio.run(pipeline.score_sample(sample))` 执行。与现有 `evaluator.py` 模式一致。
**ground_truth 为空时的指标跳过逻辑:**
`context_recall``factual_correctness``semantic_similarity``noise_sensitivity` 需要 ground_truth若请求中未提供自动从 metrics 列表中移除这些指标,并在响应中对应字段返回 `null`
---
## 5. Dify 侧配置方法
1. 在 Dify 「工具」→「自定义工具」中创建新工具
2. 填写 OpenAPI Schema`/api/score` 端点对齐)
3. 鉴权方式API KeyBearer或无鉴权
4. 在 Agent / Workflow 节点中引用该工具,将 `question``answer``contexts` 变量映射到工具输入
---
## 6. 不在范围内
- 批量评分接口(异步 job
- Dify Workflow 节点插件(需要 Dify 插件开发框架)
- 评分结果持久化到 scores.csv
- 与现有 report_builder 集成展示

View File

@@ -0,0 +1,173 @@
# Linux 一键部署脚本设计
**日期**: 2026-06-22
**状态**: 已批准,待实现
**范围**: 为 siemens_ragas 项目提供 Linux 环境的部署与运维脚本(无 Docker无 systemd
---
## 1. 目标
提供四个 Bash 脚本,覆盖 Linux 服务器上的完整生命周期:
| 脚本 | 职责 |
|------|------|
| `deploy.sh` | 一键完成环境检查、依赖安装、配置初始化、启动服务 |
| `start.sh` | 仅启动 Web 服务(已部署后复用,不重装依赖) |
| `stop.sh` | 停止后台 Web 服务 |
| `run_eval.sh` | 运行单次评估(对应 Windows 的 `run_eval.ps1` |
---
## 2. 约束与假设
- Linux 目标环境有 PyPI 网络访问pip 可直接安装)
- 代码已通过 `git clone` 或文件拷贝到服务器
- 使用 `pip + venv`(不使用 uv
- Web 服务监听 `0.0.0.0:8800`(内网可达)
- 后台运行使用 `nohup`PID 写入 `.server.pid`,日志追加到 `logs/server.log`
- 所有脚本均放在仓库根目录,路径相对于 `$SCRIPT_DIR`
---
## 3. `deploy.sh` 详细设计
### 3.1 阶段 1Python 版本检查
```
require Python >= 3.12
```
- `python3 --version` 解析 major.minor
- 不满足则打印错误并 `exit 1`
- 满足则打印 `[OK] Python X.Y.Z`
### 3.2 阶段 2虚拟环境
- 目标路径:`$SCRIPT_DIR/.venv`
- 已存在则跳过创建(打印 `[OK] .venv already exists`
- 不存在则 `python3 -m venv .venv`
### 3.3 阶段 3依赖安装
```bash
.venv/bin/pip install --upgrade pip -q
.venv/bin/pip install -e . -q # 安装 pyproject.toml 中的依赖
.venv/bin/pip install fastapi uvicorn httpx -q # Web 服务额外依赖
```
- 失败则打印错误并 `exit 1`
- `fastapi``uvicorn``httpx``pyproject.toml` 中未列,需单独安装
### 3.4 阶段 4配置文件
-`.env` 不存在:`cp .env.example .env`,打印警告提示用户编辑后再启动
-`.env` 已存在:跳过,打印 `[OK] .env found`
### 3.5 阶段 5目录初始化
创建以下目录(`mkdir -p`,幂等):
- `configs/` — LLM Profile 持久化存储
- `logs/` — 评估日志 + 服务器日志
- `outputs/` — 评估运行产物
- `datasets/` — 原始数据集
### 3.6 阶段 6Demo 数据
- 检查 `outputs/kba-knowledge-base-offline-baseline/` 是否存在
- 不存在则运行 `.venv/bin/python scripts/seed_sample_run.py`
- 失败时打印 `[WARN]`(非致命,报告页为空但服务可启动)
### 3.7 阶段 7端口检测
- 默认端口 `8800`
-`ss -tlnp``netstat -tlnp` 检查是否占用
- 占用则尝试 `8801`,仍占用则报错退出
### 3.8 阶段 8启动服务
```bash
nohup .venv/bin/python webmain.py \
--host 0.0.0.0 \
--port $PORT \
>> logs/server.log 2>&1 &
echo $! > .server.pid
```
- 等待 2 秒后用 `kill -0 $PID` 检测进程是否存活
- 存活则打印 URL 和 stop 方法
- 未存活则打印 `[ERROR] Server failed to start. Check logs/server.log.``exit 1`
---
## 4. `start.sh` 详细设计
单独负责启动,不做任何环境初始化。
```bash
#!/usr/bin/env bash
# 检查 .venv 存在
# 端口检测(同 deploy.sh 逻辑)
# 检查 .env 存在(不存在则 warn 但不阻止)
# nohup 启动 + PID 文件 + 存活验证
# 打印 URL
```
---
## 5. `stop.sh` 详细设计
```bash
#!/usr/bin/env bash
# 读取 .server.pid
# 若文件不存在:打印 "No server PID file found." 退出
# kill $PID
# 等待 2 秒,若进程仍存活用 kill -9
# 删除 .server.pid
# 打印 "Server stopped."
```
---
## 6. `run_eval.sh` 详细设计
对应 Windows 的 `run_eval.ps1`
```
用法:
./run_eval.sh # online eval (默认)
./run_eval.sh offline # offline smoke
./run_eval.sh scenarios/xxx.yaml # 自定义场景
./run_eval.sh online DEBUG # 自定义日志级别
```
- 参数 1Scenario`online` / `offline` / 文件路径,默认 `online`
- 参数 2LogLevel`DEBUG` / `INFO` / `WARNING` / `ERROR`,默认 `INFO`
- 场景别名映射:
- `online``scenarios/online/siemens-pdf-question-bank-online.yaml`
- `offline``scenarios/offline/siemens-pdf-offline-smoke.yaml`
- 时间戳日志文件:`logs/eval_$(date +%Y-%m-%d_%H%M%S).log`
- 环境变量:`PYTHONIOENCODING=utf-8 PYTHONPATH=.`
- 调用:`.venv/bin/python main.py --scenario $SCENARIO --log-file $LOG_FILE --log-level $LOG_LEVEL`
- 非零退出码时打印错误并 `exit 1`
---
## 7. 通用约定
- 所有脚本首行:`#!/usr/bin/env bash`
- `set -euo pipefail` — 错误立即退出,未定义变量报错,管道错误传播
- `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` — 从任意目录执行均正确
- `cd "$SCRIPT_DIR"` — 切换到仓库根目录
- 颜色输出:绿色 `[OK]`、黄色 `[WARN]`、红色 `[ERROR]`(检测 tty非交互式终端降级为无色
- 执行权限:脚本自身需要 `chmod +x`(在 deploy.sh 内对其他脚本自动 chmod
---
## 8. 不在范围内
- Docker / docker-compose 支持
- systemd service 配置
- Nginx 反向代理配置
- SSL/TLS 配置
- 离线/内网镜像源配置

View File

@@ -62,6 +62,8 @@ def load_scenario(path: str | Path) -> Scenario:
), ),
source_path=scenario_path, source_path=scenario_path,
optimization_advisor=model.optimization_advisor, optimization_advisor=model.optimization_advisor,
metric_weights=dict(model.metric_weights),
doc_weights=dict(model.doc_weights),
) )
# Run cross-field checks after all relative paths have been resolved. # Run cross-field checks after all relative paths have been resolved.
validate_scenario(scenario) validate_scenario(scenario)

View File

@@ -55,6 +55,8 @@ class ScenarioModel(BaseModel):
output_dir: str output_dir: str
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel) runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
optimization_advisor: bool = False optimization_advisor: bool = False
metric_weights: dict[str, float] = Field(default_factory=dict)
doc_weights: dict[str, float] = Field(default_factory=dict)
@field_validator("metrics") @field_validator("metrics")
@classmethod @classmethod

View File

@@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records
from rag_eval.datasets.normalizers import normalize_records from rag_eval.datasets.normalizers import normalize_records
from rag_eval.execution.concurrency import gather_with_limit from rag_eval.execution.concurrency import gather_with_limit
from rag_eval.metrics.pipeline import MetricPipeline from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
from rag_eval.shared.utils import utc_now_iso from rag_eval.shared.utils import utc_now_iso
@@ -171,7 +172,7 @@ class Evaluator:
return valid, invalid return valid, invalid
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]: def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
"""Combine sample data, metric results, and run metadata into one output row.""" """Combine sample data, metric results, run metadata, and weight columns."""
record = sample.to_record() record = sample.to_record()
record["contexts"] = sample.contexts record["contexts"] = sample.contexts
record.update(score.metrics) record.update(score.metrics)
@@ -179,4 +180,12 @@ class Evaluator:
record["judge_model"] = self.scenario.judge_model record["judge_model"] = self.scenario.judge_model
record["embedding_model"] = self.scenario.embedding_model record["embedding_model"] = self.scenario.embedding_model
record["run_id"] = self.scenario.scenario_name record["run_id"] = self.scenario.scenario_name
# Weighted score columns — enable post-hoc weighted aggregation in reporting.
record["weighted_score"] = compute_weighted_score(
score.metrics, self.scenario.metric_weights
)
doc_name = str(sample.metadata.get("doc_name", "") or "")
record["sample_weight"] = resolve_weight(
self.scenario.doc_weights, doc_name, default=1.0
)
return record return record

152
rag_eval/metrics/weights.py Normal file
View File

@@ -0,0 +1,152 @@
"""Utility functions for weighted metric aggregation.
All functions are pure (no side effects, no I/O) and operate on plain dicts/lists.
Weights do not need to be pre-normalised — normalisation is done internally.
"""
from __future__ import annotations
import math
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
"""Return the weight for *key*, or *default* when absent."""
return float(weights.get(key, default))
def compute_weighted_score(
scores: dict[str, float | None],
metric_weights: dict[str, float],
) -> float | None:
"""Return the weighted mean of valid (non-NaN, non-None) metric scores.
Args:
scores: mapping of metric_name -> raw score (may be NaN or None).
metric_weights: optional per-metric weights; absent keys default to 1.0.
Returns:
Weighted mean as a float, or None when no valid score exists.
"""
total_weight = 0.0
total_score = 0.0
for metric, score in scores.items():
if score is None:
continue
try:
value = float(score)
except (TypeError, ValueError):
continue
if math.isnan(value) or math.isinf(value):
continue
weight = resolve_weight(metric_weights, metric, default=1.0)
total_weight += weight
total_score += weight * value
if total_weight == 0.0:
return None
return total_score / total_weight
def weighted_metric_means(
score_rows: list[dict],
metrics: list[str],
doc_weights: dict[str, float],
) -> dict[str, float | None]:
"""Compute per-metric weighted means across all score rows.
Each row's contribution is scaled by the doc_weight for its ``doc_name``.
Rows with NaN/None for a given metric are excluded from that metric's mean.
Args:
score_rows: list of score record dicts (from scores.csv).
metrics: ordered list of metric names to aggregate.
doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0.
Returns:
Dict mapping metric_name -> weighted mean (or None if no valid data).
"""
totals: dict[str, float] = {metric: 0.0 for metric in metrics}
weights_sum: dict[str, float] = {metric: 0.0 for metric in metrics}
for row in score_rows:
doc_name = str(row.get("doc_name", "") or "")
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
for metric in metrics:
raw_value = row.get(metric)
if raw_value is None:
continue
try:
value = float(raw_value)
except (TypeError, ValueError):
continue
if math.isnan(value) or math.isinf(value):
continue
totals[metric] += sample_weight * value
weights_sum[metric] += sample_weight
return {
metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None)
for metric in metrics
}
def compute_overall_weighted_score_mean(
score_rows: list[dict],
metric_weights: dict[str, float],
doc_weights: dict[str, float],
) -> float | None:
"""Compute the overall weighted-score mean across all samples.
For each sample:
1. Compute per-sample weighted_score via compute_weighted_score.
2. Scale by the doc weight for that sample's doc_name.
Then return the weighted mean of all per-sample weighted_scores.
"""
total_weight = 0.0
total_score = 0.0
for row in score_rows:
metric_scores: dict[str, float | None] = {}
for key, value in row.items():
if key in _META_COLUMNS:
continue
metric_scores[key] = value # type: ignore[assignment]
weighted_score = compute_weighted_score(metric_scores, metric_weights)
if weighted_score is None:
continue
doc_name = str(row.get("doc_name", "") or "")
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
total_weight += sample_weight
total_score += sample_weight * weighted_score
return total_score / total_weight if total_weight > 0 else None
# Columns in scores.csv that are sample metadata, not metric scores.
_META_COLUMNS = frozenset(
{
"sample_id",
"question",
"contexts",
"answer",
"ground_truth",
"scenario",
"language",
"retrieval_config",
"error",
"judge_model",
"embedding_model",
"run_id",
"difficulty",
"question_type",
"doc_id",
"doc_name",
"section_path",
"page_start",
"page_end",
"source_chunk_ids",
"review_status",
"review_notes",
"weighted_score",
"sample_weight",
}
)

View File

@@ -6,6 +6,10 @@ import math
import pandas as pd import pandas as pd
from rag_eval.metrics.weights import (
compute_overall_weighted_score_mean,
weighted_metric_means,
)
from rag_eval.shared.models import EvaluationResult from rag_eval.shared.models import EvaluationResult
@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
lines.append("No valid samples were scored.") lines.append("No valid samples were scored.")
return "\n".join(lines) + "\n" return "\n".join(lines) + "\n"
for metric in result.scenario.metrics: score_rows_list = scores.to_dict(orient="records")
mean_value = scores[metric].mean(numeric_only=True) w_means = weighted_metric_means(
if isinstance(mean_value, float) and not math.isnan(mean_value): score_rows_list, result.scenario.metrics, result.scenario.doc_weights
lines.append(f"- {metric}: `{mean_value:.4f}`") )
else:
lines.append(f"- {metric}: `n/a`")
# Keep the summary self-sufficient by including every scored sample and its errors. has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
detail = scores[detail_columns] for metric in result.scenario.metrics:
lines.extend( mean_value = w_means.get(metric)
[ w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
if mean_value is not None and not math.isnan(mean_value):
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
else:
lines.append(f"- {metric}: `n/a`{weight_note}")
if has_weights:
overall_ws = compute_overall_weighted_score_mean(
score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
)
weight_suffix = " (加权)"
if overall_ws is not None and not math.isnan(overall_ws):
lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
else:
lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
existing_columns = [c for c in detail_columns if c in scores.columns]
detail = scores[existing_columns]
lines.extend([
"", "",
"## Per-sample Scores", "## Per-sample Scores",
"", "",
"```text", "```text",
_table_from_frame(detail), _table_from_frame(detail),
"```", "```",
] ])
)
return "\n".join(lines) + "\n" return "\n".join(lines) + "\n"

View File

@@ -52,6 +52,11 @@ class EvaluationSettings(BaseSettings):
) )
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE") parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL") dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
score_api_token: str | None = Field(
default=None,
alias="SCORE_API_TOKEN",
description="Bearer token for /api/score endpoint. Empty = no auth.",
)
@property @property
def openai_client_kwargs(self) -> dict[str, str | float]: def openai_client_kwargs(self) -> dict[str, str | float]:

View File

@@ -77,6 +77,8 @@ class Scenario:
app_adapter: AppAdapterConfig | None = None app_adapter: AppAdapterConfig | None = None
source_path: Path | None = None source_path: Path | None = None
optimization_advisor: bool = False optimization_advisor: bool = False
metric_weights: dict[str, float] = field(default_factory=dict)
doc_weights: dict[str, float] = field(default_factory=dict)
def snapshot(self) -> dict[str, Any]: def snapshot(self) -> dict[str, Any]:
"""Serialize the scenario into a reporting-friendly dictionary snapshot.""" """Serialize the scenario into a reporting-friendly dictionary snapshot."""

147
run_eval.sh Normal file
View File

@@ -0,0 +1,147 @@
#!/usr/bin/env bash
# run_eval.sh — Siemens RAGAS 评估运行脚本Linux
# 对应 Windows 的 run_eval.ps1
#
# 用法:
# bash run_eval.sh # online 评估(默认)
# bash run_eval.sh offline # offline 冒烟测试
# bash run_eval.sh scenarios/xxx.yaml # 自定义场景
# bash run_eval.sh online DEBUG # 指定日志级别
# bash run_eval.sh build scenarios/siemens_build/siemens-pdf-build.yaml
# # 题库生成
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# ── 颜色输出 ──────────────────────────────────────────────────────
if [ -t 1 ]; then
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
else
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
fi
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
# ── 参数解析 ──────────────────────────────────────────────────────
SCENARIO="${1:-online}"
LOG_LEVEL="${2:-INFO}"
# 场景别名映射
declare -A SCENARIO_MAP=(
["online"]="scenarios/online/siemens-pdf-question-bank-online.yaml"
["offline"]="scenarios/offline/siemens-pdf-offline-smoke.yaml"
)
# 检测是否是 dataset build 模式
BUILD_MODE=false
BUILD_CONFIG=""
if [ "$SCENARIO" = "build" ]; then
BUILD_MODE=true
BUILD_CONFIG="${2:-scenarios/siemens_build/siemens-pdf-build.yaml}"
LOG_LEVEL="${3:-INFO}"
elif [ -v "SCENARIO_MAP[$SCENARIO]" ]; then
SCENARIO="${SCENARIO_MAP[$SCENARIO]}"
fi
# ── 验证 ──────────────────────────────────────────────────────────
echo ""
echo -e "${CYAN}============================================================${NC}"
echo -e "${CYAN} Siemens RAGAS — 评估运行${NC}"
echo -e "${CYAN}============================================================${NC}"
echo ""
# 检查虚拟环境
if [ ! -f ".venv/bin/python" ]; then
err "未找到 .venv请先执行部署bash deploy.sh"
exit 1
fi
PYTHON=".venv/bin/python"
# Build 模式校验
if [ "$BUILD_MODE" = true ]; then
if [ ! -f "$BUILD_CONFIG" ]; then
err "题库生成配置文件不存在:$BUILD_CONFIG"
echo ""
echo "可用配置:"
find scenarios/ -name "*.yaml" 2>/dev/null | head -20 | sed 's/^/ /'
exit 1
fi
ok "模式 : 题库生成 (dataset build)"
ok "配置文件 : $BUILD_CONFIG"
else
# 场景文件校验
if [ ! -f "$SCENARIO" ]; then
err "场景文件不存在:$SCENARIO"
echo ""
echo "用法示例:"
echo " bash run_eval.sh # online 评估"
echo " bash run_eval.sh offline # offline 冒烟"
echo " bash run_eval.sh scenarios/xxx.yaml # 自定义场景"
echo " bash run_eval.sh build [config.yaml] # 题库生成"
exit 1
fi
ok "场景文件 : $SCENARIO"
fi
# 日志级别校验
LOG_LEVEL_UPPER="${LOG_LEVEL^^}"
case "$LOG_LEVEL_UPPER" in
DEBUG|INFO|WARNING|ERROR) ;;
*)
warn "未知日志级别 '$LOG_LEVEL',使用默认值 INFO"
LOG_LEVEL_UPPER="INFO"
;;
esac
ok "日志级别 : $LOG_LEVEL_UPPER"
# 创建日志目录
mkdir -p logs
TIMESTAMP=$(date +%Y-%m-%d_%H%M%S)
LOG_FILE="logs/eval_${TIMESTAMP}.log"
ok "日志文件 : $LOG_FILE"
echo ""
echo -e "${CYAN}============================================================${NC}"
echo -e "${CYAN} 开始运行,按 Ctrl+C 中止${NC}"
echo -e "${CYAN}============================================================${NC}"
echo ""
# ── 运行 ──────────────────────────────────────────────────────────
export PYTHONIOENCODING="utf-8"
export PYTHONPATH="."
if [ "$BUILD_MODE" = true ]; then
"$PYTHON" main.py \
--dataset-build-config "$BUILD_CONFIG"
else
"$PYTHON" main.py \
--scenario "$SCENARIO" \
--log-file "$LOG_FILE" \
--log-level "$LOG_LEVEL_UPPER"
fi
EXIT_CODE=$?
echo ""
if [ $EXIT_CODE -eq 0 ]; then
echo -e "${GREEN}============================================================${NC}"
echo -e "${GREEN} 运行完成!${NC}"
if [ "$BUILD_MODE" = false ]; then
echo -e "${GREEN} 日志已保存到:$LOG_FILE${NC}"
fi
echo -e "${CYAN} 在 Web 控制台查看报告bash start.sh${NC}"
echo -e "${GREEN}============================================================${NC}"
else
err "运行失败exit code=$EXIT_CODE"
if [ "$BUILD_MODE" = false ]; then
err "查看日志cat $LOG_FILE"
fi
exit $EXIT_CODE
fi
echo ""

94
start.sh Normal file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env bash
# start.sh — 启动 Siemens RAGAS Web 服务(后台运行)
# 前提:已执行过 deploy.sh.venv 和依赖均已就绪)
# 用法bash start.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# ── 颜色输出 ──────────────────────────────────────────────────────
if [ -t 1 ]; then
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
else
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
fi
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
echo ""
echo -e "${CYAN}============================================================${NC}"
echo -e "${CYAN} Siemens RAGAS Console — 启动服务${NC}"
echo -e "${CYAN}============================================================${NC}"
echo ""
# 检查虚拟环境
if [ ! -f ".venv/bin/python" ]; then
err "未找到 .venv请先执行部署bash deploy.sh"
exit 1
fi
PYTHON=".venv/bin/python"
# 检查 .env
if [ ! -f ".env" ]; then
warn ".env 不存在,请先复制并编辑配置:"
warn " cp .env.example .env && nano .env"
fi
if grep -q "your-api-key" .env 2>/dev/null; then
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
fi
# 检查是否已有运行中的进程
if [ -f ".server.pid" ]; then
EXISTING_PID=$(cat .server.pid)
if kill -0 "$EXISTING_PID" 2>/dev/null; then
warn "服务已在运行 (PID=$EXISTING_PID),无需重复启动"
warn "如需重启请先执行bash stop.sh"
exit 0
else
# PID 文件残留,清理
rm -f .server.pid
fi
fi
# 创建必要目录
mkdir -p logs
# 端口检测
PORT=8800
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
warn "端口 $PORT 已被占用,尝试 8801..."
PORT=8801
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
err "端口 8800 和 8801 均被占用,请手动指定端口:"
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
exit 1
fi
fi
# 后台启动
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
SERVER_PID=$!
echo "$SERVER_PID" > .server.pid
# 等待 3 秒验证进程存活
sleep 3
if kill -0 "$SERVER_PID" 2>/dev/null; then
ok "服务已启动 (PID=$SERVER_PID)"
echo ""
echo -e "${CYAN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
echo -e "${CYAN} 本机访问: http://127.0.0.1:${PORT}${NC}"
echo -e "${CYAN} 查看日志: tail -f logs/server.log${NC}"
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
echo ""
else
err "服务启动失败,请查看日志:"
err " tail -20 logs/server.log"
rm -f .server.pid
exit 1
fi

68
stop.sh Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# stop.sh — 停止 Siemens RAGAS 后台 Web 服务
# 用法bash stop.sh
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# ── 颜色输出 ──────────────────────────────────────────────────────
if [ -t 1 ]; then
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
else
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
fi
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
echo ""
echo -e "${CYAN} Siemens RAGAS Console — 停止服务${NC}"
echo ""
PID_FILE="$SCRIPT_DIR/.server.pid"
if [ ! -f "$PID_FILE" ]; then
warn "未找到 .server.pid服务可能未启动或已停止"
exit 0
fi
PID=$(cat "$PID_FILE")
if ! kill -0 "$PID" 2>/dev/null; then
warn "进程 $PID 已不存在,清理 PID 文件"
rm -f "$PID_FILE"
exit 0
fi
# 优雅停止SIGTERM
echo -e " 正在停止进程 (PID=$PID)..."
kill "$PID" 2>/dev/null || true
# 等待最多 5 秒
for i in 1 2 3 4 5; do
sleep 1
if ! kill -0 "$PID" 2>/dev/null; then
break
fi
echo -e " 等待进程退出... ($i/5)"
done
# 若进程仍存在,强制终止
if kill -0 "$PID" 2>/dev/null; then
warn "进程未响应,强制终止 (SIGKILL)..."
kill -9 "$PID" 2>/dev/null || true
sleep 1
fi
rm -f "$PID_FILE"
if kill -0 "$PID" 2>/dev/null; then
err "无法停止进程 $PID请手动执行kill -9 $PID"
exit 1
else
ok "服务已停止"
echo ""
fi

View File

@@ -80,6 +80,64 @@ class ScenarioAndDatasetTests(unittest.TestCase):
self.assertTrue(scenario.dataset.path.name.endswith(".csv")) self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
self.assertTrue(scenario.output_dir.name == "sample-offline-baseline") self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
def test_load_scenario_metric_and_doc_weights(self) -> None:
"""load_scenario passes metric_weights and doc_weights into Scenario."""
import os
import tempfile
import yaml
from rag_eval.config.loader import load_scenario
payload = {
"scenario_name": "w-test",
"mode": "offline",
"dataset": "nonexistent.csv",
"judge_model": "m",
"embedding_model": "e",
"metrics": ["faithfulness"],
"output_dir": "out",
"metric_weights": {"faithfulness": 0.7},
"doc_weights": {"doc.pdf": 2.0},
}
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
yaml.dump(payload, f, allow_unicode=True)
tmp_path = f.name
try:
scenario = load_scenario(tmp_path)
assert scenario.metric_weights == {"faithfulness": 0.7}
assert scenario.doc_weights == {"doc.pdf": 2.0}
finally:
os.unlink(tmp_path)
def test_load_scenario_defaults_to_empty_weights(self) -> None:
"""load_scenario defaults metric_weights and doc_weights to empty dicts."""
import os
import tempfile
import yaml
from rag_eval.config.loader import load_scenario
payload = {
"scenario_name": "no-w",
"mode": "offline",
"dataset": "nonexistent.csv",
"judge_model": "m",
"embedding_model": "e",
"metrics": ["faithfulness"],
"output_dir": "out",
}
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
yaml.dump(payload, f, allow_unicode=True)
tmp_path = f.name
try:
scenario = load_scenario(tmp_path)
assert scenario.metric_weights == {}
assert scenario.doc_weights == {}
finally:
os.unlink(tmp_path)
def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None: def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml") scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
snapshot = scenario.snapshot() snapshot = scenario.snapshot()
@@ -125,6 +183,117 @@ class ScenarioAndDatasetTests(unittest.TestCase):
class EvaluatorAndReportingTests(unittest.TestCase): class EvaluatorAndReportingTests(unittest.TestCase):
def test_merge_score_includes_weighted_score_and_sample_weight(self):
"""_merge_score adds weighted_score and sample_weight columns."""
from unittest.mock import MagicMock
from rag_eval.execution.evaluator import Evaluator
from rag_eval.shared.models import (
MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
)
scenario = Scenario(
scenario_name="w-test", mode="offline",
dataset=DatasetConfig(path=Path("d.csv")),
judge_model="m", embedding_model="e",
metrics=["faithfulness", "context_recall"],
output_dir=Path("out"),
metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
doc_weights={"doc.pdf": 2.0},
)
evaluator = Evaluator(
scenario=scenario,
metric_pipeline=MagicMock(),
app_adapter=None,
)
sample = NormalizedSample(
sample_id="s1", question="q", contexts=["ctx"],
answer="a", ground_truth="gt",
metadata={"doc_name": "doc.pdf"},
)
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
row = evaluator._merge_score(sample, score)
# (3*1.0 + 1*0.0) / (3+1) = 0.75
assert abs(row["weighted_score"] - 0.75) < 1e-4
assert row["sample_weight"] == 2.0
def test_summary_markdown_shows_weighted_score(self):
"""build_summary_markdown includes weighted_score when metric_weights set."""
import math
from rag_eval.reporting.summary import build_summary_markdown
from rag_eval.shared.models import (
EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
)
from pathlib import Path
scenario = Scenario(
scenario_name="ws-test", mode="offline",
dataset=DatasetConfig(path=Path("d.csv")),
judge_model="m", embedding_model="e",
metrics=["faithfulness"],
output_dir=Path("out"),
metric_weights={"faithfulness": 1.0},
doc_weights={},
)
sample = NormalizedSample(
sample_id="s1", question="q", contexts=["c"],
answer="a", ground_truth="gt",
)
result = EvaluationResult(
scenario=scenario, run_id="r1",
started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
valid_samples=[sample], invalid_samples=[],
score_rows=[{
"sample_id": "s1", "faithfulness": 0.8,
"weighted_score": 0.8, "sample_weight": 1.0,
"doc_name": "", "error": "",
}],
)
md = build_summary_markdown(result)
assert "weighted_score" in md
assert "0.8000" in md
def test_summary_markdown_hides_weighted_score_without_weights(self):
"""build_summary_markdown preserves unweighted summaries when no weights set."""
from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
scenario = Scenario(
scenario_name="plain-test",
mode="offline",
dataset=DatasetConfig(path=Path("d.csv")),
judge_model="m",
embedding_model="e",
metrics=["faithfulness"],
output_dir=Path("out"),
metric_weights={},
doc_weights={},
)
sample = NormalizedSample(
sample_id="s1",
question="q",
contexts=["c"],
answer="a",
ground_truth="gt",
)
result = EvaluationResult(
scenario=scenario,
run_id="r1",
started_at="2026-01-01T00:00:00",
finished_at="2026-01-01T00:01:00",
valid_samples=[sample],
invalid_samples=[],
score_rows=[{
"sample_id": "s1",
"faithfulness": 0.8,
"weighted_score": 0.8,
"sample_weight": 1.0,
"doc_name": "",
"error": "",
}],
)
md = build_summary_markdown(result)
assert "- **weighted_score" not in md
def test_metric_pipeline_scores_sample(self) -> None: def test_metric_pipeline_scores_sample(self) -> None:
pipeline = MetricPipeline( pipeline = MetricPipeline(
metrics={ metrics={

View File

@@ -0,0 +1,89 @@
"""Regression tests for weighted webapp report aggregation."""
from __future__ import annotations
from pathlib import Path
import pytest
from webapp.services.report_builder import build_report
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
def _write_run_artifacts(run_dir: Path) -> None:
"""Create a minimal run directory with weighted scores and a snapshot."""
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "scores.csv").write_text(
"\n".join(
[
"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
"s1,a.pdf,1.0,0.5,0.8333,3.0",
"s2,b.pdf,0.0,0.5,0.1667,1.0",
]
),
encoding="utf-8",
)
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
(run_dir / "scenario.snapshot.yaml").write_text(
"\n".join(
[
"metrics:",
" - faithfulness",
" - context_recall",
"metric_weights:",
" faithfulness: 2.0",
" context_recall: 1.0",
"doc_weights:",
" a.pdf: 3.0",
" b.pdf: 1.0",
]
),
encoding="utf-8",
)
def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
"""Snapshot weight reader returns both weight maps as plain float dicts."""
run_dir = tmp_path / "run"
_write_run_artifacts(run_dir)
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
"""Report aggregation uses weighted means and surfaces snapshot weights."""
run_dir = tmp_path / "run"
_write_run_artifacts(run_dir)
report = build_report(run_dir, ["faithfulness", "context_recall"])
assert report.metric_means == {
"faithfulness": pytest.approx(0.75, rel=1e-4),
"context_recall": pytest.approx(0.5, rel=1e-4),
}
assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
assert report.summary_markdown == "summary"
assert report.advice_markdown == "advice"
def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
"""Metric inference excludes weighted helper columns from scores.csv."""
run_dir = tmp_path / "run"
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "scores.csv").write_text(
"\n".join(
[
"sample_id,doc_name,faithfulness,weighted_score,sample_weight",
"s1,a.pdf,0.8,0.8,2.0",
]
),
encoding="utf-8",
)
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]

124
tests/test_weights.py Normal file
View File

@@ -0,0 +1,124 @@
"""Unit tests for rag_eval/metrics/weights.py"""
import math
import pytest
from rag_eval.metrics.weights import (
compute_overall_weighted_score_mean,
compute_weighted_score,
resolve_weight,
weighted_metric_means,
)
class TestResolveWeight:
def test_returns_value_when_key_present(self):
assert resolve_weight({"faith": 0.5}, "faith") == 0.5
def test_returns_default_when_key_missing(self):
assert resolve_weight({}, "faith") == 1.0
def test_returns_custom_default_when_key_missing(self):
assert resolve_weight({}, "faith", default=2.0) == 2.0
def test_empty_dict_returns_default(self):
assert resolve_weight({}, "anything") == 1.0
class TestComputeWeightedScore:
def test_equal_weights_is_simple_mean(self):
scores = {"faithfulness": 0.8, "context_recall": 0.6}
result = compute_weighted_score(scores, {})
assert result == pytest.approx(0.7, rel=1e-4)
def test_explicit_weights(self):
scores = {"faithfulness": 1.0, "context_recall": 0.0}
weights = {"faithfulness": 3.0, "context_recall": 1.0}
result = compute_weighted_score(scores, weights)
assert result == pytest.approx(0.75, rel=1e-4)
def test_nan_values_excluded(self):
scores = {"faithfulness": float("nan"), "context_recall": 0.8}
result = compute_weighted_score(scores, {})
assert result == pytest.approx(0.8, rel=1e-4)
def test_none_values_excluded(self):
scores = {"faithfulness": None, "context_recall": 0.6}
result = compute_weighted_score(scores, {})
assert result == pytest.approx(0.6, rel=1e-4)
def test_all_nan_returns_none(self):
scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
assert compute_weighted_score(scores, {}) is None
def test_empty_scores_returns_none(self):
assert compute_weighted_score({}, {}) is None
def test_missing_metric_in_weights_uses_default_1(self):
scores = {"faithfulness": 0.8, "context_recall": 0.4}
weights = {"faithfulness": 2.0}
result = compute_weighted_score(scores, weights)
assert result == pytest.approx(2.0 / 3, rel=1e-4)
class TestWeightedMetricMeans:
def _rows(self):
return [
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
{"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
]
def test_equal_weights_gives_arithmetic_mean(self):
rows = self._rows()
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
def test_doc_weight_amplifies_contribution(self):
rows = self._rows()
doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
def test_nan_rows_skipped_per_metric(self):
rows = [
{"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
{"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
]
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
def test_missing_metric_column_returns_none(self):
rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
assert result["unknown_metric"] is None
def test_empty_rows_returns_none_for_all(self):
result = weighted_metric_means([], ["faithfulness"], {})
assert result["faithfulness"] is None
class TestComputeOverallWeightedScoreMean:
def test_basic_weighted_mean_of_weighted_scores(self):
rows = [
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
{"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
]
metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
result = compute_overall_weighted_score_mean(rows, metric_weights, {})
assert result == pytest.approx(0.5, rel=1e-4)
def test_doc_weight_amplifies_sample(self):
rows = [
{"doc_name": "important.pdf", "faithfulness": 1.0},
{"doc_name": "other.pdf", "faithfulness": 0.0},
]
doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
assert result == pytest.approx(0.9, rel=1e-4)
def test_all_nan_returns_none(self):
rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
assert compute_overall_weighted_score_mean(rows, {}, {}) is None

View File

@@ -137,3 +137,104 @@ def test_apply_no_profiles_returns_empty(tmp_path):
_resolve_absolute=True, _resolve_absolute=True,
) )
assert patched == [] assert patched == []
def test_apply_metric_weights_patches_yaml(tmp_path):
"""Applying metric_weights writes them into the YAML."""
import yaml as yaml_lib
import pytest
scenario_file = tmp_path / "w-scenario.yaml"
scenario_file.write_text(
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
encoding="utf-8",
)
from webapp.services.yaml_patcher import apply_profiles_to_scenario
patched = apply_profiles_to_scenario(
scenario_path=str(scenario_file),
judge_profile=None, answer_profile=None, dataset_profile=None,
metric_weights={"faithfulness": 0.7, "context_recall": 0.3},
_resolve_absolute=True,
)
assert "metric_weights" in patched
data = yaml_lib.safe_load(scenario_file.read_text())
assert abs(data["metric_weights"]["faithfulness"] - 0.7) < 1e-9
def test_apply_doc_weights_patches_yaml(tmp_path):
"""Applying doc_weights writes them into the YAML."""
import yaml as yaml_lib
scenario_file = tmp_path / "dw-scenario.yaml"
scenario_file.write_text(
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
encoding="utf-8",
)
from webapp.services.yaml_patcher import apply_profiles_to_scenario
patched = apply_profiles_to_scenario(
scenario_path=str(scenario_file),
judge_profile=None, answer_profile=None, dataset_profile=None,
doc_weights={"doc.pdf": 2.0},
_resolve_absolute=True,
)
assert "doc_weights" in patched
data = yaml_lib.safe_load(scenario_file.read_text())
assert abs(data["doc_weights"]["doc.pdf"] - 2.0) < 1e-9
# ---------------------------------------------------------------------------
# Connectivity test endpoint tests
# ---------------------------------------------------------------------------
from unittest.mock import MagicMock, patch
def test_probe_connectivity_success(client):
"""POST /api/llm-profiles/probe returns ok=True on successful completion."""
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
resp = client.post("/api/llm-profiles/probe", json={
"model": "test-model",
"base_url": "http://x/v1",
"api_key": "sk-test",
})
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is True
assert data["latency_ms"] is not None
def test_probe_connectivity_failure(client):
"""POST /api/llm-profiles/probe returns ok=False when the LLM call raises."""
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
MockOpenAI.return_value.chat.completions.create.side_effect = Exception("connection refused")
resp = client.post("/api/llm-profiles/probe", json={
"model": "test-model",
"base_url": "http://x/v1",
"api_key": "sk-test",
})
assert resp.status_code == 200
data = resp.json()
assert data["ok"] is False
assert "connection refused" in data["message"]
def test_test_saved_profile_success(client):
"""POST /api/llm-profiles/{id}/test returns ok=True for a saved profile."""
body = {"name": "T", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
resp = client.post(f"/api/llm-profiles/{pid}/test")
assert resp.status_code == 200
assert resp.json()["ok"] is True
def test_test_nonexistent_profile_returns_404(client):
"""POST /api/llm-profiles/{id}/test returns 404 for unknown profile id."""
resp = client.post("/api/llm-profiles/nonexistent/test")
assert resp.status_code == 404

View File

@@ -0,0 +1,327 @@
"""Tests for POST /api/score endpoint."""
from __future__ import annotations
import pytest
from pydantic import ValidationError
from webapp.models import ScoreRequest, ScoreResponse
class TestScoreRequest:
def test_minimal_valid_request(self):
"""Only required fields — question, answer, contexts."""
req = ScoreRequest(
question="What is CT?",
answer="CT is imaging.",
contexts="CT uses X-rays.",
)
assert req.question == "What is CT?"
assert req.contexts == "CT uses X-rays."
assert req.ground_truth is None
assert req.context_separator == " |||| "
assert req.metrics == [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
def test_contexts_split_by_separator(self):
"""contexts_as_list() splits on context_separator."""
req = ScoreRequest(
question="q",
answer="a",
contexts="ctx1 |||| ctx2 |||| ctx3",
context_separator=" |||| ",
)
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
def test_contexts_split_custom_separator(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="a---b---c",
context_separator="---",
)
assert req.contexts_as_list() == ["a", "b", "c"]
def test_contexts_split_single_item(self):
req = ScoreRequest(question="q", answer="a", contexts="only one")
assert req.contexts_as_list() == ["only one"]
def test_missing_question_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
def test_missing_answer_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
def test_missing_contexts_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
def test_custom_metrics_accepted(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=["faithfulness"],
)
assert req.metrics == ["faithfulness"]
def test_invalid_metric_name_raises(self):
with pytest.raises(ValidationError):
ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=["not_a_metric"],
)
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
"""Without ground_truth, GT-dependent metrics are excluded."""
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
metrics=[
"faithfulness",
"context_recall",
"factual_correctness",
"semantic_similarity",
"noise_sensitivity",
],
)
effective = req.effective_metrics()
assert "faithfulness" in effective
assert "context_recall" not in effective
assert "factual_correctness" not in effective
assert "semantic_similarity" not in effective
assert "noise_sensitivity" not in effective
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
req = ScoreRequest(
question="q",
answer="a",
contexts="c",
ground_truth="gt",
metrics=["faithfulness", "context_recall", "factual_correctness"],
)
effective = req.effective_metrics()
assert effective == [
"faithfulness",
"context_recall",
"factual_correctness",
]
class TestScoreResponse:
def test_score_response_structure(self):
resp = ScoreResponse(
scores={"faithfulness": 0.85, "answer_relevancy": None},
weighted_score=0.85,
latency_ms=1200,
)
assert resp.scores["faithfulness"] == 0.85
assert resp.scores["answer_relevancy"] is None
assert resp.latency_ms == 1200
class TestInlineScorer:
def test_score_returns_dict_with_requested_metrics(self):
"""InlineScorer.score returns a dict keyed by the requested metrics."""
from unittest.mock import AsyncMock, MagicMock, patch
from webapp.services.inline_scorer import InlineScorer
from rag_eval.settings import EvaluationSettings
mock_score = MagicMock()
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
mock_score.error = ""
mock_pipeline = MagicMock()
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
scorer = InlineScorer()
result = scorer.score(
question="q", answer="a",
contexts=["ctx1"],
ground_truth=None,
metrics=["faithfulness", "answer_relevancy"],
judge_model="test-model",
embedding_model="test-embed",
settings=EvaluationSettings(_env_file=None),
)
assert "faithfulness" in result
assert "answer_relevancy" in result
assert result["faithfulness"] == pytest.approx(0.9)
def test_score_converts_nan_to_none(self):
"""NaN scores are converted to None in the returned dict."""
import math
from unittest.mock import AsyncMock, MagicMock, patch
from webapp.services.inline_scorer import InlineScorer
from rag_eval.settings import EvaluationSettings
mock_score = MagicMock()
mock_score.metrics = {"faithfulness": float("nan")}
mock_score.error = ""
mock_pipeline = MagicMock()
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
scorer = InlineScorer()
result = scorer.score(
question="q", answer="a", contexts=["c"],
ground_truth=None,
metrics=["faithfulness"],
judge_model="m", embedding_model="e",
settings=EvaluationSettings(_env_file=None),
)
assert result["faithfulness"] is None
# ── Endpoint integration tests ────────────────────────────────────────────────
@pytest.fixture()
def client(monkeypatch):
"""TestClient with mocked InlineScorer."""
import webapp.api.score as score_mod
from unittest.mock import MagicMock
mock_scorer = MagicMock()
mock_scorer.score.return_value = {
"faithfulness": 0.85,
"answer_relevancy": 0.90,
}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
return TestClient(create_app())
from fastapi.testclient import TestClient
class TestScoreEndpoint:
def test_post_score_returns_200(self, client):
resp = client.post("/api/score", json={
"question": "What is CT?",
"answer": "CT is imaging.",
"contexts": "CT uses X-rays.",
})
assert resp.status_code == 200
data = resp.json()
assert "scores" in data
assert "latency_ms" in data
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
def test_weighted_score_computed(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
})
assert resp.status_code == 200
data = resp.json()
assert data["weighted_score"] is not None
def test_missing_required_fields_returns_422(self, client):
resp = client.post("/api/score", json={"question": "q"})
assert resp.status_code == 422
def test_invalid_metric_name_returns_422(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
"metrics": ["not_a_metric"],
})
assert resp.status_code == 422
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
resp = client.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
"metrics": ["faithfulness", "context_recall"],
})
assert resp.status_code == 200
data = resp.json()
assert "context_recall" in data["skipped_metrics"]
def test_contexts_split_on_separator(self, monkeypatch):
"""contexts string is split before passing to scorer."""
import webapp.api.score as score_mod
from unittest.mock import MagicMock
calls = []
def capture(**kwargs):
calls.append(kwargs.get("contexts", []))
return {"faithfulness": 0.9}
mock_scorer = MagicMock()
mock_scorer.score.side_effect = lambda **kw: capture(**kw)
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
from fastapi.testclient import TestClient
tc = TestClient(create_app())
tc.post("/api/score", json={
"question": "q", "answer": "a",
"contexts": "ctx1 |||| ctx2",
"context_separator": " |||| ",
})
assert len(calls) == 1
assert calls[0] == ["ctx1", "ctx2"]
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
"""When SCORE_API_TOKEN is set, requests without token get 401."""
import webapp.api.score as score_mod
from rag_eval.settings import EvaluationSettings
from unittest.mock import MagicMock
mock_settings = EvaluationSettings(_env_file=None)
object.__setattr__(mock_settings, "score_api_token", "secret-token")
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
mock_scorer = MagicMock()
mock_scorer.score.return_value = {"faithfulness": 0.9}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
from fastapi.testclient import TestClient
tc = TestClient(create_app())
# No auth header -> 401
resp = tc.post("/api/score", json={
"question": "q", "answer": "a", "contexts": "c",
})
assert resp.status_code == 401
# Correct token -> 200
resp = tc.post("/api/score",
json={"question": "q", "answer": "a", "contexts": "c"},
headers={"Authorization": "Bearer secret-token"},
)
assert resp.status_code == 200
def test_wrong_bearer_token_returns_401(self, monkeypatch):
import webapp.api.score as score_mod
from rag_eval.settings import EvaluationSettings
from unittest.mock import MagicMock
mock_settings = EvaluationSettings(_env_file=None)
object.__setattr__(mock_settings, "score_api_token", "correct-token")
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
mock_scorer = MagicMock()
mock_scorer.score.return_value = {}
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
from webapp.server import create_app
from fastapi.testclient import TestClient
tc = TestClient(create_app())
resp = tc.post("/api/score",
json={"question": "q", "answer": "a", "contexts": "c"},
headers={"Authorization": "Bearer wrong-token"},
)
assert resp.status_code == 401

View File

@@ -2,13 +2,18 @@
from __future__ import annotations from __future__ import annotations
import time
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException
from openai import OpenAI
from webapp.models import ( from webapp.models import (
CreateProfileRequest, CreateProfileRequest,
LLMProfile, LLMProfile,
ProfileApplyRequest, ProfileApplyRequest,
ProfileApplyResponse, ProfileApplyResponse,
ProfileProbeRequest,
ProfileTestResponse,
) )
from webapp.services.profile_manager import profile_manager from webapp.services.profile_manager import profile_manager
from webapp.services.yaml_patcher import apply_profiles_to_scenario from webapp.services.yaml_patcher import apply_profiles_to_scenario
@@ -16,6 +21,43 @@ from webapp.services.yaml_patcher import apply_profiles_to_scenario
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"]) router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
def _do_connectivity_test(
model: str,
base_url: str,
api_key: str,
timeout_seconds: int,
) -> ProfileTestResponse:
"""Send a minimal chat completion request and return the test result."""
client = OpenAI(
api_key=api_key,
base_url=base_url.rstrip("/"),
timeout=float(timeout_seconds),
)
t0 = time.monotonic()
try:
client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "hi"}],
max_tokens=1,
)
latency_ms = int((time.monotonic() - t0) * 1000)
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
def probe_connectivity(request: ProfileProbeRequest) -> ProfileTestResponse:
"""Test LLM connectivity with inline credentials (no saved profile required)."""
return _do_connectivity_test(
model=request.model,
base_url=request.base_url,
api_key=request.api_key,
timeout_seconds=request.timeout_seconds,
)
@router.get("", response_model=dict) @router.get("", response_model=dict)
def list_profiles() -> dict: def list_profiles() -> dict:
"""Return all saved LLM profiles.""" """Return all saved LLM profiles."""
@@ -59,6 +101,20 @@ def delete_profile(profile_id: str) -> dict:
return {"deleted": True} return {"deleted": True}
@router.post("/{profile_id}/test", response_model=ProfileTestResponse)
def test_profile(profile_id: str) -> ProfileTestResponse:
"""Test LLM connectivity for a saved profile."""
profile = profile_manager.get(profile_id)
if profile is None:
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
return _do_connectivity_test(
model=profile.model,
base_url=profile.base_url,
api_key=profile.api_key,
timeout_seconds=profile.timeout_seconds,
)
@router.post("/apply", response_model=ProfileApplyResponse) @router.post("/apply", response_model=ProfileApplyResponse)
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse: def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
"""Patch selected LLM profiles into the target scenario YAML file.""" """Patch selected LLM profiles into the target scenario YAML file."""
@@ -89,6 +145,8 @@ def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
judge_profile=role_profiles["judge"], judge_profile=role_profiles["judge"],
answer_profile=role_profiles["answer"], answer_profile=role_profiles["answer"],
dataset_profile=role_profiles["dataset"], dataset_profile=role_profiles["dataset"],
metric_weights=request.metric_weights,
doc_weights=request.doc_weights,
) )
return ProfileApplyResponse( return ProfileApplyResponse(
scenario_path=request.scenario_path, scenario_path=request.scenario_path,

105
webapp/api/score.py Normal file
View File

@@ -0,0 +1,105 @@
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
from __future__ import annotations
import time
from typing import Annotated
from fastapi import APIRouter, Header, HTTPException
from rag_eval.metrics.weights import compute_weighted_score
from rag_eval.settings import EvaluationSettings
from webapp.models import ScoreRequest, ScoreResponse
from webapp.services.inline_scorer import inline_scorer
router = APIRouter(prefix="/api/score", tags=["score"])
def _get_settings() -> EvaluationSettings:
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
return EvaluationSettings()
def _check_auth(authorization: str | None, token: str) -> None:
"""Raise 401 if Bearer token does not match the configured token."""
if authorization is None:
raise HTTPException(status_code=401, detail="Missing Authorization header.")
parts = authorization.split(" ", 1)
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
"",
response_model=ScoreResponse,
summary="单题实时评分Dify 外部 Tool",
responses={
200: {"description": "各指标得分和加权综合得分。"},
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
422: {"description": "请求参数校验失败。"},
},
)
def score_sample(
request: ScoreRequest,
authorization: Annotated[str | None, Header()] = None,
) -> ScoreResponse:
"""Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
settings = _get_settings()
# Require Bearer auth only when the deployment configured a shared token.
if settings.score_api_token:
_check_auth(authorization, settings.score_api_token)
judge_model = request.judge_model or settings.ragas_judge_model
embedding_model = request.embedding_model or settings.ragas_embedding_model
effective = request.effective_metrics()
requested = set(request.metrics)
skipped = sorted(requested - set(effective))
if not effective:
return ScoreResponse(
scores={metric_name: None for metric_name in request.metrics},
weighted_score=None,
latency_ms=0,
skipped_metrics=skipped,
)
t0 = time.monotonic()
try:
raw_scores = inline_scorer.score(
question=request.question,
answer=request.answer,
contexts=request.contexts_as_list(),
ground_truth=request.ground_truth,
metrics=effective,
judge_model=judge_model,
embedding_model=embedding_model,
settings=settings,
)
except Exception as exc: # noqa: BLE001
latency_ms = int((time.monotonic() - t0) * 1000)
return ScoreResponse(
scores={},
weighted_score=None,
latency_ms=latency_ms,
skipped_metrics=skipped,
error=f"{type(exc).__name__}: {exc}",
)
latency_ms = int((time.monotonic() - t0) * 1000)
# Keep skipped metrics visible to callers by emitting them as null scores.
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
all_scores.update(raw_scores)
weighted = compute_weighted_score(
{key: value for key, value in raw_scores.items() if value is not None},
{},
)
return ScoreResponse(
scores=all_scores,
weighted_score=round(weighted, 4) if weighted is not None else None,
latency_ms=latency_ms,
skipped_metrics=skipped,
)

View File

@@ -5,7 +5,7 @@ from __future__ import annotations
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any from typing import Any
from pydantic import BaseModel, Field from pydantic import BaseModel, ConfigDict, Field, field_validator
def _utcnow_iso() -> str: def _utcnow_iso() -> str:
@@ -74,6 +74,18 @@ class ReportData(BaseModel):
lowest_samples: list[SampleScore] = Field(default_factory=list) lowest_samples: list[SampleScore] = Field(default_factory=list)
summary_markdown: str = "" summary_markdown: str = ""
advice_markdown: str = "" # optimization_advice.md content (empty if not generated) advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
weighted_score_mean: float | None = Field(
default=None,
description="加权综合得分均值metric_weights × doc_weights 共同作用)。",
)
metric_weights: dict[str, float] = Field(
default_factory=dict,
description="该次运行使用的指标权重配置(来自 scenario.snapshot.yaml",
)
doc_weights: dict[str, float] = Field(
default_factory=dict,
description="该次运行使用的文档权重配置(来自 scenario.snapshot.yaml",
)
class RunDetail(BaseModel): class RunDetail(BaseModel):
@@ -93,6 +105,14 @@ class ScenarioInfo(BaseModel):
judge_model: str = "" judge_model: str = ""
metrics: list[str] = Field(default_factory=list) metrics: list[str] = Field(default_factory=list)
error: str = "" error: str = ""
metric_weights: dict[str, float] = Field(
default_factory=dict,
description="从场景 YAML 读取的指标权重配置,供前端权重面板预填。",
)
doc_weights: dict[str, float] = Field(
default_factory=dict,
description="从场景 YAML 读取的文档权重配置,供前端权重面板预填。",
)
class TaskStatus(BaseModel): class TaskStatus(BaseModel):
@@ -150,6 +170,14 @@ class ProfileApplyRequest(BaseModel):
judge_profile_id: str | None = None judge_profile_id: str | None = None
answer_profile_id: str | None = None answer_profile_id: str | None = None
dataset_profile_id: str | None = None dataset_profile_id: str | None = None
metric_weights: dict[str, float] | None = Field(
default=None,
description="指标权重映射,如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。",
)
doc_weights: dict[str, float] | None = Field(
default=None,
description="文档权重映射,如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。",
)
class ProfileApplyResponse(BaseModel): class ProfileApplyResponse(BaseModel):
@@ -159,6 +187,23 @@ class ProfileApplyResponse(BaseModel):
patched_fields: list[str] = Field(default_factory=list) patched_fields: list[str] = Field(default_factory=list)
class ProfileProbeRequest(BaseModel):
"""Inline credentials for testing LLM connectivity without saving a profile."""
model: str
base_url: str
api_key: str
timeout_seconds: int = 30
class ProfileTestResponse(BaseModel):
"""Result of a LLM connectivity test."""
ok: bool
message: str
latency_ms: int | None = None
def jsonable(value: Any) -> Any: def jsonable(value: Any) -> Any:
"""Convert NaN/inf floats into None so the payload stays valid JSON.""" """Convert NaN/inf floats into None so the payload stays valid JSON."""
import math import math
@@ -172,3 +217,288 @@ def jsonable(value: Any) -> Any:
if isinstance(value, list): if isinstance(value, list):
return [jsonable(item) for item in value] return [jsonable(item) for item in value]
return value return value
# ---------------------------------------------------------------------------
# Full pipeline (build + eval) job models
# ---------------------------------------------------------------------------
class PipelineJobRequest(BaseModel):
"""Request body for launching an end-to-end build + evaluation pipeline job."""
model_config = ConfigDict(
json_schema_extra={
"examples": [
{
"summary": "西门子 CT 文档评估(完整参数)",
"value": {
"docs_path": "datasets/siemens-pdfs",
"job_name": "siemens-ct-eval-2026",
"generation_model": "qwen3.6-plus",
"answer_model": "deepseek-v4-flash",
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3",
"max_questions_per_document": 10,
"max_source_chunks_per_question": 3,
"max_documents": None,
"max_samples": None,
"metrics": [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
],
"optimization_advisor": False,
"failure_mode": "skip",
},
},
{
"summary": "快速冒烟测试(仅 2 份文档、5 道题)",
"value": {
"docs_path": "datasets/siemens-pdfs",
"job_name": "smoke-test",
"generation_model": "qwen3.6-plus",
"answer_model": "deepseek-v4-flash",
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3",
"max_questions_per_document": 5,
"max_source_chunks_per_question": 3,
"max_documents": 2,
"max_samples": 10,
"metrics": ["faithfulness", "answer_relevancy"],
"optimization_advisor": False,
"failure_mode": "skip",
},
},
]
}
)
docs_path: str = Field(
description="PDF 文档所在文件夹的绝对路径或相对于仓库根目录的相对路径。"
)
job_name: str = Field(
default="",
description="任务显示名称;留空时系统自动生成唯一标识。",
)
generation_model: str = Field(
default="qwen3.6-plus",
description="用于从文档片段生成草稿题库的 LLM 模型名称。",
)
answer_model: str = Field(
default="deepseek-v4-flash",
description="在线评估时调用的答题 LLM 模型名称siemens_pdf_qa adapter",
)
judge_model: str = Field(
default="deepseek-v4-flash",
description="RAGAS 指标评分时使用的 Judge LLM 模型名称。",
)
embedding_model: str = Field(
default="text-embedding-v3",
description="RAGAS context-recall / context-precision 使用的 Embedding 模型名称。",
)
max_questions_per_document: int = Field(
default=10, gt=0,
description="每份 PDF 文档最多生成的草稿题目数量。",
)
max_source_chunks_per_question: int = Field(
default=3, gt=0,
description="每道题目最多引用的文档片段source chunk数量。",
)
max_documents: int | None = Field(
default=None, gt=0,
description="限制处理的 PDF 文件数量上限(冒烟测试时使用)。",
)
max_samples: int | None = Field(
default=None, gt=0,
description="限制评估的题目数量上限(冒烟测试时使用)。",
)
metrics: list[str] = Field(
default_factory=lambda: [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
],
description=(
"需要计算的 RAGAS 指标列表。"
"可选值faithfulness, answer_relevancy, context_recall, "
"context_precision, noise_sensitivity, factual_correctness, semantic_similarity。"
),
)
optimization_advisor: bool = Field(
default=False,
description="为 True 时启用 RAGAS 优化建议模块,生成 optimization_advice.md。",
)
failure_mode: str = Field(
default="skip",
description="PDF 解析失败时的处理策略skip跳过继续或 fail立即中止",
)
class PipelineResult(BaseModel):
"""Artifact locations and statistics for a completed pipeline run."""
build_artifact_dir: str = Field(description="题库生成阶段的产物根目录路径。")
dataset_csv: str = Field(description="生成的草稿题库 CSV 文件路径(评估输入)。")
source_chunks_jsonl: str = Field(description="文档片段索引文件路径(在线评估 adapter 使用)。")
total_questions: int = Field(description="成功生成的有效题目总数。")
parse_failures: int = Field(description="文档解析失败的 PDF 数量。")
eval_run_id: str = Field(description="RAGAS 评估运行 ID。")
eval_output_dir: str = Field(description="RAGAS 评估产物根目录路径。")
scores_csv: str = Field(description="每道题目逐项评分的 CSV 文件路径。")
summary_md: str = Field(description="评估结果摘要 Markdown 文件路径。")
class PipelineJobStatus(BaseModel):
"""State of one end-to-end pipeline job."""
job_id: str = Field(description="任务唯一标识符。")
job_name: str = Field(description="任务显示名称。")
status: str = Field(description="任务状态queued | running | completed | failed。")
phase: str = Field(default="idle", description="当前执行阶段idle | parsing_documents | generating_questions | evaluating | done。")
logs: list[str] = Field(default_factory=list, description="实时日志行列表。")
result: PipelineResult | None = Field(default=None, description="任务完成后填充的产物路径与统计信息。")
error: str | None = Field(default=None, description="失败时的错误信息。")
created_at: str = Field(default="", description="任务创建时间ISO 8601 UTC")
finished_at: str = Field(default="", description="任务结束时间ISO 8601 UTC")
class PipelineJobResponse(BaseModel):
"""Immediate response returned after a pipeline job is queued."""
job_id: str = Field(description="任务唯一标识符,用于后续轮询状态。")
job_name: str = Field(description="任务显示名称。")
status: str = Field(default="queued", description="初始状态,通常为 queued。")
# ---------------------------------------------------------------------------
# Dify 实时评分 API 模型
# ---------------------------------------------------------------------------
# 需要 ground_truth 才能计算的指标集合
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
"context_recall",
"factual_correctness",
"semantic_similarity",
"noise_sensitivity",
})
# 所有合法指标名称
_VALID_METRICS: frozenset[str] = frozenset({
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
"noise_sensitivity",
"factual_correctness",
"semantic_similarity",
})
_DEFAULT_SCORE_METRICS: list[str] = [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
class ScoreRequest(BaseModel):
"""Request body for the real-time single-sample scoring endpoint."""
model_config = ConfigDict(
json_schema_extra={
"examples": [
{
"summary": "基础评分请求",
"value": {
"question": "双源CT的时间分辨率是多少?",
"answer": "双源CT的单扇区时间分辨率为75ms。",
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
"ground_truth": "双源CT单扇区时间分辨率为75ms需旋转135度。",
"context_separator": " |||| ",
"metrics": [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
],
"judge_model": "deepseek-v4-flash",
"embedding_model": "text-embedding-v3",
},
}
]
}
)
question: str = Field(description="问题文本。")
answer: str = Field(description="待评分的回答。")
contexts: str = Field(
description="检索上下文字符串,多段之间用 context_separator 拼接。"
)
ground_truth: str | None = Field(
default=None,
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
)
context_separator: str = Field(
default=" |||| ",
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
)
metrics: list[str] = Field(
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
description="需要计算的 RAGAS 指标列表。",
)
judge_model: str | None = Field(
default=None,
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
)
embedding_model: str | None = Field(
default=None,
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
)
@field_validator("metrics")
@classmethod
def validate_metric_names(cls, value: list[str]) -> list[str]:
"""Reject any metric name not in the supported registry."""
invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
if invalid:
raise ValueError(
f"不支持的指标名称:{invalid}"
f"合法值:{sorted(_VALID_METRICS)}"
)
if not value:
raise ValueError("metrics 不能为空列表。")
return value
def contexts_as_list(self) -> list[str]:
"""Split the contexts string into a list of non-empty fragments."""
separator = self.context_separator or " |||| "
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
def effective_metrics(self) -> list[str]:
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
if self.ground_truth is not None:
return list(self.metrics)
return [metric_name for metric_name in self.metrics if metric_name not in _GT_DEPENDENT_METRICS]
class ScoreResponse(BaseModel):
"""Response payload for the real-time scoring endpoint."""
scores: dict[str, float | None] = Field(
description="各指标得分NaN 或计算失败时为 null"
)
weighted_score: float | None = Field(
default=None,
description="等权加权综合得分(仅对非 null 指标求均值)。",
)
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
skipped_metrics: list[str] = Field(
default_factory=list,
description="因缺少 ground_truth 而跳过的指标名称列表。",
)
error: str | None = Field(
default=None,
description="打分异常时的错误信息HTTP 200 仍返回scores 为空)。",
)

View File

@@ -13,23 +13,95 @@ from fastapi import FastAPI
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from webapp.api import evaluations, llm_profiles, runs, scenarios from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
STATIC_DIR = Path(__file__).resolve().parent / "static" STATIC_DIR = Path(__file__).resolve().parent / "static"
# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
OPENAPI_TAGS = [
{
"name": "pipeline",
"description": (
"**全链路评估 Pipeline API**\n\n"
"一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
"**使用流程**\n"
"1. `POST /api/pipeline/jobs` 提交任务,立即拿到 `job_id`。\n"
"2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
"3. 当 `status=completed` 时,`result` 字段包含所有产物路径。\n\n"
"**Pipeline 阶段**\n"
"| phase | 说明 |\n"
"|-------|------|\n"
"| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
"| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
"| `evaluating` | RAGAS 在线评测打分 |\n"
"| `done` | 所有产物写入磁盘,任务完成 |"
),
},
{
"name": "evaluations",
"description": (
"**单场景评估 API**\n\n"
"基于已有 YAML 场景文件触发评估任务,并查询任务状态与日志。"
),
},
{
"name": "llm-profiles",
"description": (
"**LLM 配置管理 API**\n\n"
"增删改查已保存的 LLM 连接配置模型名称、Base URL、API Key"
"支持连通性测试;可将配置一键写入场景 YAML 文件。"
),
},
{
"name": "runs",
"description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
},
{
"name": "scenarios",
"description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
},
{
"name": "score",
"description": (
"**实时评分 APIDify 外部 Tool**\n\n"
"接受单条问答记录 `(question, answer, contexts, ground_truth)`\n"
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
"适用场景Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
"`Authorization: Bearer <token>` 请求头。"
),
},
{
"name": "meta",
"description": "**系统 API**\n\n健康检查等基础接口。",
},
]
def create_app() -> FastAPI: def create_app() -> FastAPI:
"""Build and configure the FastAPI application instance.""" """Build and configure the FastAPI application instance."""
app = FastAPI( app = FastAPI(
title="Siemens RAGAS 评估控制台", title="RAGAS 评估系统",
description="RAGAS 评估子系统的可视化报告与评估触发控制台。", description=(
version="0.1.0", "西门子医疗影像 RAG 评估平台 API 文档。\n\n"
"提供以下能力:\n"
"- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
"- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
"- **LLM 配置 API** — 管理多个 LLM 连接配置,支持连通性测试\n"
"- **报告 API** — 查询历史运行记录与评估报告\n\n"
"> **快速开始**:调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
),
version="0.2.0",
openapi_tags=OPENAPI_TAGS,
) )
app.include_router(runs.router) app.include_router(runs.router)
app.include_router(scenarios.router) app.include_router(scenarios.router)
app.include_router(evaluations.router) app.include_router(evaluations.router)
app.include_router(llm_profiles.router) app.include_router(llm_profiles.router)
app.include_router(pipeline.router)
app.include_router(score.router)
@app.get("/api/health", tags=["meta"]) @app.get("/api/health", tags=["meta"])
def health() -> dict[str, str]: def health() -> dict[str, str]:

View File

@@ -0,0 +1,109 @@
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
(judge_model, embedding_model), so repeated Dify Tool calls with the same
models reuse existing AsyncOpenAI connections instead of creating new ones.
"""
from __future__ import annotations
import asyncio
import math
import threading
from typing import Any
from rag_eval.compat import ensure_ragas_import_compat
from rag_eval.metrics.factory import build_models
from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import NormalizedSample
ensure_ragas_import_compat()
from ragas.metrics.collections import ( # noqa: E402
AnswerRelevancy,
ContextPrecision,
ContextRecall,
FactualCorrectness,
Faithfulness,
NoiseSensitivity,
SemanticSimilarity,
)
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
"""Instantiate only the RAGAS metric objects requested."""
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
"noise_sensitivity": NoiseSensitivity(llm=llm),
"factual_correctness": FactualCorrectness(llm=llm),
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
}
return {name: registry[name] for name in metrics if name in registry}
class InlineScorer:
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
def __init__(self) -> None:
"""Initialize the scorer cache and synchronization primitives."""
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
self._lock = threading.Lock()
def _get_models(
self,
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> tuple[Any, Any]:
"""Return cached LLM/embedding clients, building them on first use."""
cache_key = (judge_model, embedding_model)
with self._lock:
if cache_key not in self._model_cache:
llm, embeddings = build_models(judge_model, embedding_model, settings)
self._model_cache[cache_key] = (llm, embeddings)
return self._model_cache[cache_key]
def score(
self,
question: str,
answer: str,
contexts: list[str],
ground_truth: str | None,
metrics: list[str],
judge_model: str,
embedding_model: str,
settings: EvaluationSettings,
) -> dict[str, float | None]:
"""Score one sample synchronously and return {metric_name: score | None}."""
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
metric_instances = _build_metric_instances(metrics, llm, embeddings)
pipeline = MetricPipeline(
metrics=metric_instances,
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
)
sample = NormalizedSample(
sample_id="inline-score",
question=question,
answer=answer,
contexts=contexts,
ground_truth=ground_truth or "",
)
metric_score = asyncio.run(pipeline.score_sample(sample))
# Convert NaN and Inf into None for clean JSON output.
return {
name: (None if math.isnan(value) or math.isinf(value) else round(value, 4))
for name, value in metric_score.metrics.items()
}
# Module-level singleton shared by FastAPI routes.
inline_scorer = InlineScorer()

View File

@@ -13,6 +13,11 @@ from pathlib import Path
import pandas as pd import pandas as pd
from rag_eval.metrics.weights import (
compute_overall_weighted_score_mean,
weighted_metric_means as _weighted_metric_means,
)
from webapp.services.run_reader import _read_weights_from_snapshot
from webapp.services.text_utils import parse_contexts from webapp.services.text_utils import parse_contexts
from webapp.models import ( from webapp.models import (
DistributionBin, DistributionBin,
@@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None:
return round(float(value), 4) return round(float(value), 4)
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
"""Compute the mean of each metric column across all scored samples."""
means: dict[str, float | None] = {}
for metric in metrics:
if metric in frame.columns:
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
else:
means[metric] = None
return means
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]: def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
"""Bucket one metric's scores into fixed-width [0,1] histogram bins.""" """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
bins: list[DistributionBin] = [] bins: list[DistributionBin] = []
@@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
frame = run_reader.read_scores_frame(run_dir) frame = run_reader.read_scores_frame(run_dir)
summary_markdown = run_reader.read_summary_markdown(run_dir) summary_markdown = run_reader.read_summary_markdown(run_dir)
advice_markdown = run_reader.read_advice_markdown(run_dir) advice_markdown = run_reader.read_advice_markdown(run_dir)
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
if frame.empty or not metrics: if frame.empty or not metrics:
return ReportData( return ReportData(
@@ -172,6 +167,18 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
metric_means={metric: None for metric in metrics}, metric_means={metric: None for metric in metrics},
summary_markdown=summary_markdown, summary_markdown=summary_markdown,
advice_markdown=advice_markdown, advice_markdown=advice_markdown,
metric_weights=metric_weights,
doc_weights=doc_weights,
)
score_rows_list = frame.to_dict(orient="records")
# Use weighted metric means (degrades to arithmetic mean when weights are empty).
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
overall_ws = compute_overall_weighted_score_mean(
score_rows_list, metric_weights, doc_weights
) )
distributions = { distributions = {
@@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
return ReportData( return ReportData(
metrics=metrics, metrics=metrics,
metric_means=_metric_means(frame, metrics), metric_means=rounded_means,
distributions=distributions, distributions=distributions,
groupings=_groupings(frame, metrics), groupings=_groupings(frame, metrics),
lowest_samples=_lowest_samples(frame, metrics), lowest_samples=_lowest_samples(frame, metrics),
summary_markdown=summary_markdown, summary_markdown=summary_markdown,
advice_markdown=advice_markdown, advice_markdown=advice_markdown,
weighted_score_mean=_round_or_none(overall_ws),
metric_weights=metric_weights,
doc_weights=doc_weights,
) )

View File

@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
return [] return []
def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
"""Read metric_weights and doc_weights from a scenario snapshot if present.
Returns a (metric_weights, doc_weights) tuple of plain dicts.
Both default to empty dicts when the snapshot is absent or lacks the fields.
"""
snapshot = run_dir / "scenario.snapshot.yaml"
if not snapshot.is_file():
return {}, {}
try:
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
except (OSError, yaml.YAMLError):
return {}, {}
mw = payload.get("metric_weights") or {}
dw = payload.get("doc_weights") or {}
return (
{str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
{str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
)
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]: def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
"""Find every run directory (one that contains metadata.json) under the roots.""" """Find every run directory (one that contains metadata.json) under the roots."""
run_dirs: list[Path] = [] run_dirs: list[Path] = []
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
"source_chunk_ids", "source_chunk_ids",
"review_status", "review_status",
"review_notes", "review_notes",
"weighted_score",
"sample_weight",
} }

View File

@@ -37,6 +37,16 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
metrics = payload.get("metrics") metrics = payload.get("metrics")
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else [] metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
raw_metric_weights = payload.get("metric_weights") or {}
raw_doc_weights = payload.get("doc_weights") or {}
metric_weights = {
str(k): float(v) for k, v in raw_metric_weights.items()
if isinstance(v, (int, float))
}
doc_weights = {
str(k): float(v) for k, v in raw_doc_weights.items()
if isinstance(v, (int, float))
}
return ScenarioInfo( return ScenarioInfo(
path=relative, path=relative,
@@ -45,6 +55,8 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
dataset=str(payload.get("dataset", "")), dataset=str(payload.get("dataset", "")),
judge_model=str(payload.get("judge_model", "")), judge_model=str(payload.get("judge_model", "")),
metrics=metric_list, metrics=metric_list,
metric_weights=metric_weights,
doc_weights=doc_weights,
) )

View File

@@ -32,9 +32,11 @@ def apply_profiles_to_scenario(
judge_profile: LLMProfile | None, judge_profile: LLMProfile | None,
answer_profile: LLMProfile | None, answer_profile: LLMProfile | None,
dataset_profile: LLMProfile | None, dataset_profile: LLMProfile | None,
metric_weights: dict[str, float] | None = None,
doc_weights: dict[str, float] | None = None,
_resolve_absolute: bool = False, _resolve_absolute: bool = False,
) -> list[str]: ) -> list[str]:
"""Patch the YAML file at *scenario_path* with the supplied profiles. """Patch the YAML file at *scenario_path* with the supplied profiles and weights.
Returns a list of dotted field names that were actually patched. Returns a list of dotted field names that were actually patched.
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests). Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
@@ -67,6 +69,14 @@ def apply_profiles_to_scenario(
generation["model"] = dataset_profile.model generation["model"] = dataset_profile.model
patched.append("generation.model") patched.append("generation.model")
if metric_weights is not None:
data["metric_weights"] = dict(metric_weights)
patched.append("metric_weights")
if doc_weights is not None:
data["doc_weights"] = dict(doc_weights)
patched.append("doc_weights")
resolved.write_text( resolved.write_text(
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False), yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
encoding="utf-8", encoding="utf-8",

View File

@@ -308,6 +308,203 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); } .llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
.llm-role-select { min-width: 240px; } .llm-role-select { min-width: 240px; }
/* ---------- API 文档 iframe ---------- */
#view-apidocs { padding: 0; display: flex; flex-direction: column; flex: 1; }
.apidocs-frame {
flex: 1;
width: 100%;
height: calc(100vh - 64px);
border: none;
}
.report-actions {
display: flex; justify-content: flex-end; margin: 0 0 12px;
}
.btn-export-pdf {
font-size: 13px; display: flex; align-items: center; gap: 6px;
}
/* ---------- 报告历史切换下拉 ---------- */
.report-switcher {
display: flex; align-items: center; gap: 10px;
background: var(--surface); border: 1px solid var(--line);
border-radius: var(--radius); padding: 10px 16px;
margin-bottom: 14px; box-shadow: var(--shadow);
}
.report-switcher-label {
font-size: 13px; font-weight: 600; color: var(--slate); white-space: nowrap;
}
.report-switcher-select {
flex: 1; min-width: 0;
border: 1px solid var(--line); border-radius: 6px; padding: 6px 10px;
font-size: 13px; font-family: inherit; background: var(--bg); color: var(--ink);
cursor: pointer;
}
.report-switcher-select:focus { outline: none; border-color: var(--petrol); }
/* ?? ?????? ??????????????????????????????????? */
.weight-config-panel { margin-top: 12px; }
.weight-section-title { font-size: 13px; font-weight: 600; color: var(--text); margin-bottom: 8px; }
.weight-rows { display: flex; flex-direction: column; gap: 6px; }
.weight-row {
display: flex; align-items: center; gap: 10px;
font-size: 13px;
}
.weight-row-label { min-width: 180px; color: var(--slate); font-family: monospace; }
.weight-row-input {
width: 80px; padding: 4px 8px; border: 1px solid var(--border);
border-radius: 6px; font-size: 13px; text-align: right;
}
.weight-row-input:focus { outline: none; border-color: #6366f1; }
.doc-weight-name {
flex: 1; padding: 4px 8px; border: 1px solid var(--border);
border-radius: 6px; font-size: 13px; min-width: 0;
}
.weight-row-remove { color: var(--bad); cursor: pointer; font-size: 14px; background: none; border: none; padding: 2px 6px; }
.weight-row-remove:hover { background: #fee2e2; border-radius: 4px; }
/* weighted_score ???????? */
.metric-card.weighted-score-card {
border: 2px solid #6366f1;
background: #f5f3ff;
}
.metric-card.weighted-score-card .metric-name { color: #4f46e5; font-weight: 700; }
/* ================================================================
打印样式(导出 PDF 用)
浏览器打印时隐藏 UI chrome保留报告内容图表 canvas 原样输出
================================================================ */
@media print {
/* ── 页面尺寸与边距 ── */
@page {
size: A4 portrait;
margin: 18mm 16mm 18mm 16mm;
}
/* ── 隐藏所有非报告元素 ── */
.sidebar,
.topbar,
.report-actions,
.no-print,
#dist-metric-select,
.grouping-tabs,
#view-runs,
#view-new,
#view-profiles { display: none !important; }
/* ── 全局基础 ── */
body {
font-size: 11pt;
line-height: 1.5;
color: #0f1b2d;
background: #fff;
}
/* ── 布局重置main 全宽 ── */
.app { display: block; }
.main { display: block; width: 100%; }
.view { padding: 0; display: block !important; }
#view-report { display: block !important; }
/* ── 报告内容 ── */
#report-content { display: block !important; }
#report-empty { display: none !important; }
/* ── 元信息条 ── */
.report-meta {
display: flex;
justify-content: space-between;
border-bottom: 2px solid #009999;
padding-bottom: 8pt;
margin-bottom: 14pt;
}
.report-meta-title { font-size: 14pt; font-weight: 700; }
.report-meta-info { font-size: 9pt; color: #64748b; }
/* ── Section 标签 ── */
.section-label {
font-size: 9pt;
font-weight: 700;
letter-spacing: 0.5px;
color: #64748b;
text-transform: uppercase;
margin: 14pt 0 6pt;
border-bottom: 1px solid #e2e8f0;
padding-bottom: 3pt;
break-after: avoid;
}
/* ── ① 指标均值卡片 ── */
.metric-cards {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(90pt, 1fr));
gap: 8pt;
margin-bottom: 12pt;
}
.metric-card {
border: 1px solid #e2e8f0;
border-radius: 6pt;
padding: 10pt 8pt;
text-align: center;
break-inside: avoid;
}
.metric-value { font-size: 20pt; font-weight: 700; }
.metric-name { font-size: 8pt; color: #64748b; margin-top: 2pt; }
/* ── ② 分布 + ③ 分组:打印时改为纵向排列 ── */
.report-row {
display: block;
}
.report-half {
margin-bottom: 12pt;
break-inside: avoid;
}
#dist-chart {
max-height: 160pt;
width: 100% !important;
}
/* ── 面板统一 ── */
.panel {
border: 1px solid #e2e8f0;
border-radius: 6pt;
padding: 10pt 12pt;
margin-bottom: 10pt;
break-inside: avoid;
box-shadow: none;
}
.panel h2 { font-size: 12pt; margin-bottom: 4pt; }
/* ── ④ 最低分样本:打印时全部展开,隐藏点击提示 ── */
.lowest-detail { display: block !important; hidden: false; }
.lowest-row { break-inside: avoid; }
.lowest-detail-inner { padding: 8pt 0; font-size: 10pt; }
.detail-label { font-size: 8pt; font-weight: 700; color: #64748b; margin-bottom: 2pt; }
.detail-context .ctx-item { border-bottom: 1px dashed #e2e8f0; padding: 2pt 0; font-size: 9pt; }
/* ── ⑤ 优化建议 ── */
#advice-section { display: block !important; }
.advice-panel { border: 1px solid #e2e8f0; border-radius: 6pt; padding: 10pt 12pt; }
.advice-md h2 { font-size: 12pt; margin-top: 10pt; }
.advice-md h3 { font-size: 11pt; }
.advice-md ul { margin: 4pt 0 4pt 16pt; }
.advice-md li { margin-bottom: 3pt; }
/* ── 分组表 ── */
table.group-table { width: 100%; font-size: 9pt; border-collapse: collapse; }
table.group-table th,
table.group-table td { padding: 4pt 6pt; border-bottom: 1px solid #e2e8f0; }
table.group-table th { font-weight: 700; color: #64748b; }
/* ── 颜色保留(部分浏览器打印默认去色) ── */
.good { color: #16a34a !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
.warn { color: #eab308 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
.bad { color: #dc2626 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
.score-badge.good { background: #dcfce7 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
.score-badge.warn { background: #fef9c3 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
.score-badge.bad { background: #fee2e2 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
}
/* ---------- ⑤ 优化建议面板 ---------- */ /* ---------- ⑤ 优化建议面板 ---------- */
.advice-panel { border-left: 3px solid #7c3aed; } .advice-panel { border-left: 3px solid #7c3aed; }
.advice-header { .advice-header {

View File

@@ -3,7 +3,7 @@
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Siemens RAGAS 评估控制台</title> <title>RAGAS 评估控制台</title>
<link rel="stylesheet" href="/static/css/app.css" /> <link rel="stylesheet" href="/static/css/app.css" />
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
</head> </head>
@@ -28,6 +28,9 @@
<button class="nav-item" data-view="profiles"> <button class="nav-item" data-view="profiles">
<span class="nav-ico"></span><span>LLM 配置</span> <span class="nav-ico"></span><span>LLM 配置</span>
</button> </button>
<button class="nav-item" data-view="apidocs">
<span class="nav-ico"></span><span>API 文档</span>
</button>
</nav> </nav>
<div class="sidebar-foot"> <div class="sidebar-foot">
<span class="dot" id="health-dot"></span> <span class="dot" id="health-dot"></span>
@@ -89,6 +92,22 @@
</div> </div>
</div> </div>
<!-- ??????????????? -->
<div class="panel weight-config-panel" id="weight-config-panel" hidden>
<h2>???? <span class="muted" style="font-size:13px;font-weight:400">???????????????</span></h2>
<div class="weight-section">
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">???????????????????</span></div>
<div id="metric-weight-rows" class="weight-rows"></div>
</div>
<div class="weight-section" style="margin-top:16px">
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">?? PDF ???????????????????????</span></div>
<div id="doc-weight-rows" class="weight-rows"></div>
<button class="btn btn-sm" id="add-doc-weight-btn" style="margin-top:8px">? ??????</button>
</div>
</div>
<div class="panel" id="task-panel" hidden> <div class="panel" id="task-panel" hidden>
<div class="task-head"> <div class="task-head">
<h2>评估进度</h2> <h2>评估进度</h2>
@@ -103,12 +122,25 @@
<!-- 报告详情视图 --> <!-- 报告详情视图 -->
<section class="view" id="view-report" hidden> <section class="view" id="view-report" hidden>
<!-- 历史报告切换下拉(顶部,始终可见) -->
<div class="report-switcher no-print" id="report-switcher">
<label class="report-switcher-label">切换报告</label>
<select class="select report-switcher-select" id="report-switcher-select">
<option value="">— 加载中… —</option>
</select>
</div>
<div class="empty" id="report-empty"> <div class="empty" id="report-empty">
<p>请先从「运行列表」选择一次运行。</p> <p>请先从「运行列表」选择一次运行。</p>
</div> </div>
<div id="report-content" hidden> <div id="report-content" hidden>
<!-- 顶部元信息条 --> <!-- 顶部元信息条 -->
<div class="report-meta" id="report-meta"></div> <div class="report-meta" id="report-meta"></div>
<div class="report-actions no-print">
<button class="btn btn-ghost btn-export-pdf" id="export-pdf-btn" onclick="Report.exportPdf()">
📄 导出 PDF
</button>
</div>
<!-- ① 指标均值卡片 --> <!-- ① 指标均值卡片 -->
<div class="section-label">① 指标均值 OVERVIEW</div> <div class="section-label">① 指标均值 OVERVIEW</div>
@@ -199,6 +231,17 @@
<p class="muted">点击「新建配置」添加第一个。</p> <p class="muted">点击「新建配置」添加第一个。</p>
</div> </div>
</section> </section>
<!-- API 文档视图 -->
<section class="view" id="view-apidocs" hidden>
<iframe
id="apidocs-frame"
src="/docs"
class="apidocs-frame"
title="API 文档"
allowfullscreen>
</iframe>
</section>
</main> </main>
</div> </div>

View File

@@ -5,8 +5,8 @@
const App = { const App = {
currentRunId: null, currentRunId: null,
activeView: null, activeView: null,
views: ["runs", "new", "report", "profiles"], views: ["runs", "new", "report", "profiles", "apidocs"],
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置" }, titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", apidocs: "API 文档" },
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。 // 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
init() { init() {

View File

@@ -4,11 +4,16 @@ const Report = {
distChart: null, distChart: null,
currentDetail: null, currentDetail: null,
activeGrouping: null, activeGrouping: null,
_switcherLoaded: false,
// 加载并渲染指定运行的完整报告。 // 加载并渲染指定运行的完整报告。
async render(runId) { async render(runId) {
const empty = document.getElementById("report-empty"); const empty = document.getElementById("report-empty");
const content = document.getElementById("report-content"); const content = document.getElementById("report-content");
// 加载历史报告下拉(仅首次)
Report._loadSwitcher(runId);
if (!runId) { if (!runId) {
empty.hidden = false; empty.hidden = false;
content.hidden = true; content.hidden = true;
@@ -28,6 +33,10 @@ const Report = {
Report.renderLowest(detail.report); Report.renderLowest(detail.report);
Report.renderAdvice(detail.summary, detail.report); Report.renderAdvice(detail.summary, detail.report);
content.style.opacity = "1"; content.style.opacity = "1";
// 同步下拉选中项
const sel = document.getElementById("report-switcher-select");
if (sel) sel.value = runId;
} catch (err) { } catch (err) {
empty.hidden = false; empty.hidden = false;
content.hidden = true; content.hidden = true;
@@ -35,6 +44,55 @@ const Report = {
} }
}, },
// 加载并填充历史报告下拉选择框
async _loadSwitcher(currentRunId) {
const sel = document.getElementById("report-switcher-select");
if (!sel) return;
// 已加载过就只更新选中值,不重复请求
if (Report._switcherLoaded) {
if (currentRunId) sel.value = currentRunId;
return;
}
try {
const data = await API.runs();
const runs = data.runs || [];
sel.innerHTML = "";
if (runs.length === 0) {
sel.innerHTML = '<option value="">(无历史运行)</option>';
return;
}
runs.forEach((run) => {
const opt = document.createElement("option");
opt.value = run.run_id;
const timeStr = App.shortTime(run.finished_at);
const meanText = run.metric_means
? Object.entries(run.metric_means)
.filter(([, v]) => v !== null && v !== undefined)
.slice(0, 2)
.map(([k, v]) => `${App.shortMetric(k)}=${v.toFixed(2)}`)
.join(" ")
: "";
opt.textContent = `${run.scenario_name || run.run_id} ${timeStr}${meanText ? " [" + meanText + "]" : ""}`;
sel.appendChild(opt);
});
Report._switcherLoaded = true;
if (currentRunId) sel.value = currentRunId;
} catch (_e) {
sel.innerHTML = '<option value="">(加载失败)</option>';
}
// 绑定切换事件(只绑一次)
sel.addEventListener("change", () => {
const rid = sel.value;
if (!rid) return;
App.currentRunId = rid;
App.enableReportNav();
Report.render(rid);
});
},
// 顶部元信息条。 // 顶部元信息条。
renderMeta(summary) { renderMeta(summary) {
const el = document.getElementById("report-meta"); const el = document.getElementById("report-meta");
@@ -69,6 +127,18 @@ const Report = {
`; `;
wrap.appendChild(card); wrap.appendChild(card);
}); });
// 综合加权得分卡片
const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
const wsCard = document.createElement("div");
wsCard.className = "metric-card weighted-score-card";
const wsCls = App.scoreClass(wsValue);
const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
wsCard.innerHTML = `
<div class="metric-value ${wsCls}">${wsText}</div>
<div class="metric-name">综合加权得分</div>
`;
wrap.appendChild(wsCard);
}, },
// ② 分数分布直方图(可切换指标)。 // ② 分数分布直方图(可切换指标)。
@@ -286,4 +356,22 @@ const Report = {
body.innerHTML = `<div class="advice-md">${html}</div>`; body.innerHTML = `<div class="advice-md">${html}</div>`;
}, },
// 导出 PDF展开所有低分样本 → 打印 → 还原折叠状态
exportPdf() {
// 1. 记录当前各 detail 展开状态,并全部展开
const details = document.querySelectorAll("#lowest-table .lowest-detail");
const wasHidden = Array.from(details).map((el) => el.hidden);
details.forEach((el) => { el.hidden = false; });
// 2. 打印完成后还原折叠状态
const restore = () => {
details.forEach((el, i) => { el.hidden = wasHidden[i]; });
window.removeEventListener("afterprint", restore);
};
window.addEventListener("afterprint", restore);
// 3. 触发打印(浏览器弹出打印对话框,用户选"另存为 PDF"
window.print();
},
}; };

View File

@@ -1,11 +1,11 @@
// runner.js — 新建评估视图列出场景、LLM角色配置、触发评估、轮询任务状态与日志 // runner.js — 新建评估视图列出场景、LLM角色配置、权重配置、触发评估、轮询任务状态。
const Runner = { const Runner = {
selectedScenario: null, selectedScenario: null,
selectedScenarioInfo: null,
pollTimer: null, pollTimer: null,
lastRunId: null, lastRunId: null,
// 绑定运行按钮。
init() { init() {
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger()); document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
document.getElementById("view-report-btn").addEventListener("click", () => { document.getElementById("view-report-btn").addEventListener("click", () => {
@@ -14,9 +14,9 @@ const Runner = {
App.navigate("report", Runner.lastRunId); App.navigate("report", Runner.lastRunId);
} }
}); });
document.getElementById("add-doc-weight-btn").addEventListener("click", () => Runner._addDocWeightRow());
}, },
// 加载并渲染可触发的场景列表。
async loadScenarios() { async loadScenarios() {
const list = document.getElementById("scenario-list"); const list = document.getElementById("scenario-list");
list.innerHTML = '<p class="muted">加载中…</p>'; list.innerHTML = '<p class="muted">加载中…</p>';
@@ -32,17 +32,14 @@ const Runner = {
} catch (err) { } catch (err) {
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`; list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
} }
// 同时加载 profiles 供角色选择
Runner._populateProfileSelects(); Runner._populateProfileSelects();
}, },
// 填充三个角色下拉框
async _populateProfileSelects() { async _populateProfileSelects() {
const cached = Profiles.getAll(); const cached = Profiles.getAll();
const profiles = cached.length > 0 const profiles = cached.length > 0
? cached ? cached
: (await API.profiles().catch(() => ({ profiles: [] }))).profiles; : (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
["role-judge", "role-answer", "role-dataset"].forEach(id => { ["role-judge", "role-answer", "role-dataset"].forEach(id => {
const sel = document.getElementById(id); const sel = document.getElementById(id);
sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>'; sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
@@ -55,17 +52,14 @@ const Runner = {
}); });
}, },
// 构造单个场景条目。
renderScenarioItem(sc) { renderScenarioItem(sc) {
const item = document.createElement("div"); const item = document.createElement("div");
const invalid = !!sc.error; const invalid = !!sc.error;
item.className = "scenario-item" + (invalid ? " invalid" : ""); item.className = "scenario-item" + (invalid ? " invalid" : "");
const modeTag = sc.mode const modeTag = sc.mode
? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>` ? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
: ""; : "";
const metricCount = (sc.metrics || []).length; const metricCount = (sc.metrics || []).length;
item.innerHTML = ` item.innerHTML = `
<div> <div>
<div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div> <div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
@@ -77,27 +71,94 @@ const Runner = {
<span class="tag">${metricCount} 指标</span> <span class="tag">${metricCount} 指标</span>
</div> </div>
`; `;
if (!invalid) { if (!invalid) {
item.addEventListener("click", () => { item.addEventListener("click", () => {
document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected")); document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
item.classList.add("selected"); item.classList.add("selected");
Runner.selectedScenario = sc.path; Runner.selectedScenario = sc.path;
Runner.selectedScenarioInfo = sc;
document.getElementById("selected-scenario").textContent = sc.path; document.getElementById("selected-scenario").textContent = sc.path;
document.getElementById("run-btn").disabled = false; document.getElementById("run-btn").disabled = false;
// 显示 LLM 角色面板
document.getElementById("llm-assignment-panel").hidden = false; document.getElementById("llm-assignment-panel").hidden = false;
Runner._renderWeightPanel(sc);
document.getElementById("weight-config-panel").hidden = false;
}); });
} }
return item; return item;
}, },
// 触发评估:先 apply profiles若选了再触发任务。 // 根据选中场景渲染指标权重行(动态生成,按场景 metrics 列表)
_renderWeightPanel(sc) {
const metricRows = document.getElementById("metric-weight-rows");
metricRows.innerHTML = "";
const metrics = sc.metrics || [];
const existingWeights = sc.metric_weights || {};
metrics.forEach(metric => {
const row = document.createElement("div");
row.className = "weight-row";
const currentVal = existingWeights[metric] != null ? existingWeights[metric] : 1.0;
row.innerHTML = `
<span class="weight-row-label">${App.escape(metric)}</span>
<input class="weight-row-input" type="number" min="0" step="0.1"
data-metric="${App.escape(metric)}" value="${currentVal}" />
`;
metricRows.appendChild(row);
});
// 填充已有文档权重
const docRows = document.getElementById("doc-weight-rows");
docRows.innerHTML = "";
const existingDocWeights = sc.doc_weights || {};
Object.entries(existingDocWeights).forEach(([docName, w]) => {
Runner._addDocWeightRow(docName, w);
});
},
// 添加一行文档权重输入
_addDocWeightRow(docName, weight) {
const name = docName !== undefined ? docName : "";
const w = weight !== undefined ? weight : 1.0;
const container = document.getElementById("doc-weight-rows");
const row = document.createElement("div");
row.className = "weight-row";
row.innerHTML = `
<input class="doc-weight-name" type="text" placeholder="PDF 文件名(如 322_双源CT.pdf" value="${App.escape(String(name))}" />
<input class="weight-row-input" type="number" min="0" step="0.1" value="${w}" />
<button class="weight-row-remove" title="删除">✕</button>
`;
row.querySelector(".weight-row-remove").addEventListener("click", () => row.remove());
container.appendChild(row);
},
// 收集权重面板当前值;全等权时返回 null不发送
_collectWeights() {
const metricWeights = {};
document.querySelectorAll("#metric-weight-rows .weight-row-input").forEach(input => {
const metric = input.dataset.metric;
const val = parseFloat(input.value);
if (metric && !isNaN(val)) metricWeights[metric] = val;
});
const docWeights = {};
document.querySelectorAll("#doc-weight-rows .weight-row").forEach(row => {
const nameInput = row.querySelector(".doc-weight-name");
const valInput = row.querySelector(".weight-row-input");
if (!nameInput || !valInput) return;
const name = nameInput.value.trim();
const val = parseFloat(valInput.value);
if (name && !isNaN(val)) docWeights[name] = val;
});
const allMetricDefault = Object.values(metricWeights).every(v => Math.abs(v - 1.0) < 1e-9);
const noDocWeights = Object.keys(docWeights).length === 0;
if (allMetricDefault && noDocWeights) return { metricWeights: null, docWeights: null };
return { metricWeights, docWeights };
},
async trigger() { async trigger() {
if (!Runner.selectedScenario) return; if (!Runner.selectedScenario) return;
const runBtn = document.getElementById("run-btn"); const runBtn = document.getElementById("run-btn");
runBtn.disabled = true; runBtn.disabled = true;
const panel = document.getElementById("task-panel"); const panel = document.getElementById("task-panel");
const logBox = document.getElementById("task-log"); const logBox = document.getElementById("task-log");
const statusBadge = document.getElementById("task-status"); const statusBadge = document.getElementById("task-status");
@@ -106,12 +167,8 @@ const Runner = {
reportBtn.hidden = true; reportBtn.hidden = true;
logBox.textContent = ""; logBox.textContent = "";
Runner._setStatus(statusBadge, "queued"); Runner._setStatus(statusBadge, "queued");
try { try {
// Step 1: apply LLM profiles to YAML if any selected
await Runner._applyProfilesIfNeeded(logBox); await Runner._applyProfilesIfNeeded(logBox);
// Step 2: trigger evaluation
const resp = await API.triggerEvaluation(Runner.selectedScenario); const resp = await API.triggerEvaluation(Runner.selectedScenario);
Runner.poll(resp.task_id); Runner.poll(resp.task_id);
} catch (err) { } catch (err) {
@@ -121,20 +178,22 @@ const Runner = {
} }
}, },
// 如果用户选了 profile就先 apply 写回 YAML
async _applyProfilesIfNeeded(logBox) { async _applyProfilesIfNeeded(logBox) {
const judgeId = document.getElementById("role-judge").value; const judgeId = document.getElementById("role-judge").value;
const answerId = document.getElementById("role-answer").value; const answerId = document.getElementById("role-answer").value;
const datasetId = document.getElementById("role-dataset").value; const datasetId = document.getElementById("role-dataset").value;
const { metricWeights, docWeights } = Runner._collectWeights();
if (!judgeId && !answerId && !datasetId) return; // 全空,跳过 if (!judgeId && !answerId && !datasetId && !metricWeights && !docWeights) return;
logBox.textContent = "正在将 LLM 配置写入场景文件…\n"; logBox.textContent = "正在将 LLM 配置和权重写入场景文件…\n";
const body = { const body = {
scenario_path: Runner.selectedScenario, scenario_path: Runner.selectedScenario,
judge_profile_id: judgeId || null, judge_profile_id: judgeId || null,
answer_profile_id: answerId || null, answer_profile_id: answerId || null,
dataset_profile_id: datasetId || null, dataset_profile_id: datasetId || null,
metric_weights: metricWeights,
doc_weights: docWeights,
}; };
const result = await API.applyProfiles(body); const result = await API.applyProfiles(body);
const fields = (result.patched_fields || []).join(", "); const fields = (result.patched_fields || []).join(", ");
@@ -143,13 +202,11 @@ const Runner = {
: "(未找到可更新的字段,继续运行)\n"; : "(未找到可更新的字段,继续运行)\n";
}, },
// 周期性轮询任务状态,刷新日志与徽标。
poll(taskId) { poll(taskId) {
const logBox = document.getElementById("task-log"); const logBox = document.getElementById("task-log");
const statusBadge = document.getElementById("task-status"); const statusBadge = document.getElementById("task-status");
const reportBtn = document.getElementById("view-report-btn"); const reportBtn = document.getElementById("view-report-btn");
const runBtn = document.getElementById("run-btn"); const runBtn = document.getElementById("run-btn");
if (Runner.pollTimer) clearInterval(Runner.pollTimer); if (Runner.pollTimer) clearInterval(Runner.pollTimer);
Runner.pollTimer = setInterval(async () => { Runner.pollTimer = setInterval(async () => {
try { try {
@@ -157,7 +214,6 @@ const Runner = {
logBox.textContent = (status.logs || []).join("\n"); logBox.textContent = (status.logs || []).join("\n");
logBox.scrollTop = logBox.scrollHeight; logBox.scrollTop = logBox.scrollHeight;
Runner._setStatus(statusBadge, status.status); Runner._setStatus(statusBadge, status.status);
if (status.status === "completed" || status.status === "failed") { if (status.status === "completed" || status.status === "failed") {
clearInterval(Runner.pollTimer); clearInterval(Runner.pollTimer);
runBtn.disabled = false; runBtn.disabled = false;
@@ -175,7 +231,6 @@ const Runner = {
}, 1200); }, 1200);
}, },
// 更新状态徽标的文本与配色类。
_setStatus(badge, status) { _setStatus(badge, status) {
badge.textContent = status; badge.textContent = status;
badge.className = "badge " + status; badge.className = "badge " + status;