Compare commits

..

15 Commits

Author SHA1 Message Date
wangwei
24956bbf75 更新 2026-06-16 18:12:33 +08:00
wangwei
ca01e44ad2 feat(webapp): add session persistence via URL hash routing + sessionStorage
- app.js: hash-based router (#runs / #new / #profiles / #report/{runId})
  - navigate() pushes history entries for back/forward support
  - _restoreSession() reads hash on load and popstate
  - sessionStorage fallback for same-tab refreshes
  - run-card highlights selected run (.run-card.selected)
- runner.js: use App.navigate() for report redirect; persist lastRunId to sessionStorage
- index.html: report nav button starts disabled (enabled on run select/restore)
- app.css: .run-card.selected with petrol border + ring

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:55:07 +08:00
wangwei
1a2cc534b8 feat(webapp): add optimization advice section to report UI
- index.html: add section ⑤ advice block (hidden by default, shown when advice_markdown present)
- report.js: add renderAdvice() called in render(), simple Markdown→HTML converter
- app.js: add noise_sensitivity / factual_correctness / semantic_similarity to shortMetric map
- app.css: add .advice-panel, .advice-badge, .advice-md styles (purple left-border theme)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:26:37 +08:00
wangwei
91c0dab4f9 fix(advisor): fix LLM API call, wire advice_markdown to webapp, update .env.example timeouts
- llm_analyzer.py: use llm.langchain_llm.ainvoke() (correct RAGAS 0.4.3 API)
- webapp/models.py: add advice_markdown field to ReportData
- webapp/services/run_reader.py: add read_advice_markdown() reading optimization_advice.md
- webapp/services/report_builder.py: pass advice_markdown into ReportData
- .env.example: OPENAI_TIMEOUT_SECONDS 30→180, RAGAS_METRIC_TIMEOUT_SECONDS 45→300

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:12:32 +08:00
wangwei
f5c2dce64a feat(advisor): add optimization advisor module
- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:06:19 +08:00
wangwei
d68399d39b chore: update startup scripts and .env.example for LLM profile feature 2026-06-16 17:03:25 +08:00
wangwei
719c3b4ca4 test: ensure test package structure and all webapp tests pass 2026-06-16 16:27:54 +08:00
wangwei
5b60ed12ea feat: add LLM role-assignment panel to 新建评估 view 2026-06-16 16:27:00 +08:00
wangwei
dc8baf8662 feat: add LLM配置 management page (profiles view) 2026-06-16 16:25:20 +08:00
wangwei
e329f59139 feat: add yaml_patcher service to apply LLM profiles to scenario YAML 2026-06-16 16:21:19 +08:00
wangwei
b19054bd66 feat: add /api/llm-profiles CRUD router 2026-06-16 16:18:40 +08:00
wangwei
5d09deb420 feat: add ProfileManager service with JSON persistence 2026-06-16 16:14:31 +08:00
wangwei
b98af29449 feat: add LLMProfile pydantic models 2026-06-16 16:10:37 +08:00
wangwei
4173a40d93 feat(scripts): add run_eval.bat / run_eval.ps1 evaluation launcher scripts
Both scripts support:
  - Shortcut args: online (default), offline, or any custom .yaml path
  - Second arg: log level (DEBUG/INFO/WARNING/ERROR), default INFO
  - Auto-timestamped log file saved to logs\eval_<date>_<time>.log
  - Sets PYTHONIOENCODING=utf-8 and PYTHONPATH=. automatically
  - Friendly error/success banners with log file path

Usage:
  run_eval.bat                    # online eval
  run_eval.bat offline DEBUG      # offline eval with DEBUG logs
  .\run_eval.ps1 online DEBUG     # PowerShell equivalent

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 11:16:53 +08:00
wangwei
629304aa6d feat(logging): add structured evaluation logs for metric-level debugging
- pipeline.py: log each metric score/timeout/error with sample_id,
  elapsed time, and score value; log NaN list per sample; progress
  counter N/total after each sample completes
- evaluator.py: log eval start, dataset counts, adapter enrichment
  progress (per-sample OK/FAIL with elapsed), metric scoring summary,
  and per-metric NaN rate at end of run
- runner.py: _setup_logging() helper writes to stderr + optional file;
  ragas/httpx/openai noisy loggers throttled to WARNING
- main.py: add --log-file and --log-level CLI flags

Usage:
  python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 10:48:41 +08:00
49 changed files with 5448 additions and 83 deletions

View File

@@ -1,11 +1,22 @@
# ===== LLM 连接配置RAGAS 评测 + 生成) =====
# 所有模型共用同一个 OpenAI 兼容 endpoint
# 在 Web 控制台的「LLM 配置」页面可以保存多个命名配置,
# 并在运行评估时按角色Judge / Answer / Dataset分别选择覆盖。
OPENAI_API_KEY=your-api-key
OPENAI_BASE_URL=http://6.86.80.4:30080/v1
OPENAI_TIMEOUT_SECONDS=180
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
RAGAS_JUDGE_MODEL=deepseek-v4-flash
RAGAS_EMBEDDING_MODEL=text-embedding-v3
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300
BATCH_SIZE=8
RAGAS_METRIC_TIMEOUT_SECONDS=300
# ===== 阿里云文档解析 =====
# ===== 阿里云文档解析dataset build 功能需要) =====
ALIBABA_ACCESS_KEY_ID=
ALIBABA_ACCESS_KEY_SECRET=
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
@@ -14,6 +25,8 @@ ALIYUN_PARSE_TIMEOUT_SECONDS=900
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
ALIYUN_LLM_ENHANCEMENT=true
ALIYUN_ENHANCEMENT_MODE=VLM
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
DOCUMENT_PARSE_ARTIFACT_PREFIX=outputs/dataset-builds
PARSER_FAILURE_MODE=fail
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
DATASET_GENERATOR_MODEL=qwen3.6-plus

64
configs/llm_profiles.json Normal file
View File

@@ -0,0 +1,64 @@
{
"profiles": [
{
"profile_id": "c8e185a64fa0",
"name": "glm-5",
"model": "glm-5",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:16:22.438297+00:00",
"updated_at": "2026-06-16T09:19:03.089865+00:00"
},
{
"profile_id": "54ddfe5aeb46",
"name": "deepseek-v4-pro",
"model": "deepseek-v4-pro",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:17:08.473904+00:00",
"updated_at": "2026-06-16T09:19:07.504082+00:00"
},
{
"profile_id": "25d035eef194",
"name": "qwen3.5-flash",
"model": "qwen3.5-flash",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:18:24.265619+00:00",
"updated_at": "2026-06-16T09:18:24.265619+00:00"
},
{
"profile_id": "ff1d0f417a5d",
"name": "deepseek-v4-flash",
"model": "deepseek-v4-flash",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:18:57.091549+00:00",
"updated_at": "2026-06-16T09:18:57.091549+00:00"
},
{
"profile_id": "5b04c49df9df",
"name": "text-embedding-v4",
"model": "text-embedding-v4",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:19:49.104004+00:00",
"updated_at": "2026-06-16T09:19:49.104004+00:00"
},
{
"profile_id": "b4f7c82859d5",
"name": "text-embedding-v3",
"model": "text-embedding-v3",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:20:18.266540+00:00",
"updated_at": "2026-06-16T09:20:18.266540+00:00"
}
]
}

View File

@@ -318,6 +318,10 @@ metrics:
- answer_relevancy
- context_recall
- context_precision
# 可选:鲁棒性 / 端到端指标(需数据集含 ground_truth完整列表见 §9.4
# - noise_sensitivity
# - factual_correctness
# - semantic_similarity
output_dir: runs/legal-assistant-offline-baseline
runtime:
batch_size: 4
@@ -338,7 +342,7 @@ runtime:
- `embedding_model`
- 负责向量相关指标的模型
- `metrics`
- 本次启用的指标列表
- 本次启用的指标列表(完整可选项与依赖见 §9.4
- `output_dir`
- 本次运行结果输出目录
- `runtime.batch_size`
@@ -399,6 +403,32 @@ app_adapter:
- embedding model
- 指标实例
当前支持的指标(`rag_eval/metrics/registry.py` 中的 `SUPPORTED_METRICS`
| 指标名 | 层面 | 依赖 |
|---|---|---|
| `faithfulness` | 生成 | judge model |
| `answer_relevancy` | 生成 | judge model + embedding |
| `context_recall` | 检索 | judge model + ground_truth |
| `context_precision` | 检索 | judge model + ground_truth |
| `noise_sensitivity` | 鲁棒性 | judge model + ground_truth |
| `factual_correctness` | 端到端 | judge model + ground_truth |
| `semantic_similarity` | 端到端 | embedding + ground_truth无 LLM 调用) |
后四项以 `ground_truth`(标准答案)为参照,数据集必须提供该字段。新增指标统一在 `registry.py` / `factory.py` / `pipeline.py` 三处对齐装配。
**Optimization Advisor§11 优化策略落地):**
评测结束后,若场景配置 `optimization_advisor: true`,则自动调用 `rag_eval/advisor/` 模块:
- 规则引擎(`rules.py`)对 7 个指标各自设阈值,识别触发项并选取 top-3 低分样本
- LLM 分析器(`llm_analyzer.py`)结合低分样本生成中文 Markdown 优化建议(复用 judge_model失败自动降级为纯规则报告
- 写出层(`writer.py`)输出 `optimization_advice.md` 并打日志摘要
```yaml
# 场景配置示例
optimization_advisor: true
```
### 9.5 并发控制
执行层负责并发上限,不把并发策略散落到各指标实现中。

View File

@@ -316,11 +316,21 @@ adapter 层的目标是:**把不同类型的目标应用,统一成同一套
当前支持的指标包括:
核心检索 / 生成指标(始终可用):
- `faithfulness`
- `answer_relevancy`
- `context_recall`
- `context_precision`
鲁棒性 / 端到端指标(架构设计 §10.2,需数据集含 `ground_truth`
- `noise_sensitivity` —— 鲁棒性:对检索噪声的敏感度
- `factual_correctness` —— 端到端:回答相对标准答案的事实正确性
- `semantic_similarity` —— 端到端:回答与标准答案的语义相似度(基于 embedding无 LLM 调用)
所有指标都通过同一套装配点接入:`registry.py`(校验白名单)、`factory.py`(实例化)、`pipeline.py``ascore` 入参分发),新增指标只需在这三处对齐即可。
所以 metric pipeline 的职责可以总结为:
**把标准样本转换成结构化评分结果。**
@@ -414,3 +424,39 @@ main.py
- 可以把每次实验的资产稳定留住
这也是它和一次性离线脚本的根本区别。
---
## 15. Optimization Advisor 链路
相关代码:
- `rag_eval/advisor/__init__.py` — 外部入口 `run_advisor()`
- `rag_eval/advisor/rules.py` — 规则引擎(纯函数,无 LLM7 条指标诊断规则
- `rag_eval/advisor/llm_analyzer.py` — LLM 分析器(复用 judge_model llm 实例,失败自动降级)
- `rag_eval/advisor/writer.py` — 写出 `optimization_advice.md` + 日志摘要
Advisor 在 `write_run_artifacts()` 之后触发,仅当场景配置 `optimization_advisor: true` 时生效,默认关闭。
执行链路:
```text
run_advisor(result, scenario, llm)
-> rules.diagnose(score_rows, metrics) # 识别异常指标,选取 top-3 低分样本
-> llm_analyzer.analyze(diagnoses, llm) # LLM 生成中文建议(失败自动降级为纯规则报告)
-> writer.write_advice(...) # 写 optimization_advice.md + 日志摘要
```
输出产物追加在现有 run 目录:
```text
outputs/online/siemens-pdf-question-bank/<run_id>/
scenario.snapshot.yaml
scores.csv
invalid.csv
summary.md
metadata.json
optimization_advice.md <- 新增optimization_advisor: true 时生成)
```
规则引擎对 7 个指标各自设 warning / critical 双档阈值,`noise_sensitivity` 为"越低越好"(方向相反)。所有诊断均附带 top-3 低分样本,喂给 LLM 生成针对具体内容的中文建议。

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,225 @@
# 优化顾问模块设计 Spec
- 日期2026-06-16
- 状态:已确认,进入实现。
## 1. 目标
在现有 RAG 评测流程结束后,新增一个**优化顾问模块**Optimization Advisor根据本次评测的多项指标分数与低分样本自动诊断指标偏低的原因并给出针对性的优化建议输出为中文 Markdown 报告 + 日志摘要。
对应架构设计 §11优化策略将"指标到动作的映射"§11.2)从文档形式落地为代码自动执行。
---
## 2. 决策摘要
| 决策点 | 选择 |
|---|---|
| 输出形式 | `optimization_advice.md`(文件)+ 控制台/日志摘要(双输出) |
| 生成机制 | 规则引擎定位异常指标 → LLM 结合低分样本二次解读(两层) |
| 触发方式 | YAML 场景文件显式声明 `optimization_advisor: true`,默认关闭 |
| LLM 实例 | 复用 `build_models()` 已创建的 `llm` 实例,不重建 client |
| 包位置 | `rag_eval/advisor/`(独立包,对外暴露 `run_advisor()` 单一入口) |
---
## 3. 架构
### 3.1 执行链路
```
run_scenario()
→ load_scenario() # 读 YAML解析 optimization_advisor 字段
→ build_models() # 已有:创建 llm, embeddings
→ build_metric_pipeline() # 已有
→ Evaluator.evaluate() # 已有:打分 → EvaluationResult
→ write_run_artifacts() # 已有scores.csv / summary.md / ...
→ run_advisor( # 新增3 行)
result, scenario, llm, artifact_paths
)
→ rules.diagnose(score_rows) # 规则引擎:返回 Diagnosis 列表
→ llm_analyzer.analyze(diags, samples) # LLM生成中文 Markdown 建议
→ writer.write(advice, paths) # 写文件 + 打日志
```
### 3.2 新增文件
```
rag_eval/advisor/
__init__.py ← 暴露 run_advisor(),外部唯一入口
rules.py ← 纯函数规则引擎,无 LLM可单独单测
llm_analyzer.py ← 接收 llm 实例 + 诊断结构 → 中文 Markdown
writer.py ← 写 optimization_advice.md打日志摘要
```
### 3.3 修改文件(最小改动)
| 文件 | 改动 |
|---|---|
| `rag_eval/shared/models.py` | `Scenario``optimization_advisor: bool = False` 字段 |
| `rag_eval/config/schema.py` | `ScenarioModel` 加同名字段 + 透传到 `Scenario` |
| `rag_eval/config/loader.py` | 透传 `optimization_advisor``Scenario` 构造 |
| `rag_eval/reporting/artifacts.py` | `RunArtifactPaths``advice_md: Path` 字段 + `build_artifact_paths()` 加赋值 |
| `rag_eval/execution/runner.py` | `run_scenario()` 末尾:`build_models` 返回 llm 传入,条件调用 `run_advisor()` |
### 3.4 输出产物
```
outputs/online/siemens-pdf-question-bank/<run_id>/
scenario.snapshot.yaml
scores.csv
invalid.csv
summary.md
metadata.json
optimization_advice.md ← 新增optimization_advisor: true 时生成)
```
---
## 4. 规则引擎rules.py
### 4.1 数据结构
```python
@dataclass
class Diagnosis:
metric: str # 指标名
mean_score: float # 本次均值
threshold: float # 警戒阈值
severity: str # "warning" | "critical"
root_causes: list[str] # 可能原因(来自架构设计 §11.2
suggested_actions: list[str] # 对应可调阶段
low_samples: list[dict] # 分数最低的 N 条样本(含 question/answer/ground_truth
```
### 4.2 七条指标诊断规则
阈值参考 RAG 评测最佳实践,分 warning / critical 两档:
| 指标 | warning | critical | 根因方向 | 对应优化阶段§11.2 |
|---|---|---|---|---|
| `faithfulness` | < 0.7 | < 0.5 | 生成未严格基于检索片段 / 幻觉 | 生成 prompt grounding开启校验 |
| `answer_relevancy` | < 0.7 | < 0.5 | 回答偏离问题 / 格式冗余 | 查询改写生成 prompt 格式 |
| `context_recall` | < 0.7 | < 0.5 | 检索遗漏关键信息 | 多查询问题分解Step-back加大过召回 |
| `context_precision` | < 0.6 | < 0.4 | 检索引入过多噪声 / 排序差 | 后检索重排压缩相关性过滤 |
| `noise_sensitivity` | > 0.3 | > 0.5 | 回答被噪声片段干扰(越低越好) | 后检索相关性过滤、重排 |
| `factual_correctness` | < 0.6 | < 0.4 | 回答事实与标准答案偏差大 | 检索与生成综合优化 |
| `semantic_similarity` | < 0.7 | < 0.5 | 回答语义与标准答案差距大 | 生成 prompt检索质量 |
> 注:`noise_sensitivity` 越低越好0=完全不受噪声影响),其阈值方向与其余相反。
### 4.3 低分样本选取
每个触发诊断的指标取该指标分数最低的 **top-3** 样本排除 NaN附入 `Diagnosis.low_samples`字段包含 `sample_id / question / answer / ground_truth / <metric_score>`
---
## 5. LLM 分析器llm_analyzer.py
### 5.1 输入
- `diagnoses: list[Diagnosis]` 规则引擎输出仅触发阈值的指标
- `llm` 已有 RAGAS LLM 实例scenario judge_model
- `scenario_name: str` 用于报告标题
### 5.2 Prompt 设计
使用**一次 LLM 调用**把所有触发诊断的指标和低分样本一起发送
```
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
请用中文撰写一份优化建议报告,格式为 Markdown。
## 评测诊断摘要
{for each diagnosis: 指标名、均值、阈值、可能原因、建议动作}
## 低分样本示例
{for each diagnosis: top-3 低分样本的 question / answer / ground_truth}
## 要求
1. 按指标分节(## 指标名),先解释"为什么低",再给出"具体怎么改"
2. "具体怎么改"要结合低分样本的具体内容,而不只是泛泛建议
3. 最后写一节 ## 优先优化次序,按性价比排序(参考:不增加调用次数的优先)
4. 语言简洁,面向工程师,不要废话
```
### 5.3 输出
LLM 返回的 Markdown 字符串直接写入 `optimization_advice.md`在报告头部追加运行元信息)。
### 5.4 失败降级
LLM 调用失败超时/异常降级为**纯规则报告**只输出规则引擎的诊断结构不含 LLM 解读文件照常写出错误信息写入报告末尾不阻断整个评测流程
---
## 6. 写出层writer.py
### 6.1 文件写出
`optimization_advice.md` 结构
```markdown
# 优化建议报告 — <scenario_name>
- run_id: `<run_id>`
- 生成时间: `<timestamp>`
- judge_model: `<model>`
---
<LLM 生成的 Markdown 正文>
```
### 6.2 日志摘要
`run_advisor()` 完成后向 `logger.info` 打印一条精简摘要单行适合 `run_eval.bat` 结束后一眼扫到
```
[advisor] 触发诊断 3 项: faithfulness(0.42, critical) context_recall(0.58, warning) noise_sensitivity(0.41, critical)
[advisor] 优化建议已写出: outputs/online/.../optimization_advice.md
```
---
## 7. YAML 配置
场景文件新增一个顶层字段
```yaml
optimization_advisor: true # 默认 falsetrue 时评测结束后自动生成优化建议
```
后续若需精细配置阈值覆盖top-N 低分样本数可扩展为
```yaml
optimization_advisor:
enabled: true
top_low_samples: 3 # 每个指标取几条低分样本(默认 3
# thresholds: # 可选:覆盖默认阈值
# faithfulness: 0.65
```
本轮实现仅支持 `optimization_advisor: true/false`扩展接口预留但不实现
---
## 8. 测试策略
| 测试 | 文件 | 说明 |
|---|---|---|
| 规则引擎单测 | `tests/test_advisor_rules.py` | 纯函数 LLM覆盖每条规则的 warning/critical 触发NaN 跳过low_samples 选取 |
| writer 单测 | `tests/test_advisor_writer.py` | mock Diagnosis 列表验证 md 文件写出格式和日志输出 |
| 集成可选 | 现有 `tests/test_online_eval.py` | 验证 `optimization_advisor: true` 场景下 advice_md 存在 |
LLM 分析器不写单测依赖网络由集成场景覆盖
---
## 9. 不覆盖(本轮边界)
- 不支持跨版本对比分析只分析本次 run
- 不支持批量场景聚合建议
- 不建设 Web UI 展示
- LLM 分析器 prompt 本轮不做多语言适配直接中文
- advisor 阈值本轮硬编码在 `rules.py`不从 YAML 读取

19
main.py
View File

@@ -1,6 +1,8 @@
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from rag_eval.dataset_builder.runner import run_dataset_build
from rag_eval.execution.runner import run_scenario
@@ -18,18 +20,33 @@ def parse_args() -> argparse.Namespace:
"--dataset-build-config",
help="Path to a YAML dataset build config file.",
)
parser.add_argument(
"--log-file",
default=None,
help="Write evaluation logs to this file (in addition to stderr). "
"Example: logs/eval.log",
)
parser.add_argument(
"--log-level",
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging verbosity level (default: INFO). Use DEBUG for per-metric detail.",
)
return parser.parse_args()
def main() -> None:
"""Dispatch the CLI call to the requested workflow."""
args = parse_args()
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
log_file = Path(args.log_file) if args.log_file else None
if args.dataset_build_config:
result = run_dataset_build(args.dataset_build_config)
print(f"Completed dataset build: {result.artifact_paths.root_dir}")
return
result = run_scenario(args.scenario)
result = run_scenario(args.scenario, log_file=log_file, log_level=log_level)
print(f"Completed run: {result.scenario.output_dir}")

View File

@@ -0,0 +1,67 @@
"""Optimization advisor: rule-based diagnosis + LLM-powered recommendations."""
from __future__ import annotations
import asyncio
import logging
from typing import Any
from rag_eval.reporting.artifacts import build_artifact_paths
from rag_eval.shared.models import EvaluationResult, Scenario
from .llm_analyzer import analyze
from .rules import Diagnosis, diagnose
from .writer import write_advice
logger = logging.getLogger("rag_eval.advisor")
__all__ = ["run_advisor", "Diagnosis", "diagnose"]
def run_advisor(
result: EvaluationResult,
scenario: Scenario,
llm: Any,
) -> None:
"""Run the full optimization advisor pipeline after an evaluation completes.
Skips silently if scenario.optimization_advisor is False.
Never raises — failures are logged as warnings, not exceptions.
Args:
result: Completed EvaluationResult from Evaluator.evaluate().
scenario: The resolved Scenario (provides metrics, judge_model, output_dir).
llm: Pre-built RAGAS LLM instance (from build_models()) for LLM analysis.
"""
if not scenario.optimization_advisor:
return
logger.info("[advisor] starting optimization analysis scenario=%s", scenario.scenario_name)
try:
artifact_paths = build_artifact_paths(scenario.output_dir, result.run_id)
if artifact_paths.advice_md is None:
logger.warning("[advisor] advice_md path not set in RunArtifactPaths — skipping")
return
diagnoses = diagnose(result.score_rows, scenario.metrics)
logger.info("[advisor] rule diagnosis complete: %d metric(s) triggered", len(diagnoses))
if diagnoses:
llm_markdown = asyncio.run(analyze(diagnoses, llm, scenario.scenario_name))
else:
llm_markdown = ""
write_advice(
diagnoses=diagnoses,
llm_markdown=llm_markdown,
advice_path=artifact_paths.advice_md,
scenario_name=scenario.scenario_name,
run_id=result.run_id,
judge_model=scenario.judge_model,
)
except Exception as exc:
logger.warning(
"[advisor] advisor failed (%s: %s) — evaluation result is unaffected",
type(exc).__name__, exc,
)

View File

@@ -0,0 +1,100 @@
"""LLM-powered analysis of rule diagnostics and low-score samples."""
from __future__ import annotations
import logging
from typing import Any
from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor")
_PROMPT_TEMPLATE = """\
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
请用中文撰写一份优化建议报告,格式为 Markdown。
## 评测诊断摘要
{diagnosis_summary}
## 低分样本示例
{low_sample_text}
## 报告要求
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
只输出 Markdown 报告正文,不要任何前置说明。
"""
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
lines.append(
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}"
f"阈值={d.threshold},严重程度={d.severity}"
)
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
return "\n".join(lines)
def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
if not d.low_samples:
continue
lines.append(f"### {d.metric} 低分样本(最多 3 条)")
for i, s in enumerate(d.low_samples, 1):
score = s.get(d.metric, "N/A")
lines.append(f"\n**样本 {i}**(分数={score}")
lines.append(f"- 问题:{s.get('question', '')}")
lines.append(f"- 回答:{s.get('answer', '')[:300]}")
lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}")
return "\n".join(lines)
async def analyze(
diagnoses: list[Diagnosis],
llm: Any,
scenario_name: str,
) -> str:
"""Call the judge LLM to generate a Chinese optimization report.
Args:
diagnoses: Non-empty list of Diagnosis from rules.diagnose().
llm: RAGAS LLM wrapper (has .agenerate() method).
scenario_name: Used only for logging.
Returns:
LLM-generated Markdown string, or "" on failure (triggers writer fallback).
"""
if not diagnoses:
return ""
diagnosis_summary = _build_diagnosis_summary(diagnoses)
low_sample_text = _build_low_sample_text(diagnoses)
prompt = _PROMPT_TEMPLATE.format(
diagnosis_summary=diagnosis_summary,
low_sample_text=low_sample_text,
)
try:
logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name)
from langchain_core.messages import HumanMessage
# Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel)
response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)])
text = response.content.strip()
logger.info("[advisor] LLM analysis complete chars=%d", len(text))
return text
except Exception as exc:
logger.warning(
"[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
type(exc).__name__, exc,
)
return ""

236
rag_eval/advisor/rules.py Normal file
View File

@@ -0,0 +1,236 @@
"""Rule-based diagnostic engine for RAG evaluation metric scores."""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from typing import Any
@dataclass
class MetricRule:
"""Threshold configuration and diagnostic text for one metric."""
warning_threshold: float
critical_threshold: float
higher_is_better: bool # False for noise_sensitivity
root_causes: list[str]
suggested_actions: list[str]
METRIC_RULES: dict[str, MetricRule] = {
"faithfulness": MetricRule(
warning_threshold=0.7,
critical_threshold=0.5,
higher_is_better=True,
root_causes=[
"生成回答包含检索片段中不支持的陈述(幻觉)",
"生成阶段未严格遵循 grounding 约束",
"校验阶段未开启或未生效",
],
suggested_actions=[
"强化生成 prompt 的 grounding 约束('只依据参考资料作答'",
"开启校验阶段validation: by_scenario",
"检查低分样本中模型是否引用了片段外的知识",
],
),
"answer_relevancy": MetricRule(
warning_threshold=0.7,
critical_threshold=0.5,
higher_is_better=True,
root_causes=[
"回答偏离问题主旨或包含大量冗余内容",
"查询改写后问题语义漂移",
"生成 prompt 格式约束不足",
],
suggested_actions=[
"优化查询改写 prompt确保改写后语义不偏移",
"在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
"检查低分样本的回答是否存在格式冗余或话题偏移",
],
),
"context_recall": MetricRule(
warning_threshold=0.7,
critical_threshold=0.5,
higher_is_better=True,
root_causes=[
"检索未能召回标准答案所涉及的关键信息",
"单一查询未能覆盖问题的多个角度",
"过召回数量不足,关键片段被截断",
],
suggested_actions=[
"启用多查询扩展use_multi_query覆盖不同措辞",
"对多跳问题启用问题分解sub_questions",
"加大过召回宽度recall_top_k",
"对颗粒度细的问题尝试 Step-back 双路检索",
],
),
"context_precision": MetricRule(
warning_threshold=0.6,
critical_threshold=0.4,
higher_is_better=True,
root_causes=[
"检索引入过多与问题无关的片段",
"重排未能将相关片段排在前列",
"缺少相关性过滤,噪声片段进入上下文",
],
suggested_actions=[
"启用或优化 listwise 重排,将相关片段排在前列",
"启用上下文压缩compression过滤无关句子",
"启用相关性过滤relevance_filter丢弃明确无关片段",
"缩小 rerank_keep_k如从 8 降到 5",
],
),
"noise_sensitivity": MetricRule(
warning_threshold=0.3, # higher is worse; trigger when mean > threshold
critical_threshold=0.5,
higher_is_better=False,
root_causes=[
"回答中包含检索到的噪声片段所引入的错误陈述",
"相关性过滤未能拦截干扰性片段",
"生成阶段对噪声片段未加区分地引用",
],
suggested_actions=[
"启用相关性过滤relevance_filter拦截噪声",
"优化重排,将不相关片段排到截断点之后",
"在生成 prompt 中强调'来源冲突时并列陈述,不擅自下定论'",
],
),
"factual_correctness": MetricRule(
warning_threshold=0.6,
critical_threshold=0.4,
higher_is_better=True,
root_causes=[
"回答的事实陈述与标准答案存在偏差",
"检索未能命中标准答案所依据的关键片段",
"生成阶段对多个来源综合时产生事实错误",
],
suggested_actions=[
"重点检查低分样本,确认是检索遗漏还是生成错误",
"提升 context_recall 以确保关键信息被检索到",
"对事实型问题将 temperature 降至 0",
],
),
"semantic_similarity": MetricRule(
warning_threshold=0.7,
critical_threshold=0.5,
higher_is_better=True,
root_causes=[
"回答语义与标准答案差距较大",
"回答过于简短或过于冗长,语义偏移",
"检索到的片段质量不足,导致生成内容偏离",
],
suggested_actions=[
"检查低分样本的回答与标准答案的表述差异",
"优化生成 prompt 使回答更贴近标准表述风格",
"提升检索质量context_recall / context_precision",
],
),
}
@dataclass
class Diagnosis:
"""Diagnostic result for one metric that triggered a threshold."""
metric: str
mean_score: float
threshold: float # the triggered threshold
severity: str # "warning" | "critical"
root_causes: list[str] = field(default_factory=list)
suggested_actions: list[str] = field(default_factory=list)
low_samples: list[dict[str, Any]] = field(default_factory=list)
def _mean_ignoring_nan(values: list[float]) -> float | None:
valid = [v for v in values if not math.isnan(v)]
if not valid:
return None
return sum(valid) / len(valid)
def _select_low_samples(
rows: list[dict[str, Any]],
metric: str,
top_n: int,
higher_is_better: bool,
) -> list[dict[str, Any]]:
"""Return the top_n worst-scoring rows for a metric, excluding NaN."""
valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
worst = sorted_rows[:top_n]
keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
def diagnose(
score_rows: list[dict[str, Any]],
metrics: list[str],
top_low_samples: int = 3,
) -> list[Diagnosis]:
"""Analyse score_rows and return a Diagnosis for each metric below threshold.
Args:
score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
metrics: Metric names to evaluate (from Scenario.metrics).
top_low_samples: How many worst-scoring samples to attach per diagnosis.
Returns:
List of Diagnosis objects, one per triggered metric. Empty if all OK.
"""
diagnoses: list[Diagnosis] = []
for metric in metrics:
rule = METRIC_RULES.get(metric)
if rule is None:
continue # unknown metric, skip
values = []
for row in score_rows:
raw = row.get(metric)
if raw is None:
continue
try:
v = float(raw)
except (TypeError, ValueError):
continue
values.append(v)
if not values:
continue
mean = _mean_ignoring_nan(values)
if mean is None:
continue
# Determine severity (direction-aware)
if rule.higher_is_better:
if mean < rule.critical_threshold:
severity = "critical"
threshold = rule.critical_threshold
elif mean < rule.warning_threshold:
severity = "warning"
threshold = rule.warning_threshold
else:
continue # above warning threshold → no diagnosis
else:
# lower is better (noise_sensitivity)
if mean > rule.critical_threshold:
severity = "critical"
threshold = rule.critical_threshold
elif mean > rule.warning_threshold:
severity = "warning"
threshold = rule.warning_threshold
else:
continue
low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
diagnoses.append(Diagnosis(
metric=metric,
mean_score=round(mean, 4),
threshold=threshold,
severity=severity,
root_causes=list(rule.root_causes),
suggested_actions=list(rule.suggested_actions),
low_samples=low_samples,
))
return diagnoses

View File

@@ -0,0 +1,82 @@
"""Write optimization advice to markdown file and emit log summary."""
from __future__ import annotations
import logging
from pathlib import Path
from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor")
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
"""Return a single-line log summary of triggered diagnoses."""
if not diagnoses:
return "[advisor] 所有指标正常,无需优化建议。"
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
triggered = " ".join(parts)
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}{advice_path}"
def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
"""Build a rules-only report when LLM analysis is unavailable."""
if not diagnoses:
return ""
lines = ["## 规则诊断LLM 分析不可用)\n"]
for d in diagnoses:
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
lines.append("\n**可能原因:**")
for cause in d.root_causes:
lines.append(f"- {cause}")
lines.append("\n**建议动作:**")
for action in d.suggested_actions:
lines.append(f"- {action}")
lines.append("")
return "\n".join(lines)
def write_advice(
diagnoses: list[Diagnosis],
llm_markdown: str,
advice_path: Path,
scenario_name: str,
run_id: str,
judge_model: str,
) -> None:
"""Write optimization_advice.md and emit a log summary line.
Args:
diagnoses: List of Diagnosis from rules.diagnose().
llm_markdown: LLM-generated Markdown body. Empty string triggers fallback.
advice_path: Full path to write the .md file.
scenario_name: Human-readable scenario identifier for the report header.
run_id: Run identifier string.
judge_model: Model used for LLM analysis (shown in header).
"""
advice_path.parent.mkdir(parents=True, exist_ok=True)
from rag_eval.shared.utils import utc_now_iso
header_lines = [
f"# 优化建议报告 — {scenario_name}",
"",
f"- run_id: `{run_id}`",
f"- 生成时间: `{utc_now_iso()}`",
f"- judge_model: `{judge_model}`",
"",
"---",
"",
]
if not diagnoses:
body = "## ✅ 未发现明显指标异常\n\n所有指标均在正常范围内,当前 RAG 链路表现良好。\n"
elif llm_markdown:
body = llm_markdown
else:
body = _build_fallback_report(diagnoses)
content = "\n".join(header_lines) + body
advice_path.write_text(content, encoding="utf-8")
summary = _format_log_summary(diagnoses, advice_path)
logger.info(summary)
logger.info("[advisor] 优化建议已写出: %s", advice_path)

View File

@@ -61,6 +61,7 @@ def load_scenario(path: str | Path) -> Scenario:
max_samples=model.runtime.max_samples,
),
source_path=scenario_path,
optimization_advisor=model.optimization_advisor,
)
# Run cross-field checks after all relative paths have been resolved.
validate_scenario(scenario)

View File

@@ -54,6 +54,7 @@ class ScenarioModel(BaseModel):
metrics: list[str]
output_dir: str
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
optimization_advisor: bool = False
@field_validator("metrics")
@classmethod

View File

@@ -3,6 +3,8 @@
from __future__ import annotations
import asyncio
import logging
import time
from typing import Any
from rag_eval.adapters.base import AppAdapter
@@ -13,6 +15,8 @@ from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
from rag_eval.shared.utils import utc_now_iso
logger = logging.getLogger("rag_eval.execution.evaluator")
class Evaluator:
"""Coordinate dataset loading, optional app execution, and metric scoring."""
@@ -31,27 +35,61 @@ class Evaluator:
def evaluate(self) -> EvaluationResult:
"""Execute the full evaluation flow and return the collected results."""
started_at = utc_now_iso()
scenario_name = self.scenario.scenario_name
mode = self.scenario.mode
logger.info("=" * 60)
logger.info("[eval] START scenario=%s mode=%s", scenario_name, mode)
logger.info("[eval] dataset=%s", self.scenario.dataset.path)
logger.info("[eval] metrics=%s", list(self.scenario.metrics))
logger.info("[eval] judge=%s embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
raw_records = load_dataset_records(self.scenario.dataset.path)
logger.info("[eval] raw_records=%d", len(raw_records))
samples, invalid_samples = normalize_records(
raw_records,
mode=self.scenario.mode,
max_samples=self.scenario.runtime.max_samples,
)
logger.info("[eval] normalized: valid=%d invalid=%d", len(samples), len(invalid_samples))
if self.scenario.mode == "online":
# Online mode enriches each sample by calling the target application first.
logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
t0 = time.monotonic()
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
elapsed = time.monotonic() - t0
invalid_samples.extend(online_invalids)
logger.info(
"[eval] adapter done: enriched=%d adapter_invalids=%d elapsed=%.1fs",
len(samples), len(online_invalids), elapsed,
)
logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
t0 = time.monotonic()
metric_scores = asyncio.run(
self.metric_pipeline.score_samples(
samples,
max_concurrency=self.scenario.runtime.metric_limit(),
)
)
elapsed = time.monotonic() - t0
logger.info("[eval] metric scoring done elapsed=%.1fs", elapsed)
finished_at = utc_now_iso()
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
# Summary of NaN rates per metric
import math
for metric_name in self.scenario.metrics:
nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
logger.info("[eval] %-22s NaN=%d/%d (%.0f%%)",
metric_name, nan_count, len(score_rows),
100 * nan_count / len(score_rows) if score_rows else 0)
run_id = finished_at.replace(":", "-")
logger.info("[eval] DONE run_id=%s total_valid=%d total_invalid=%d",
run_id, len(samples), len(invalid_samples))
logger.info("=" * 60)
return EvaluationResult(
scenario=self.scenario,
run_id=run_id,
@@ -72,13 +110,27 @@ class Evaluator:
valid: list[NormalizedSample] = []
invalid: list[InvalidSample] = []
total = len(samples)
async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
sid = sample.sample_id[:12]
logger.debug("[adapter] [%d/%d] calling adapter sample=%s question=%r",
idx + 1, total, sid, (sample.question or "")[:60])
t0 = time.monotonic()
try:
return await self.app_adapter.enrich_sample(sample)
result = await self.app_adapter.enrich_sample(sample)
elapsed = time.monotonic() - t0
ans_len = len(result.answer or "")
ctx_count = len(result.contexts or [])
logger.info("[adapter] [%d/%d] OK sample=%-12s ans_len=%d ctx_count=%d elapsed=%.1fs",
idx + 1, total, sid, ans_len, ctx_count, elapsed)
return result
except Exception as exc:
elapsed = time.monotonic() - t0
error_type = type(exc).__name__
logger.warning("[adapter] [%d/%d] FAIL sample=%-12s %s: %s (elapsed=%.1fs)",
idx + 1, total, sid, error_type, exc, elapsed)
return InvalidSample(
sample_id=sample.sample_id,
error=f"adapter failed [{error_type}]: {exc}",
@@ -86,8 +138,8 @@ class Evaluator:
)
factories = [
(lambda sample=sample: enrich_with_capture(sample))
for sample in samples
(lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
for i, sample in enumerate(samples)
]
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
@@ -102,6 +154,8 @@ class Evaluator:
if not sample.contexts:
errors.append("adapter returned empty contexts")
if errors:
logger.warning("[adapter] incomplete payload sample=%s errors=%s",
sample.sample_id[:12], errors)
invalid.append(
InvalidSample(
sample_id=sample.sample_id,
@@ -111,6 +165,9 @@ class Evaluator:
)
continue
valid.append(sample)
logger.info("[adapter] enrichment summary: valid=%d invalid=%d of total=%d",
len(valid), len(invalid), total)
return valid, invalid
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:

View File

@@ -2,16 +2,42 @@
from __future__ import annotations
import logging
import sys
from pathlib import Path
from rag_eval.adapters.http import HttpAppAdapter
from rag_eval.adapters.python import PythonFunctionAdapter
from rag_eval.advisor import run_advisor
from rag_eval.config.loader import load_scenario
from rag_eval.metrics.factory import build_metric_pipeline
from rag_eval.metrics.factory import build_models, build_metric_pipeline
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario
from .evaluator import Evaluator
logger = logging.getLogger("rag_eval.execution.runner")
def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
"""Configure root logger: always write to stderr, optionally also to a file."""
fmt = "%(asctime)s %(levelname)-8s %(name)s %(message)s"
datefmt = "%H:%M:%S"
handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if log_file is not None:
log_file.parent.mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(log_file, encoding="utf-8")
fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
handlers.append(fh)
logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
# Also show ragas internal logs at WARNING so we can see LLM errors
logging.getLogger("ragas").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.WARNING)
def build_adapter(scenario: Scenario):
"""Instantiate the adapter required by the resolved scenario, if any."""
@@ -27,16 +53,32 @@ def build_adapter(scenario: Scenario):
def run_scenario(
scenario_path: str,
settings: EvaluationSettings | None = None,
log_file: Path | None = None,
log_level: int = logging.INFO,
):
"""Run one scenario end to end and persist its reporting artifacts."""
_setup_logging(log_file=log_file, level=log_level)
logger.info("[runner] run_scenario path=%s", scenario_path)
settings = settings or EvaluationSettings()
if not settings.openai_api_key:
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
scenario = load_scenario(scenario_path)
logger.info("[runner] scenario loaded: name=%s mode=%s max_samples=%s",
scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)
# Build models once; reuse llm in both MetricPipeline and advisor.
llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)
adapter = build_adapter(scenario)
pipeline = build_metric_pipeline(scenario, settings)
pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
result = evaluator.evaluate()
write_run_artifacts(result)
logger.info("[runner] artifacts written for run_id=%s", result.run_id)
# Optimization advisor — runs only if scenario.optimization_advisor is True.
run_advisor(result, scenario, llm)
return result

View File

@@ -18,7 +18,10 @@ from ragas.metrics.collections import (
AnswerRelevancy,
ContextPrecision,
ContextRecall,
FactualCorrectness,
Faithfulness,
NoiseSensitivity,
SemanticSimilarity,
)
from .pipeline import MetricPipeline
@@ -39,19 +42,34 @@ def build_models(
def build_metric_pipeline(
scenario: Scenario,
settings: EvaluationSettings,
llm: Any | None = None,
embeddings: Any | None = None,
) -> MetricPipeline:
"""Build a metric pipeline containing only the metrics requested by the scenario."""
llm, embeddings = build_models(
scenario.judge_model,
scenario.embedding_model,
settings,
)
"""Build a metric pipeline containing only the metrics requested by the scenario.
If llm and embeddings are provided (pre-built by the caller), they are reused.
Otherwise, new instances are created from scenario + settings.
"""
if llm is None or embeddings is None:
llm, embeddings = build_models(
scenario.judge_model,
scenario.embedding_model,
settings,
)
# Build the full registry once, then slice it by configured metric names.
registry: dict[str, Any] = {
"faithfulness": Faithfulness(llm=llm),
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
"context_recall": ContextRecall(llm=llm),
"context_precision": ContextPrecision(llm=llm),
# Robustness / end-to-end metrics (架构设计 §10.2).
# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
"noise_sensitivity": NoiseSensitivity(llm=llm),
# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
"factual_correctness": FactualCorrectness(llm=llm),
# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
}
return MetricPipeline(
metrics={name: registry[name] for name in scenario.metrics},

View File

@@ -3,12 +3,16 @@
from __future__ import annotations
import asyncio
import logging
import math
import time
from dataclasses import dataclass
from typing import Any
from rag_eval.shared.models import MetricScore, NormalizedSample
logger = logging.getLogger("rag_eval.metrics.pipeline")
@dataclass(slots=True)
class MetricPipeline:
@@ -22,12 +26,43 @@ class MetricPipeline:
results = {name: math.nan for name in self.metrics}
errors: list[str] = []
sid = sample.sample_id[:12]
ans_len = len(sample.answer or "")
ctx_count = len(sample.contexts or [])
logger.debug(
"[score] sample=%s ans_len=%d ctx_count=%d question=%r",
sid, ans_len, ctx_count,
(sample.question or "")[:80],
)
for name, metric in self.metrics.items():
t0 = time.monotonic()
try:
result = await self._run_metric(name, metric, sample)
results[name] = float(result.value)
score_val = float(result.value)
results[name] = score_val
elapsed = time.monotonic() - t0
logger.info(
"[metric OK ] sample=%-12s %-20s score=%.4f elapsed=%.1fs",
sid, name, score_val, elapsed,
)
except asyncio.TimeoutError:
elapsed = time.monotonic() - t0
msg = f"timeout after {self.metric_timeout_seconds}s"
errors.append(f"{name}: {msg}")
logger.warning(
"[metric TMO] sample=%-12s %-20s TIMEOUT after %.1fs",
sid, name, elapsed,
)
except Exception as exc:
elapsed = time.monotonic() - t0
exc_type = type(exc).__name__
errors.append(f"{name}: {exc}")
logger.warning(
"[metric ERR] sample=%-12s %-20s %s: %s (elapsed=%.1fs)",
sid, name, exc_type, exc, elapsed,
)
return MetricScore(metrics=results, error=" | ".join(errors))
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
@@ -59,6 +94,23 @@ class MetricPipeline:
reference=sample.ground_truth,
retrieved_contexts=sample.contexts,
)
elif name == "noise_sensitivity":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
reference=sample.ground_truth,
retrieved_contexts=sample.contexts,
)
elif name == "factual_correctness":
coroutine = metric.ascore(
response=sample.answer,
reference=sample.ground_truth,
)
elif name == "semantic_similarity":
coroutine = metric.ascore(
reference=sample.ground_truth,
response=sample.answer,
)
else:
raise ValueError(f"Unsupported metric: {name}")
@@ -72,11 +124,22 @@ class MetricPipeline:
max_concurrency: int,
) -> list[MetricScore]:
"""Score all samples while respecting the configured concurrency limit."""
total = len(samples)
logger.info("[pipeline] scoring %d samples concurrency=%d timeout=%ss",
total, max_concurrency, self.metric_timeout_seconds)
semaphore = asyncio.Semaphore(max(1, max_concurrency))
completed = 0
async def guarded(sample: NormalizedSample) -> MetricScore:
async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
nonlocal completed
async with semaphore:
return await self.score_sample(sample)
result = await self.score_sample(sample)
completed += 1
nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
logger.info("[pipeline] progress %d/%d sample=%-12s %s",
completed, total, sample.sample_id[:12], status)
return result
return await asyncio.gather(*(guarded(sample) for sample in samples))
return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))

View File

@@ -1,8 +1,13 @@
"""Supported metric names recognized by scenario validation and pipeline setup."""
SUPPORTED_METRICS = {
# Core retrieval / generation metrics (always available).
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
# Robustness and end-to-end metrics (see 架构设计 §10.2).
"noise_sensitivity", # 鲁棒性:对检索噪声的敏感度
"factual_correctness", # 端到端:回答相对标准答案的事实正确性
"semantic_similarity", # 端到端回答与标准答案的语义相似度embedding无 LLM 调用)
}

View File

@@ -17,4 +17,5 @@ def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
invalid_csv=run_dir / "invalid.csv",
summary_md=run_dir / "summary.md",
metadata_json=run_dir / "metadata.json",
advice_md=run_dir / "optimization_advice.md",
)

View File

@@ -76,6 +76,7 @@ class Scenario:
runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
app_adapter: AppAdapterConfig | None = None
source_path: Path | None = None
optimization_advisor: bool = False
def snapshot(self) -> dict[str, Any]:
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
@@ -159,3 +160,4 @@ class RunArtifactPaths:
invalid_csv: Path
summary_md: Path
metadata_json: Path
advice_md: Path | None = None

107
run_eval.bat Normal file
View File

@@ -0,0 +1,107 @@
@echo off
setlocal enabledelayedexpansion
:: ============================================================
:: run_eval.bat - Run a RAGAS evaluation scenario with logs
::
:: Usage:
:: run_eval.bat (uses default online scenario)
:: run_eval.bat offline (runs offline smoke scenario)
:: run_eval.bat path\to\scenario.yaml (any custom scenario)
:: run_eval.bat offline DEBUG (second arg = log level)
:: ============================================================
cd /d "%~dp0"
echo.
echo ============================================================
echo Siemens RAGAS - Evaluation Runner
echo ============================================================
echo.
:: ----------------------------------------------------------------
:: 1. Resolve scenario path (arg1)
:: ----------------------------------------------------------------
set "SCENARIO=%~1"
if "%SCENARIO%"=="" set "SCENARIO=online"
if /i "%SCENARIO%"=="online" (
set "SCENARIO=scenarios\online\siemens-pdf-question-bank-online.yaml"
)
if /i "%SCENARIO%"=="offline" (
set "SCENARIO=scenarios\offline\siemens-pdf-offline-smoke.yaml"
)
if not exist "%SCENARIO%" (
echo [ERROR] Scenario file not found: %SCENARIO%
echo.
echo Usage examples:
echo run_eval.bat - online eval (default)
echo run_eval.bat offline - offline smoke
echo run_eval.bat path\to\file.yaml - custom scenario
goto :error
)
echo [OK] Scenario : %SCENARIO%
:: ----------------------------------------------------------------
:: 2. Resolve log level (arg2, default INFO)
:: ----------------------------------------------------------------
set "LOG_LEVEL=%~2"
if "%LOG_LEVEL%"=="" set "LOG_LEVEL=INFO"
echo [OK] Log level: %LOG_LEVEL%
:: ----------------------------------------------------------------
:: 3. Create logs dir and build timestamped log filename
:: ----------------------------------------------------------------
if not exist "logs" mkdir logs
for /f "tokens=1-3 delims=/-" %%a in ("%DATE%") do (
set "YMD=%%c-%%a-%%b"
)
for /f "tokens=1-3 delims=:." %%a in ("%TIME: =0%") do (
set "HMS=%%a%%b%%c"
)
set "LOG_FILE=logs\eval_%YMD%_%HMS%.log"
echo [OK] Log file : %LOG_FILE%
echo.
echo ============================================================
echo Starting evaluation...
echo (Logs also written to %LOG_FILE%)
echo Press Ctrl+C to abort
echo ============================================================
echo.
:: ----------------------------------------------------------------
:: 4. Run evaluation with UTF-8 and logging
:: ----------------------------------------------------------------
set PYTHONIOENCODING=utf-8
set PYTHONPATH=.
python main.py ^
--scenario "%SCENARIO%" ^
--log-file "%LOG_FILE%" ^
--log-level %LOG_LEVEL%
if errorlevel 1 (
echo.
echo [ERROR] Evaluation failed. Check log: %LOG_FILE%
goto :error
)
echo.
echo ============================================================
echo Evaluation complete!
echo Log saved to: %LOG_FILE%
echo Open the web console to view results: start.bat
echo ============================================================
echo.
pause
exit /b 0
:error
echo.
echo ============================================================
echo Evaluation failed. See error above or check log file.
echo ============================================================
pause
exit /b 1

96
run_eval.ps1 Normal file
View File

@@ -0,0 +1,96 @@
# run_eval.ps1 - Siemens RAGAS Evaluation Runner
# Usage:
# .\run_eval.ps1 # online eval (default)
# .\run_eval.ps1 offline # offline smoke
# .\run_eval.ps1 path\to\scenario.yaml # custom scenario
# .\run_eval.ps1 online DEBUG # second arg = log level (DEBUG/INFO/WARNING)
# Or: powershell -ExecutionPolicy Bypass -File run_eval.ps1 [scenario] [log-level]
param(
[string]$Scenario = "online",
[string]$LogLevel = "INFO"
)
$ErrorActionPreference = "Stop"
Set-Location $PSScriptRoot
Write-Host ""
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host " Siemens RAGAS - Evaluation Runner" -ForegroundColor Cyan
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
# ----------------------------------------------------------------
# 1. Resolve scenario path
# ----------------------------------------------------------------
$scenarioMap = @{
"online" = "scenarios\online\siemens-pdf-question-bank-online.yaml"
"offline" = "scenarios\offline\siemens-pdf-offline-smoke.yaml"
}
if ($scenarioMap.ContainsKey($Scenario.ToLower())) {
$Scenario = $scenarioMap[$Scenario.ToLower()]
}
if (-not (Test-Path $Scenario)) {
Write-Host "[ERROR] Scenario file not found: $Scenario" -ForegroundColor Red
Write-Host ""
Write-Host "Usage examples:"
Write-Host " .\run_eval.ps1 - online eval (default)"
Write-Host " .\run_eval.ps1 offline - offline smoke"
Write-Host " .\run_eval.ps1 path\to\file.yaml - custom scenario"
Read-Host "Press Enter to exit"
exit 1
}
Write-Host "[OK] Scenario : $Scenario" -ForegroundColor Green
# ----------------------------------------------------------------
# 2. Validate log level
# ----------------------------------------------------------------
$validLevels = @("DEBUG", "INFO", "WARNING", "ERROR")
if ($validLevels -notcontains $LogLevel.ToUpper()) {
Write-Host "[WARN] Unknown log level '$LogLevel', defaulting to INFO" -ForegroundColor Yellow
$LogLevel = "INFO"
}
Write-Host "[OK] Log level: $LogLevel" -ForegroundColor Green
# ----------------------------------------------------------------
# 3. Create logs dir with timestamped filename
# ----------------------------------------------------------------
if (-not (Test-Path "logs")) { New-Item -ItemType Directory "logs" | Out-Null }
$timestamp = Get-Date -Format "yyyy-MM-dd_HHmmss"
$logFile = "logs\eval_$timestamp.log"
Write-Host "[OK] Log file : $logFile" -ForegroundColor Green
Write-Host ""
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host " Starting evaluation..." -ForegroundColor Cyan
Write-Host " Logs also written to: $logFile" -ForegroundColor Cyan
Write-Host " Press Ctrl+C to abort" -ForegroundColor Yellow
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
# ----------------------------------------------------------------
# 4. Run evaluation
# ----------------------------------------------------------------
$env:PYTHONIOENCODING = "utf-8"
$env:PYTHONPATH = "."
& python main.py `
--scenario $Scenario `
--log-file $logFile `
--log-level $LogLevel.ToUpper()
if ($LASTEXITCODE -ne 0) {
Write-Host ""
Write-Host "[ERROR] Evaluation failed. Check log: $logFile" -ForegroundColor Red
Read-Host "Press Enter to exit"
exit 1
}
Write-Host ""
Write-Host "============================================================" -ForegroundColor Green
Write-Host " Evaluation complete!" -ForegroundColor Green
Write-Host " Log saved to: $logFile" -ForegroundColor Green
Write-Host " Open the web console to view results: start.bat" -ForegroundColor Cyan
Write-Host "============================================================" -ForegroundColor Green
Write-Host ""
Read-Host "Press Enter to exit"

View File

@@ -9,6 +9,10 @@ metrics:
- answer_relevancy
- context_recall
- context_precision
# 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth取消注释即可启用
# - noise_sensitivity # 鲁棒性:对检索噪声的敏感度
# - factual_correctness # 端到端:事实正确性(相对标准答案)
# - semantic_similarity # 端到端语义相似度embedding无 LLM 调用)
output_dir: ../../outputs/siemens-pdf-offline-smoke
runtime:
batch_size: 4

View File

@@ -1,13 +1,13 @@
scenario_name: sample-pdf-question-bank-online
mode: online
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
judge_model: deepseek-v4-pro
judge_model: qwen3.5-flash
embedding_model: text-embedding-v3
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
- faithfulness
- answer_relevancy
- context_recall
- context_precision
output_dir: ../../outputs/online/sample-pdf-question-bank
runtime:
batch_size: 2
@@ -19,4 +19,4 @@ app_adapter:
callable: apps.pdf_question_bank.adapter:run
static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash
model: glm-5

View File

@@ -3,20 +3,24 @@ mode: online
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3
optimization_advisor: true
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
- faithfulness
- answer_relevancy
- context_recall
- context_precision
- noise_sensitivity
- factual_correctness
- semantic_similarity
output_dir: ../../outputs/online/siemens-pdf-question-bank
runtime:
batch_size: 4
app_concurrency: 4
metric_concurrency: 4
max_samples: 50
batch_size: 3
app_concurrency: 3
metric_concurrency: 3
max_samples: 10
app_adapter:
type: python
callable: apps.siemens_pdf_qa.adapter:run
static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash
model: glm-5

59
scripts/smoke_advisor.py Normal file
View File

@@ -0,0 +1,59 @@
"""Offline smoke-check for the advisor module wiring (no network required)."""
import math
import sys
import tempfile
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from rag_eval.advisor.rules import diagnose
from rag_eval.advisor.writer import write_advice, _format_log_summary
# Simulate score_rows with low faithfulness and high noise_sensitivity
rows = [
{
"sample_id": f"s{i}",
"question": f"问题{i}西门子CT扫描的Flash技术原理是什么",
"answer": f"答案{i}Flash技术采用双源CT扫描",
"ground_truth": f"标准答案{i}Flash扫描利用双源CT和大螺距实现超低辐射剂量扫描",
"faithfulness": 0.3 + i * 0.05,
"noise_sensitivity": 0.4 + i * 0.02,
"context_recall": 0.75,
"semantic_similarity": 0.65,
}
for i in range(5)
]
diags = diagnose(rows, metrics=["faithfulness", "noise_sensitivity", "context_recall", "semantic_similarity"])
print(f"Diagnosed {len(diags)} metric(s):")
for d in diags:
print(f" {d.metric}: mean={d.mean_score}, severity={d.severity}, low_samples={len(d.low_samples)}")
assert len(diags) >= 2, f"Expected at least 2 diagnoses, got {len(diags)}"
metrics_hit = {d.metric for d in diags}
assert "faithfulness" in metrics_hit, "faithfulness should be triggered"
assert "noise_sensitivity" in metrics_hit, "noise_sensitivity should be triggered"
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "optimization_advice.md"
write_advice(
diagnoses=diags,
llm_markdown="", # fallback mode (no LLM)
advice_path=path,
scenario_name="smoke-test-siemens",
run_id="2026-06-16T00-00-00",
judge_model="deepseek-v4-flash",
)
content = path.read_text(encoding="utf-8")
assert "smoke-test-siemens" in content, "scenario name missing from report"
assert "faithfulness" in content, "faithfulness missing from report"
assert "noise_sensitivity" in content, "noise_sensitivity missing from report"
print(f"\nAdvice file ({len(content)} chars) — assertions OK")
# Verify log summary format
summary = _format_log_summary(diags, Path("optimization_advice.md"))
print(f"\nLog summary length: {len(summary)} chars, faithfulness present: {'faithfulness' in summary}")
assert "触发诊断" in summary
assert "faithfulness" in summary
print("\nSmoke check PASSED")

View File

@@ -56,7 +56,17 @@ if errorlevel 1 (
)
:: ----------------------------------------------------------------
:: 4. Seed demo data if no runs exist yet
:: 4. Ensure configs/ directory exists for LLM profile storage
:: ----------------------------------------------------------------
if not exist "configs" (
mkdir configs
echo [OK] Created configs/ directory for LLM profile storage.
) else (
echo [OK] configs/ directory ready.
)
:: ----------------------------------------------------------------
:: 5. Seed demo data if no runs exist yet
:: ----------------------------------------------------------------
if not exist "outputs\kba-knowledge-base-offline-baseline" (
echo [INFO] No run data found. Generating demo data...
@@ -71,7 +81,7 @@ if not exist "outputs\kba-knowledge-base-offline-baseline" (
)
:: ----------------------------------------------------------------
:: 5. Pick an available port
:: 6. Pick an available port
:: ----------------------------------------------------------------
set PORT=8800
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1

View File

@@ -58,7 +58,17 @@ if ($LASTEXITCODE -ne 0) {
}
# ----------------------------------------------------------------
# 4. Seed demo data if missing
# 4. Ensure configs/ directory exists for LLM profile storage
# ----------------------------------------------------------------
if (-not (Test-Path "configs")) {
New-Item -ItemType Directory "configs" | Out-Null
Write-Host "[OK] Created configs/ directory for LLM profile storage." -ForegroundColor Green
} else {
Write-Host "[OK] configs/ directory ready." -ForegroundColor Green
}
# ----------------------------------------------------------------
# 5. Seed demo data if missing
# ----------------------------------------------------------------
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
@@ -73,7 +83,7 @@ if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
}
# ----------------------------------------------------------------
# 5. Pick an available port
# 6. Pick an available port
# ----------------------------------------------------------------
$PORT = 8800
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"

0
tests/__init__.py Normal file
View File

View File

@@ -0,0 +1,72 @@
import math
import unittest
from rag_eval.advisor.rules import Diagnosis, diagnose, METRIC_RULES
class TestDiagnosis(unittest.TestCase):
def _make_rows(self, metric: str, scores: list[float]) -> list[dict]:
return [{metric: s, "question": f"q{i}", "answer": f"a{i}",
"ground_truth": f"gt{i}", "sample_id": f"s{i}"}
for i, s in enumerate(scores)]
def test_no_diagnosis_when_all_scores_above_threshold(self):
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(result, [])
def test_warning_when_mean_below_warning_threshold(self):
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].metric, "faithfulness")
self.assertEqual(result[0].severity, "warning")
self.assertAlmostEqual(result[0].mean_score, 0.65, places=2)
def test_critical_when_mean_below_critical_threshold(self):
rows = self._make_rows("faithfulness", [0.3, 0.4, 0.45])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(result[0].severity, "critical")
def test_low_samples_selected_are_bottom_three(self):
rows = self._make_rows("faithfulness", [0.1, 0.2, 0.3, 0.8, 0.9])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(len(result[0].low_samples), 3)
scores = [s["faithfulness"] for s in result[0].low_samples]
self.assertEqual(sorted(scores), [0.1, 0.2, 0.3])
def test_nan_scores_excluded_from_mean_and_low_samples(self):
rows = self._make_rows("faithfulness", [0.3, float("nan"), 0.4])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(len(result), 1)
for s in result[0].low_samples:
self.assertFalse(math.isnan(s["faithfulness"]))
def test_noise_sensitivity_direction_inverted(self):
# noise_sensitivity: higher is worse; threshold > 0.3 is warning
rows = self._make_rows("noise_sensitivity", [0.4, 0.45, 0.5])
result = diagnose(rows, metrics=["noise_sensitivity"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].metric, "noise_sensitivity")
def test_noise_sensitivity_no_diagnosis_when_low(self):
rows = self._make_rows("noise_sensitivity", [0.1, 0.15, 0.2])
result = diagnose(rows, metrics=["noise_sensitivity"])
self.assertEqual(result, [])
def test_skips_metric_not_in_rows(self):
rows = [{"faithfulness": 0.3, "question": "q", "answer": "a",
"ground_truth": "gt", "sample_id": "s1"}]
result = diagnose(rows, metrics=["faithfulness", "context_recall"])
metrics_found = [d.metric for d in result]
self.assertIn("faithfulness", metrics_found)
self.assertNotIn("context_recall", metrics_found)
def test_all_seven_metrics_have_rules(self):
expected = {"faithfulness", "answer_relevancy", "context_recall",
"context_precision", "noise_sensitivity",
"factual_correctness", "semantic_similarity"}
self.assertEqual(set(METRIC_RULES.keys()), expected)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,113 @@
import shutil
import unittest
from pathlib import Path
from rag_eval.advisor.rules import Diagnosis
from rag_eval.advisor.writer import write_advice, _format_log_summary
class TestWriteAdvice(unittest.TestCase):
def setUp(self):
self.tmp = Path("tests/.tmp/test_advisor_writer")
shutil.rmtree(self.tmp, ignore_errors=True)
self.tmp.mkdir(parents=True, exist_ok=True)
self.advice_path = self.tmp / "optimization_advice.md"
def tearDown(self):
shutil.rmtree(self.tmp, ignore_errors=True)
def _make_diagnosis(self, metric="faithfulness", severity="warning"):
return Diagnosis(
metric=metric,
mean_score=0.55,
threshold=0.7,
severity=severity,
root_causes=["原因1", "原因2"],
suggested_actions=["建议1", "建议2"],
low_samples=[
{"sample_id": "s1", "question": "问题1", "answer": "答案1",
"ground_truth": "标准1", metric: 0.4},
],
)
def test_write_creates_file(self):
diag = self._make_diagnosis()
write_advice(
diagnoses=[diag],
llm_markdown="## faithfulness\n\nLLM 建议内容",
advice_path=self.advice_path,
scenario_name="test-scenario",
run_id="2026-01-01T00-00-00",
judge_model="deepseek-v4-flash",
)
self.assertTrue(self.advice_path.exists())
def test_write_contains_scenario_name_and_run_id(self):
diag = self._make_diagnosis()
write_advice(
diagnoses=[diag],
llm_markdown="## faithfulness\n\nLLM 建议",
advice_path=self.advice_path,
scenario_name="siemens-test",
run_id="2026-01-01T00-00-00",
judge_model="deepseek-v4-flash",
)
content = self.advice_path.read_text(encoding="utf-8")
self.assertIn("siemens-test", content)
self.assertIn("2026-01-01T00-00-00", content)
def test_write_contains_llm_markdown(self):
diag = self._make_diagnosis()
write_advice(
diagnoses=[diag],
llm_markdown="## faithfulness\n\n具体建议文本",
advice_path=self.advice_path,
scenario_name="test",
run_id="rid",
judge_model="model",
)
content = self.advice_path.read_text(encoding="utf-8")
self.assertIn("具体建议文本", content)
def test_write_fallback_when_no_llm_markdown(self):
"""When llm_markdown is empty, writer emits rule-only report."""
diag = self._make_diagnosis()
write_advice(
diagnoses=[diag],
llm_markdown="",
advice_path=self.advice_path,
scenario_name="test",
run_id="rid",
judge_model="model",
)
content = self.advice_path.read_text(encoding="utf-8")
self.assertIn("faithfulness", content)
self.assertIn("原因1", content)
def test_log_summary_format(self):
diags = [
self._make_diagnosis("faithfulness", "critical"),
self._make_diagnosis("context_recall", "warning"),
]
summary = _format_log_summary(diags, self.advice_path)
self.assertIn("faithfulness", summary)
self.assertIn("critical", summary)
self.assertIn("context_recall", summary)
self.assertIn("warning", summary)
def test_write_empty_diagnoses_still_creates_file(self):
write_advice(
diagnoses=[],
llm_markdown="",
advice_path=self.advice_path,
scenario_name="test",
run_id="rid",
judge_model="model",
)
self.assertTrue(self.advice_path.exists())
content = self.advice_path.read_text(encoding="utf-8")
self.assertIn("未发现明显指标异常", content)
if __name__ == "__main__":
unittest.main()

0
tests/webapp/__init__.py Normal file
View File

View File

@@ -0,0 +1,139 @@
"""Integration tests for /api/llm-profiles endpoints."""
import pytest
from fastapi.testclient import TestClient
@pytest.fixture()
def client(tmp_path, monkeypatch):
"""TestClient with a fresh ProfileManager backed by a temp file."""
store = tmp_path / "profiles.json"
import webapp.services.profile_manager as pm_mod
from webapp.services.profile_manager import ProfileManager
fresh_mgr = ProfileManager(store_path=store)
monkeypatch.setattr(pm_mod, "profile_manager", fresh_mgr)
import webapp.api.llm_profiles as api_mod
monkeypatch.setattr(api_mod, "profile_manager", fresh_mgr)
from webapp.server import create_app
return TestClient(create_app())
def test_list_empty(client):
resp = client.get("/api/llm-profiles")
assert resp.status_code == 200
assert resp.json()["profiles"] == []
def test_create_and_list(client):
body = {"name": "Test", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
resp = client.post("/api/llm-profiles", json=body)
assert resp.status_code == 201
data = resp.json()
assert data["name"] == "Test"
assert data["profile_id"] != ""
resp2 = client.get("/api/llm-profiles")
assert len(resp2.json()["profiles"]) == 1
def test_update_profile(client):
body = {"name": "Old", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
assert resp.status_code == 200
assert resp.json()["name"] == "New"
assert resp.json()["timeout_seconds"] == 60
def test_delete_profile(client):
body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
resp = client.delete(f"/api/llm-profiles/{pid}")
assert resp.status_code == 200
assert resp.json()["deleted"] is True
assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
def test_update_nonexistent(client):
resp = client.put("/api/llm-profiles/nope",
json={"name": "X", "model": "m", "base_url": "http://x/v1", "api_key": "k"})
assert resp.status_code == 404
def test_delete_nonexistent(client):
resp = client.delete("/api/llm-profiles/nope")
assert resp.status_code == 404
# ---------------------------------------------------------------------------
# YAML patcher tests
# ---------------------------------------------------------------------------
import yaml as yaml_lib
from webapp.services.yaml_patcher import apply_profiles_to_scenario
from webapp.models import LLMProfile
def test_apply_judge_profile(tmp_path):
"""Applying a judge profile patches judge_model in the YAML."""
scenario_file = tmp_path / "test-scenario.yaml"
scenario_file.write_text(
"scenario_name: test\nmode: offline\njudge_model: old-model\nembedding_model: emb\n"
"dataset: data.csv\nmetrics:\n- faithfulness\noutput_dir: outputs/test\n",
encoding="utf-8",
)
judge_p = LLMProfile(
profile_id="x", name="J", model="new-model",
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
)
patched = apply_profiles_to_scenario(
scenario_path=str(scenario_file),
judge_profile=judge_p,
answer_profile=None,
dataset_profile=None,
_resolve_absolute=True,
)
assert "judge_model" in patched
data = yaml_lib.safe_load(scenario_file.read_text())
assert data["judge_model"] == "new-model"
def test_apply_answer_profile(tmp_path):
"""Applying an answer profile patches app_adapter.static_kwargs.model."""
scenario_file = tmp_path / "online.yaml"
scenario_file.write_text(
"scenario_name: online\nmode: online\njudge_model: j\nembedding_model: emb\n"
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n"
"app_adapter:\n type: python\n callable: apps.foo:run\n"
" static_kwargs:\n model: old\n source_chunks_path: chunks.jsonl\n",
encoding="utf-8",
)
answer_p = LLMProfile(
profile_id="y", name="A", model="new-answer-model",
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
)
patched = apply_profiles_to_scenario(
scenario_path=str(scenario_file),
judge_profile=None,
answer_profile=answer_p,
dataset_profile=None,
_resolve_absolute=True,
)
assert "app_adapter.static_kwargs.model" in patched
data = yaml_lib.safe_load(scenario_file.read_text())
assert data["app_adapter"]["static_kwargs"]["model"] == "new-answer-model"
def test_apply_no_profiles_returns_empty(tmp_path):
"""When no profiles are given, no fields are patched."""
scenario_file = tmp_path / "noop.yaml"
scenario_file.write_text("scenario_name: noop\njudge_model: m\n", encoding="utf-8")
patched = apply_profiles_to_scenario(
scenario_path=str(scenario_file),
judge_profile=None,
answer_profile=None,
dataset_profile=None,
_resolve_absolute=True,
)
assert patched == []

View File

@@ -0,0 +1,100 @@
import pytest
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
def test_llm_profile_defaults():
p = LLMProfile(
profile_id="abc",
name="Test",
model="gpt-4",
base_url="http://localhost/v1",
api_key="sk-test",
)
assert p.timeout_seconds == 30
assert p.created_at != ""
assert p.updated_at != ""
def test_profile_apply_request_fields():
req = ProfileApplyRequest(
scenario_path="scenarios/offline/sample.yaml",
judge_profile_id="id1",
answer_profile_id="id2",
dataset_profile_id=None,
)
assert req.judge_profile_id == "id1"
assert req.dataset_profile_id is None
def test_profile_apply_response():
resp = ProfileApplyResponse(scenario_path="scenarios/offline/sample.yaml", patched_fields=["judge_model"])
assert "judge_model" in resp.patched_fields
# ---------------------------------------------------------------------------
# ProfileManager service tests
# ---------------------------------------------------------------------------
import json
from webapp.services.profile_manager import ProfileManager
def _make_manager(tmp_path):
store = tmp_path / "profiles.json"
return ProfileManager(store_path=store)
def test_create_profile(tmp_path):
mgr = _make_manager(tmp_path)
p = mgr.create(name="Local", model="deepseek-v4-flash",
base_url="http://localhost/v1", api_key="sk-x")
assert p.profile_id != ""
assert p.name == "Local"
def test_list_profiles(tmp_path):
mgr = _make_manager(tmp_path)
mgr.create(name="A", model="m1", base_url="http://a/v1", api_key="k1")
mgr.create(name="B", model="m2", base_url="http://b/v1", api_key="k2")
profiles = mgr.list_all()
assert len(profiles) == 2
def test_get_profile(tmp_path):
mgr = _make_manager(tmp_path)
created = mgr.create(name="X", model="m", base_url="http://x/v1", api_key="k")
fetched = mgr.get(created.profile_id)
assert fetched is not None
assert fetched.name == "X"
def test_update_profile(tmp_path):
mgr = _make_manager(tmp_path)
p = mgr.create(name="Old", model="m", base_url="http://x/v1", api_key="k")
updated = mgr.update(p.profile_id, name="New", model="m2",
base_url="http://x/v1", api_key="k", timeout_seconds=60)
assert updated is not None
assert updated.name == "New"
assert updated.model == "m2"
assert updated.timeout_seconds == 60
def test_delete_profile(tmp_path):
mgr = _make_manager(tmp_path)
p = mgr.create(name="Del", model="m", base_url="http://x/v1", api_key="k")
assert mgr.delete(p.profile_id) is True
assert mgr.get(p.profile_id) is None
def test_persistence(tmp_path):
store = tmp_path / "profiles.json"
mgr1 = ProfileManager(store_path=store)
p = mgr1.create(name="Persist", model="m", base_url="http://x/v1", api_key="k")
mgr2 = ProfileManager(store_path=store)
assert mgr2.get(p.profile_id) is not None
def test_get_nonexistent(tmp_path):
mgr = _make_manager(tmp_path)
assert mgr.get("does-not-exist") is None
def test_delete_nonexistent(tmp_path):
mgr = _make_manager(tmp_path)
assert mgr.delete("does-not-exist") is False

View File

@@ -0,0 +1,96 @@
"""CRUD routes for LLM profiles plus the scenario-patching apply endpoint."""
from __future__ import annotations
from fastapi import APIRouter, HTTPException
from webapp.models import (
CreateProfileRequest,
LLMProfile,
ProfileApplyRequest,
ProfileApplyResponse,
)
from webapp.services.profile_manager import profile_manager
from webapp.services.yaml_patcher import apply_profiles_to_scenario
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
@router.get("", response_model=dict)
def list_profiles() -> dict:
"""Return all saved LLM profiles."""
return {"profiles": [p.model_dump() for p in profile_manager.list_all()]}
@router.post("", status_code=201, response_model=LLMProfile)
def create_profile(request: CreateProfileRequest) -> LLMProfile:
"""Create a new LLM profile."""
return profile_manager.create(
name=request.name,
model=request.model,
base_url=request.base_url,
api_key=request.api_key,
timeout_seconds=request.timeout_seconds,
)
@router.put("/{profile_id}", response_model=LLMProfile)
def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile:
"""Update an existing LLM profile by id."""
updated = profile_manager.update(
profile_id=profile_id,
name=request.name,
model=request.model,
base_url=request.base_url,
api_key=request.api_key,
timeout_seconds=request.timeout_seconds,
)
if updated is None:
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
return updated
@router.delete("/{profile_id}", response_model=dict)
def delete_profile(profile_id: str) -> dict:
"""Delete an LLM profile by id."""
deleted = profile_manager.delete(profile_id)
if not deleted:
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
return {"deleted": True}
@router.post("/apply", response_model=ProfileApplyResponse)
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
"""Patch selected LLM profiles into the target scenario YAML file."""
role_profiles: dict[str, LLMProfile | None] = {
"judge": profile_manager.get(request.judge_profile_id) if request.judge_profile_id else None,
"answer": profile_manager.get(request.answer_profile_id) if request.answer_profile_id else None,
"dataset": profile_manager.get(request.dataset_profile_id) if request.dataset_profile_id else None,
}
missing = [
role
for role, pid in [
("judge", request.judge_profile_id),
("answer", request.answer_profile_id),
("dataset", request.dataset_profile_id),
]
if pid and role_profiles[role] is None
]
if missing:
raise HTTPException(
status_code=400,
detail=f"Profile(s) not found for roles: {', '.join(missing)}",
)
patched = apply_profiles_to_scenario(
scenario_path=request.scenario_path,
judge_profile=role_profiles["judge"],
answer_profile=role_profiles["answer"],
dataset_profile=role_profiles["dataset"],
)
return ProfileApplyResponse(
scenario_path=request.scenario_path,
patched_fields=patched,
)

View File

@@ -2,11 +2,16 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from pydantic import BaseModel, Field
def _utcnow_iso() -> str:
return datetime.now(timezone.utc).isoformat()
class RunSummary(BaseModel):
"""Compact description of a single evaluation run for list views."""
@@ -68,6 +73,7 @@ class ReportData(BaseModel):
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
lowest_samples: list[SampleScore] = Field(default_factory=list)
summary_markdown: str = ""
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
class RunDetail(BaseModel):
@@ -114,6 +120,45 @@ class TriggerEvaluationResponse(BaseModel):
task_id: str
class LLMProfile(BaseModel):
"""A named LLM connection configuration that can be reused across tasks."""
profile_id: str
name: str
model: str
base_url: str
api_key: str
timeout_seconds: int = 30
created_at: str = Field(default_factory=_utcnow_iso)
updated_at: str = Field(default_factory=_utcnow_iso)
class CreateProfileRequest(BaseModel):
"""Request body for creating or updating an LLM profile."""
name: str
model: str
base_url: str
api_key: str
timeout_seconds: int = 30
class ProfileApplyRequest(BaseModel):
"""Request body to patch LLM profile selections into a scenario YAML."""
scenario_path: str
judge_profile_id: str | None = None
answer_profile_id: str | None = None
dataset_profile_id: str | None = None
class ProfileApplyResponse(BaseModel):
"""Response after patching a scenario YAML with profile settings."""
scenario_path: str
patched_fields: list[str] = Field(default_factory=list)
def jsonable(value: Any) -> Any:
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
import math

View File

@@ -13,7 +13,7 @@ from fastapi import FastAPI
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from webapp.api import evaluations, runs, scenarios
from webapp.api import evaluations, llm_profiles, runs, scenarios
STATIC_DIR = Path(__file__).resolve().parent / "static"
@@ -29,6 +29,7 @@ def create_app() -> FastAPI:
app.include_router(runs.router)
app.include_router(scenarios.router)
app.include_router(evaluations.router)
app.include_router(llm_profiles.router)
@app.get("/api/health", tags=["meta"])
def health() -> dict[str, str]:

View File

@@ -0,0 +1,137 @@
"""In-memory + JSON-file LLM profile manager.
Profiles are kept in a dict keyed by profile_id and written to a JSON file
on every mutation, so they survive server restarts. The pattern mirrors
TaskManager but without threading concerns beyond a simple lock (profiles
are only mutated by API calls in FastAPI request handlers).
"""
from __future__ import annotations
import json
import threading
import uuid
from datetime import datetime, timezone
from pathlib import Path
from webapp.models import LLMProfile
_DEFAULT_STORE = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
class ProfileManager:
"""Manages LLM profiles with in-memory cache and JSON file persistence."""
def __init__(self, store_path: Path = _DEFAULT_STORE) -> None:
self._store_path = store_path
self._lock = threading.Lock()
self._profiles: dict[str, LLMProfile] = {}
self._load()
# ------------------------------------------------------------------ #
# Public API
# ------------------------------------------------------------------ #
def list_all(self) -> list[LLMProfile]:
"""Return all profiles sorted by creation time."""
with self._lock:
return sorted(self._profiles.values(), key=lambda p: p.created_at)
def get(self, profile_id: str) -> LLMProfile | None:
"""Return one profile by id, or None if not found."""
with self._lock:
return self._profiles.get(profile_id)
def create(
self,
name: str,
model: str,
base_url: str,
api_key: str,
timeout_seconds: int = 30,
) -> LLMProfile:
"""Create and persist a new profile, returning it."""
now = _now_iso()
profile = LLMProfile(
profile_id=uuid.uuid4().hex[:12],
name=name,
model=model,
base_url=base_url,
api_key=api_key,
timeout_seconds=timeout_seconds,
created_at=now,
updated_at=now,
)
with self._lock:
self._profiles[profile.profile_id] = profile
self._persist()
return profile
def update(
self,
profile_id: str,
name: str,
model: str,
base_url: str,
api_key: str,
timeout_seconds: int = 30,
) -> LLMProfile | None:
"""Update an existing profile in-place; returns None if not found."""
with self._lock:
existing = self._profiles.get(profile_id)
if existing is None:
return None
updated = existing.model_copy(update={
"name": name,
"model": model,
"base_url": base_url,
"api_key": api_key,
"timeout_seconds": timeout_seconds,
"updated_at": _now_iso(),
})
self._profiles[profile_id] = updated
self._persist()
return updated
def delete(self, profile_id: str) -> bool:
"""Remove a profile; returns True if deleted, False if not found."""
with self._lock:
if profile_id not in self._profiles:
return False
del self._profiles[profile_id]
self._persist()
return True
# ------------------------------------------------------------------ #
# Persistence helpers
# ------------------------------------------------------------------ #
def _load(self) -> None:
"""Load profiles from the JSON store file, ignoring missing/corrupt files."""
if not self._store_path.exists():
return
try:
data = json.loads(self._store_path.read_text(encoding="utf-8"))
for raw in data.get("profiles", []):
p = LLMProfile.model_validate(raw)
self._profiles[p.profile_id] = p
except Exception: # noqa: BLE001
pass # Corrupt store — start fresh
def _persist(self) -> None:
"""Write current profiles to the JSON store file (called under lock)."""
self._store_path.parent.mkdir(parents=True, exist_ok=True)
payload = {"profiles": [p.model_dump() for p in self._profiles.values()]}
self._store_path.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
# Module-level singleton shared by FastAPI routes.
profile_manager = ProfileManager()

View File

@@ -164,12 +164,14 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
"""Build the full aggregated report payload for one run directory."""
frame = run_reader.read_scores_frame(run_dir)
summary_markdown = run_reader.read_summary_markdown(run_dir)
advice_markdown = run_reader.read_advice_markdown(run_dir)
if frame.empty or not metrics:
return ReportData(
metrics=metrics,
metric_means={metric: None for metric in metrics},
summary_markdown=summary_markdown,
advice_markdown=advice_markdown,
)
distributions = {
@@ -185,4 +187,5 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
groupings=_groupings(frame, metrics),
lowest_samples=_lowest_samples(frame, metrics),
summary_markdown=summary_markdown,
advice_markdown=advice_markdown,
)

View File

@@ -220,3 +220,14 @@ def read_summary_markdown(run_dir: Path) -> str:
return summary_path.read_text(encoding="utf-8")
except OSError:
return ""
def read_advice_markdown(run_dir: Path) -> str:
"""Return the optimization_advice.md for a run, or an empty string if not generated."""
advice_path = run_dir / "optimization_advice.md"
if not advice_path.is_file():
return ""
try:
return advice_path.read_text(encoding="utf-8")
except OSError:
return ""

View File

@@ -0,0 +1,74 @@
"""Patch LLM profile settings into scenario YAML files in-place.
Only the fields that correspond to a provided (non-None) profile are touched.
All other fields and structure are preserved as much as PyYAML allows
(comments are lost on round-trip, which is an accepted trade-off).
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from webapp.models import LLMProfile
def _repo_root() -> Path:
return Path(__file__).resolve().parents[2]
def _resolve_scenario_path(path_str: str) -> Path:
"""Resolve a scenario path; absolute paths are used as-is."""
candidate = Path(path_str)
if candidate.is_absolute():
return candidate
return (_repo_root() / candidate).resolve()
def apply_profiles_to_scenario(
scenario_path: str,
judge_profile: LLMProfile | None,
answer_profile: LLMProfile | None,
dataset_profile: LLMProfile | None,
_resolve_absolute: bool = False,
) -> list[str]:
"""Patch the YAML file at *scenario_path* with the supplied profiles.
Returns a list of dotted field names that were actually patched.
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
"""
if _resolve_absolute:
resolved = Path(scenario_path)
else:
resolved = _resolve_scenario_path(scenario_path)
if not resolved.exists():
raise FileNotFoundError(f"Scenario file not found: {resolved}")
data: dict[str, Any] = yaml.safe_load(resolved.read_text(encoding="utf-8")) or {}
patched: list[str] = []
if judge_profile is not None:
data["judge_model"] = judge_profile.model
patched.append("judge_model")
if answer_profile is not None:
adapter = data.get("app_adapter")
if isinstance(adapter, dict):
static_kwargs = adapter.setdefault("static_kwargs", {})
static_kwargs["model"] = answer_profile.model
patched.append("app_adapter.static_kwargs.model")
if dataset_profile is not None:
generation = data.get("generation")
if isinstance(generation, dict):
generation["model"] = dataset_profile.model
patched.append("generation.model")
resolved.write_text(
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
encoding="utf-8",
)
return patched

View File

@@ -265,3 +265,69 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
.sidebar { width: 64px; }
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
}
/* ---------- LLM 配置管理页 ---------- */
.profile-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 16px; }
.profile-card {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
padding: 16px; box-shadow: var(--shadow);
}
.profile-card-head { display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; }
.profile-card-name { font-size: 15px; font-weight: 600; }
.profile-card-actions { display: flex; gap: 6px; }
.profile-card-field { font-size: 12px; color: var(--slate); margin-top: 4px; }
.field-label { font-weight: 600; color: var(--ink); }
/* Form */
.profile-form { display: flex; flex-direction: column; gap: 12px; margin-top: 14px; max-width: 560px; }
.form-row { display: flex; flex-direction: column; gap: 4px; }
.form-label { font-size: 13px; font-weight: 600; }
.req { color: var(--bad); }
.form-input {
border: 1px solid var(--line); border-radius: 6px; padding: 8px 10px;
font-size: 13px; font-family: inherit; width: 100%;
}
.form-input:focus { outline: none; border-color: var(--petrol); }
.form-input-sm { max-width: 120px; }
.form-actions { display: flex; gap: 10px; align-items: center; margin-top: 4px; }
.form-error { font-size: 12px; color: var(--bad); }
.btn-sm { padding: 4px 10px; font-size: 12px; }
.btn-danger { color: var(--bad); border-color: var(--bad); }
.btn-danger:hover { background: #fee2e2; }
/* 选中态 run 卡片 */
.run-card.selected {
border-color: var(--petrol);
box-shadow: 0 0 0 2px rgba(0,153,153,0.25), var(--shadow);
}
/* ---------- LLM 角色配置面板 ---------- */
.llm-assignment-panel { border-left: 3px solid var(--petrol); }
.llm-role-rows { display: flex; flex-direction: column; gap: 10px; }
.llm-role-row { display: flex; align-items: center; gap: 14px; }
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
.llm-role-select { min-width: 240px; }
/* ---------- ⑤ 优化建议面板 ---------- */
.advice-panel { border-left: 3px solid #7c3aed; }
.advice-header {
display: flex; align-items: center; gap: 10px;
margin-bottom: 14px;
}
.advice-badge {
background: #7c3aed; color: #fff;
font-size: 11px; font-weight: 700; letter-spacing: 0.5px;
padding: 3px 8px; border-radius: 4px; text-transform: uppercase;
}
.advice-model { font-size: 12px; color: var(--slate); }
.advice-body { line-height: 1.7; color: var(--ink); }
.advice-md h1 { font-size: 16px; font-weight: 700; margin: 16px 0 8px; color: var(--ink); }
.advice-md h2 {
font-size: 14px; font-weight: 700; margin: 20px 0 8px;
padding-bottom: 4px; border-bottom: 1px solid var(--line); color: var(--ink-soft);
}
.advice-md h3 { font-size: 13px; font-weight: 600; margin: 12px 0 6px; color: var(--slate); }
.advice-md hr { border: none; border-top: 1px solid var(--line); margin: 14px 0; }
.advice-md ul { padding-left: 20px; margin: 6px 0; }
.advice-md li { margin: 3px 0; font-size: 13px; }
.advice-md strong { color: var(--ink); font-weight: 600; }

View File

@@ -22,9 +22,12 @@
<button class="nav-item" data-view="new">
<span class="nav-ico"></span><span>新建评估</span>
</button>
<button class="nav-item" data-view="report" data-requires-run="1">
<button class="nav-item" data-view="report" data-requires-run="1" disabled>
<span class="nav-ico"></span><span>报告详情</span>
</button>
<button class="nav-item" data-view="profiles">
<span class="nav-ico"></span><span>LLM 配置</span>
</button>
</nav>
<div class="sidebar-foot">
<span class="dot" id="health-dot"></span>
@@ -59,6 +62,33 @@
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
</div>
</div>
<!-- LLM 角色配置面板(选中场景后显示) -->
<div class="panel llm-assignment-panel" id="llm-assignment-panel" hidden>
<h2>LLM 角色配置 <span class="muted" style="font-size:13px;font-weight:400">(可选)</span></h2>
<p class="muted" style="margin-bottom:14px">为不同任务角色选择已保存的 LLM 配置,留空则使用场景文件中的原始配置。</p>
<div class="llm-role-rows">
<div class="llm-role-row">
<label class="llm-role-label">评测打分 Judge LLM</label>
<select class="select llm-role-select" id="role-judge">
<option value="">— 使用场景原始配置 —</option>
</select>
</div>
<div class="llm-role-row">
<label class="llm-role-label">生成答案 Answer LLM</label>
<select class="select llm-role-select" id="role-answer">
<option value="">— 使用场景原始配置 —</option>
</select>
</div>
<div class="llm-role-row">
<label class="llm-role-label">生成题库 Dataset LLM</label>
<select class="select llm-role-select" id="role-dataset">
<option value="">— 使用场景原始配置 —</option>
</select>
</div>
</div>
</div>
<div class="panel" id="task-panel" hidden>
<div class="task-head">
<h2>评估进度</h2>
@@ -105,6 +135,68 @@
<!-- ④ 最低分样本逐条复核 -->
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
<div class="lowest-table" id="lowest-table"></div>
<!-- ⑤ 优化建议optimization_advisor: true 时显示) -->
<div id="advice-section" hidden>
<div class="section-label">⑤ 优化建议 OPTIMIZATION ADVICE</div>
<div class="panel advice-panel">
<div class="advice-header">
<span class="advice-badge">AI 诊断报告</span>
<span class="advice-model" id="advice-model-label"></span>
</div>
<div class="advice-body" id="advice-body"></div>
</div>
</div>
</div>
</section>
<!-- LLM 配置视图 -->
<section class="view" id="view-profiles" hidden>
<div class="panel">
<div class="panel-head">
<h2>LLM 配置管理</h2>
<button class="btn btn-primary" id="add-profile-btn"> 新建配置</button>
</div>
<p class="muted">保存常用 LLM 连接参数,在运行评估时按角色选择。</p>
</div>
<!-- 新建 / 编辑表单(默认隐藏) -->
<div class="panel" id="profile-form-panel" hidden>
<h2 id="profile-form-title">新建 LLM 配置</h2>
<div class="profile-form">
<input type="hidden" id="edit-profile-id" />
<div class="form-row">
<label class="form-label">配置名称 <span class="req">*</span></label>
<input class="form-input" id="pf-name" placeholder="例DeepSeek Flash内网" />
</div>
<div class="form-row">
<label class="form-label">模型名称 <span class="req">*</span></label>
<input class="form-input" id="pf-model" placeholder="例deepseek-v4-flash" />
</div>
<div class="form-row">
<label class="form-label">Base URL <span class="req">*</span></label>
<input class="form-input" id="pf-base-url" placeholder="例http://6.86.80.4:30080/v1" />
</div>
<div class="form-row">
<label class="form-label">API Key <span class="req">*</span></label>
<input class="form-input" id="pf-api-key" type="password" placeholder="sk-…" />
</div>
<div class="form-row">
<label class="form-label">超时(秒)</label>
<input class="form-input form-input-sm" id="pf-timeout" type="number" value="30" min="5" max="300" />
</div>
<div class="form-actions">
<button class="btn btn-primary" id="save-profile-btn">保存</button>
<button class="btn" id="cancel-profile-btn">取消</button>
<span class="form-error muted" id="profile-form-error"></span>
</div>
</div>
</div>
<div id="profile-cards" class="profile-grid"></div>
<div class="empty" id="profiles-empty" hidden>
<p>尚未添加任何 LLM 配置。</p>
<p class="muted">点击「新建配置」添加第一个。</p>
</div>
</section>
</main>
@@ -112,6 +204,7 @@
<script src="/static/js/api.js"></script>
<script src="/static/js/report.js"></script>
<script src="/static/js/profiles.js"></script>
<script src="/static/js/runner.js"></script>
<script src="/static/js/app.js"></script>
</body>

View File

@@ -43,4 +43,26 @@ const API = {
return API.post("/api/evaluations", { scenario_path: scenarioPath });
},
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
// LLM Profile API
profiles() { return API.get("/api/llm-profiles"); },
createProfile(body) { return API.post("/api/llm-profiles", body); },
updateProfile(id, body) {
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, {
method: "PUT",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body),
}).then(async r => {
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
return r.json();
});
},
deleteProfile(id) {
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, { method: "DELETE" })
.then(async r => {
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
return r.json();
});
},
applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); },
};

View File

@@ -1,28 +1,59 @@
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
// 会话保持URL hash 路由(#runs / #new / #profiles / #report/{runId}
// + sessionStorage 兜底F5 刷新 / 浏览器前进后退均可恢复。
const App = {
currentRunId: null,
views: ["runs", "new", "report"],
titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
activeView: null,
views: ["runs", "new", "report", "profiles"],
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置" },
// 初始化:绑定导航、加载首屏、启动健康检查。
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
init() {
document.querySelectorAll(".nav-item").forEach((btn) => {
btn.addEventListener("click", () => App.switchView(btn.dataset.view));
btn.addEventListener("click", () => App.navigate(btn.dataset.view));
});
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
Runner.init();
App.switchView("runs");
Profiles.init();
// 恢复上次会话(优先 URL hash其次 sessionStorage
App._restoreSession();
App.checkHealth();
setInterval(App.checkHealth, 15000);
// 浏览器前进 / 后退按钮
window.addEventListener("popstate", () => App._restoreSession());
},
// 切换主视图,并同步导航高亮与标题。
switchView(view) {
if (view === "report" && !App.currentRunId) {
// 没有选中的运行时,报告页显示占位。
// ----------------------------------------------------------------
// 路由 —— 有历史记录的主动导航(更新 URL hash
// ----------------------------------------------------------------
navigate(view, runId) {
if (runId !== undefined) App.currentRunId = runId;
const hash = App._buildHash(view, App.currentRunId);
if (location.hash !== `#${hash}`) {
history.pushState({ view, runId: App.currentRunId }, "", `#${hash}`);
}
App._doSwitch(view);
},
// 供内部调用(不产生历史记录),例如刷新同一视图
switchView(view) {
App._doSwitch(view);
},
// 刷新当前视图数据
refreshCurrent() {
App._doSwitch(App.activeView || "runs");
},
// ----------------------------------------------------------------
// 内部:实际切换 DOM + 触发数据加载
// ----------------------------------------------------------------
_doSwitch(view) {
App.views.forEach((name) => {
const el = document.getElementById(`view-${name}`);
if (el) el.hidden = name !== view;
@@ -33,17 +64,53 @@ const App = {
document.getElementById("view-title").textContent = App.titles[view] || view;
App.activeView = view;
if (view === "runs") App.loadRuns();
if (view === "new") Runner.loadScenarios();
if (view === "report") Report.render(App.currentRunId);
// 持久化到 sessionStorageURL 共享场景的备份)
sessionStorage.setItem("rag_view", view);
if (App.currentRunId) sessionStorage.setItem("rag_run_id", App.currentRunId);
if (view === "runs") App.loadRuns();
if (view === "new") Runner.loadScenarios();
if (view === "report") Report.render(App.currentRunId);
if (view === "profiles") Profiles.load();
},
// 刷新当前视图的数据。
refreshCurrent() {
App.switchView(App.activeView || "runs");
// ----------------------------------------------------------------
// Hash 工具
// ----------------------------------------------------------------
_buildHash(view, runId) {
if (view === "report" && runId) {
return `report/${encodeURIComponent(runId)}`;
}
return view || "runs";
},
// 加载并渲染运行列表。
_parseHash() {
const raw = location.hash.replace(/^#\/?/, "");
if (!raw) return { view: null, runId: null };
if (raw.startsWith("report/")) {
const runId = decodeURIComponent(raw.slice("report/".length));
return { view: "report", runId };
}
const view = App.views.includes(raw) ? raw : null;
return { view, runId: null };
},
// 会话恢复hash → sessionStorage → 默认 runs
_restoreSession() {
const { view: hView, runId: hRunId } = App._parseHash();
const view = hView || sessionStorage.getItem("rag_view") || "runs";
const runId = hRunId || sessionStorage.getItem("rag_run_id") || null;
if (runId) {
App.currentRunId = runId;
App.enableReportNav();
}
App._doSwitch(view);
},
// ----------------------------------------------------------------
// 运行列表
// ----------------------------------------------------------------
async loadRuns() {
const container = document.getElementById("runs-container");
const empty = document.getElementById("runs-empty");
@@ -64,14 +131,16 @@ const App = {
}
},
// 构造一张运行卡片。
renderRunCard(run) {
const card = document.createElement("div");
card.className = "run-card";
card.className = "run-card" + (run.run_id === App.currentRunId ? " selected" : "");
card.addEventListener("click", () => {
App.currentRunId = run.run_id;
// 更新选中高亮
document.querySelectorAll(".run-card").forEach((c) => c.classList.remove("selected"));
card.classList.add("selected");
App.enableReportNav();
App.switchView("report");
App.navigate("report", run.run_id);
});
const chips = (run.metrics || [])
@@ -79,7 +148,7 @@ const App = {
const val = run.metric_means ? run.metric_means[m] : null;
const cls = App.scoreClass(val);
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
})
.join("");
@@ -96,13 +165,14 @@ const App = {
return card;
},
// 启用报告导航项(选中运行后)。
// ----------------------------------------------------------------
// 工具方法
// ----------------------------------------------------------------
enableReportNav() {
const btn = document.querySelector('.nav-item[data-view="report"]');
if (btn) btn.disabled = false;
},
// 根据分值返回 good/warn/bad/na 配色类。
scoreClass(value) {
if (value === null || value === undefined) return "na";
if (value >= 0.8) return "good";
@@ -110,40 +180,39 @@ const App = {
return "bad";
},
// 指标名缩写,节省卡片横向空间。
shortMetric(name) {
const map = {
faithfulness: "faith.",
answer_relevancy: "ans.rel.",
context_recall: "ctx.recall",
context_precision: "ctx.prec.",
faithfulness: "faith.",
answer_relevancy: "ans.rel.",
context_recall: "ctx.recall",
context_precision: "ctx.prec.",
noise_sensitivity: "noise.sens.",
factual_correctness: "fact.corr.",
semantic_similarity: "sem.sim.",
};
return map[name] || name;
},
// 截取时间戳到分钟,便于阅读。
shortTime(iso) {
if (!iso) return "—";
return String(iso).replace("T", " ").slice(0, 16);
},
// 简单 HTML 转义,防止注入。
escape(text) {
const div = document.createElement("div");
div.textContent = text == null ? "" : String(text);
return div.innerHTML;
},
// 健康检查,更新左下角状态点。
async checkHealth() {
const dot = document.getElementById("health-dot");
const dot = document.getElementById("health-dot");
const label = document.getElementById("health-text");
try {
await API.health();
dot.className = "dot ok";
dot.className = "dot ok";
label.textContent = "服务正常";
} catch (_e) {
dot.className = "dot bad";
dot.className = "dot bad";
label.textContent = "服务离线";
}
},

View File

@@ -0,0 +1,118 @@
// profiles.js — LLM 配置管理页面逻辑
const Profiles = {
_data: [],
// 初始化:绑定按钮事件
init() {
document.getElementById("add-profile-btn").addEventListener("click", () => Profiles.showForm());
document.getElementById("save-profile-btn").addEventListener("click", () => Profiles.save());
document.getElementById("cancel-profile-btn").addEventListener("click", () => Profiles.hideForm());
},
// 加载并渲染 Profile 列表
async load() {
const grid = document.getElementById("profile-cards");
const empty = document.getElementById("profiles-empty");
grid.innerHTML = '<p class="muted">加载中…</p>';
try {
const data = await API.profiles();
Profiles._data = data.profiles || [];
grid.innerHTML = "";
if (Profiles._data.length === 0) {
empty.hidden = false;
} else {
empty.hidden = true;
Profiles._data.forEach(p => grid.appendChild(Profiles.renderCard(p)));
}
} catch (err) {
grid.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
}
},
// 渲染单个 Profile 卡片
renderCard(p) {
const card = document.createElement("div");
card.className = "profile-card";
card.dataset.id = p.profile_id;
card.innerHTML = `
<div class="profile-card-head">
<div class="profile-card-name">${App.escape(p.name)}</div>
<div class="profile-card-actions">
<button class="btn btn-sm" data-action="edit">编辑</button>
<button class="btn btn-sm btn-danger" data-action="delete">删除</button>
</div>
</div>
<div class="profile-card-field"><span class="field-label">模型</span> <code>${App.escape(p.model)}</code></div>
<div class="profile-card-field"><span class="field-label">Base URL</span> <code>${App.escape(p.base_url)}</code></div>
<div class="profile-card-field"><span class="field-label">超时</span> ${p.timeout_seconds}s</div>
`;
card.querySelector("[data-action=edit]").addEventListener("click", () => Profiles.showForm(p));
card.querySelector("[data-action=delete]").addEventListener("click", () => Profiles.remove(p.profile_id, p.name));
return card;
},
// 显示新建或编辑表单
showForm(profile = null) {
const panel = document.getElementById("profile-form-panel");
const title = document.getElementById("profile-form-title");
panel.hidden = false;
title.textContent = profile ? "编辑 LLM 配置" : "新建 LLM 配置";
document.getElementById("edit-profile-id").value = profile ? profile.profile_id : "";
document.getElementById("pf-name").value = profile ? profile.name : "";
document.getElementById("pf-model").value = profile ? profile.model : "";
document.getElementById("pf-base-url").value = profile ? profile.base_url : "";
document.getElementById("pf-api-key").value = profile ? profile.api_key : "";
document.getElementById("pf-timeout").value = profile ? profile.timeout_seconds : 30;
document.getElementById("profile-form-error").textContent = "";
panel.scrollIntoView({ behavior: "smooth", block: "start" });
},
hideForm() {
document.getElementById("profile-form-panel").hidden = true;
},
// 保存(新建 or 更新)
async save() {
const id = document.getElementById("edit-profile-id").value;
const body = {
name: document.getElementById("pf-name").value.trim(),
model: document.getElementById("pf-model").value.trim(),
base_url: document.getElementById("pf-base-url").value.trim(),
api_key: document.getElementById("pf-api-key").value.trim(),
timeout_seconds: parseInt(document.getElementById("pf-timeout").value, 10) || 30,
};
const errEl = document.getElementById("profile-form-error");
if (!body.name || !body.model || !body.base_url || !body.api_key) {
errEl.textContent = "请填写所有必填字段名称、模型、Base URL、API Key";
return;
}
try {
if (id) {
await API.updateProfile(id, body);
} else {
await API.createProfile(body);
}
Profiles.hideForm();
await Profiles.load();
} catch (err) {
errEl.textContent = `保存失败:${err.message}`;
}
},
// 删除 Profile
async remove(profileId, name) {
if (!confirm(`确认删除配置「${name}」?`)) return;
try {
await API.deleteProfile(profileId);
await Profiles.load();
} catch (err) {
alert(`删除失败:${err.message}`);
}
},
// 获取当前已加载的 profiles供 runner.js 使用)
getAll() {
return Profiles._data;
},
};

View File

@@ -26,6 +26,7 @@ const Report = {
Report.renderDistribution(detail.report);
Report.renderGroupings(detail.report);
Report.renderLowest(detail.report);
Report.renderAdvice(detail.summary, detail.report);
content.style.opacity = "1";
} catch (err) {
empty.hidden = false;
@@ -186,8 +187,7 @@ const Report = {
},
// ④ 最低分样本逐条复核表(点击展开)。
renderLowest(report) {
const wrap = document.getElementById("lowest-table");
renderLowest(report) { const wrap = document.getElementById("lowest-table");
const samples = report.lowest_samples || [];
wrap.innerHTML = "";
if (samples.length === 0) {
@@ -255,4 +255,35 @@ const Report = {
</div>
`;
},
// ⑤ 优化建议(仅 optimization_advice.md 存在时渲染)。
renderAdvice(summary, report) {
const section = document.getElementById("advice-section");
const body = document.getElementById("advice-body");
const modelLabel = document.getElementById("advice-model-label");
const md = report.advice_markdown || "";
if (!md.trim()) {
section.hidden = true;
return;
}
section.hidden = false;
modelLabel.textContent = summary.judge_model ? `judge: ${summary.judge_model}` : "";
// 简单 Markdown → HTML 转换(标题、列表、分隔线、粗体)
const escaped = md
.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
const html = escaped
.replace(/^#{3}\s+(.+)$/gm, "<h3>$1</h3>")
.replace(/^#{2}\s+(.+)$/gm, "<h2>$1</h2>")
.replace(/^#{1}\s+(.+)$/gm, "<h1>$1</h1>")
.replace(/^---+$/gm, "<hr>")
.replace(/\*\*(.+?)\*\*/g, "<strong>$1</strong>")
.replace(/^- (.+)$/gm, "<li>$1</li>")
.replace(/(<li>[^]*?<\/li>\n?)+/g, (m) => `<ul>${m}</ul>`)
.replace(/\n\n+/g, "\n<br>\n");
body.innerHTML = `<div class="advice-md">${html}</div>`;
},
};

View File

@@ -1,17 +1,17 @@
// runner.js — 新建评估视图:列出场景、触发评估、轮询任务状态与日志。
// runner.js — 新建评估视图:列出场景、LLM角色配置、触发评估、轮询任务状态与日志。
const Runner = {
selectedScenario: null,
pollTimer: null,
lastRunId: null,
// 绑定运行按钮。
init() {
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
document.getElementById("view-report-btn").addEventListener("click", () => {
if (Runner.lastRunId) {
App.currentRunId = Runner.lastRunId;
App.enableReportNav();
App.switchView("report");
App.navigate("report", Runner.lastRunId);
}
});
},
@@ -32,6 +32,27 @@ const Runner = {
} catch (err) {
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
}
// 同时加载 profiles 供角色选择
Runner._populateProfileSelects();
},
// 填充三个角色下拉框
async _populateProfileSelects() {
const cached = Profiles.getAll();
const profiles = cached.length > 0
? cached
: (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
["role-judge", "role-answer", "role-dataset"].forEach(id => {
const sel = document.getElementById(id);
sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
profiles.forEach(p => {
const opt = document.createElement("option");
opt.value = p.profile_id;
opt.textContent = `${p.name} (${p.model})`;
sel.appendChild(opt);
});
});
},
// 构造单个场景条目。
@@ -64,12 +85,14 @@ const Runner = {
Runner.selectedScenario = sc.path;
document.getElementById("selected-scenario").textContent = sc.path;
document.getElementById("run-btn").disabled = false;
// 显示 LLM 角色面板
document.getElementById("llm-assignment-panel").hidden = false;
});
}
return item;
},
// 触发评估并开始轮询
// 触发评估:先 apply profiles若选了再触发任务
async trigger() {
if (!Runner.selectedScenario) return;
const runBtn = document.getElementById("run-btn");
@@ -85,15 +108,41 @@ const Runner = {
Runner._setStatus(statusBadge, "queued");
try {
// Step 1: apply LLM profiles to YAML if any selected
await Runner._applyProfilesIfNeeded(logBox);
// Step 2: trigger evaluation
const resp = await API.triggerEvaluation(Runner.selectedScenario);
Runner.poll(resp.task_id);
} catch (err) {
Runner._setStatus(statusBadge, "failed");
logBox.textContent = `触发失败:${err.message}`;
logBox.textContent = (logBox.textContent ? logBox.textContent + "\n" : "") + `触发失败:${err.message}`;
runBtn.disabled = false;
}
},
// 如果用户选了 profile就先 apply 写回 YAML
async _applyProfilesIfNeeded(logBox) {
const judgeId = document.getElementById("role-judge").value;
const answerId = document.getElementById("role-answer").value;
const datasetId = document.getElementById("role-dataset").value;
if (!judgeId && !answerId && !datasetId) return; // 全空,跳过
logBox.textContent = "正在将 LLM 配置写入场景文件…\n";
const body = {
scenario_path: Runner.selectedScenario,
judge_profile_id: judgeId || null,
answer_profile_id: answerId || null,
dataset_profile_id: datasetId || null,
};
const result = await API.applyProfiles(body);
const fields = (result.patched_fields || []).join(", ");
logBox.textContent += fields
? `✓ 已更新字段:${fields}\n`
: "(未找到可更新的字段,继续运行)\n";
},
// 周期性轮询任务状态,刷新日志与徽标。
poll(taskId) {
const logBox = document.getElementById("task-log");
@@ -114,6 +163,7 @@ const Runner = {
runBtn.disabled = false;
if (status.status === "completed" && status.run_id) {
Runner.lastRunId = status.run_id;
sessionStorage.setItem("rag_run_id", status.run_id);
reportBtn.hidden = false;
}
}