Compare commits
15 Commits
1ff4a3943a
...
24956bbf75
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24956bbf75 | ||
|
|
ca01e44ad2 | ||
|
|
1a2cc534b8 | ||
|
|
91c0dab4f9 | ||
|
|
f5c2dce64a | ||
|
|
d68399d39b | ||
|
|
719c3b4ca4 | ||
|
|
5b60ed12ea | ||
|
|
dc8baf8662 | ||
|
|
e329f59139 | ||
|
|
b19054bd66 | ||
|
|
5d09deb420 | ||
|
|
b98af29449 | ||
|
|
4173a40d93 | ||
|
|
629304aa6d |
17
.env.example
17
.env.example
@@ -1,11 +1,22 @@
|
||||
# ===== LLM 连接配置(RAGAS 评测 + 生成) =====
|
||||
# 所有模型共用同一个 OpenAI 兼容 endpoint
|
||||
# 在 Web 控制台的「LLM 配置」页面可以保存多个命名配置,
|
||||
# 并在运行评估时按角色(Judge / Answer / Dataset)分别选择覆盖。
|
||||
|
||||
OPENAI_API_KEY=your-api-key
|
||||
OPENAI_BASE_URL=http://6.86.80.4:30080/v1
|
||||
OPENAI_TIMEOUT_SECONDS=180
|
||||
|
||||
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
||||
RAGAS_JUDGE_MODEL=deepseek-v4-flash
|
||||
RAGAS_EMBEDDING_MODEL=text-embedding-v3
|
||||
|
||||
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
||||
BATCH_SIZE=8
|
||||
RAGAS_METRIC_TIMEOUT_SECONDS=300
|
||||
|
||||
|
||||
# ===== 阿里云文档解析 =====
|
||||
# ===== 阿里云文档解析(dataset build 功能需要) =====
|
||||
ALIBABA_ACCESS_KEY_ID=
|
||||
ALIBABA_ACCESS_KEY_SECRET=
|
||||
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
|
||||
@@ -14,6 +25,8 @@ ALIYUN_PARSE_TIMEOUT_SECONDS=900
|
||||
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
|
||||
ALIYUN_LLM_ENHANCEMENT=true
|
||||
ALIYUN_ENHANCEMENT_MODE=VLM
|
||||
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
|
||||
DOCUMENT_PARSE_ARTIFACT_PREFIX=outputs/dataset-builds
|
||||
PARSER_FAILURE_MODE=fail
|
||||
|
||||
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
|
||||
DATASET_GENERATOR_MODEL=qwen3.6-plus
|
||||
|
||||
64
configs/llm_profiles.json
Normal file
64
configs/llm_profiles.json
Normal file
@@ -0,0 +1,64 @@
|
||||
{
|
||||
"profiles": [
|
||||
{
|
||||
"profile_id": "c8e185a64fa0",
|
||||
"name": "glm-5",
|
||||
"model": "glm-5",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:16:22.438297+00:00",
|
||||
"updated_at": "2026-06-16T09:19:03.089865+00:00"
|
||||
},
|
||||
{
|
||||
"profile_id": "54ddfe5aeb46",
|
||||
"name": "deepseek-v4-pro",
|
||||
"model": "deepseek-v4-pro",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:17:08.473904+00:00",
|
||||
"updated_at": "2026-06-16T09:19:07.504082+00:00"
|
||||
},
|
||||
{
|
||||
"profile_id": "25d035eef194",
|
||||
"name": "qwen3.5-flash",
|
||||
"model": "qwen3.5-flash",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:18:24.265619+00:00",
|
||||
"updated_at": "2026-06-16T09:18:24.265619+00:00"
|
||||
},
|
||||
{
|
||||
"profile_id": "ff1d0f417a5d",
|
||||
"name": "deepseek-v4-flash",
|
||||
"model": "deepseek-v4-flash",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:18:57.091549+00:00",
|
||||
"updated_at": "2026-06-16T09:18:57.091549+00:00"
|
||||
},
|
||||
{
|
||||
"profile_id": "5b04c49df9df",
|
||||
"name": "text-embedding-v4",
|
||||
"model": "text-embedding-v4",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:19:49.104004+00:00",
|
||||
"updated_at": "2026-06-16T09:19:49.104004+00:00"
|
||||
},
|
||||
{
|
||||
"profile_id": "b4f7c82859d5",
|
||||
"name": "text-embedding-v3",
|
||||
"model": "text-embedding-v3",
|
||||
"base_url": "http://6.86.80.4:30080/v1",
|
||||
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||
"timeout_seconds": 600,
|
||||
"created_at": "2026-06-16T09:20:18.266540+00:00",
|
||||
"updated_at": "2026-06-16T09:20:18.266540+00:00"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -318,6 +318,10 @@ metrics:
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
# 可选:鲁棒性 / 端到端指标(需数据集含 ground_truth),完整列表见 §9.4
|
||||
# - noise_sensitivity
|
||||
# - factual_correctness
|
||||
# - semantic_similarity
|
||||
output_dir: runs/legal-assistant-offline-baseline
|
||||
runtime:
|
||||
batch_size: 4
|
||||
@@ -338,7 +342,7 @@ runtime:
|
||||
- `embedding_model`
|
||||
- 负责向量相关指标的模型
|
||||
- `metrics`
|
||||
- 本次启用的指标列表
|
||||
- 本次启用的指标列表(完整可选项与依赖见 §9.4)
|
||||
- `output_dir`
|
||||
- 本次运行结果输出目录
|
||||
- `runtime.batch_size`
|
||||
@@ -399,6 +403,32 @@ app_adapter:
|
||||
- embedding model
|
||||
- 指标实例
|
||||
|
||||
当前支持的指标(`rag_eval/metrics/registry.py` 中的 `SUPPORTED_METRICS`):
|
||||
|
||||
| 指标名 | 层面 | 依赖 |
|
||||
|---|---|---|
|
||||
| `faithfulness` | 生成 | judge model |
|
||||
| `answer_relevancy` | 生成 | judge model + embedding |
|
||||
| `context_recall` | 检索 | judge model + ground_truth |
|
||||
| `context_precision` | 检索 | judge model + ground_truth |
|
||||
| `noise_sensitivity` | 鲁棒性 | judge model + ground_truth |
|
||||
| `factual_correctness` | 端到端 | judge model + ground_truth |
|
||||
| `semantic_similarity` | 端到端 | embedding + ground_truth(无 LLM 调用) |
|
||||
|
||||
后四项以 `ground_truth`(标准答案)为参照,数据集必须提供该字段。新增指标统一在 `registry.py` / `factory.py` / `pipeline.py` 三处对齐装配。
|
||||
|
||||
**Optimization Advisor(§11 优化策略落地):**
|
||||
|
||||
评测结束后,若场景配置 `optimization_advisor: true`,则自动调用 `rag_eval/advisor/` 模块:
|
||||
- 规则引擎(`rules.py`)对 7 个指标各自设阈值,识别触发项并选取 top-3 低分样本
|
||||
- LLM 分析器(`llm_analyzer.py`)结合低分样本生成中文 Markdown 优化建议(复用 judge_model,失败自动降级为纯规则报告)
|
||||
- 写出层(`writer.py`)输出 `optimization_advice.md` 并打日志摘要
|
||||
|
||||
```yaml
|
||||
# 场景配置示例
|
||||
optimization_advisor: true
|
||||
```
|
||||
|
||||
### 9.5 并发控制
|
||||
|
||||
执行层负责并发上限,不把并发策略散落到各指标实现中。
|
||||
|
||||
@@ -316,11 +316,21 @@ adapter 层的目标是:**把不同类型的目标应用,统一成同一套
|
||||
|
||||
当前支持的指标包括:
|
||||
|
||||
核心检索 / 生成指标(始终可用):
|
||||
|
||||
- `faithfulness`
|
||||
- `answer_relevancy`
|
||||
- `context_recall`
|
||||
- `context_precision`
|
||||
|
||||
鲁棒性 / 端到端指标(架构设计 §10.2,需数据集含 `ground_truth`):
|
||||
|
||||
- `noise_sensitivity` —— 鲁棒性:对检索噪声的敏感度
|
||||
- `factual_correctness` —— 端到端:回答相对标准答案的事实正确性
|
||||
- `semantic_similarity` —— 端到端:回答与标准答案的语义相似度(基于 embedding,无 LLM 调用)
|
||||
|
||||
所有指标都通过同一套装配点接入:`registry.py`(校验白名单)、`factory.py`(实例化)、`pipeline.py`(`ascore` 入参分发),新增指标只需在这三处对齐即可。
|
||||
|
||||
所以 metric pipeline 的职责可以总结为:
|
||||
|
||||
**把标准样本转换成结构化评分结果。**
|
||||
@@ -414,3 +424,39 @@ main.py
|
||||
- 可以把每次实验的资产稳定留住
|
||||
|
||||
这也是它和一次性离线脚本的根本区别。
|
||||
|
||||
---
|
||||
|
||||
## 15. Optimization Advisor 链路
|
||||
|
||||
相关代码:
|
||||
|
||||
- `rag_eval/advisor/__init__.py` — 外部入口 `run_advisor()`
|
||||
- `rag_eval/advisor/rules.py` — 规则引擎(纯函数,无 LLM),7 条指标诊断规则
|
||||
- `rag_eval/advisor/llm_analyzer.py` — LLM 分析器(复用 judge_model llm 实例,失败自动降级)
|
||||
- `rag_eval/advisor/writer.py` — 写出 `optimization_advice.md` + 日志摘要
|
||||
|
||||
Advisor 在 `write_run_artifacts()` 之后触发,仅当场景配置 `optimization_advisor: true` 时生效,默认关闭。
|
||||
|
||||
执行链路:
|
||||
|
||||
```text
|
||||
run_advisor(result, scenario, llm)
|
||||
-> rules.diagnose(score_rows, metrics) # 识别异常指标,选取 top-3 低分样本
|
||||
-> llm_analyzer.analyze(diagnoses, llm) # LLM 生成中文建议(失败自动降级为纯规则报告)
|
||||
-> writer.write_advice(...) # 写 optimization_advice.md + 日志摘要
|
||||
```
|
||||
|
||||
输出产物追加在现有 run 目录:
|
||||
|
||||
```text
|
||||
outputs/online/siemens-pdf-question-bank/<run_id>/
|
||||
scenario.snapshot.yaml
|
||||
scores.csv
|
||||
invalid.csv
|
||||
summary.md
|
||||
metadata.json
|
||||
optimization_advice.md <- 新增(optimization_advisor: true 时生成)
|
||||
```
|
||||
|
||||
规则引擎对 7 个指标各自设 warning / critical 双档阈值,`noise_sensitivity` 为"越低越好"(方向相反)。所有诊断均附带 top-3 低分样本,喂给 LLM 生成针对具体内容的中文建议。
|
||||
|
||||
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
File diff suppressed because it is too large
Load Diff
1378
docs/superpowers/plans/2026-06-16-optimization-advisor.md
Normal file
1378
docs/superpowers/plans/2026-06-16-optimization-advisor.md
Normal file
File diff suppressed because it is too large
Load Diff
225
docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
Normal file
225
docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# 优化顾问模块设计 Spec
|
||||
|
||||
- 日期:2026-06-16
|
||||
- 状态:已确认,进入实现。
|
||||
|
||||
## 1. 目标
|
||||
|
||||
在现有 RAG 评测流程结束后,新增一个**优化顾问模块**(Optimization Advisor),根据本次评测的多项指标分数与低分样本,自动诊断指标偏低的原因并给出针对性的优化建议,输出为中文 Markdown 报告 + 日志摘要。
|
||||
|
||||
对应架构设计 §11(优化策略):将"指标到动作的映射"(§11.2)从文档形式落地为代码自动执行。
|
||||
|
||||
---
|
||||
|
||||
## 2. 决策摘要
|
||||
|
||||
| 决策点 | 选择 |
|
||||
|---|---|
|
||||
| 输出形式 | `optimization_advice.md`(文件)+ 控制台/日志摘要(双输出) |
|
||||
| 生成机制 | 规则引擎定位异常指标 → LLM 结合低分样本二次解读(两层) |
|
||||
| 触发方式 | YAML 场景文件显式声明 `optimization_advisor: true`,默认关闭 |
|
||||
| LLM 实例 | 复用 `build_models()` 已创建的 `llm` 实例,不重建 client |
|
||||
| 包位置 | `rag_eval/advisor/`(独立包,对外暴露 `run_advisor()` 单一入口) |
|
||||
|
||||
---
|
||||
|
||||
## 3. 架构
|
||||
|
||||
### 3.1 执行链路
|
||||
|
||||
```
|
||||
run_scenario()
|
||||
→ load_scenario() # 读 YAML,解析 optimization_advisor 字段
|
||||
→ build_models() # 已有:创建 llm, embeddings
|
||||
→ build_metric_pipeline() # 已有
|
||||
→ Evaluator.evaluate() # 已有:打分 → EvaluationResult
|
||||
→ write_run_artifacts() # 已有:scores.csv / summary.md / ...
|
||||
→ run_advisor( # 新增(3 行)
|
||||
result, scenario, llm, artifact_paths
|
||||
)
|
||||
→ rules.diagnose(score_rows) # 规则引擎:返回 Diagnosis 列表
|
||||
→ llm_analyzer.analyze(diags, samples) # LLM:生成中文 Markdown 建议
|
||||
→ writer.write(advice, paths) # 写文件 + 打日志
|
||||
```
|
||||
|
||||
### 3.2 新增文件
|
||||
|
||||
```
|
||||
rag_eval/advisor/
|
||||
__init__.py ← 暴露 run_advisor(),外部唯一入口
|
||||
rules.py ← 纯函数规则引擎,无 LLM,可单独单测
|
||||
llm_analyzer.py ← 接收 llm 实例 + 诊断结构 → 中文 Markdown
|
||||
writer.py ← 写 optimization_advice.md,打日志摘要
|
||||
```
|
||||
|
||||
### 3.3 修改文件(最小改动)
|
||||
|
||||
| 文件 | 改动 |
|
||||
|---|---|
|
||||
| `rag_eval/shared/models.py` | `Scenario` 加 `optimization_advisor: bool = False` 字段 |
|
||||
| `rag_eval/config/schema.py` | `ScenarioModel` 加同名字段 + 透传到 `Scenario` |
|
||||
| `rag_eval/config/loader.py` | 透传 `optimization_advisor` 到 `Scenario` 构造 |
|
||||
| `rag_eval/reporting/artifacts.py` | `RunArtifactPaths` 加 `advice_md: Path` 字段 + `build_artifact_paths()` 加赋值 |
|
||||
| `rag_eval/execution/runner.py` | `run_scenario()` 末尾:`build_models` 返回 llm 传入,条件调用 `run_advisor()` |
|
||||
|
||||
### 3.4 输出产物
|
||||
|
||||
```
|
||||
outputs/online/siemens-pdf-question-bank/<run_id>/
|
||||
scenario.snapshot.yaml
|
||||
scores.csv
|
||||
invalid.csv
|
||||
summary.md
|
||||
metadata.json
|
||||
optimization_advice.md ← 新增(optimization_advisor: true 时生成)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. 规则引擎(rules.py)
|
||||
|
||||
### 4.1 数据结构
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class Diagnosis:
|
||||
metric: str # 指标名
|
||||
mean_score: float # 本次均值
|
||||
threshold: float # 警戒阈值
|
||||
severity: str # "warning" | "critical"
|
||||
root_causes: list[str] # 可能原因(来自架构设计 §11.2)
|
||||
suggested_actions: list[str] # 对应可调阶段
|
||||
low_samples: list[dict] # 分数最低的 N 条样本(含 question/answer/ground_truth)
|
||||
```
|
||||
|
||||
### 4.2 七条指标诊断规则
|
||||
|
||||
阈值参考 RAG 评测最佳实践,分 warning / critical 两档:
|
||||
|
||||
| 指标 | warning | critical | 根因方向 | 对应优化阶段(§11.2) |
|
||||
|---|---|---|---|---|
|
||||
| `faithfulness` | < 0.7 | < 0.5 | 生成未严格基于检索片段 / 幻觉 | 生成 prompt grounding、开启校验 |
|
||||
| `answer_relevancy` | < 0.7 | < 0.5 | 回答偏离问题 / 格式冗余 | 查询改写、生成 prompt 格式 |
|
||||
| `context_recall` | < 0.7 | < 0.5 | 检索遗漏关键信息 | 多查询、问题分解、Step-back、加大过召回 |
|
||||
| `context_precision` | < 0.6 | < 0.4 | 检索引入过多噪声 / 排序差 | 后检索重排、压缩、相关性过滤 |
|
||||
| `noise_sensitivity` | > 0.3 | > 0.5 | 回答被噪声片段干扰(越低越好) | 后检索相关性过滤、重排 |
|
||||
| `factual_correctness` | < 0.6 | < 0.4 | 回答事实与标准答案偏差大 | 检索与生成综合优化 |
|
||||
| `semantic_similarity` | < 0.7 | < 0.5 | 回答语义与标准答案差距大 | 生成 prompt、检索质量 |
|
||||
|
||||
> 注:`noise_sensitivity` 越低越好(0=完全不受噪声影响),其阈值方向与其余相反。
|
||||
|
||||
### 4.3 低分样本选取
|
||||
|
||||
每个触发诊断的指标,取该指标分数最低的 **top-3** 样本(排除 NaN)附入 `Diagnosis.low_samples`,字段包含 `sample_id / question / answer / ground_truth / <metric_score>`。
|
||||
|
||||
---
|
||||
|
||||
## 5. LLM 分析器(llm_analyzer.py)
|
||||
|
||||
### 5.1 输入
|
||||
|
||||
- `diagnoses: list[Diagnosis]` — 规则引擎输出(仅触发阈值的指标)
|
||||
- `llm` — 已有 RAGAS LLM 实例(scenario 的 judge_model)
|
||||
- `scenario_name: str` — 用于报告标题
|
||||
|
||||
### 5.2 Prompt 设计
|
||||
|
||||
使用**一次 LLM 调用**,把所有触发诊断的指标和低分样本一起发送:
|
||||
|
||||
```
|
||||
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
|
||||
请用中文撰写一份优化建议报告,格式为 Markdown。
|
||||
|
||||
## 评测诊断摘要
|
||||
{for each diagnosis: 指标名、均值、阈值、可能原因、建议动作}
|
||||
|
||||
## 低分样本示例
|
||||
{for each diagnosis: top-3 低分样本的 question / answer / ground_truth}
|
||||
|
||||
## 要求
|
||||
1. 按指标分节(## 指标名),先解释"为什么低",再给出"具体怎么改"
|
||||
2. "具体怎么改"要结合低分样本的具体内容,而不只是泛泛建议
|
||||
3. 最后写一节 ## 优先优化次序,按性价比排序(参考:不增加调用次数的优先)
|
||||
4. 语言简洁,面向工程师,不要废话
|
||||
```
|
||||
|
||||
### 5.3 输出
|
||||
|
||||
LLM 返回的 Markdown 字符串,直接写入 `optimization_advice.md`(在报告头部追加运行元信息)。
|
||||
|
||||
### 5.4 失败降级
|
||||
|
||||
LLM 调用失败(超时/异常)时:降级为**纯规则报告**(只输出规则引擎的诊断结构,不含 LLM 解读),文件照常写出,错误信息写入报告末尾,不阻断整个评测流程。
|
||||
|
||||
---
|
||||
|
||||
## 6. 写出层(writer.py)
|
||||
|
||||
### 6.1 文件写出
|
||||
|
||||
`optimization_advice.md` 结构:
|
||||
|
||||
```markdown
|
||||
# 优化建议报告 — <scenario_name>
|
||||
|
||||
- run_id: `<run_id>`
|
||||
- 生成时间: `<timestamp>`
|
||||
- judge_model: `<model>`
|
||||
|
||||
---
|
||||
|
||||
<LLM 生成的 Markdown 正文>
|
||||
```
|
||||
|
||||
### 6.2 日志摘要
|
||||
|
||||
`run_advisor()` 完成后向 `logger.info` 打印一条精简摘要(单行,适合 `run_eval.bat` 结束后一眼扫到):
|
||||
|
||||
```
|
||||
[advisor] 触发诊断 3 项: faithfulness(0.42, critical) context_recall(0.58, warning) noise_sensitivity(0.41, critical)
|
||||
[advisor] 优化建议已写出: outputs/online/.../optimization_advice.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. YAML 配置
|
||||
|
||||
场景文件新增一个顶层字段:
|
||||
|
||||
```yaml
|
||||
optimization_advisor: true # 默认 false;true 时评测结束后自动生成优化建议
|
||||
```
|
||||
|
||||
后续若需精细配置(阈值覆盖、top-N 低分样本数),可扩展为:
|
||||
|
||||
```yaml
|
||||
optimization_advisor:
|
||||
enabled: true
|
||||
top_low_samples: 3 # 每个指标取几条低分样本(默认 3)
|
||||
# thresholds: # 可选:覆盖默认阈值
|
||||
# faithfulness: 0.65
|
||||
```
|
||||
|
||||
本轮实现仅支持 `optimization_advisor: true/false`,扩展接口预留但不实现。
|
||||
|
||||
---
|
||||
|
||||
## 8. 测试策略
|
||||
|
||||
| 测试 | 文件 | 说明 |
|
||||
|---|---|---|
|
||||
| 规则引擎单测 | `tests/test_advisor_rules.py` | 纯函数,无 LLM,覆盖每条规则的 warning/critical 触发、NaN 跳过、low_samples 选取 |
|
||||
| writer 单测 | `tests/test_advisor_writer.py` | mock Diagnosis 列表,验证 md 文件写出格式和日志输出 |
|
||||
| 集成(可选) | 现有 `tests/test_online_eval.py` | 验证 `optimization_advisor: true` 场景下 advice_md 存在 |
|
||||
|
||||
LLM 分析器不写单测(依赖网络),由集成场景覆盖。
|
||||
|
||||
---
|
||||
|
||||
## 9. 不覆盖(本轮边界)
|
||||
|
||||
- 不支持跨版本对比分析(只分析本次 run)
|
||||
- 不支持批量场景聚合建议
|
||||
- 不建设 Web UI 展示
|
||||
- LLM 分析器 prompt 本轮不做多语言适配(直接中文)
|
||||
- advisor 阈值本轮硬编码在 `rules.py`,不从 YAML 读取
|
||||
19
main.py
19
main.py
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.dataset_builder.runner import run_dataset_build
|
||||
from rag_eval.execution.runner import run_scenario
|
||||
@@ -18,18 +20,33 @@ def parse_args() -> argparse.Namespace:
|
||||
"--dataset-build-config",
|
||||
help="Path to a YAML dataset build config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-file",
|
||||
default=None,
|
||||
help="Write evaluation logs to this file (in addition to stderr). "
|
||||
"Example: logs/eval.log",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
default="INFO",
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
help="Logging verbosity level (default: INFO). Use DEBUG for per-metric detail.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Dispatch the CLI call to the requested workflow."""
|
||||
args = parse_args()
|
||||
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
||||
log_file = Path(args.log_file) if args.log_file else None
|
||||
|
||||
if args.dataset_build_config:
|
||||
result = run_dataset_build(args.dataset_build_config)
|
||||
print(f"Completed dataset build: {result.artifact_paths.root_dir}")
|
||||
return
|
||||
|
||||
result = run_scenario(args.scenario)
|
||||
result = run_scenario(args.scenario, log_file=log_file, log_level=log_level)
|
||||
print(f"Completed run: {result.scenario.output_dir}")
|
||||
|
||||
|
||||
|
||||
67
rag_eval/advisor/__init__.py
Normal file
67
rag_eval/advisor/__init__.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Optimization advisor: rule-based diagnosis + LLM-powered recommendations."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.reporting.artifacts import build_artifact_paths
|
||||
from rag_eval.shared.models import EvaluationResult, Scenario
|
||||
|
||||
from .llm_analyzer import analyze
|
||||
from .rules import Diagnosis, diagnose
|
||||
from .writer import write_advice
|
||||
|
||||
logger = logging.getLogger("rag_eval.advisor")
|
||||
|
||||
__all__ = ["run_advisor", "Diagnosis", "diagnose"]
|
||||
|
||||
|
||||
def run_advisor(
|
||||
result: EvaluationResult,
|
||||
scenario: Scenario,
|
||||
llm: Any,
|
||||
) -> None:
|
||||
"""Run the full optimization advisor pipeline after an evaluation completes.
|
||||
|
||||
Skips silently if scenario.optimization_advisor is False.
|
||||
Never raises — failures are logged as warnings, not exceptions.
|
||||
|
||||
Args:
|
||||
result: Completed EvaluationResult from Evaluator.evaluate().
|
||||
scenario: The resolved Scenario (provides metrics, judge_model, output_dir).
|
||||
llm: Pre-built RAGAS LLM instance (from build_models()) for LLM analysis.
|
||||
"""
|
||||
if not scenario.optimization_advisor:
|
||||
return
|
||||
|
||||
logger.info("[advisor] starting optimization analysis scenario=%s", scenario.scenario_name)
|
||||
|
||||
try:
|
||||
artifact_paths = build_artifact_paths(scenario.output_dir, result.run_id)
|
||||
if artifact_paths.advice_md is None:
|
||||
logger.warning("[advisor] advice_md path not set in RunArtifactPaths — skipping")
|
||||
return
|
||||
|
||||
diagnoses = diagnose(result.score_rows, scenario.metrics)
|
||||
logger.info("[advisor] rule diagnosis complete: %d metric(s) triggered", len(diagnoses))
|
||||
|
||||
if diagnoses:
|
||||
llm_markdown = asyncio.run(analyze(diagnoses, llm, scenario.scenario_name))
|
||||
else:
|
||||
llm_markdown = ""
|
||||
|
||||
write_advice(
|
||||
diagnoses=diagnoses,
|
||||
llm_markdown=llm_markdown,
|
||||
advice_path=artifact_paths.advice_md,
|
||||
scenario_name=scenario.scenario_name,
|
||||
run_id=result.run_id,
|
||||
judge_model=scenario.judge_model,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[advisor] advisor failed (%s: %s) — evaluation result is unaffected",
|
||||
type(exc).__name__, exc,
|
||||
)
|
||||
100
rag_eval/advisor/llm_analyzer.py
Normal file
100
rag_eval/advisor/llm_analyzer.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""LLM-powered analysis of rule diagnostics and low-score samples."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from .rules import Diagnosis
|
||||
|
||||
logger = logging.getLogger("rag_eval.advisor")
|
||||
|
||||
_PROMPT_TEMPLATE = """\
|
||||
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
|
||||
请用中文撰写一份优化建议报告,格式为 Markdown。
|
||||
|
||||
## 评测诊断摘要
|
||||
|
||||
{diagnosis_summary}
|
||||
|
||||
## 低分样本示例
|
||||
|
||||
{low_sample_text}
|
||||
|
||||
## 报告要求
|
||||
|
||||
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
|
||||
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||
|
||||
只输出 Markdown 报告正文,不要任何前置说明。
|
||||
"""
|
||||
|
||||
|
||||
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
||||
lines = []
|
||||
for d in diagnoses:
|
||||
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
||||
lines.append(
|
||||
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
||||
f"阈值={d.threshold},严重程度={d.severity}"
|
||||
)
|
||||
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
||||
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
|
||||
lines = []
|
||||
for d in diagnoses:
|
||||
if not d.low_samples:
|
||||
continue
|
||||
lines.append(f"### {d.metric} 低分样本(最多 3 条)")
|
||||
for i, s in enumerate(d.low_samples, 1):
|
||||
score = s.get(d.metric, "N/A")
|
||||
lines.append(f"\n**样本 {i}**(分数={score})")
|
||||
lines.append(f"- 问题:{s.get('question', '')}")
|
||||
lines.append(f"- 回答:{s.get('answer', '')[:300]}")
|
||||
lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def analyze(
|
||||
diagnoses: list[Diagnosis],
|
||||
llm: Any,
|
||||
scenario_name: str,
|
||||
) -> str:
|
||||
"""Call the judge LLM to generate a Chinese optimization report.
|
||||
|
||||
Args:
|
||||
diagnoses: Non-empty list of Diagnosis from rules.diagnose().
|
||||
llm: RAGAS LLM wrapper (has .agenerate() method).
|
||||
scenario_name: Used only for logging.
|
||||
|
||||
Returns:
|
||||
LLM-generated Markdown string, or "" on failure (triggers writer fallback).
|
||||
"""
|
||||
if not diagnoses:
|
||||
return ""
|
||||
|
||||
diagnosis_summary = _build_diagnosis_summary(diagnoses)
|
||||
low_sample_text = _build_low_sample_text(diagnoses)
|
||||
prompt = _PROMPT_TEMPLATE.format(
|
||||
diagnosis_summary=diagnosis_summary,
|
||||
low_sample_text=low_sample_text,
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name)
|
||||
from langchain_core.messages import HumanMessage
|
||||
# Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel)
|
||||
response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)])
|
||||
text = response.content.strip()
|
||||
logger.info("[advisor] LLM analysis complete chars=%d", len(text))
|
||||
return text
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
|
||||
type(exc).__name__, exc,
|
||||
)
|
||||
return ""
|
||||
236
rag_eval/advisor/rules.py
Normal file
236
rag_eval/advisor/rules.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Rule-based diagnostic engine for RAG evaluation metric scores."""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricRule:
|
||||
"""Threshold configuration and diagnostic text for one metric."""
|
||||
warning_threshold: float
|
||||
critical_threshold: float
|
||||
higher_is_better: bool # False for noise_sensitivity
|
||||
root_causes: list[str]
|
||||
suggested_actions: list[str]
|
||||
|
||||
|
||||
METRIC_RULES: dict[str, MetricRule] = {
|
||||
"faithfulness": MetricRule(
|
||||
warning_threshold=0.7,
|
||||
critical_threshold=0.5,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"生成回答包含检索片段中不支持的陈述(幻觉)",
|
||||
"生成阶段未严格遵循 grounding 约束",
|
||||
"校验阶段未开启或未生效",
|
||||
],
|
||||
suggested_actions=[
|
||||
"强化生成 prompt 的 grounding 约束('只依据参考资料作答')",
|
||||
"开启校验阶段(validation: by_scenario)",
|
||||
"检查低分样本中模型是否引用了片段外的知识",
|
||||
],
|
||||
),
|
||||
"answer_relevancy": MetricRule(
|
||||
warning_threshold=0.7,
|
||||
critical_threshold=0.5,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"回答偏离问题主旨或包含大量冗余内容",
|
||||
"查询改写后问题语义漂移",
|
||||
"生成 prompt 格式约束不足",
|
||||
],
|
||||
suggested_actions=[
|
||||
"优化查询改写 prompt,确保改写后语义不偏移",
|
||||
"在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
|
||||
"检查低分样本的回答是否存在格式冗余或话题偏移",
|
||||
],
|
||||
),
|
||||
"context_recall": MetricRule(
|
||||
warning_threshold=0.7,
|
||||
critical_threshold=0.5,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"检索未能召回标准答案所涉及的关键信息",
|
||||
"单一查询未能覆盖问题的多个角度",
|
||||
"过召回数量不足,关键片段被截断",
|
||||
],
|
||||
suggested_actions=[
|
||||
"启用多查询扩展(use_multi_query)覆盖不同措辞",
|
||||
"对多跳问题启用问题分解(sub_questions)",
|
||||
"加大过召回宽度(recall_top_k)",
|
||||
"对颗粒度细的问题尝试 Step-back 双路检索",
|
||||
],
|
||||
),
|
||||
"context_precision": MetricRule(
|
||||
warning_threshold=0.6,
|
||||
critical_threshold=0.4,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"检索引入过多与问题无关的片段",
|
||||
"重排未能将相关片段排在前列",
|
||||
"缺少相关性过滤,噪声片段进入上下文",
|
||||
],
|
||||
suggested_actions=[
|
||||
"启用或优化 listwise 重排,将相关片段排在前列",
|
||||
"启用上下文压缩(compression)过滤无关句子",
|
||||
"启用相关性过滤(relevance_filter)丢弃明确无关片段",
|
||||
"缩小 rerank_keep_k(如从 8 降到 5)",
|
||||
],
|
||||
),
|
||||
"noise_sensitivity": MetricRule(
|
||||
warning_threshold=0.3, # higher is worse; trigger when mean > threshold
|
||||
critical_threshold=0.5,
|
||||
higher_is_better=False,
|
||||
root_causes=[
|
||||
"回答中包含检索到的噪声片段所引入的错误陈述",
|
||||
"相关性过滤未能拦截干扰性片段",
|
||||
"生成阶段对噪声片段未加区分地引用",
|
||||
],
|
||||
suggested_actions=[
|
||||
"启用相关性过滤(relevance_filter)拦截噪声",
|
||||
"优化重排,将不相关片段排到截断点之后",
|
||||
"在生成 prompt 中强调'来源冲突时并列陈述,不擅自下定论'",
|
||||
],
|
||||
),
|
||||
"factual_correctness": MetricRule(
|
||||
warning_threshold=0.6,
|
||||
critical_threshold=0.4,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"回答的事实陈述与标准答案存在偏差",
|
||||
"检索未能命中标准答案所依据的关键片段",
|
||||
"生成阶段对多个来源综合时产生事实错误",
|
||||
],
|
||||
suggested_actions=[
|
||||
"重点检查低分样本,确认是检索遗漏还是生成错误",
|
||||
"提升 context_recall 以确保关键信息被检索到",
|
||||
"对事实型问题将 temperature 降至 0",
|
||||
],
|
||||
),
|
||||
"semantic_similarity": MetricRule(
|
||||
warning_threshold=0.7,
|
||||
critical_threshold=0.5,
|
||||
higher_is_better=True,
|
||||
root_causes=[
|
||||
"回答语义与标准答案差距较大",
|
||||
"回答过于简短或过于冗长,语义偏移",
|
||||
"检索到的片段质量不足,导致生成内容偏离",
|
||||
],
|
||||
suggested_actions=[
|
||||
"检查低分样本的回答与标准答案的表述差异",
|
||||
"优化生成 prompt 使回答更贴近标准表述风格",
|
||||
"提升检索质量(context_recall / context_precision)",
|
||||
],
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Diagnosis:
|
||||
"""Diagnostic result for one metric that triggered a threshold."""
|
||||
metric: str
|
||||
mean_score: float
|
||||
threshold: float # the triggered threshold
|
||||
severity: str # "warning" | "critical"
|
||||
root_causes: list[str] = field(default_factory=list)
|
||||
suggested_actions: list[str] = field(default_factory=list)
|
||||
low_samples: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
def _mean_ignoring_nan(values: list[float]) -> float | None:
|
||||
valid = [v for v in values if not math.isnan(v)]
|
||||
if not valid:
|
||||
return None
|
||||
return sum(valid) / len(valid)
|
||||
|
||||
|
||||
def _select_low_samples(
|
||||
rows: list[dict[str, Any]],
|
||||
metric: str,
|
||||
top_n: int,
|
||||
higher_is_better: bool,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Return the top_n worst-scoring rows for a metric, excluding NaN."""
|
||||
valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
|
||||
sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
|
||||
worst = sorted_rows[:top_n]
|
||||
keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
|
||||
return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
|
||||
|
||||
|
||||
def diagnose(
|
||||
score_rows: list[dict[str, Any]],
|
||||
metrics: list[str],
|
||||
top_low_samples: int = 3,
|
||||
) -> list[Diagnosis]:
|
||||
"""Analyse score_rows and return a Diagnosis for each metric below threshold.
|
||||
|
||||
Args:
|
||||
score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
|
||||
metrics: Metric names to evaluate (from Scenario.metrics).
|
||||
top_low_samples: How many worst-scoring samples to attach per diagnosis.
|
||||
|
||||
Returns:
|
||||
List of Diagnosis objects, one per triggered metric. Empty if all OK.
|
||||
"""
|
||||
diagnoses: list[Diagnosis] = []
|
||||
|
||||
for metric in metrics:
|
||||
rule = METRIC_RULES.get(metric)
|
||||
if rule is None:
|
||||
continue # unknown metric, skip
|
||||
|
||||
values = []
|
||||
for row in score_rows:
|
||||
raw = row.get(metric)
|
||||
if raw is None:
|
||||
continue
|
||||
try:
|
||||
v = float(raw)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
values.append(v)
|
||||
|
||||
if not values:
|
||||
continue
|
||||
|
||||
mean = _mean_ignoring_nan(values)
|
||||
if mean is None:
|
||||
continue
|
||||
|
||||
# Determine severity (direction-aware)
|
||||
if rule.higher_is_better:
|
||||
if mean < rule.critical_threshold:
|
||||
severity = "critical"
|
||||
threshold = rule.critical_threshold
|
||||
elif mean < rule.warning_threshold:
|
||||
severity = "warning"
|
||||
threshold = rule.warning_threshold
|
||||
else:
|
||||
continue # above warning threshold → no diagnosis
|
||||
else:
|
||||
# lower is better (noise_sensitivity)
|
||||
if mean > rule.critical_threshold:
|
||||
severity = "critical"
|
||||
threshold = rule.critical_threshold
|
||||
elif mean > rule.warning_threshold:
|
||||
severity = "warning"
|
||||
threshold = rule.warning_threshold
|
||||
else:
|
||||
continue
|
||||
|
||||
low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
|
||||
|
||||
diagnoses.append(Diagnosis(
|
||||
metric=metric,
|
||||
mean_score=round(mean, 4),
|
||||
threshold=threshold,
|
||||
severity=severity,
|
||||
root_causes=list(rule.root_causes),
|
||||
suggested_actions=list(rule.suggested_actions),
|
||||
low_samples=low_samples,
|
||||
))
|
||||
|
||||
return diagnoses
|
||||
82
rag_eval/advisor/writer.py
Normal file
82
rag_eval/advisor/writer.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Write optimization advice to markdown file and emit log summary."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from .rules import Diagnosis
|
||||
|
||||
logger = logging.getLogger("rag_eval.advisor")
|
||||
|
||||
|
||||
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
||||
"""Return a single-line log summary of triggered diagnoses."""
|
||||
if not diagnoses:
|
||||
return "[advisor] 所有指标正常,无需优化建议。"
|
||||
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
|
||||
triggered = " ".join(parts)
|
||||
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
||||
|
||||
|
||||
def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
|
||||
"""Build a rules-only report when LLM analysis is unavailable."""
|
||||
if not diagnoses:
|
||||
return ""
|
||||
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
||||
for d in diagnoses:
|
||||
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
|
||||
lines.append("\n**可能原因:**")
|
||||
for cause in d.root_causes:
|
||||
lines.append(f"- {cause}")
|
||||
lines.append("\n**建议动作:**")
|
||||
for action in d.suggested_actions:
|
||||
lines.append(f"- {action}")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_advice(
|
||||
diagnoses: list[Diagnosis],
|
||||
llm_markdown: str,
|
||||
advice_path: Path,
|
||||
scenario_name: str,
|
||||
run_id: str,
|
||||
judge_model: str,
|
||||
) -> None:
|
||||
"""Write optimization_advice.md and emit a log summary line.
|
||||
|
||||
Args:
|
||||
diagnoses: List of Diagnosis from rules.diagnose().
|
||||
llm_markdown: LLM-generated Markdown body. Empty string triggers fallback.
|
||||
advice_path: Full path to write the .md file.
|
||||
scenario_name: Human-readable scenario identifier for the report header.
|
||||
run_id: Run identifier string.
|
||||
judge_model: Model used for LLM analysis (shown in header).
|
||||
"""
|
||||
advice_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from rag_eval.shared.utils import utc_now_iso
|
||||
header_lines = [
|
||||
f"# 优化建议报告 — {scenario_name}",
|
||||
"",
|
||||
f"- run_id: `{run_id}`",
|
||||
f"- 生成时间: `{utc_now_iso()}`",
|
||||
f"- judge_model: `{judge_model}`",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
|
||||
if not diagnoses:
|
||||
body = "## ✅ 未发现明显指标异常\n\n所有指标均在正常范围内,当前 RAG 链路表现良好。\n"
|
||||
elif llm_markdown:
|
||||
body = llm_markdown
|
||||
else:
|
||||
body = _build_fallback_report(diagnoses)
|
||||
|
||||
content = "\n".join(header_lines) + body
|
||||
advice_path.write_text(content, encoding="utf-8")
|
||||
|
||||
summary = _format_log_summary(diagnoses, advice_path)
|
||||
logger.info(summary)
|
||||
logger.info("[advisor] 优化建议已写出: %s", advice_path)
|
||||
@@ -61,6 +61,7 @@ def load_scenario(path: str | Path) -> Scenario:
|
||||
max_samples=model.runtime.max_samples,
|
||||
),
|
||||
source_path=scenario_path,
|
||||
optimization_advisor=model.optimization_advisor,
|
||||
)
|
||||
# Run cross-field checks after all relative paths have been resolved.
|
||||
validate_scenario(scenario)
|
||||
|
||||
@@ -54,6 +54,7 @@ class ScenarioModel(BaseModel):
|
||||
metrics: list[str]
|
||||
output_dir: str
|
||||
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
|
||||
optimization_advisor: bool = False
|
||||
|
||||
@field_validator("metrics")
|
||||
@classmethod
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.adapters.base import AppAdapter
|
||||
@@ -13,6 +15,8 @@ from rag_eval.metrics.pipeline import MetricPipeline
|
||||
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
||||
from rag_eval.shared.utils import utc_now_iso
|
||||
|
||||
logger = logging.getLogger("rag_eval.execution.evaluator")
|
||||
|
||||
|
||||
class Evaluator:
|
||||
"""Coordinate dataset loading, optional app execution, and metric scoring."""
|
||||
@@ -31,27 +35,61 @@ class Evaluator:
|
||||
def evaluate(self) -> EvaluationResult:
|
||||
"""Execute the full evaluation flow and return the collected results."""
|
||||
started_at = utc_now_iso()
|
||||
scenario_name = self.scenario.scenario_name
|
||||
mode = self.scenario.mode
|
||||
logger.info("=" * 60)
|
||||
logger.info("[eval] START scenario=%s mode=%s", scenario_name, mode)
|
||||
logger.info("[eval] dataset=%s", self.scenario.dataset.path)
|
||||
logger.info("[eval] metrics=%s", list(self.scenario.metrics))
|
||||
logger.info("[eval] judge=%s embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
|
||||
|
||||
raw_records = load_dataset_records(self.scenario.dataset.path)
|
||||
logger.info("[eval] raw_records=%d", len(raw_records))
|
||||
|
||||
samples, invalid_samples = normalize_records(
|
||||
raw_records,
|
||||
mode=self.scenario.mode,
|
||||
max_samples=self.scenario.runtime.max_samples,
|
||||
)
|
||||
logger.info("[eval] normalized: valid=%d invalid=%d", len(samples), len(invalid_samples))
|
||||
|
||||
if self.scenario.mode == "online":
|
||||
# Online mode enriches each sample by calling the target application first.
|
||||
logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
|
||||
t0 = time.monotonic()
|
||||
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
|
||||
elapsed = time.monotonic() - t0
|
||||
invalid_samples.extend(online_invalids)
|
||||
logger.info(
|
||||
"[eval] adapter done: enriched=%d adapter_invalids=%d elapsed=%.1fs",
|
||||
len(samples), len(online_invalids), elapsed,
|
||||
)
|
||||
|
||||
logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
|
||||
t0 = time.monotonic()
|
||||
metric_scores = asyncio.run(
|
||||
self.metric_pipeline.score_samples(
|
||||
samples,
|
||||
max_concurrency=self.scenario.runtime.metric_limit(),
|
||||
)
|
||||
)
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info("[eval] metric scoring done elapsed=%.1fs", elapsed)
|
||||
|
||||
finished_at = utc_now_iso()
|
||||
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
|
||||
|
||||
# Summary of NaN rates per metric
|
||||
import math
|
||||
for metric_name in self.scenario.metrics:
|
||||
nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
|
||||
logger.info("[eval] %-22s NaN=%d/%d (%.0f%%)",
|
||||
metric_name, nan_count, len(score_rows),
|
||||
100 * nan_count / len(score_rows) if score_rows else 0)
|
||||
|
||||
run_id = finished_at.replace(":", "-")
|
||||
logger.info("[eval] DONE run_id=%s total_valid=%d total_invalid=%d",
|
||||
run_id, len(samples), len(invalid_samples))
|
||||
logger.info("=" * 60)
|
||||
return EvaluationResult(
|
||||
scenario=self.scenario,
|
||||
run_id=run_id,
|
||||
@@ -72,13 +110,27 @@ class Evaluator:
|
||||
|
||||
valid: list[NormalizedSample] = []
|
||||
invalid: list[InvalidSample] = []
|
||||
total = len(samples)
|
||||
|
||||
async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
|
||||
async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
|
||||
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
|
||||
sid = sample.sample_id[:12]
|
||||
logger.debug("[adapter] [%d/%d] calling adapter sample=%s question=%r",
|
||||
idx + 1, total, sid, (sample.question or "")[:60])
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
return await self.app_adapter.enrich_sample(sample)
|
||||
result = await self.app_adapter.enrich_sample(sample)
|
||||
elapsed = time.monotonic() - t0
|
||||
ans_len = len(result.answer or "")
|
||||
ctx_count = len(result.contexts or [])
|
||||
logger.info("[adapter] [%d/%d] OK sample=%-12s ans_len=%d ctx_count=%d elapsed=%.1fs",
|
||||
idx + 1, total, sid, ans_len, ctx_count, elapsed)
|
||||
return result
|
||||
except Exception as exc:
|
||||
elapsed = time.monotonic() - t0
|
||||
error_type = type(exc).__name__
|
||||
logger.warning("[adapter] [%d/%d] FAIL sample=%-12s %s: %s (elapsed=%.1fs)",
|
||||
idx + 1, total, sid, error_type, exc, elapsed)
|
||||
return InvalidSample(
|
||||
sample_id=sample.sample_id,
|
||||
error=f"adapter failed [{error_type}]: {exc}",
|
||||
@@ -86,8 +138,8 @@ class Evaluator:
|
||||
)
|
||||
|
||||
factories = [
|
||||
(lambda sample=sample: enrich_with_capture(sample))
|
||||
for sample in samples
|
||||
(lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
|
||||
for i, sample in enumerate(samples)
|
||||
]
|
||||
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
|
||||
|
||||
@@ -102,6 +154,8 @@ class Evaluator:
|
||||
if not sample.contexts:
|
||||
errors.append("adapter returned empty contexts")
|
||||
if errors:
|
||||
logger.warning("[adapter] incomplete payload sample=%s errors=%s",
|
||||
sample.sample_id[:12], errors)
|
||||
invalid.append(
|
||||
InvalidSample(
|
||||
sample_id=sample.sample_id,
|
||||
@@ -111,6 +165,9 @@ class Evaluator:
|
||||
)
|
||||
continue
|
||||
valid.append(sample)
|
||||
|
||||
logger.info("[adapter] enrichment summary: valid=%d invalid=%d of total=%d",
|
||||
len(valid), len(invalid), total)
|
||||
return valid, invalid
|
||||
|
||||
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
||||
|
||||
@@ -2,16 +2,42 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.adapters.http import HttpAppAdapter
|
||||
from rag_eval.adapters.python import PythonFunctionAdapter
|
||||
from rag_eval.advisor import run_advisor
|
||||
from rag_eval.config.loader import load_scenario
|
||||
from rag_eval.metrics.factory import build_metric_pipeline
|
||||
from rag_eval.metrics.factory import build_models, build_metric_pipeline
|
||||
from rag_eval.reporting.writers import write_run_artifacts
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
from rag_eval.shared.models import Scenario
|
||||
|
||||
from .evaluator import Evaluator
|
||||
|
||||
logger = logging.getLogger("rag_eval.execution.runner")
|
||||
|
||||
|
||||
def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
|
||||
"""Configure root logger: always write to stderr, optionally also to a file."""
|
||||
fmt = "%(asctime)s %(levelname)-8s %(name)s %(message)s"
|
||||
datefmt = "%H:%M:%S"
|
||||
|
||||
handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
|
||||
if log_file is not None:
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
fh = logging.FileHandler(log_file, encoding="utf-8")
|
||||
fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
|
||||
handlers.append(fh)
|
||||
|
||||
logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
|
||||
# Also show ragas internal logs at WARNING so we can see LLM errors
|
||||
logging.getLogger("ragas").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("openai").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def build_adapter(scenario: Scenario):
|
||||
"""Instantiate the adapter required by the resolved scenario, if any."""
|
||||
@@ -27,16 +53,32 @@ def build_adapter(scenario: Scenario):
|
||||
def run_scenario(
|
||||
scenario_path: str,
|
||||
settings: EvaluationSettings | None = None,
|
||||
log_file: Path | None = None,
|
||||
log_level: int = logging.INFO,
|
||||
):
|
||||
"""Run one scenario end to end and persist its reporting artifacts."""
|
||||
_setup_logging(log_file=log_file, level=log_level)
|
||||
logger.info("[runner] run_scenario path=%s", scenario_path)
|
||||
|
||||
settings = settings or EvaluationSettings()
|
||||
if not settings.openai_api_key:
|
||||
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
|
||||
|
||||
scenario = load_scenario(scenario_path)
|
||||
logger.info("[runner] scenario loaded: name=%s mode=%s max_samples=%s",
|
||||
scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)
|
||||
|
||||
# Build models once; reuse llm in both MetricPipeline and advisor.
|
||||
llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)
|
||||
|
||||
adapter = build_adapter(scenario)
|
||||
pipeline = build_metric_pipeline(scenario, settings)
|
||||
pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
|
||||
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
|
||||
result = evaluator.evaluate()
|
||||
write_run_artifacts(result)
|
||||
logger.info("[runner] artifacts written for run_id=%s", result.run_id)
|
||||
|
||||
# Optimization advisor — runs only if scenario.optimization_advisor is True.
|
||||
run_advisor(result, scenario, llm)
|
||||
|
||||
return result
|
||||
|
||||
@@ -18,7 +18,10 @@ from ragas.metrics.collections import (
|
||||
AnswerRelevancy,
|
||||
ContextPrecision,
|
||||
ContextRecall,
|
||||
FactualCorrectness,
|
||||
Faithfulness,
|
||||
NoiseSensitivity,
|
||||
SemanticSimilarity,
|
||||
)
|
||||
|
||||
from .pipeline import MetricPipeline
|
||||
@@ -39,19 +42,34 @@ def build_models(
|
||||
def build_metric_pipeline(
|
||||
scenario: Scenario,
|
||||
settings: EvaluationSettings,
|
||||
llm: Any | None = None,
|
||||
embeddings: Any | None = None,
|
||||
) -> MetricPipeline:
|
||||
"""Build a metric pipeline containing only the metrics requested by the scenario."""
|
||||
llm, embeddings = build_models(
|
||||
scenario.judge_model,
|
||||
scenario.embedding_model,
|
||||
settings,
|
||||
)
|
||||
"""Build a metric pipeline containing only the metrics requested by the scenario.
|
||||
|
||||
If llm and embeddings are provided (pre-built by the caller), they are reused.
|
||||
Otherwise, new instances are created from scenario + settings.
|
||||
"""
|
||||
if llm is None or embeddings is None:
|
||||
llm, embeddings = build_models(
|
||||
scenario.judge_model,
|
||||
scenario.embedding_model,
|
||||
settings,
|
||||
)
|
||||
|
||||
# Build the full registry once, then slice it by configured metric names.
|
||||
registry: dict[str, Any] = {
|
||||
"faithfulness": Faithfulness(llm=llm),
|
||||
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||
"context_recall": ContextRecall(llm=llm),
|
||||
"context_precision": ContextPrecision(llm=llm),
|
||||
# Robustness / end-to-end metrics (架构设计 §10.2).
|
||||
# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
|
||||
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||
# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
|
||||
"factual_correctness": FactualCorrectness(llm=llm),
|
||||
# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
|
||||
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||
}
|
||||
return MetricPipeline(
|
||||
metrics={name: registry[name] for name in scenario.metrics},
|
||||
|
||||
@@ -3,12 +3,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.shared.models import MetricScore, NormalizedSample
|
||||
|
||||
logger = logging.getLogger("rag_eval.metrics.pipeline")
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class MetricPipeline:
|
||||
@@ -22,12 +26,43 @@ class MetricPipeline:
|
||||
results = {name: math.nan for name in self.metrics}
|
||||
errors: list[str] = []
|
||||
|
||||
sid = sample.sample_id[:12]
|
||||
ans_len = len(sample.answer or "")
|
||||
ctx_count = len(sample.contexts or [])
|
||||
logger.debug(
|
||||
"[score] sample=%s ans_len=%d ctx_count=%d question=%r",
|
||||
sid, ans_len, ctx_count,
|
||||
(sample.question or "")[:80],
|
||||
)
|
||||
|
||||
for name, metric in self.metrics.items():
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
result = await self._run_metric(name, metric, sample)
|
||||
results[name] = float(result.value)
|
||||
score_val = float(result.value)
|
||||
results[name] = score_val
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info(
|
||||
"[metric OK ] sample=%-12s %-20s score=%.4f elapsed=%.1fs",
|
||||
sid, name, score_val, elapsed,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
elapsed = time.monotonic() - t0
|
||||
msg = f"timeout after {self.metric_timeout_seconds}s"
|
||||
errors.append(f"{name}: {msg}")
|
||||
logger.warning(
|
||||
"[metric TMO] sample=%-12s %-20s TIMEOUT after %.1fs",
|
||||
sid, name, elapsed,
|
||||
)
|
||||
except Exception as exc:
|
||||
elapsed = time.monotonic() - t0
|
||||
exc_type = type(exc).__name__
|
||||
errors.append(f"{name}: {exc}")
|
||||
logger.warning(
|
||||
"[metric ERR] sample=%-12s %-20s %s: %s (elapsed=%.1fs)",
|
||||
sid, name, exc_type, exc, elapsed,
|
||||
)
|
||||
|
||||
return MetricScore(metrics=results, error=" | ".join(errors))
|
||||
|
||||
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
|
||||
@@ -59,6 +94,23 @@ class MetricPipeline:
|
||||
reference=sample.ground_truth,
|
||||
retrieved_contexts=sample.contexts,
|
||||
)
|
||||
elif name == "noise_sensitivity":
|
||||
coroutine = metric.ascore(
|
||||
user_input=sample.question,
|
||||
response=sample.answer,
|
||||
reference=sample.ground_truth,
|
||||
retrieved_contexts=sample.contexts,
|
||||
)
|
||||
elif name == "factual_correctness":
|
||||
coroutine = metric.ascore(
|
||||
response=sample.answer,
|
||||
reference=sample.ground_truth,
|
||||
)
|
||||
elif name == "semantic_similarity":
|
||||
coroutine = metric.ascore(
|
||||
reference=sample.ground_truth,
|
||||
response=sample.answer,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported metric: {name}")
|
||||
|
||||
@@ -72,11 +124,22 @@ class MetricPipeline:
|
||||
max_concurrency: int,
|
||||
) -> list[MetricScore]:
|
||||
"""Score all samples while respecting the configured concurrency limit."""
|
||||
total = len(samples)
|
||||
logger.info("[pipeline] scoring %d samples concurrency=%d timeout=%ss",
|
||||
total, max_concurrency, self.metric_timeout_seconds)
|
||||
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
||||
completed = 0
|
||||
|
||||
async def guarded(sample: NormalizedSample) -> MetricScore:
|
||||
async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
|
||||
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
|
||||
nonlocal completed
|
||||
async with semaphore:
|
||||
return await self.score_sample(sample)
|
||||
result = await self.score_sample(sample)
|
||||
completed += 1
|
||||
nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
|
||||
status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
|
||||
logger.info("[pipeline] progress %d/%d sample=%-12s %s",
|
||||
completed, total, sample.sample_id[:12], status)
|
||||
return result
|
||||
|
||||
return await asyncio.gather(*(guarded(sample) for sample in samples))
|
||||
return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
||||
|
||||
SUPPORTED_METRICS = {
|
||||
# Core retrieval / generation metrics (always available).
|
||||
"faithfulness",
|
||||
"answer_relevancy",
|
||||
"context_recall",
|
||||
"context_precision",
|
||||
# Robustness and end-to-end metrics (see 架构设计 §10.2).
|
||||
"noise_sensitivity", # 鲁棒性:对检索噪声的敏感度
|
||||
"factual_correctness", # 端到端:回答相对标准答案的事实正确性
|
||||
"semantic_similarity", # 端到端:回答与标准答案的语义相似度(embedding,无 LLM 调用)
|
||||
}
|
||||
|
||||
@@ -17,4 +17,5 @@ def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
|
||||
invalid_csv=run_dir / "invalid.csv",
|
||||
summary_md=run_dir / "summary.md",
|
||||
metadata_json=run_dir / "metadata.json",
|
||||
advice_md=run_dir / "optimization_advice.md",
|
||||
)
|
||||
|
||||
@@ -76,6 +76,7 @@ class Scenario:
|
||||
runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
|
||||
app_adapter: AppAdapterConfig | None = None
|
||||
source_path: Path | None = None
|
||||
optimization_advisor: bool = False
|
||||
|
||||
def snapshot(self) -> dict[str, Any]:
|
||||
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
|
||||
@@ -159,3 +160,4 @@ class RunArtifactPaths:
|
||||
invalid_csv: Path
|
||||
summary_md: Path
|
||||
metadata_json: Path
|
||||
advice_md: Path | None = None
|
||||
|
||||
107
run_eval.bat
Normal file
107
run_eval.bat
Normal file
@@ -0,0 +1,107 @@
|
||||
@echo off
|
||||
setlocal enabledelayedexpansion
|
||||
|
||||
:: ============================================================
|
||||
:: run_eval.bat - Run a RAGAS evaluation scenario with logs
|
||||
::
|
||||
:: Usage:
|
||||
:: run_eval.bat (uses default online scenario)
|
||||
:: run_eval.bat offline (runs offline smoke scenario)
|
||||
:: run_eval.bat path\to\scenario.yaml (any custom scenario)
|
||||
:: run_eval.bat offline DEBUG (second arg = log level)
|
||||
:: ============================================================
|
||||
|
||||
cd /d "%~dp0"
|
||||
|
||||
echo.
|
||||
echo ============================================================
|
||||
echo Siemens RAGAS - Evaluation Runner
|
||||
echo ============================================================
|
||||
echo.
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 1. Resolve scenario path (arg1)
|
||||
:: ----------------------------------------------------------------
|
||||
set "SCENARIO=%~1"
|
||||
if "%SCENARIO%"=="" set "SCENARIO=online"
|
||||
|
||||
if /i "%SCENARIO%"=="online" (
|
||||
set "SCENARIO=scenarios\online\siemens-pdf-question-bank-online.yaml"
|
||||
)
|
||||
if /i "%SCENARIO%"=="offline" (
|
||||
set "SCENARIO=scenarios\offline\siemens-pdf-offline-smoke.yaml"
|
||||
)
|
||||
|
||||
if not exist "%SCENARIO%" (
|
||||
echo [ERROR] Scenario file not found: %SCENARIO%
|
||||
echo.
|
||||
echo Usage examples:
|
||||
echo run_eval.bat - online eval (default)
|
||||
echo run_eval.bat offline - offline smoke
|
||||
echo run_eval.bat path\to\file.yaml - custom scenario
|
||||
goto :error
|
||||
)
|
||||
echo [OK] Scenario : %SCENARIO%
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 2. Resolve log level (arg2, default INFO)
|
||||
:: ----------------------------------------------------------------
|
||||
set "LOG_LEVEL=%~2"
|
||||
if "%LOG_LEVEL%"=="" set "LOG_LEVEL=INFO"
|
||||
echo [OK] Log level: %LOG_LEVEL%
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 3. Create logs dir and build timestamped log filename
|
||||
:: ----------------------------------------------------------------
|
||||
if not exist "logs" mkdir logs
|
||||
for /f "tokens=1-3 delims=/-" %%a in ("%DATE%") do (
|
||||
set "YMD=%%c-%%a-%%b"
|
||||
)
|
||||
for /f "tokens=1-3 delims=:." %%a in ("%TIME: =0%") do (
|
||||
set "HMS=%%a%%b%%c"
|
||||
)
|
||||
set "LOG_FILE=logs\eval_%YMD%_%HMS%.log"
|
||||
echo [OK] Log file : %LOG_FILE%
|
||||
|
||||
echo.
|
||||
echo ============================================================
|
||||
echo Starting evaluation...
|
||||
echo (Logs also written to %LOG_FILE%)
|
||||
echo Press Ctrl+C to abort
|
||||
echo ============================================================
|
||||
echo.
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 4. Run evaluation with UTF-8 and logging
|
||||
:: ----------------------------------------------------------------
|
||||
set PYTHONIOENCODING=utf-8
|
||||
set PYTHONPATH=.
|
||||
|
||||
python main.py ^
|
||||
--scenario "%SCENARIO%" ^
|
||||
--log-file "%LOG_FILE%" ^
|
||||
--log-level %LOG_LEVEL%
|
||||
|
||||
if errorlevel 1 (
|
||||
echo.
|
||||
echo [ERROR] Evaluation failed. Check log: %LOG_FILE%
|
||||
goto :error
|
||||
)
|
||||
|
||||
echo.
|
||||
echo ============================================================
|
||||
echo Evaluation complete!
|
||||
echo Log saved to: %LOG_FILE%
|
||||
echo Open the web console to view results: start.bat
|
||||
echo ============================================================
|
||||
echo.
|
||||
pause
|
||||
exit /b 0
|
||||
|
||||
:error
|
||||
echo.
|
||||
echo ============================================================
|
||||
echo Evaluation failed. See error above or check log file.
|
||||
echo ============================================================
|
||||
pause
|
||||
exit /b 1
|
||||
96
run_eval.ps1
Normal file
96
run_eval.ps1
Normal file
@@ -0,0 +1,96 @@
|
||||
# run_eval.ps1 - Siemens RAGAS Evaluation Runner
|
||||
# Usage:
|
||||
# .\run_eval.ps1 # online eval (default)
|
||||
# .\run_eval.ps1 offline # offline smoke
|
||||
# .\run_eval.ps1 path\to\scenario.yaml # custom scenario
|
||||
# .\run_eval.ps1 online DEBUG # second arg = log level (DEBUG/INFO/WARNING)
|
||||
# Or: powershell -ExecutionPolicy Bypass -File run_eval.ps1 [scenario] [log-level]
|
||||
|
||||
param(
|
||||
[string]$Scenario = "online",
|
||||
[string]$LogLevel = "INFO"
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
Set-Location $PSScriptRoot
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "============================================================" -ForegroundColor Cyan
|
||||
Write-Host " Siemens RAGAS - Evaluation Runner" -ForegroundColor Cyan
|
||||
Write-Host "============================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 1. Resolve scenario path
|
||||
# ----------------------------------------------------------------
|
||||
$scenarioMap = @{
|
||||
"online" = "scenarios\online\siemens-pdf-question-bank-online.yaml"
|
||||
"offline" = "scenarios\offline\siemens-pdf-offline-smoke.yaml"
|
||||
}
|
||||
if ($scenarioMap.ContainsKey($Scenario.ToLower())) {
|
||||
$Scenario = $scenarioMap[$Scenario.ToLower()]
|
||||
}
|
||||
if (-not (Test-Path $Scenario)) {
|
||||
Write-Host "[ERROR] Scenario file not found: $Scenario" -ForegroundColor Red
|
||||
Write-Host ""
|
||||
Write-Host "Usage examples:"
|
||||
Write-Host " .\run_eval.ps1 - online eval (default)"
|
||||
Write-Host " .\run_eval.ps1 offline - offline smoke"
|
||||
Write-Host " .\run_eval.ps1 path\to\file.yaml - custom scenario"
|
||||
Read-Host "Press Enter to exit"
|
||||
exit 1
|
||||
}
|
||||
Write-Host "[OK] Scenario : $Scenario" -ForegroundColor Green
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 2. Validate log level
|
||||
# ----------------------------------------------------------------
|
||||
$validLevels = @("DEBUG", "INFO", "WARNING", "ERROR")
|
||||
if ($validLevels -notcontains $LogLevel.ToUpper()) {
|
||||
Write-Host "[WARN] Unknown log level '$LogLevel', defaulting to INFO" -ForegroundColor Yellow
|
||||
$LogLevel = "INFO"
|
||||
}
|
||||
Write-Host "[OK] Log level: $LogLevel" -ForegroundColor Green
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 3. Create logs dir with timestamped filename
|
||||
# ----------------------------------------------------------------
|
||||
if (-not (Test-Path "logs")) { New-Item -ItemType Directory "logs" | Out-Null }
|
||||
$timestamp = Get-Date -Format "yyyy-MM-dd_HHmmss"
|
||||
$logFile = "logs\eval_$timestamp.log"
|
||||
Write-Host "[OK] Log file : $logFile" -ForegroundColor Green
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "============================================================" -ForegroundColor Cyan
|
||||
Write-Host " Starting evaluation..." -ForegroundColor Cyan
|
||||
Write-Host " Logs also written to: $logFile" -ForegroundColor Cyan
|
||||
Write-Host " Press Ctrl+C to abort" -ForegroundColor Yellow
|
||||
Write-Host "============================================================" -ForegroundColor Cyan
|
||||
Write-Host ""
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4. Run evaluation
|
||||
# ----------------------------------------------------------------
|
||||
$env:PYTHONIOENCODING = "utf-8"
|
||||
$env:PYTHONPATH = "."
|
||||
|
||||
& python main.py `
|
||||
--scenario $Scenario `
|
||||
--log-file $logFile `
|
||||
--log-level $LogLevel.ToUpper()
|
||||
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Host ""
|
||||
Write-Host "[ERROR] Evaluation failed. Check log: $logFile" -ForegroundColor Red
|
||||
Read-Host "Press Enter to exit"
|
||||
exit 1
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "============================================================" -ForegroundColor Green
|
||||
Write-Host " Evaluation complete!" -ForegroundColor Green
|
||||
Write-Host " Log saved to: $logFile" -ForegroundColor Green
|
||||
Write-Host " Open the web console to view results: start.bat" -ForegroundColor Cyan
|
||||
Write-Host "============================================================" -ForegroundColor Green
|
||||
Write-Host ""
|
||||
Read-Host "Press Enter to exit"
|
||||
@@ -9,6 +9,10 @@ metrics:
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
# 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth,取消注释即可启用)
|
||||
# - noise_sensitivity # 鲁棒性:对检索噪声的敏感度
|
||||
# - factual_correctness # 端到端:事实正确性(相对标准答案)
|
||||
# - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用)
|
||||
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
||||
runtime:
|
||||
batch_size: 4
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
scenario_name: sample-pdf-question-bank-online
|
||||
mode: online
|
||||
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
|
||||
judge_model: deepseek-v4-pro
|
||||
judge_model: qwen3.5-flash
|
||||
embedding_model: text-embedding-v3
|
||||
metrics:
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
output_dir: ../../outputs/online/sample-pdf-question-bank
|
||||
runtime:
|
||||
batch_size: 2
|
||||
@@ -19,4 +19,4 @@ app_adapter:
|
||||
callable: apps.pdf_question_bank.adapter:run
|
||||
static_kwargs:
|
||||
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
|
||||
model: deepseek-v4-flash
|
||||
model: glm-5
|
||||
|
||||
@@ -3,20 +3,24 @@ mode: online
|
||||
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||
judge_model: deepseek-v4-flash
|
||||
embedding_model: text-embedding-v3
|
||||
optimization_advisor: true
|
||||
metrics:
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
- faithfulness
|
||||
- answer_relevancy
|
||||
- context_recall
|
||||
- context_precision
|
||||
- noise_sensitivity
|
||||
- factual_correctness
|
||||
- semantic_similarity
|
||||
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
||||
runtime:
|
||||
batch_size: 4
|
||||
app_concurrency: 4
|
||||
metric_concurrency: 4
|
||||
max_samples: 50
|
||||
batch_size: 3
|
||||
app_concurrency: 3
|
||||
metric_concurrency: 3
|
||||
max_samples: 10
|
||||
app_adapter:
|
||||
type: python
|
||||
callable: apps.siemens_pdf_qa.adapter:run
|
||||
static_kwargs:
|
||||
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
||||
model: deepseek-v4-flash
|
||||
model: glm-5
|
||||
|
||||
59
scripts/smoke_advisor.py
Normal file
59
scripts/smoke_advisor.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Offline smoke-check for the advisor module wiring (no network required)."""
|
||||
import math
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from rag_eval.advisor.rules import diagnose
|
||||
from rag_eval.advisor.writer import write_advice, _format_log_summary
|
||||
|
||||
# Simulate score_rows with low faithfulness and high noise_sensitivity
|
||||
rows = [
|
||||
{
|
||||
"sample_id": f"s{i}",
|
||||
"question": f"问题{i}:西门子CT扫描的Flash技术原理是什么?",
|
||||
"answer": f"答案{i}:Flash技术采用双源CT扫描",
|
||||
"ground_truth": f"标准答案{i}:Flash扫描利用双源CT和大螺距实现超低辐射剂量扫描",
|
||||
"faithfulness": 0.3 + i * 0.05,
|
||||
"noise_sensitivity": 0.4 + i * 0.02,
|
||||
"context_recall": 0.75,
|
||||
"semantic_similarity": 0.65,
|
||||
}
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
diags = diagnose(rows, metrics=["faithfulness", "noise_sensitivity", "context_recall", "semantic_similarity"])
|
||||
print(f"Diagnosed {len(diags)} metric(s):")
|
||||
for d in diags:
|
||||
print(f" {d.metric}: mean={d.mean_score}, severity={d.severity}, low_samples={len(d.low_samples)}")
|
||||
|
||||
assert len(diags) >= 2, f"Expected at least 2 diagnoses, got {len(diags)}"
|
||||
metrics_hit = {d.metric for d in diags}
|
||||
assert "faithfulness" in metrics_hit, "faithfulness should be triggered"
|
||||
assert "noise_sensitivity" in metrics_hit, "noise_sensitivity should be triggered"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = Path(tmp) / "optimization_advice.md"
|
||||
write_advice(
|
||||
diagnoses=diags,
|
||||
llm_markdown="", # fallback mode (no LLM)
|
||||
advice_path=path,
|
||||
scenario_name="smoke-test-siemens",
|
||||
run_id="2026-06-16T00-00-00",
|
||||
judge_model="deepseek-v4-flash",
|
||||
)
|
||||
content = path.read_text(encoding="utf-8")
|
||||
assert "smoke-test-siemens" in content, "scenario name missing from report"
|
||||
assert "faithfulness" in content, "faithfulness missing from report"
|
||||
assert "noise_sensitivity" in content, "noise_sensitivity missing from report"
|
||||
print(f"\nAdvice file ({len(content)} chars) — assertions OK")
|
||||
|
||||
# Verify log summary format
|
||||
summary = _format_log_summary(diags, Path("optimization_advice.md"))
|
||||
print(f"\nLog summary length: {len(summary)} chars, faithfulness present: {'faithfulness' in summary}")
|
||||
assert "触发诊断" in summary
|
||||
assert "faithfulness" in summary
|
||||
|
||||
print("\nSmoke check PASSED")
|
||||
14
start.bat
14
start.bat
@@ -56,7 +56,17 @@ if errorlevel 1 (
|
||||
)
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 4. Seed demo data if no runs exist yet
|
||||
:: 4. Ensure configs/ directory exists for LLM profile storage
|
||||
:: ----------------------------------------------------------------
|
||||
if not exist "configs" (
|
||||
mkdir configs
|
||||
echo [OK] Created configs/ directory for LLM profile storage.
|
||||
) else (
|
||||
echo [OK] configs/ directory ready.
|
||||
)
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 5. Seed demo data if no runs exist yet
|
||||
:: ----------------------------------------------------------------
|
||||
if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
||||
echo [INFO] No run data found. Generating demo data...
|
||||
@@ -71,7 +81,7 @@ if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
||||
)
|
||||
|
||||
:: ----------------------------------------------------------------
|
||||
:: 5. Pick an available port
|
||||
:: 6. Pick an available port
|
||||
:: ----------------------------------------------------------------
|
||||
set PORT=8800
|
||||
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
|
||||
|
||||
14
start.ps1
14
start.ps1
@@ -58,7 +58,17 @@ if ($LASTEXITCODE -ne 0) {
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 4. Seed demo data if missing
|
||||
# 4. Ensure configs/ directory exists for LLM profile storage
|
||||
# ----------------------------------------------------------------
|
||||
if (-not (Test-Path "configs")) {
|
||||
New-Item -ItemType Directory "configs" | Out-Null
|
||||
Write-Host "[OK] Created configs/ directory for LLM profile storage." -ForegroundColor Green
|
||||
} else {
|
||||
Write-Host "[OK] configs/ directory ready." -ForegroundColor Green
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5. Seed demo data if missing
|
||||
# ----------------------------------------------------------------
|
||||
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
||||
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
|
||||
@@ -73,7 +83,7 @@ if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# 5. Pick an available port
|
||||
# 6. Pick an available port
|
||||
# ----------------------------------------------------------------
|
||||
$PORT = 8800
|
||||
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
|
||||
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
72
tests/test_advisor_rules.py
Normal file
72
tests/test_advisor_rules.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import math
|
||||
import unittest
|
||||
from rag_eval.advisor.rules import Diagnosis, diagnose, METRIC_RULES
|
||||
|
||||
|
||||
class TestDiagnosis(unittest.TestCase):
|
||||
def _make_rows(self, metric: str, scores: list[float]) -> list[dict]:
|
||||
return [{metric: s, "question": f"q{i}", "answer": f"a{i}",
|
||||
"ground_truth": f"gt{i}", "sample_id": f"s{i}"}
|
||||
for i, s in enumerate(scores)]
|
||||
|
||||
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(result, [])
|
||||
|
||||
def test_warning_when_mean_below_warning_threshold(self):
|
||||
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].metric, "faithfulness")
|
||||
self.assertEqual(result[0].severity, "warning")
|
||||
self.assertAlmostEqual(result[0].mean_score, 0.65, places=2)
|
||||
|
||||
def test_critical_when_mean_below_critical_threshold(self):
|
||||
rows = self._make_rows("faithfulness", [0.3, 0.4, 0.45])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(result[0].severity, "critical")
|
||||
|
||||
def test_low_samples_selected_are_bottom_three(self):
|
||||
rows = self._make_rows("faithfulness", [0.1, 0.2, 0.3, 0.8, 0.9])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(len(result[0].low_samples), 3)
|
||||
scores = [s["faithfulness"] for s in result[0].low_samples]
|
||||
self.assertEqual(sorted(scores), [0.1, 0.2, 0.3])
|
||||
|
||||
def test_nan_scores_excluded_from_mean_and_low_samples(self):
|
||||
rows = self._make_rows("faithfulness", [0.3, float("nan"), 0.4])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(len(result), 1)
|
||||
for s in result[0].low_samples:
|
||||
self.assertFalse(math.isnan(s["faithfulness"]))
|
||||
|
||||
def test_noise_sensitivity_direction_inverted(self):
|
||||
# noise_sensitivity: higher is worse; threshold > 0.3 is warning
|
||||
rows = self._make_rows("noise_sensitivity", [0.4, 0.45, 0.5])
|
||||
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].metric, "noise_sensitivity")
|
||||
|
||||
def test_noise_sensitivity_no_diagnosis_when_low(self):
|
||||
rows = self._make_rows("noise_sensitivity", [0.1, 0.15, 0.2])
|
||||
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||
self.assertEqual(result, [])
|
||||
|
||||
def test_skips_metric_not_in_rows(self):
|
||||
rows = [{"faithfulness": 0.3, "question": "q", "answer": "a",
|
||||
"ground_truth": "gt", "sample_id": "s1"}]
|
||||
result = diagnose(rows, metrics=["faithfulness", "context_recall"])
|
||||
metrics_found = [d.metric for d in result]
|
||||
self.assertIn("faithfulness", metrics_found)
|
||||
self.assertNotIn("context_recall", metrics_found)
|
||||
|
||||
def test_all_seven_metrics_have_rules(self):
|
||||
expected = {"faithfulness", "answer_relevancy", "context_recall",
|
||||
"context_precision", "noise_sensitivity",
|
||||
"factual_correctness", "semantic_similarity"}
|
||||
self.assertEqual(set(METRIC_RULES.keys()), expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
113
tests/test_advisor_writer.py
Normal file
113
tests/test_advisor_writer.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import shutil
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.advisor.rules import Diagnosis
|
||||
from rag_eval.advisor.writer import write_advice, _format_log_summary
|
||||
|
||||
|
||||
class TestWriteAdvice(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.tmp = Path("tests/.tmp/test_advisor_writer")
|
||||
shutil.rmtree(self.tmp, ignore_errors=True)
|
||||
self.tmp.mkdir(parents=True, exist_ok=True)
|
||||
self.advice_path = self.tmp / "optimization_advice.md"
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmp, ignore_errors=True)
|
||||
|
||||
def _make_diagnosis(self, metric="faithfulness", severity="warning"):
|
||||
return Diagnosis(
|
||||
metric=metric,
|
||||
mean_score=0.55,
|
||||
threshold=0.7,
|
||||
severity=severity,
|
||||
root_causes=["原因1", "原因2"],
|
||||
suggested_actions=["建议1", "建议2"],
|
||||
low_samples=[
|
||||
{"sample_id": "s1", "question": "问题1", "answer": "答案1",
|
||||
"ground_truth": "标准1", metric: 0.4},
|
||||
],
|
||||
)
|
||||
|
||||
def test_write_creates_file(self):
|
||||
diag = self._make_diagnosis()
|
||||
write_advice(
|
||||
diagnoses=[diag],
|
||||
llm_markdown="## faithfulness\n\nLLM 建议内容",
|
||||
advice_path=self.advice_path,
|
||||
scenario_name="test-scenario",
|
||||
run_id="2026-01-01T00-00-00",
|
||||
judge_model="deepseek-v4-flash",
|
||||
)
|
||||
self.assertTrue(self.advice_path.exists())
|
||||
|
||||
def test_write_contains_scenario_name_and_run_id(self):
|
||||
diag = self._make_diagnosis()
|
||||
write_advice(
|
||||
diagnoses=[diag],
|
||||
llm_markdown="## faithfulness\n\nLLM 建议",
|
||||
advice_path=self.advice_path,
|
||||
scenario_name="siemens-test",
|
||||
run_id="2026-01-01T00-00-00",
|
||||
judge_model="deepseek-v4-flash",
|
||||
)
|
||||
content = self.advice_path.read_text(encoding="utf-8")
|
||||
self.assertIn("siemens-test", content)
|
||||
self.assertIn("2026-01-01T00-00-00", content)
|
||||
|
||||
def test_write_contains_llm_markdown(self):
|
||||
diag = self._make_diagnosis()
|
||||
write_advice(
|
||||
diagnoses=[diag],
|
||||
llm_markdown="## faithfulness\n\n具体建议文本",
|
||||
advice_path=self.advice_path,
|
||||
scenario_name="test",
|
||||
run_id="rid",
|
||||
judge_model="model",
|
||||
)
|
||||
content = self.advice_path.read_text(encoding="utf-8")
|
||||
self.assertIn("具体建议文本", content)
|
||||
|
||||
def test_write_fallback_when_no_llm_markdown(self):
|
||||
"""When llm_markdown is empty, writer emits rule-only report."""
|
||||
diag = self._make_diagnosis()
|
||||
write_advice(
|
||||
diagnoses=[diag],
|
||||
llm_markdown="",
|
||||
advice_path=self.advice_path,
|
||||
scenario_name="test",
|
||||
run_id="rid",
|
||||
judge_model="model",
|
||||
)
|
||||
content = self.advice_path.read_text(encoding="utf-8")
|
||||
self.assertIn("faithfulness", content)
|
||||
self.assertIn("原因1", content)
|
||||
|
||||
def test_log_summary_format(self):
|
||||
diags = [
|
||||
self._make_diagnosis("faithfulness", "critical"),
|
||||
self._make_diagnosis("context_recall", "warning"),
|
||||
]
|
||||
summary = _format_log_summary(diags, self.advice_path)
|
||||
self.assertIn("faithfulness", summary)
|
||||
self.assertIn("critical", summary)
|
||||
self.assertIn("context_recall", summary)
|
||||
self.assertIn("warning", summary)
|
||||
|
||||
def test_write_empty_diagnoses_still_creates_file(self):
|
||||
write_advice(
|
||||
diagnoses=[],
|
||||
llm_markdown="",
|
||||
advice_path=self.advice_path,
|
||||
scenario_name="test",
|
||||
run_id="rid",
|
||||
judge_model="model",
|
||||
)
|
||||
self.assertTrue(self.advice_path.exists())
|
||||
content = self.advice_path.read_text(encoding="utf-8")
|
||||
self.assertIn("未发现明显指标异常", content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
0
tests/webapp/__init__.py
Normal file
0
tests/webapp/__init__.py
Normal file
139
tests/webapp/test_llm_profiles_api.py
Normal file
139
tests/webapp/test_llm_profiles_api.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""Integration tests for /api/llm-profiles endpoints."""
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def client(tmp_path, monkeypatch):
|
||||
"""TestClient with a fresh ProfileManager backed by a temp file."""
|
||||
store = tmp_path / "profiles.json"
|
||||
import webapp.services.profile_manager as pm_mod
|
||||
from webapp.services.profile_manager import ProfileManager
|
||||
fresh_mgr = ProfileManager(store_path=store)
|
||||
monkeypatch.setattr(pm_mod, "profile_manager", fresh_mgr)
|
||||
import webapp.api.llm_profiles as api_mod
|
||||
monkeypatch.setattr(api_mod, "profile_manager", fresh_mgr)
|
||||
|
||||
from webapp.server import create_app
|
||||
return TestClient(create_app())
|
||||
|
||||
|
||||
def test_list_empty(client):
|
||||
resp = client.get("/api/llm-profiles")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["profiles"] == []
|
||||
|
||||
|
||||
def test_create_and_list(client):
|
||||
body = {"name": "Test", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||
resp = client.post("/api/llm-profiles", json=body)
|
||||
assert resp.status_code == 201
|
||||
data = resp.json()
|
||||
assert data["name"] == "Test"
|
||||
assert data["profile_id"] != ""
|
||||
|
||||
resp2 = client.get("/api/llm-profiles")
|
||||
assert len(resp2.json()["profiles"]) == 1
|
||||
|
||||
|
||||
def test_update_profile(client):
|
||||
body = {"name": "Old", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||
|
||||
upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
|
||||
resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["name"] == "New"
|
||||
assert resp.json()["timeout_seconds"] == 60
|
||||
|
||||
|
||||
def test_delete_profile(client):
|
||||
body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
|
||||
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||
resp = client.delete(f"/api/llm-profiles/{pid}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["deleted"] is True
|
||||
assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
|
||||
|
||||
|
||||
def test_update_nonexistent(client):
|
||||
resp = client.put("/api/llm-profiles/nope",
|
||||
json={"name": "X", "model": "m", "base_url": "http://x/v1", "api_key": "k"})
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_delete_nonexistent(client):
|
||||
resp = client.delete("/api/llm-profiles/nope")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# YAML patcher tests
|
||||
# ---------------------------------------------------------------------------
|
||||
import yaml as yaml_lib
|
||||
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||
from webapp.models import LLMProfile
|
||||
|
||||
|
||||
def test_apply_judge_profile(tmp_path):
|
||||
"""Applying a judge profile patches judge_model in the YAML."""
|
||||
scenario_file = tmp_path / "test-scenario.yaml"
|
||||
scenario_file.write_text(
|
||||
"scenario_name: test\nmode: offline\njudge_model: old-model\nembedding_model: emb\n"
|
||||
"dataset: data.csv\nmetrics:\n- faithfulness\noutput_dir: outputs/test\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
judge_p = LLMProfile(
|
||||
profile_id="x", name="J", model="new-model",
|
||||
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
|
||||
)
|
||||
patched = apply_profiles_to_scenario(
|
||||
scenario_path=str(scenario_file),
|
||||
judge_profile=judge_p,
|
||||
answer_profile=None,
|
||||
dataset_profile=None,
|
||||
_resolve_absolute=True,
|
||||
)
|
||||
assert "judge_model" in patched
|
||||
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||
assert data["judge_model"] == "new-model"
|
||||
|
||||
|
||||
def test_apply_answer_profile(tmp_path):
|
||||
"""Applying an answer profile patches app_adapter.static_kwargs.model."""
|
||||
scenario_file = tmp_path / "online.yaml"
|
||||
scenario_file.write_text(
|
||||
"scenario_name: online\nmode: online\njudge_model: j\nembedding_model: emb\n"
|
||||
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n"
|
||||
"app_adapter:\n type: python\n callable: apps.foo:run\n"
|
||||
" static_kwargs:\n model: old\n source_chunks_path: chunks.jsonl\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
answer_p = LLMProfile(
|
||||
profile_id="y", name="A", model="new-answer-model",
|
||||
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
|
||||
)
|
||||
patched = apply_profiles_to_scenario(
|
||||
scenario_path=str(scenario_file),
|
||||
judge_profile=None,
|
||||
answer_profile=answer_p,
|
||||
dataset_profile=None,
|
||||
_resolve_absolute=True,
|
||||
)
|
||||
assert "app_adapter.static_kwargs.model" in patched
|
||||
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||
assert data["app_adapter"]["static_kwargs"]["model"] == "new-answer-model"
|
||||
|
||||
|
||||
def test_apply_no_profiles_returns_empty(tmp_path):
|
||||
"""When no profiles are given, no fields are patched."""
|
||||
scenario_file = tmp_path / "noop.yaml"
|
||||
scenario_file.write_text("scenario_name: noop\njudge_model: m\n", encoding="utf-8")
|
||||
patched = apply_profiles_to_scenario(
|
||||
scenario_path=str(scenario_file),
|
||||
judge_profile=None,
|
||||
answer_profile=None,
|
||||
dataset_profile=None,
|
||||
_resolve_absolute=True,
|
||||
)
|
||||
assert patched == []
|
||||
100
tests/webapp/test_profile_manager.py
Normal file
100
tests/webapp/test_profile_manager.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import pytest
|
||||
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
||||
|
||||
def test_llm_profile_defaults():
|
||||
p = LLMProfile(
|
||||
profile_id="abc",
|
||||
name="Test",
|
||||
model="gpt-4",
|
||||
base_url="http://localhost/v1",
|
||||
api_key="sk-test",
|
||||
)
|
||||
assert p.timeout_seconds == 30
|
||||
assert p.created_at != ""
|
||||
assert p.updated_at != ""
|
||||
|
||||
def test_profile_apply_request_fields():
|
||||
req = ProfileApplyRequest(
|
||||
scenario_path="scenarios/offline/sample.yaml",
|
||||
judge_profile_id="id1",
|
||||
answer_profile_id="id2",
|
||||
dataset_profile_id=None,
|
||||
)
|
||||
assert req.judge_profile_id == "id1"
|
||||
assert req.dataset_profile_id is None
|
||||
|
||||
def test_profile_apply_response():
|
||||
resp = ProfileApplyResponse(scenario_path="scenarios/offline/sample.yaml", patched_fields=["judge_model"])
|
||||
assert "judge_model" in resp.patched_fields
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ProfileManager service tests
|
||||
# ---------------------------------------------------------------------------
|
||||
import json
|
||||
from webapp.services.profile_manager import ProfileManager
|
||||
|
||||
|
||||
def _make_manager(tmp_path):
|
||||
store = tmp_path / "profiles.json"
|
||||
return ProfileManager(store_path=store)
|
||||
|
||||
|
||||
def test_create_profile(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
p = mgr.create(name="Local", model="deepseek-v4-flash",
|
||||
base_url="http://localhost/v1", api_key="sk-x")
|
||||
assert p.profile_id != ""
|
||||
assert p.name == "Local"
|
||||
|
||||
|
||||
def test_list_profiles(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
mgr.create(name="A", model="m1", base_url="http://a/v1", api_key="k1")
|
||||
mgr.create(name="B", model="m2", base_url="http://b/v1", api_key="k2")
|
||||
profiles = mgr.list_all()
|
||||
assert len(profiles) == 2
|
||||
|
||||
|
||||
def test_get_profile(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
created = mgr.create(name="X", model="m", base_url="http://x/v1", api_key="k")
|
||||
fetched = mgr.get(created.profile_id)
|
||||
assert fetched is not None
|
||||
assert fetched.name == "X"
|
||||
|
||||
|
||||
def test_update_profile(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
p = mgr.create(name="Old", model="m", base_url="http://x/v1", api_key="k")
|
||||
updated = mgr.update(p.profile_id, name="New", model="m2",
|
||||
base_url="http://x/v1", api_key="k", timeout_seconds=60)
|
||||
assert updated is not None
|
||||
assert updated.name == "New"
|
||||
assert updated.model == "m2"
|
||||
assert updated.timeout_seconds == 60
|
||||
|
||||
|
||||
def test_delete_profile(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
p = mgr.create(name="Del", model="m", base_url="http://x/v1", api_key="k")
|
||||
assert mgr.delete(p.profile_id) is True
|
||||
assert mgr.get(p.profile_id) is None
|
||||
|
||||
|
||||
def test_persistence(tmp_path):
|
||||
store = tmp_path / "profiles.json"
|
||||
mgr1 = ProfileManager(store_path=store)
|
||||
p = mgr1.create(name="Persist", model="m", base_url="http://x/v1", api_key="k")
|
||||
mgr2 = ProfileManager(store_path=store)
|
||||
assert mgr2.get(p.profile_id) is not None
|
||||
|
||||
|
||||
def test_get_nonexistent(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
assert mgr.get("does-not-exist") is None
|
||||
|
||||
|
||||
def test_delete_nonexistent(tmp_path):
|
||||
mgr = _make_manager(tmp_path)
|
||||
assert mgr.delete("does-not-exist") is False
|
||||
96
webapp/api/llm_profiles.py
Normal file
96
webapp/api/llm_profiles.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""CRUD routes for LLM profiles plus the scenario-patching apply endpoint."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from webapp.models import (
|
||||
CreateProfileRequest,
|
||||
LLMProfile,
|
||||
ProfileApplyRequest,
|
||||
ProfileApplyResponse,
|
||||
)
|
||||
from webapp.services.profile_manager import profile_manager
|
||||
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||
|
||||
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
|
||||
|
||||
|
||||
@router.get("", response_model=dict)
|
||||
def list_profiles() -> dict:
|
||||
"""Return all saved LLM profiles."""
|
||||
return {"profiles": [p.model_dump() for p in profile_manager.list_all()]}
|
||||
|
||||
|
||||
@router.post("", status_code=201, response_model=LLMProfile)
|
||||
def create_profile(request: CreateProfileRequest) -> LLMProfile:
|
||||
"""Create a new LLM profile."""
|
||||
return profile_manager.create(
|
||||
name=request.name,
|
||||
model=request.model,
|
||||
base_url=request.base_url,
|
||||
api_key=request.api_key,
|
||||
timeout_seconds=request.timeout_seconds,
|
||||
)
|
||||
|
||||
|
||||
@router.put("/{profile_id}", response_model=LLMProfile)
|
||||
def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile:
|
||||
"""Update an existing LLM profile by id."""
|
||||
updated = profile_manager.update(
|
||||
profile_id=profile_id,
|
||||
name=request.name,
|
||||
model=request.model,
|
||||
base_url=request.base_url,
|
||||
api_key=request.api_key,
|
||||
timeout_seconds=request.timeout_seconds,
|
||||
)
|
||||
if updated is None:
|
||||
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||
return updated
|
||||
|
||||
|
||||
@router.delete("/{profile_id}", response_model=dict)
|
||||
def delete_profile(profile_id: str) -> dict:
|
||||
"""Delete an LLM profile by id."""
|
||||
deleted = profile_manager.delete(profile_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||
return {"deleted": True}
|
||||
|
||||
|
||||
@router.post("/apply", response_model=ProfileApplyResponse)
|
||||
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
|
||||
"""Patch selected LLM profiles into the target scenario YAML file."""
|
||||
role_profiles: dict[str, LLMProfile | None] = {
|
||||
"judge": profile_manager.get(request.judge_profile_id) if request.judge_profile_id else None,
|
||||
"answer": profile_manager.get(request.answer_profile_id) if request.answer_profile_id else None,
|
||||
"dataset": profile_manager.get(request.dataset_profile_id) if request.dataset_profile_id else None,
|
||||
}
|
||||
|
||||
missing = [
|
||||
role
|
||||
for role, pid in [
|
||||
("judge", request.judge_profile_id),
|
||||
("answer", request.answer_profile_id),
|
||||
("dataset", request.dataset_profile_id),
|
||||
]
|
||||
if pid and role_profiles[role] is None
|
||||
]
|
||||
|
||||
if missing:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Profile(s) not found for roles: {', '.join(missing)}",
|
||||
)
|
||||
|
||||
patched = apply_profiles_to_scenario(
|
||||
scenario_path=request.scenario_path,
|
||||
judge_profile=role_profiles["judge"],
|
||||
answer_profile=role_profiles["answer"],
|
||||
dataset_profile=role_profiles["dataset"],
|
||||
)
|
||||
return ProfileApplyResponse(
|
||||
scenario_path=request.scenario_path,
|
||||
patched_fields=patched,
|
||||
)
|
||||
@@ -2,11 +2,16 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
def _utcnow_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class RunSummary(BaseModel):
|
||||
"""Compact description of a single evaluation run for list views."""
|
||||
|
||||
@@ -68,6 +73,7 @@ class ReportData(BaseModel):
|
||||
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
||||
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||||
summary_markdown: str = ""
|
||||
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
|
||||
|
||||
|
||||
class RunDetail(BaseModel):
|
||||
@@ -114,6 +120,45 @@ class TriggerEvaluationResponse(BaseModel):
|
||||
task_id: str
|
||||
|
||||
|
||||
class LLMProfile(BaseModel):
|
||||
"""A named LLM connection configuration that can be reused across tasks."""
|
||||
|
||||
profile_id: str
|
||||
name: str
|
||||
model: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
timeout_seconds: int = 30
|
||||
created_at: str = Field(default_factory=_utcnow_iso)
|
||||
updated_at: str = Field(default_factory=_utcnow_iso)
|
||||
|
||||
|
||||
class CreateProfileRequest(BaseModel):
|
||||
"""Request body for creating or updating an LLM profile."""
|
||||
|
||||
name: str
|
||||
model: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
timeout_seconds: int = 30
|
||||
|
||||
|
||||
class ProfileApplyRequest(BaseModel):
|
||||
"""Request body to patch LLM profile selections into a scenario YAML."""
|
||||
|
||||
scenario_path: str
|
||||
judge_profile_id: str | None = None
|
||||
answer_profile_id: str | None = None
|
||||
dataset_profile_id: str | None = None
|
||||
|
||||
|
||||
class ProfileApplyResponse(BaseModel):
|
||||
"""Response after patching a scenario YAML with profile settings."""
|
||||
|
||||
scenario_path: str
|
||||
patched_fields: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
def jsonable(value: Any) -> Any:
|
||||
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||||
import math
|
||||
|
||||
@@ -13,7 +13,7 @@ from fastapi import FastAPI
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from webapp.api import evaluations, runs, scenarios
|
||||
from webapp.api import evaluations, llm_profiles, runs, scenarios
|
||||
|
||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||
|
||||
@@ -29,6 +29,7 @@ def create_app() -> FastAPI:
|
||||
app.include_router(runs.router)
|
||||
app.include_router(scenarios.router)
|
||||
app.include_router(evaluations.router)
|
||||
app.include_router(llm_profiles.router)
|
||||
|
||||
@app.get("/api/health", tags=["meta"])
|
||||
def health() -> dict[str, str]:
|
||||
|
||||
137
webapp/services/profile_manager.py
Normal file
137
webapp/services/profile_manager.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""In-memory + JSON-file LLM profile manager.
|
||||
|
||||
Profiles are kept in a dict keyed by profile_id and written to a JSON file
|
||||
on every mutation, so they survive server restarts. The pattern mirrors
|
||||
TaskManager but without threading concerns beyond a simple lock (profiles
|
||||
are only mutated by API calls in FastAPI request handlers).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import threading
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from webapp.models import LLMProfile
|
||||
|
||||
|
||||
_DEFAULT_STORE = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class ProfileManager:
|
||||
"""Manages LLM profiles with in-memory cache and JSON file persistence."""
|
||||
|
||||
def __init__(self, store_path: Path = _DEFAULT_STORE) -> None:
|
||||
self._store_path = store_path
|
||||
self._lock = threading.Lock()
|
||||
self._profiles: dict[str, LLMProfile] = {}
|
||||
self._load()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Public API
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def list_all(self) -> list[LLMProfile]:
|
||||
"""Return all profiles sorted by creation time."""
|
||||
with self._lock:
|
||||
return sorted(self._profiles.values(), key=lambda p: p.created_at)
|
||||
|
||||
def get(self, profile_id: str) -> LLMProfile | None:
|
||||
"""Return one profile by id, or None if not found."""
|
||||
with self._lock:
|
||||
return self._profiles.get(profile_id)
|
||||
|
||||
def create(
|
||||
self,
|
||||
name: str,
|
||||
model: str,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
timeout_seconds: int = 30,
|
||||
) -> LLMProfile:
|
||||
"""Create and persist a new profile, returning it."""
|
||||
now = _now_iso()
|
||||
profile = LLMProfile(
|
||||
profile_id=uuid.uuid4().hex[:12],
|
||||
name=name,
|
||||
model=model,
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
timeout_seconds=timeout_seconds,
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
)
|
||||
with self._lock:
|
||||
self._profiles[profile.profile_id] = profile
|
||||
self._persist()
|
||||
return profile
|
||||
|
||||
def update(
|
||||
self,
|
||||
profile_id: str,
|
||||
name: str,
|
||||
model: str,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
timeout_seconds: int = 30,
|
||||
) -> LLMProfile | None:
|
||||
"""Update an existing profile in-place; returns None if not found."""
|
||||
with self._lock:
|
||||
existing = self._profiles.get(profile_id)
|
||||
if existing is None:
|
||||
return None
|
||||
updated = existing.model_copy(update={
|
||||
"name": name,
|
||||
"model": model,
|
||||
"base_url": base_url,
|
||||
"api_key": api_key,
|
||||
"timeout_seconds": timeout_seconds,
|
||||
"updated_at": _now_iso(),
|
||||
})
|
||||
self._profiles[profile_id] = updated
|
||||
self._persist()
|
||||
return updated
|
||||
|
||||
def delete(self, profile_id: str) -> bool:
|
||||
"""Remove a profile; returns True if deleted, False if not found."""
|
||||
with self._lock:
|
||||
if profile_id not in self._profiles:
|
||||
return False
|
||||
del self._profiles[profile_id]
|
||||
self._persist()
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Persistence helpers
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
def _load(self) -> None:
|
||||
"""Load profiles from the JSON store file, ignoring missing/corrupt files."""
|
||||
if not self._store_path.exists():
|
||||
return
|
||||
try:
|
||||
data = json.loads(self._store_path.read_text(encoding="utf-8"))
|
||||
for raw in data.get("profiles", []):
|
||||
p = LLMProfile.model_validate(raw)
|
||||
self._profiles[p.profile_id] = p
|
||||
except Exception: # noqa: BLE001
|
||||
pass # Corrupt store — start fresh
|
||||
|
||||
def _persist(self) -> None:
|
||||
"""Write current profiles to the JSON store file (called under lock)."""
|
||||
self._store_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {"profiles": [p.model_dump() for p in self._profiles.values()]}
|
||||
self._store_path.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
# Module-level singleton shared by FastAPI routes.
|
||||
profile_manager = ProfileManager()
|
||||
@@ -164,12 +164,14 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
||||
"""Build the full aggregated report payload for one run directory."""
|
||||
frame = run_reader.read_scores_frame(run_dir)
|
||||
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
||||
|
||||
if frame.empty or not metrics:
|
||||
return ReportData(
|
||||
metrics=metrics,
|
||||
metric_means={metric: None for metric in metrics},
|
||||
summary_markdown=summary_markdown,
|
||||
advice_markdown=advice_markdown,
|
||||
)
|
||||
|
||||
distributions = {
|
||||
@@ -185,4 +187,5 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
||||
groupings=_groupings(frame, metrics),
|
||||
lowest_samples=_lowest_samples(frame, metrics),
|
||||
summary_markdown=summary_markdown,
|
||||
advice_markdown=advice_markdown,
|
||||
)
|
||||
|
||||
@@ -220,3 +220,14 @@ def read_summary_markdown(run_dir: Path) -> str:
|
||||
return summary_path.read_text(encoding="utf-8")
|
||||
except OSError:
|
||||
return ""
|
||||
|
||||
|
||||
def read_advice_markdown(run_dir: Path) -> str:
|
||||
"""Return the optimization_advice.md for a run, or an empty string if not generated."""
|
||||
advice_path = run_dir / "optimization_advice.md"
|
||||
if not advice_path.is_file():
|
||||
return ""
|
||||
try:
|
||||
return advice_path.read_text(encoding="utf-8")
|
||||
except OSError:
|
||||
return ""
|
||||
|
||||
74
webapp/services/yaml_patcher.py
Normal file
74
webapp/services/yaml_patcher.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""Patch LLM profile settings into scenario YAML files in-place.
|
||||
|
||||
Only the fields that correspond to a provided (non-None) profile are touched.
|
||||
All other fields and structure are preserved as much as PyYAML allows
|
||||
(comments are lost on round-trip, which is an accepted trade-off).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from webapp.models import LLMProfile
|
||||
|
||||
|
||||
def _repo_root() -> Path:
|
||||
return Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def _resolve_scenario_path(path_str: str) -> Path:
|
||||
"""Resolve a scenario path; absolute paths are used as-is."""
|
||||
candidate = Path(path_str)
|
||||
if candidate.is_absolute():
|
||||
return candidate
|
||||
return (_repo_root() / candidate).resolve()
|
||||
|
||||
|
||||
def apply_profiles_to_scenario(
|
||||
scenario_path: str,
|
||||
judge_profile: LLMProfile | None,
|
||||
answer_profile: LLMProfile | None,
|
||||
dataset_profile: LLMProfile | None,
|
||||
_resolve_absolute: bool = False,
|
||||
) -> list[str]:
|
||||
"""Patch the YAML file at *scenario_path* with the supplied profiles.
|
||||
|
||||
Returns a list of dotted field names that were actually patched.
|
||||
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
|
||||
"""
|
||||
if _resolve_absolute:
|
||||
resolved = Path(scenario_path)
|
||||
else:
|
||||
resolved = _resolve_scenario_path(scenario_path)
|
||||
|
||||
if not resolved.exists():
|
||||
raise FileNotFoundError(f"Scenario file not found: {resolved}")
|
||||
|
||||
data: dict[str, Any] = yaml.safe_load(resolved.read_text(encoding="utf-8")) or {}
|
||||
patched: list[str] = []
|
||||
|
||||
if judge_profile is not None:
|
||||
data["judge_model"] = judge_profile.model
|
||||
patched.append("judge_model")
|
||||
|
||||
if answer_profile is not None:
|
||||
adapter = data.get("app_adapter")
|
||||
if isinstance(adapter, dict):
|
||||
static_kwargs = adapter.setdefault("static_kwargs", {})
|
||||
static_kwargs["model"] = answer_profile.model
|
||||
patched.append("app_adapter.static_kwargs.model")
|
||||
|
||||
if dataset_profile is not None:
|
||||
generation = data.get("generation")
|
||||
if isinstance(generation, dict):
|
||||
generation["model"] = dataset_profile.model
|
||||
patched.append("generation.model")
|
||||
|
||||
resolved.write_text(
|
||||
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return patched
|
||||
@@ -265,3 +265,69 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
|
||||
.sidebar { width: 64px; }
|
||||
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
|
||||
}
|
||||
|
||||
/* ---------- LLM 配置管理页 ---------- */
|
||||
.profile-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 16px; }
|
||||
.profile-card {
|
||||
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||
padding: 16px; box-shadow: var(--shadow);
|
||||
}
|
||||
.profile-card-head { display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; }
|
||||
.profile-card-name { font-size: 15px; font-weight: 600; }
|
||||
.profile-card-actions { display: flex; gap: 6px; }
|
||||
.profile-card-field { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||
.field-label { font-weight: 600; color: var(--ink); }
|
||||
|
||||
/* Form */
|
||||
.profile-form { display: flex; flex-direction: column; gap: 12px; margin-top: 14px; max-width: 560px; }
|
||||
.form-row { display: flex; flex-direction: column; gap: 4px; }
|
||||
.form-label { font-size: 13px; font-weight: 600; }
|
||||
.req { color: var(--bad); }
|
||||
.form-input {
|
||||
border: 1px solid var(--line); border-radius: 6px; padding: 8px 10px;
|
||||
font-size: 13px; font-family: inherit; width: 100%;
|
||||
}
|
||||
.form-input:focus { outline: none; border-color: var(--petrol); }
|
||||
.form-input-sm { max-width: 120px; }
|
||||
.form-actions { display: flex; gap: 10px; align-items: center; margin-top: 4px; }
|
||||
.form-error { font-size: 12px; color: var(--bad); }
|
||||
.btn-sm { padding: 4px 10px; font-size: 12px; }
|
||||
.btn-danger { color: var(--bad); border-color: var(--bad); }
|
||||
.btn-danger:hover { background: #fee2e2; }
|
||||
|
||||
/* 选中态 run 卡片 */
|
||||
.run-card.selected {
|
||||
border-color: var(--petrol);
|
||||
box-shadow: 0 0 0 2px rgba(0,153,153,0.25), var(--shadow);
|
||||
}
|
||||
|
||||
/* ---------- LLM 角色配置面板 ---------- */
|
||||
.llm-assignment-panel { border-left: 3px solid var(--petrol); }
|
||||
.llm-role-rows { display: flex; flex-direction: column; gap: 10px; }
|
||||
.llm-role-row { display: flex; align-items: center; gap: 14px; }
|
||||
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
|
||||
.llm-role-select { min-width: 240px; }
|
||||
|
||||
/* ---------- ⑤ 优化建议面板 ---------- */
|
||||
.advice-panel { border-left: 3px solid #7c3aed; }
|
||||
.advice-header {
|
||||
display: flex; align-items: center; gap: 10px;
|
||||
margin-bottom: 14px;
|
||||
}
|
||||
.advice-badge {
|
||||
background: #7c3aed; color: #fff;
|
||||
font-size: 11px; font-weight: 700; letter-spacing: 0.5px;
|
||||
padding: 3px 8px; border-radius: 4px; text-transform: uppercase;
|
||||
}
|
||||
.advice-model { font-size: 12px; color: var(--slate); }
|
||||
.advice-body { line-height: 1.7; color: var(--ink); }
|
||||
.advice-md h1 { font-size: 16px; font-weight: 700; margin: 16px 0 8px; color: var(--ink); }
|
||||
.advice-md h2 {
|
||||
font-size: 14px; font-weight: 700; margin: 20px 0 8px;
|
||||
padding-bottom: 4px; border-bottom: 1px solid var(--line); color: var(--ink-soft);
|
||||
}
|
||||
.advice-md h3 { font-size: 13px; font-weight: 600; margin: 12px 0 6px; color: var(--slate); }
|
||||
.advice-md hr { border: none; border-top: 1px solid var(--line); margin: 14px 0; }
|
||||
.advice-md ul { padding-left: 20px; margin: 6px 0; }
|
||||
.advice-md li { margin: 3px 0; font-size: 13px; }
|
||||
.advice-md strong { color: var(--ink); font-weight: 600; }
|
||||
|
||||
@@ -22,9 +22,12 @@
|
||||
<button class="nav-item" data-view="new">
|
||||
<span class="nav-ico">+</span><span>新建评估</span>
|
||||
</button>
|
||||
<button class="nav-item" data-view="report" data-requires-run="1">
|
||||
<button class="nav-item" data-view="report" data-requires-run="1" disabled>
|
||||
<span class="nav-ico">▤</span><span>报告详情</span>
|
||||
</button>
|
||||
<button class="nav-item" data-view="profiles">
|
||||
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
||||
</button>
|
||||
</nav>
|
||||
<div class="sidebar-foot">
|
||||
<span class="dot" id="health-dot"></span>
|
||||
@@ -59,6 +62,33 @@
|
||||
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- LLM 角色配置面板(选中场景后显示) -->
|
||||
<div class="panel llm-assignment-panel" id="llm-assignment-panel" hidden>
|
||||
<h2>LLM 角色配置 <span class="muted" style="font-size:13px;font-weight:400">(可选)</span></h2>
|
||||
<p class="muted" style="margin-bottom:14px">为不同任务角色选择已保存的 LLM 配置,留空则使用场景文件中的原始配置。</p>
|
||||
<div class="llm-role-rows">
|
||||
<div class="llm-role-row">
|
||||
<label class="llm-role-label">评测打分 Judge LLM</label>
|
||||
<select class="select llm-role-select" id="role-judge">
|
||||
<option value="">— 使用场景原始配置 —</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="llm-role-row">
|
||||
<label class="llm-role-label">生成答案 Answer LLM</label>
|
||||
<select class="select llm-role-select" id="role-answer">
|
||||
<option value="">— 使用场景原始配置 —</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="llm-role-row">
|
||||
<label class="llm-role-label">生成题库 Dataset LLM</label>
|
||||
<select class="select llm-role-select" id="role-dataset">
|
||||
<option value="">— 使用场景原始配置 —</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="panel" id="task-panel" hidden>
|
||||
<div class="task-head">
|
||||
<h2>评估进度</h2>
|
||||
@@ -105,6 +135,68 @@
|
||||
<!-- ④ 最低分样本逐条复核 -->
|
||||
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
|
||||
<div class="lowest-table" id="lowest-table"></div>
|
||||
|
||||
<!-- ⑤ 优化建议(optimization_advisor: true 时显示) -->
|
||||
<div id="advice-section" hidden>
|
||||
<div class="section-label">⑤ 优化建议 OPTIMIZATION ADVICE</div>
|
||||
<div class="panel advice-panel">
|
||||
<div class="advice-header">
|
||||
<span class="advice-badge">AI 诊断报告</span>
|
||||
<span class="advice-model" id="advice-model-label"></span>
|
||||
</div>
|
||||
<div class="advice-body" id="advice-body"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- LLM 配置视图 -->
|
||||
<section class="view" id="view-profiles" hidden>
|
||||
<div class="panel">
|
||||
<div class="panel-head">
|
||||
<h2>LLM 配置管理</h2>
|
||||
<button class="btn btn-primary" id="add-profile-btn">+ 新建配置</button>
|
||||
</div>
|
||||
<p class="muted">保存常用 LLM 连接参数,在运行评估时按角色选择。</p>
|
||||
</div>
|
||||
|
||||
<!-- 新建 / 编辑表单(默认隐藏) -->
|
||||
<div class="panel" id="profile-form-panel" hidden>
|
||||
<h2 id="profile-form-title">新建 LLM 配置</h2>
|
||||
<div class="profile-form">
|
||||
<input type="hidden" id="edit-profile-id" />
|
||||
<div class="form-row">
|
||||
<label class="form-label">配置名称 <span class="req">*</span></label>
|
||||
<input class="form-input" id="pf-name" placeholder="例:DeepSeek Flash(内网)" />
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label class="form-label">模型名称 <span class="req">*</span></label>
|
||||
<input class="form-input" id="pf-model" placeholder="例:deepseek-v4-flash" />
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label class="form-label">Base URL <span class="req">*</span></label>
|
||||
<input class="form-input" id="pf-base-url" placeholder="例:http://6.86.80.4:30080/v1" />
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label class="form-label">API Key <span class="req">*</span></label>
|
||||
<input class="form-input" id="pf-api-key" type="password" placeholder="sk-…" />
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label class="form-label">超时(秒)</label>
|
||||
<input class="form-input form-input-sm" id="pf-timeout" type="number" value="30" min="5" max="300" />
|
||||
</div>
|
||||
<div class="form-actions">
|
||||
<button class="btn btn-primary" id="save-profile-btn">保存</button>
|
||||
<button class="btn" id="cancel-profile-btn">取消</button>
|
||||
<span class="form-error muted" id="profile-form-error"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="profile-cards" class="profile-grid"></div>
|
||||
<div class="empty" id="profiles-empty" hidden>
|
||||
<p>尚未添加任何 LLM 配置。</p>
|
||||
<p class="muted">点击「新建配置」添加第一个。</p>
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
@@ -112,6 +204,7 @@
|
||||
|
||||
<script src="/static/js/api.js"></script>
|
||||
<script src="/static/js/report.js"></script>
|
||||
<script src="/static/js/profiles.js"></script>
|
||||
<script src="/static/js/runner.js"></script>
|
||||
<script src="/static/js/app.js"></script>
|
||||
</body>
|
||||
|
||||
@@ -43,4 +43,26 @@ const API = {
|
||||
return API.post("/api/evaluations", { scenario_path: scenarioPath });
|
||||
},
|
||||
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
|
||||
|
||||
// LLM Profile API
|
||||
profiles() { return API.get("/api/llm-profiles"); },
|
||||
createProfile(body) { return API.post("/api/llm-profiles", body); },
|
||||
updateProfile(id, body) {
|
||||
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, {
|
||||
method: "PUT",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(body),
|
||||
}).then(async r => {
|
||||
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
|
||||
return r.json();
|
||||
});
|
||||
},
|
||||
deleteProfile(id) {
|
||||
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, { method: "DELETE" })
|
||||
.then(async r => {
|
||||
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
|
||||
return r.json();
|
||||
});
|
||||
},
|
||||
applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); },
|
||||
};
|
||||
|
||||
@@ -1,28 +1,59 @@
|
||||
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
|
||||
// 会话保持:URL hash 路由(#runs / #new / #profiles / #report/{runId})
|
||||
// + sessionStorage 兜底,F5 刷新 / 浏览器前进后退均可恢复。
|
||||
|
||||
const App = {
|
||||
currentRunId: null,
|
||||
views: ["runs", "new", "report"],
|
||||
titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
|
||||
activeView: null,
|
||||
views: ["runs", "new", "report", "profiles"],
|
||||
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置" },
|
||||
|
||||
// 初始化:绑定导航、加载首屏、启动健康检查。
|
||||
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
||||
init() {
|
||||
document.querySelectorAll(".nav-item").forEach((btn) => {
|
||||
btn.addEventListener("click", () => App.switchView(btn.dataset.view));
|
||||
btn.addEventListener("click", () => App.navigate(btn.dataset.view));
|
||||
});
|
||||
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
|
||||
|
||||
Runner.init();
|
||||
App.switchView("runs");
|
||||
Profiles.init();
|
||||
|
||||
// 恢复上次会话(优先 URL hash,其次 sessionStorage)
|
||||
App._restoreSession();
|
||||
|
||||
App.checkHealth();
|
||||
setInterval(App.checkHealth, 15000);
|
||||
|
||||
// 浏览器前进 / 后退按钮
|
||||
window.addEventListener("popstate", () => App._restoreSession());
|
||||
},
|
||||
|
||||
// 切换主视图,并同步导航高亮与标题。
|
||||
switchView(view) {
|
||||
if (view === "report" && !App.currentRunId) {
|
||||
// 没有选中的运行时,报告页显示占位。
|
||||
// ----------------------------------------------------------------
|
||||
// 路由 —— 有历史记录的主动导航(更新 URL hash)
|
||||
// ----------------------------------------------------------------
|
||||
navigate(view, runId) {
|
||||
if (runId !== undefined) App.currentRunId = runId;
|
||||
const hash = App._buildHash(view, App.currentRunId);
|
||||
if (location.hash !== `#${hash}`) {
|
||||
history.pushState({ view, runId: App.currentRunId }, "", `#${hash}`);
|
||||
}
|
||||
App._doSwitch(view);
|
||||
},
|
||||
|
||||
// 供内部调用(不产生历史记录),例如刷新同一视图
|
||||
switchView(view) {
|
||||
App._doSwitch(view);
|
||||
},
|
||||
|
||||
// 刷新当前视图数据
|
||||
refreshCurrent() {
|
||||
App._doSwitch(App.activeView || "runs");
|
||||
},
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// 内部:实际切换 DOM + 触发数据加载
|
||||
// ----------------------------------------------------------------
|
||||
_doSwitch(view) {
|
||||
App.views.forEach((name) => {
|
||||
const el = document.getElementById(`view-${name}`);
|
||||
if (el) el.hidden = name !== view;
|
||||
@@ -33,17 +64,53 @@ const App = {
|
||||
document.getElementById("view-title").textContent = App.titles[view] || view;
|
||||
App.activeView = view;
|
||||
|
||||
if (view === "runs") App.loadRuns();
|
||||
if (view === "new") Runner.loadScenarios();
|
||||
if (view === "report") Report.render(App.currentRunId);
|
||||
// 持久化到 sessionStorage(URL 共享场景的备份)
|
||||
sessionStorage.setItem("rag_view", view);
|
||||
if (App.currentRunId) sessionStorage.setItem("rag_run_id", App.currentRunId);
|
||||
|
||||
if (view === "runs") App.loadRuns();
|
||||
if (view === "new") Runner.loadScenarios();
|
||||
if (view === "report") Report.render(App.currentRunId);
|
||||
if (view === "profiles") Profiles.load();
|
||||
},
|
||||
|
||||
// 刷新当前视图的数据。
|
||||
refreshCurrent() {
|
||||
App.switchView(App.activeView || "runs");
|
||||
// ----------------------------------------------------------------
|
||||
// Hash 工具
|
||||
// ----------------------------------------------------------------
|
||||
_buildHash(view, runId) {
|
||||
if (view === "report" && runId) {
|
||||
return `report/${encodeURIComponent(runId)}`;
|
||||
}
|
||||
return view || "runs";
|
||||
},
|
||||
|
||||
// 加载并渲染运行列表。
|
||||
_parseHash() {
|
||||
const raw = location.hash.replace(/^#\/?/, "");
|
||||
if (!raw) return { view: null, runId: null };
|
||||
if (raw.startsWith("report/")) {
|
||||
const runId = decodeURIComponent(raw.slice("report/".length));
|
||||
return { view: "report", runId };
|
||||
}
|
||||
const view = App.views.includes(raw) ? raw : null;
|
||||
return { view, runId: null };
|
||||
},
|
||||
|
||||
// 会话恢复:hash → sessionStorage → 默认 runs
|
||||
_restoreSession() {
|
||||
const { view: hView, runId: hRunId } = App._parseHash();
|
||||
const view = hView || sessionStorage.getItem("rag_view") || "runs";
|
||||
const runId = hRunId || sessionStorage.getItem("rag_run_id") || null;
|
||||
|
||||
if (runId) {
|
||||
App.currentRunId = runId;
|
||||
App.enableReportNav();
|
||||
}
|
||||
App._doSwitch(view);
|
||||
},
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// 运行列表
|
||||
// ----------------------------------------------------------------
|
||||
async loadRuns() {
|
||||
const container = document.getElementById("runs-container");
|
||||
const empty = document.getElementById("runs-empty");
|
||||
@@ -64,14 +131,16 @@ const App = {
|
||||
}
|
||||
},
|
||||
|
||||
// 构造一张运行卡片。
|
||||
renderRunCard(run) {
|
||||
const card = document.createElement("div");
|
||||
card.className = "run-card";
|
||||
card.className = "run-card" + (run.run_id === App.currentRunId ? " selected" : "");
|
||||
|
||||
card.addEventListener("click", () => {
|
||||
App.currentRunId = run.run_id;
|
||||
// 更新选中高亮
|
||||
document.querySelectorAll(".run-card").forEach((c) => c.classList.remove("selected"));
|
||||
card.classList.add("selected");
|
||||
App.enableReportNav();
|
||||
App.switchView("report");
|
||||
App.navigate("report", run.run_id);
|
||||
});
|
||||
|
||||
const chips = (run.metrics || [])
|
||||
@@ -79,7 +148,7 @@ const App = {
|
||||
const val = run.metric_means ? run.metric_means[m] : null;
|
||||
const cls = App.scoreClass(val);
|
||||
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
||||
return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||
})
|
||||
.join("");
|
||||
|
||||
@@ -96,13 +165,14 @@ const App = {
|
||||
return card;
|
||||
},
|
||||
|
||||
// 启用报告导航项(选中运行后)。
|
||||
// ----------------------------------------------------------------
|
||||
// 工具方法
|
||||
// ----------------------------------------------------------------
|
||||
enableReportNav() {
|
||||
const btn = document.querySelector('.nav-item[data-view="report"]');
|
||||
if (btn) btn.disabled = false;
|
||||
},
|
||||
|
||||
// 根据分值返回 good/warn/bad/na 配色类。
|
||||
scoreClass(value) {
|
||||
if (value === null || value === undefined) return "na";
|
||||
if (value >= 0.8) return "good";
|
||||
@@ -110,40 +180,39 @@ const App = {
|
||||
return "bad";
|
||||
},
|
||||
|
||||
// 指标名缩写,节省卡片横向空间。
|
||||
shortMetric(name) {
|
||||
const map = {
|
||||
faithfulness: "faith.",
|
||||
answer_relevancy: "ans.rel.",
|
||||
context_recall: "ctx.recall",
|
||||
context_precision: "ctx.prec.",
|
||||
faithfulness: "faith.",
|
||||
answer_relevancy: "ans.rel.",
|
||||
context_recall: "ctx.recall",
|
||||
context_precision: "ctx.prec.",
|
||||
noise_sensitivity: "noise.sens.",
|
||||
factual_correctness: "fact.corr.",
|
||||
semantic_similarity: "sem.sim.",
|
||||
};
|
||||
return map[name] || name;
|
||||
},
|
||||
|
||||
// 截取时间戳到分钟,便于阅读。
|
||||
shortTime(iso) {
|
||||
if (!iso) return "—";
|
||||
return String(iso).replace("T", " ").slice(0, 16);
|
||||
},
|
||||
|
||||
// 简单 HTML 转义,防止注入。
|
||||
escape(text) {
|
||||
const div = document.createElement("div");
|
||||
div.textContent = text == null ? "" : String(text);
|
||||
return div.innerHTML;
|
||||
},
|
||||
|
||||
// 健康检查,更新左下角状态点。
|
||||
async checkHealth() {
|
||||
const dot = document.getElementById("health-dot");
|
||||
const dot = document.getElementById("health-dot");
|
||||
const label = document.getElementById("health-text");
|
||||
try {
|
||||
await API.health();
|
||||
dot.className = "dot ok";
|
||||
dot.className = "dot ok";
|
||||
label.textContent = "服务正常";
|
||||
} catch (_e) {
|
||||
dot.className = "dot bad";
|
||||
dot.className = "dot bad";
|
||||
label.textContent = "服务离线";
|
||||
}
|
||||
},
|
||||
|
||||
118
webapp/static/js/profiles.js
Normal file
118
webapp/static/js/profiles.js
Normal file
@@ -0,0 +1,118 @@
|
||||
// profiles.js — LLM 配置管理页面逻辑
|
||||
|
||||
const Profiles = {
|
||||
_data: [],
|
||||
|
||||
// 初始化:绑定按钮事件
|
||||
init() {
|
||||
document.getElementById("add-profile-btn").addEventListener("click", () => Profiles.showForm());
|
||||
document.getElementById("save-profile-btn").addEventListener("click", () => Profiles.save());
|
||||
document.getElementById("cancel-profile-btn").addEventListener("click", () => Profiles.hideForm());
|
||||
},
|
||||
|
||||
// 加载并渲染 Profile 列表
|
||||
async load() {
|
||||
const grid = document.getElementById("profile-cards");
|
||||
const empty = document.getElementById("profiles-empty");
|
||||
grid.innerHTML = '<p class="muted">加载中…</p>';
|
||||
try {
|
||||
const data = await API.profiles();
|
||||
Profiles._data = data.profiles || [];
|
||||
grid.innerHTML = "";
|
||||
if (Profiles._data.length === 0) {
|
||||
empty.hidden = false;
|
||||
} else {
|
||||
empty.hidden = true;
|
||||
Profiles._data.forEach(p => grid.appendChild(Profiles.renderCard(p)));
|
||||
}
|
||||
} catch (err) {
|
||||
grid.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||
}
|
||||
},
|
||||
|
||||
// 渲染单个 Profile 卡片
|
||||
renderCard(p) {
|
||||
const card = document.createElement("div");
|
||||
card.className = "profile-card";
|
||||
card.dataset.id = p.profile_id;
|
||||
card.innerHTML = `
|
||||
<div class="profile-card-head">
|
||||
<div class="profile-card-name">${App.escape(p.name)}</div>
|
||||
<div class="profile-card-actions">
|
||||
<button class="btn btn-sm" data-action="edit">编辑</button>
|
||||
<button class="btn btn-sm btn-danger" data-action="delete">删除</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="profile-card-field"><span class="field-label">模型</span> <code>${App.escape(p.model)}</code></div>
|
||||
<div class="profile-card-field"><span class="field-label">Base URL</span> <code>${App.escape(p.base_url)}</code></div>
|
||||
<div class="profile-card-field"><span class="field-label">超时</span> ${p.timeout_seconds}s</div>
|
||||
`;
|
||||
card.querySelector("[data-action=edit]").addEventListener("click", () => Profiles.showForm(p));
|
||||
card.querySelector("[data-action=delete]").addEventListener("click", () => Profiles.remove(p.profile_id, p.name));
|
||||
return card;
|
||||
},
|
||||
|
||||
// 显示新建或编辑表单
|
||||
showForm(profile = null) {
|
||||
const panel = document.getElementById("profile-form-panel");
|
||||
const title = document.getElementById("profile-form-title");
|
||||
panel.hidden = false;
|
||||
title.textContent = profile ? "编辑 LLM 配置" : "新建 LLM 配置";
|
||||
document.getElementById("edit-profile-id").value = profile ? profile.profile_id : "";
|
||||
document.getElementById("pf-name").value = profile ? profile.name : "";
|
||||
document.getElementById("pf-model").value = profile ? profile.model : "";
|
||||
document.getElementById("pf-base-url").value = profile ? profile.base_url : "";
|
||||
document.getElementById("pf-api-key").value = profile ? profile.api_key : "";
|
||||
document.getElementById("pf-timeout").value = profile ? profile.timeout_seconds : 30;
|
||||
document.getElementById("profile-form-error").textContent = "";
|
||||
panel.scrollIntoView({ behavior: "smooth", block: "start" });
|
||||
},
|
||||
|
||||
hideForm() {
|
||||
document.getElementById("profile-form-panel").hidden = true;
|
||||
},
|
||||
|
||||
// 保存(新建 or 更新)
|
||||
async save() {
|
||||
const id = document.getElementById("edit-profile-id").value;
|
||||
const body = {
|
||||
name: document.getElementById("pf-name").value.trim(),
|
||||
model: document.getElementById("pf-model").value.trim(),
|
||||
base_url: document.getElementById("pf-base-url").value.trim(),
|
||||
api_key: document.getElementById("pf-api-key").value.trim(),
|
||||
timeout_seconds: parseInt(document.getElementById("pf-timeout").value, 10) || 30,
|
||||
};
|
||||
const errEl = document.getElementById("profile-form-error");
|
||||
if (!body.name || !body.model || !body.base_url || !body.api_key) {
|
||||
errEl.textContent = "请填写所有必填字段(名称、模型、Base URL、API Key)";
|
||||
return;
|
||||
}
|
||||
try {
|
||||
if (id) {
|
||||
await API.updateProfile(id, body);
|
||||
} else {
|
||||
await API.createProfile(body);
|
||||
}
|
||||
Profiles.hideForm();
|
||||
await Profiles.load();
|
||||
} catch (err) {
|
||||
errEl.textContent = `保存失败:${err.message}`;
|
||||
}
|
||||
},
|
||||
|
||||
// 删除 Profile
|
||||
async remove(profileId, name) {
|
||||
if (!confirm(`确认删除配置「${name}」?`)) return;
|
||||
try {
|
||||
await API.deleteProfile(profileId);
|
||||
await Profiles.load();
|
||||
} catch (err) {
|
||||
alert(`删除失败:${err.message}`);
|
||||
}
|
||||
},
|
||||
|
||||
// 获取当前已加载的 profiles(供 runner.js 使用)
|
||||
getAll() {
|
||||
return Profiles._data;
|
||||
},
|
||||
};
|
||||
@@ -26,6 +26,7 @@ const Report = {
|
||||
Report.renderDistribution(detail.report);
|
||||
Report.renderGroupings(detail.report);
|
||||
Report.renderLowest(detail.report);
|
||||
Report.renderAdvice(detail.summary, detail.report);
|
||||
content.style.opacity = "1";
|
||||
} catch (err) {
|
||||
empty.hidden = false;
|
||||
@@ -186,8 +187,7 @@ const Report = {
|
||||
},
|
||||
|
||||
// ④ 最低分样本逐条复核表(点击展开)。
|
||||
renderLowest(report) {
|
||||
const wrap = document.getElementById("lowest-table");
|
||||
renderLowest(report) { const wrap = document.getElementById("lowest-table");
|
||||
const samples = report.lowest_samples || [];
|
||||
wrap.innerHTML = "";
|
||||
if (samples.length === 0) {
|
||||
@@ -255,4 +255,35 @@ const Report = {
|
||||
</div>
|
||||
`;
|
||||
},
|
||||
|
||||
// ⑤ 优化建议(仅 optimization_advice.md 存在时渲染)。
|
||||
renderAdvice(summary, report) {
|
||||
const section = document.getElementById("advice-section");
|
||||
const body = document.getElementById("advice-body");
|
||||
const modelLabel = document.getElementById("advice-model-label");
|
||||
|
||||
const md = report.advice_markdown || "";
|
||||
if (!md.trim()) {
|
||||
section.hidden = true;
|
||||
return;
|
||||
}
|
||||
|
||||
section.hidden = false;
|
||||
modelLabel.textContent = summary.judge_model ? `judge: ${summary.judge_model}` : "";
|
||||
|
||||
// 简单 Markdown → HTML 转换(标题、列表、分隔线、粗体)
|
||||
const escaped = md
|
||||
.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
||||
const html = escaped
|
||||
.replace(/^#{3}\s+(.+)$/gm, "<h3>$1</h3>")
|
||||
.replace(/^#{2}\s+(.+)$/gm, "<h2>$1</h2>")
|
||||
.replace(/^#{1}\s+(.+)$/gm, "<h1>$1</h1>")
|
||||
.replace(/^---+$/gm, "<hr>")
|
||||
.replace(/\*\*(.+?)\*\*/g, "<strong>$1</strong>")
|
||||
.replace(/^- (.+)$/gm, "<li>$1</li>")
|
||||
.replace(/(<li>[^]*?<\/li>\n?)+/g, (m) => `<ul>${m}</ul>`)
|
||||
.replace(/\n\n+/g, "\n<br>\n");
|
||||
|
||||
body.innerHTML = `<div class="advice-md">${html}</div>`;
|
||||
},
|
||||
};
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
// runner.js — 新建评估视图:列出场景、触发评估、轮询任务状态与日志。
|
||||
// runner.js — 新建评估视图:列出场景、LLM角色配置、触发评估、轮询任务状态与日志。
|
||||
|
||||
const Runner = {
|
||||
selectedScenario: null,
|
||||
pollTimer: null,
|
||||
lastRunId: null,
|
||||
|
||||
// 绑定运行按钮。
|
||||
init() {
|
||||
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
|
||||
document.getElementById("view-report-btn").addEventListener("click", () => {
|
||||
if (Runner.lastRunId) {
|
||||
App.currentRunId = Runner.lastRunId;
|
||||
App.enableReportNav();
|
||||
App.switchView("report");
|
||||
App.navigate("report", Runner.lastRunId);
|
||||
}
|
||||
});
|
||||
},
|
||||
@@ -32,6 +32,27 @@ const Runner = {
|
||||
} catch (err) {
|
||||
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||
}
|
||||
// 同时加载 profiles 供角色选择
|
||||
Runner._populateProfileSelects();
|
||||
},
|
||||
|
||||
// 填充三个角色下拉框
|
||||
async _populateProfileSelects() {
|
||||
const cached = Profiles.getAll();
|
||||
const profiles = cached.length > 0
|
||||
? cached
|
||||
: (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
|
||||
|
||||
["role-judge", "role-answer", "role-dataset"].forEach(id => {
|
||||
const sel = document.getElementById(id);
|
||||
sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
|
||||
profiles.forEach(p => {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = p.profile_id;
|
||||
opt.textContent = `${p.name} (${p.model})`;
|
||||
sel.appendChild(opt);
|
||||
});
|
||||
});
|
||||
},
|
||||
|
||||
// 构造单个场景条目。
|
||||
@@ -64,12 +85,14 @@ const Runner = {
|
||||
Runner.selectedScenario = sc.path;
|
||||
document.getElementById("selected-scenario").textContent = sc.path;
|
||||
document.getElementById("run-btn").disabled = false;
|
||||
// 显示 LLM 角色面板
|
||||
document.getElementById("llm-assignment-panel").hidden = false;
|
||||
});
|
||||
}
|
||||
return item;
|
||||
},
|
||||
|
||||
// 触发评估并开始轮询。
|
||||
// 触发评估:先 apply profiles(若选了),再触发任务。
|
||||
async trigger() {
|
||||
if (!Runner.selectedScenario) return;
|
||||
const runBtn = document.getElementById("run-btn");
|
||||
@@ -85,15 +108,41 @@ const Runner = {
|
||||
Runner._setStatus(statusBadge, "queued");
|
||||
|
||||
try {
|
||||
// Step 1: apply LLM profiles to YAML if any selected
|
||||
await Runner._applyProfilesIfNeeded(logBox);
|
||||
|
||||
// Step 2: trigger evaluation
|
||||
const resp = await API.triggerEvaluation(Runner.selectedScenario);
|
||||
Runner.poll(resp.task_id);
|
||||
} catch (err) {
|
||||
Runner._setStatus(statusBadge, "failed");
|
||||
logBox.textContent = `触发失败:${err.message}`;
|
||||
logBox.textContent = (logBox.textContent ? logBox.textContent + "\n" : "") + `触发失败:${err.message}`;
|
||||
runBtn.disabled = false;
|
||||
}
|
||||
},
|
||||
|
||||
// 如果用户选了 profile,就先 apply 写回 YAML
|
||||
async _applyProfilesIfNeeded(logBox) {
|
||||
const judgeId = document.getElementById("role-judge").value;
|
||||
const answerId = document.getElementById("role-answer").value;
|
||||
const datasetId = document.getElementById("role-dataset").value;
|
||||
|
||||
if (!judgeId && !answerId && !datasetId) return; // 全空,跳过
|
||||
|
||||
logBox.textContent = "正在将 LLM 配置写入场景文件…\n";
|
||||
const body = {
|
||||
scenario_path: Runner.selectedScenario,
|
||||
judge_profile_id: judgeId || null,
|
||||
answer_profile_id: answerId || null,
|
||||
dataset_profile_id: datasetId || null,
|
||||
};
|
||||
const result = await API.applyProfiles(body);
|
||||
const fields = (result.patched_fields || []).join(", ");
|
||||
logBox.textContent += fields
|
||||
? `✓ 已更新字段:${fields}\n`
|
||||
: "(未找到可更新的字段,继续运行)\n";
|
||||
},
|
||||
|
||||
// 周期性轮询任务状态,刷新日志与徽标。
|
||||
poll(taskId) {
|
||||
const logBox = document.getElementById("task-log");
|
||||
@@ -114,6 +163,7 @@ const Runner = {
|
||||
runBtn.disabled = false;
|
||||
if (status.status === "completed" && status.run_id) {
|
||||
Runner.lastRunId = status.run_id;
|
||||
sessionStorage.setItem("rag_run_id", status.run_id);
|
||||
reportBtn.hidden = false;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user