Compare commits
62 Commits
1ff4a3943a
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9828b1d44c | ||
|
|
1df4010acc | ||
|
|
754a30ad59 | ||
|
|
e1751447df | ||
|
|
4fd515d2d9 | ||
|
|
abcd61ec8f | ||
|
|
363e8b0f27 | ||
|
|
b870ed8730 | ||
|
|
791738bb07 | ||
|
|
630b70cc2a | ||
|
|
a781ba1e4a | ||
|
|
2ad2c1ea9d | ||
|
|
f8e308b7dc | ||
|
|
fb420656ec | ||
|
|
05419db1f9 | ||
|
|
1dc7ab9727 | ||
|
|
7cc3aff95a | ||
|
|
ad2651ce27 | ||
|
|
fb42116616 | ||
|
|
a629bd516c | ||
|
|
ac410e7a5d | ||
|
|
1304fec1c4 | ||
|
|
5ced129ff7 | ||
|
|
ebf1fc7be8 | ||
|
|
1bcb208f92 | ||
|
|
a03a24be4e | ||
|
|
e4d4e4968b | ||
|
|
761faf9c42 | ||
|
|
9ad6ad4ebc | ||
|
|
eee96eb158 | ||
|
|
ccf25eb1f9 | ||
|
|
199b3af611 | ||
|
|
f9e3ba0f64 | ||
|
|
36e5506e2a | ||
|
|
835614189e | ||
|
|
ce0d2291b0 | ||
|
|
480f6d66ea | ||
|
|
d371ef7d24 | ||
|
|
8617eaa5aa | ||
|
|
e0b064587f | ||
|
|
078097af00 | ||
|
|
ca586bf9bb | ||
|
|
9ad2daff73 | ||
|
|
e8af5b906c | ||
|
|
8ea2b9c7d2 | ||
|
|
074800b741 | ||
|
|
3019390592 | ||
|
|
24956bbf75 | ||
|
|
ca01e44ad2 | ||
|
|
1a2cc534b8 | ||
|
|
91c0dab4f9 | ||
|
|
f5c2dce64a | ||
|
|
d68399d39b | ||
|
|
719c3b4ca4 | ||
|
|
5b60ed12ea | ||
|
|
dc8baf8662 | ||
|
|
e329f59139 | ||
|
|
b19054bd66 | ||
|
|
5d09deb420 | ||
|
|
b98af29449 | ||
|
|
4173a40d93 | ||
|
|
629304aa6d |
30
.env.example
30
.env.example
@@ -1,11 +1,26 @@
|
|||||||
|
# ===== LLM 连接配置(RAGAS 评测 + 生成) =====
|
||||||
|
# 所有模型共用同一个 OpenAI 兼容 endpoint
|
||||||
|
# 在 Web 控制台的「LLM 配置」页面可以保存多个命名配置,
|
||||||
|
# 并在运行评估时按角色(Judge / Answer / Dataset)分别选择覆盖。
|
||||||
|
|
||||||
OPENAI_API_KEY=your-api-key
|
OPENAI_API_KEY=your-api-key
|
||||||
OPENAI_BASE_URL=http://6.86.80.4:30080/v1
|
OPENAI_BASE_URL=http://6.86.80.4:30080/v1
|
||||||
RAGAS_JUDGE_MODEL=deepseek-v4-flash
|
OPENAI_TIMEOUT_SECONDS=180
|
||||||
RAGAS_EMBEDDING_MODEL=text-embedding-v3
|
|
||||||
|
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
||||||
|
# RAGAS_JUDGE_MODEL 需支持 OpenAI 兼容 chat.completions + 结构化 JSON 输出
|
||||||
|
# RAGAS_LLM_MAX_TOKENS 控制 Judge 评分链路的 completion budget;faithfulness 等
|
||||||
|
# 结构化指标在 GPT-5 系列上通常需要 4096 或更高,避免 IncompleteOutputException
|
||||||
|
RAGAS_JUDGE_MODEL=gpt-5
|
||||||
|
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
|
||||||
|
RAGAS_LLM_MAX_TOKENS=4096
|
||||||
|
|
||||||
|
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
||||||
BATCH_SIZE=8
|
BATCH_SIZE=8
|
||||||
|
RAGAS_METRIC_TIMEOUT_SECONDS=300
|
||||||
|
|
||||||
|
|
||||||
# ===== 阿里云文档解析 =====
|
# ===== 阿里云文档解析(dataset build 功能需要) =====
|
||||||
ALIBABA_ACCESS_KEY_ID=
|
ALIBABA_ACCESS_KEY_ID=
|
||||||
ALIBABA_ACCESS_KEY_SECRET=
|
ALIBABA_ACCESS_KEY_SECRET=
|
||||||
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
|
ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
|
||||||
@@ -14,6 +29,13 @@ ALIYUN_PARSE_TIMEOUT_SECONDS=900
|
|||||||
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
|
ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
|
||||||
ALIYUN_LLM_ENHANCEMENT=true
|
ALIYUN_LLM_ENHANCEMENT=true
|
||||||
ALIYUN_ENHANCEMENT_MODE=VLM
|
ALIYUN_ENHANCEMENT_MODE=VLM
|
||||||
DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
|
DOCUMENT_PARSE_ARTIFACT_PREFIX=outputs/dataset-builds
|
||||||
PARSER_FAILURE_MODE=fail
|
PARSER_FAILURE_MODE=fail
|
||||||
|
|
||||||
|
# 生成题库时使用的模型(可在 Web 控制台 LLM 配置中按场景覆盖)
|
||||||
DATASET_GENERATOR_MODEL=qwen3.6-plus
|
DATASET_GENERATOR_MODEL=qwen3.6-plus
|
||||||
|
|
||||||
|
# ===== Dify 集成 — 实时评分 API =====
|
||||||
|
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
|
||||||
|
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
|
||||||
|
SCORE_API_TOKEN=
|
||||||
|
|||||||
26
.gitattributes
vendored
Normal file
26
.gitattributes
vendored
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# 默认:文本文件使用 LF(Linux/macOS 风格)
|
||||||
|
* text=auto eol=lf
|
||||||
|
|
||||||
|
# Shell 脚本强制 LF,无论在哪个平台 checkout
|
||||||
|
*.sh text eol=lf
|
||||||
|
|
||||||
|
# Python 和 YAML 也用 LF
|
||||||
|
*.py text eol=lf
|
||||||
|
*.yaml text eol=lf
|
||||||
|
*.yml text eol=lf
|
||||||
|
*.md text eol=lf
|
||||||
|
*.json text eol=lf
|
||||||
|
*.toml text eol=lf
|
||||||
|
*.txt text eol=lf
|
||||||
|
*.env text eol=lf
|
||||||
|
*.env.example text eol=lf
|
||||||
|
|
||||||
|
# Windows 脚本保留 CRLF
|
||||||
|
*.ps1 text eol=crlf
|
||||||
|
*.bat text eol=crlf
|
||||||
|
|
||||||
|
# 二进制文件不转换
|
||||||
|
*.pdf binary
|
||||||
|
*.png binary
|
||||||
|
*.jpg binary
|
||||||
|
*.csv binary
|
||||||
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="KubernetesApiProvider"><![CDATA[{}]]></component>
|
||||||
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
|
||||||
|
<output url="file://$PROJECT_DIR$/out" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/siemens_ragas.iml" filepath="$PROJECT_DIR$/.idea/siemens_ragas.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
9
.idea/siemens_ragas.iml
generated
Normal file
9
.idea/siemens_ragas.iml
generated
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
<h2>优化建议怎么生成?</h2>
|
||||||
|
<p class="subtitle">这决定了模块的核心机制与可维护性</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>纯规则引擎</h3>
|
||||||
|
<p>每个指标设阈值(如 faithfulness < 0.6),触发时给出预设建议文本。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>零 LLM 调用,零额外成本</li>
|
||||||
|
<li>结果可预测、可审计</li>
|
||||||
|
<li>响应极快</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>建议固定,无法结合具体样本</li>
|
||||||
|
<li>不能解释"为什么这批数据这个指标低"</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>LLM 分析(全自动)</h3>
|
||||||
|
<p>把评测结果(各指标均值 + 低分样本)一起交给 LLM,生成上下文感知的中文分析报告。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>能结合具体低分样本给出针对性建议</li>
|
||||||
|
<li>可用中文解释西门子场景下的问题</li>
|
||||||
|
<li>建议质量高、内容丰富</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>每次评测多 1 次 LLM 调用</li>
|
||||||
|
<li>依赖 judge_model 的质量</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>规则定位 + LLM 解读(推荐)</h3>
|
||||||
|
<p>规则引擎先识别哪些指标异常、触发哪条优化方向;再把"规则诊断 + 低分样本"一起给 LLM 做二次解读,生成中文建议。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>规则保证诊断稳定,不依赖 LLM 自由发挥</li>
|
||||||
|
<li>LLM 在有结构的输入下输出更准确</li>
|
||||||
|
<li>两层可独立测试</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>实现略复杂(两个子模块)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
<h2>优化顾问模块 — 实现方案对比</h2>
|
||||||
|
<p class="subtitle">三个方案的核心区别在于 LLM 调用边界和代码入侵程度</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>独立后处理器(轻量集成)</h3>
|
||||||
|
<p>新增 <code>rag_eval/advisor/</code> 包,<code>run_scenario()</code> 末尾调用一行 <code>maybe_run_advisor(result, scenario)</code>。</p>
|
||||||
|
<p><strong>文件结构:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/advisor/__init__.py</code></li>
|
||||||
|
<li><code>rag_eval/advisor/rules.py</code> — 规则引擎,输入 score_rows,输出诊断列表</li>
|
||||||
|
<li><code>rag_eval/advisor/llm_analyzer.py</code> — 把规则诊断 + 低分样本交给 judge_model</li>
|
||||||
|
<li><code>rag_eval/advisor/writer.py</code> — 写 optimization_advice.md,打日志摘要</li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>改动最小,runner.py 只加 3 行</li>
|
||||||
|
<li>advisor 完全独立,可单独测试</li>
|
||||||
|
<li>与现有分层架构完全吻合</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>无法拿到 per-metric 的原始 NaN 率(需从 score_rows 重新算)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>嵌入 reporting 层(复用写出基础设施)</h3>
|
||||||
|
<p>把 advisor 作为 <code>rag_eval/reporting/</code> 的一部分,<code>write_run_artifacts()</code> 内部判断是否写 advice。</p>
|
||||||
|
<p><strong>文件结构:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/reporting/advisor.py</code> — 规则 + LLM + 写出三合一</li>
|
||||||
|
<li><code>write_run_artifacts()</code> 里追加 <code>if scenario.optimization_advisor: write_advice(...)</code></li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>artifacts 路径管理统一,advice 自然进 run 目录</li>
|
||||||
|
<li>文件更少</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>reporting 层本是"无副作用写文件",混入 LLM 调用破坏这一约定</li>
|
||||||
|
<li>advisor 逻辑和写出逻辑耦合,难以单独测试规则引擎</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>方案 A 变体:advisor 有独立 settings(推荐)</h3>
|
||||||
|
<p>与方案 A 相同的文件结构,但 LLM 调用使用 <strong>scenario 已有的 judge_model</strong>,不新增任何模型配置——advisor 复用 <code>build_models()</code> 已构建好的 llm 实例。</p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/advisor/rules.py</code> — 纯函数,7 条指标诊断规则</li>
|
||||||
|
<li><code>rag_eval/advisor/llm_analyzer.py</code> — 接收已有 llm 实例,不重新建 client</li>
|
||||||
|
<li><code>rag_eval/advisor/writer.py</code> — 写 md + 日志</li>
|
||||||
|
<li><code>rag_eval/advisor/__init__.py</code> — 暴露 <code>run_advisor()</code></li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>不重复创建 LLM client(节省资源)</li>
|
||||||
|
<li>advisor 阈值可通过 YAML 的 optimization_advisor 块扩展配置</li>
|
||||||
|
<li>独立包边界清晰,易于单测</li>
|
||||||
|
<li>runner.py 改动最小</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>需把 llm 实例从 runner 传入 advisor(多传一个参数)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
<h2>优化顾问模块 — 整体架构与数据流</h2>
|
||||||
|
<p class="subtitle">新增 rag_eval/advisor/ 包,插入 run_scenario() 末尾,复用已有 llm 实例</p>
|
||||||
|
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-header">执行链路(变更前 → 变更后)</div>
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:2">
|
||||||
|
<span style="color:#94a3b8">run_scenario()</span><br>
|
||||||
|
→ load_scenario() <span style="color:#94a3b8"># 读 YAML,解析 Scenario + optimization_advisor 字段</span><br>
|
||||||
|
→ build_models() <span style="color:#94a3b8"># 已有:创建 llm, embeddings</span><br>
|
||||||
|
→ build_metric_pipeline() <span style="color:#94a3b8"># 已有</span><br>
|
||||||
|
→ Evaluator.evaluate() <span style="color:#94a3b8"># 已有:打分 → EvaluationResult</span><br>
|
||||||
|
→ write_run_artifacts() <span style="color:#94a3b8"># 已有:scores.csv / summary.md / ...</span><br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">→ run_advisor(result, scenario, llm) # 新增 3 行</span><br>
|
||||||
|
<span style="color:#4ade80"> → rules.diagnose(score_rows) # 规则引擎:识别异常指标 + 方向</span><br>
|
||||||
|
<span style="color:#4ade80"> → llm_analyzer.analyze(diag, samples) # LLM:结合低分样本生成中文建议</span><br>
|
||||||
|
<span style="color:#4ade80"> → writer.write(advice, paths) # 写 optimization_advice.md + 日志</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>新增文件一览</h3>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
|
||||||
|
rag_eval/advisor/<br>
|
||||||
|
__init__.py <span style="color:#94a3b8">← 暴露 run_advisor(),是外部唯一入口</span><br>
|
||||||
|
rules.py <span style="color:#94a3b8">← 纯函数,无 LLM,可单独单测</span><br>
|
||||||
|
llm_analyzer.py <span style="color:#94a3b8">← 接收 llm 实例 + 诊断结构 → 中文 Markdown</span><br>
|
||||||
|
writer.py <span style="color:#94a3b8">← 写 optimization_advice.md,打日志摘要</span><br>
|
||||||
|
<br>
|
||||||
|
rag_eval/shared/models.py <span style="color:#fbbf24">← 修改:Scenario 加 optimization_advisor 字段</span><br>
|
||||||
|
rag_eval/config/schema.py <span style="color:#fbbf24">← 修改:ScenarioModel 加字段</span><br>
|
||||||
|
rag_eval/execution/runner.py <span style="color:#fbbf24">← 修改:末尾加 3 行调用</span><br>
|
||||||
|
rag_eval/reporting/artifacts.py <span style="color:#fbbf24">← 修改:RunArtifactPaths 加 advice_md 路径</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>输出产物</h3>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
|
||||||
|
outputs/online/siemens-pdf-question-bank/<run_id>/<br>
|
||||||
|
scenario.snapshot.yaml<br>
|
||||||
|
scores.csv<br>
|
||||||
|
invalid.csv<br>
|
||||||
|
summary.md<br>
|
||||||
|
metadata.json<br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">optimization_advice.md ← 新增</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p style="margin-top:1rem;color:#94a3b8;font-size:13px">整体看起来 OK 吗?这是新模块与现有链路的接入方式。</p>
|
||||||
@@ -0,0 +1,68 @@
|
|||||||
|
<h2>优化顾问在什么情况下运行?</h2>
|
||||||
|
<p class="subtitle">这决定了模块与现有评测流程的集成方式</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>每次评测自动运行</h3>
|
||||||
|
<p>run_scenario() 结束后自动调用,无需任何额外配置。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>零感知,开箱即用</li>
|
||||||
|
<li>每次跑完都有建议报告</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>每次都多一次 LLM 调用,不管是否需要</li>
|
||||||
|
<li>无法关闭</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>YAML 场景中显式开启(推荐)</h3>
|
||||||
|
<p>在 scenario YAML 里加一行 <code>optimization_advisor: true</code>,默认关闭。</p>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-header">siemens-pdf-question-bank-online.yaml</div>
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.8">
|
||||||
|
metrics:<br>
|
||||||
|
- faithfulness<br>
|
||||||
|
- noise_sensitivity<br>
|
||||||
|
...<br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">optimization_advisor: true # 新增</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>显式可见,按需开启</li>
|
||||||
|
<li>与现有 YAML 驱动风格一致</li>
|
||||||
|
<li>可为不同场景独立配置</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>需要手动在 YAML 里加一行</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>阈值触发(任一指标低于警戒线时自动激活)</h3>
|
||||||
|
<p>规则引擎先算,若发现有指标低于阈值则自动启动 LLM 分析;一切正常则跳过。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>"有问题才报警",符合直觉</li>
|
||||||
|
<li>高分场景无额外成本</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>阈值需要维护,不同场景可能不同</li>
|
||||||
|
<li>正常分数时无建议,但用户可能仍想看优化空间</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
<div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
|
||||||
|
<p class="subtitle">Writing spec & moving to implementation...</p>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
<div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
|
||||||
|
<p class="subtitle">Continuing in terminal — 正在设计方案...</p>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"reason":"idle timeout","timestamp":1781598635371}
|
||||||
1
.superpowers/brainstorm/1625-1781595805/state/server.pid
Normal file
1
.superpowers/brainstorm/1625-1781595805/state/server.pid
Normal file
@@ -0,0 +1 @@
|
|||||||
|
1625
|
||||||
64
configs/llm_profiles.json
Normal file
64
configs/llm_profiles.json
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
{
|
||||||
|
"profiles": [
|
||||||
|
{
|
||||||
|
"profile_id": "c8e185a64fa0",
|
||||||
|
"name": "glm-5",
|
||||||
|
"model": "glm-5",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:16:22.438297+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:03.089865+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "54ddfe5aeb46",
|
||||||
|
"name": "deepseek-v4-pro",
|
||||||
|
"model": "deepseek-v4-pro",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:17:08.473904+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:07.504082+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "25d035eef194",
|
||||||
|
"name": "qwen3.5-flash",
|
||||||
|
"model": "qwen3.5-flash",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:18:24.265619+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:18:24.265619+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "ff1d0f417a5d",
|
||||||
|
"name": "deepseek-v4-flash",
|
||||||
|
"model": "deepseek-v4-flash",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:18:57.091549+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:18:57.091549+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "5b04c49df9df",
|
||||||
|
"name": "text-embedding-v4",
|
||||||
|
"model": "text-embedding-v4",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:19:49.104004+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:49.104004+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "b4f7c82859d5",
|
||||||
|
"name": "text-embedding-v3",
|
||||||
|
"model": "text-embedding-v3",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:20:18.266540+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:20:18.266540+00:00"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
173
deploy.sh
Normal file
173
deploy.sh
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# deploy.sh — Siemens RAGAS 一键部署脚本(Linux)
|
||||||
|
# 用法:bash deploy.sh
|
||||||
|
# 功能:检查环境 → 安装依赖 → 初始化配置 → 启动后台服务
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — Linux 一键部署${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 阶段 1:Python 版本检查 ───────────────────────────────────────
|
||||||
|
info "阶段 1/7:检查 Python 版本..."
|
||||||
|
|
||||||
|
PYTHON_BIN=""
|
||||||
|
for candidate in python3.12 python3.13 python3.14 python3; do
|
||||||
|
if command -v "$candidate" &>/dev/null; then
|
||||||
|
version=$("$candidate" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || true)
|
||||||
|
major=$(echo "$version" | cut -d. -f1)
|
||||||
|
minor=$(echo "$version" | cut -d. -f2)
|
||||||
|
if [ "${major:-0}" -ge 3 ] && [ "${minor:-0}" -ge 12 ]; then
|
||||||
|
PYTHON_BIN="$candidate"
|
||||||
|
ok "Python $version ($candidate)"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$PYTHON_BIN" ]; then
|
||||||
|
err "未找到 Python 3.12+。请安装后重试。"
|
||||||
|
err " Ubuntu/Debian: sudo apt install python3.12 python3.12-venv"
|
||||||
|
err " CentOS/RHEL: sudo dnf install python3.12"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 2:虚拟环境 ──────────────────────────────────────────────
|
||||||
|
info "阶段 2/7:准备虚拟环境..."
|
||||||
|
|
||||||
|
if [ -d ".venv" ] && [ -f ".venv/bin/python" ]; then
|
||||||
|
ok ".venv 已存在,跳过创建"
|
||||||
|
else
|
||||||
|
info "创建 .venv..."
|
||||||
|
"$PYTHON_BIN" -m venv .venv
|
||||||
|
ok ".venv 创建完成"
|
||||||
|
fi
|
||||||
|
|
||||||
|
PIP=".venv/bin/pip"
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# ── 阶段 3:安装依赖 ──────────────────────────────────────────────
|
||||||
|
info "阶段 3/7:安装项目依赖(可能需要几分钟)..."
|
||||||
|
|
||||||
|
"$PIP" install --upgrade pip -q
|
||||||
|
ok "pip 已升级"
|
||||||
|
|
||||||
|
"$PIP" install -e . -q
|
||||||
|
ok "项目依赖安装完成(pyproject.toml)"
|
||||||
|
|
||||||
|
"$PIP" install fastapi uvicorn httpx -q
|
||||||
|
ok "Web 服务依赖安装完成(fastapi / uvicorn / httpx)"
|
||||||
|
|
||||||
|
# ── 阶段 4:配置文件 ──────────────────────────────────────────────
|
||||||
|
info "阶段 4/7:初始化配置文件..."
|
||||||
|
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
cp .env.example .env
|
||||||
|
warn ".env 已从 .env.example 复制,请编辑填写实际的 API Key 等配置后再启动:"
|
||||||
|
warn " nano .env 或 vim .env"
|
||||||
|
warn " 关键字段:OPENAI_API_KEY, OPENAI_BASE_URL, ALIBABA_ACCESS_KEY_ID, ALIBABA_ACCESS_KEY_SECRET"
|
||||||
|
else
|
||||||
|
ok ".env 已存在,跳过"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 5:目录初始化 ────────────────────────────────────────────
|
||||||
|
info "阶段 5/7:初始化目录结构..."
|
||||||
|
|
||||||
|
mkdir -p configs logs outputs datasets
|
||||||
|
ok "目录就绪:configs/ logs/ outputs/ datasets/"
|
||||||
|
|
||||||
|
# 确保其他脚本有执行权限
|
||||||
|
for script in start.sh stop.sh run_eval.sh; do
|
||||||
|
[ -f "$script" ] && chmod +x "$script"
|
||||||
|
done
|
||||||
|
ok "辅助脚本已设置执行权限"
|
||||||
|
|
||||||
|
# ── 阶段 6:Demo 数据 ─────────────────────────────────────────────
|
||||||
|
info "阶段 6/7:初始化演示数据..."
|
||||||
|
|
||||||
|
DEMO_DIR="outputs/kba-knowledge-base-offline-baseline"
|
||||||
|
if [ -d "$DEMO_DIR" ]; then
|
||||||
|
ok "演示数据已存在,跳过"
|
||||||
|
else
|
||||||
|
info "生成演示数据(scripts/seed_sample_run.py)..."
|
||||||
|
if "$PYTHON" scripts/seed_sample_run.py; then
|
||||||
|
ok "演示数据生成完成"
|
||||||
|
else
|
||||||
|
warn "演示数据生成失败,控制台报告页将为空(服务仍可正常启动)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 阶段 7:启动服务 ──────────────────────────────────────────────
|
||||||
|
info "阶段 7/7:启动 Web 服务..."
|
||||||
|
|
||||||
|
# 检查 .env 是否包含默认占位符
|
||||||
|
if grep -q "your-api-key" .env 2>/dev/null; then
|
||||||
|
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
|
||||||
|
warn "请编辑 .env 后重新运行 start.sh"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 端口检测
|
||||||
|
PORT=8800
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
warn "端口 $PORT 已被占用,尝试 8801..."
|
||||||
|
PORT=8801
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
err "端口 8800 和 8801 均被占用。请手动运行:"
|
||||||
|
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 清理残留 PID
|
||||||
|
if [ -f ".server.pid" ]; then
|
||||||
|
OLD_PID=$(cat .server.pid)
|
||||||
|
if kill -0 "$OLD_PID" 2>/dev/null; then
|
||||||
|
warn "检测到已有服务进程 (PID=$OLD_PID),停止旧进程..."
|
||||||
|
kill "$OLD_PID" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
rm -f .server.pid
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 后台启动
|
||||||
|
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
echo "$SERVER_PID" > .server.pid
|
||||||
|
|
||||||
|
# 等待 3 秒验证进程存活
|
||||||
|
sleep 3
|
||||||
|
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||||
|
ok "服务已启动 (PID=$SERVER_PID)"
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${GREEN} 部署成功!${NC}"
|
||||||
|
echo -e "${GREEN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
|
||||||
|
echo -e "${GREEN} 本机访问: http://127.0.0.1:${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 服务日志: tail -f logs/server.log${NC}"
|
||||||
|
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
else
|
||||||
|
err "服务启动失败,请查看日志:"
|
||||||
|
err " tail -20 logs/server.log"
|
||||||
|
rm -f .server.pid
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@@ -318,6 +318,10 @@ metrics:
|
|||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
|
# 可选:鲁棒性 / 端到端指标(需数据集含 ground_truth),完整列表见 §9.4
|
||||||
|
# - noise_sensitivity
|
||||||
|
# - factual_correctness
|
||||||
|
# - semantic_similarity
|
||||||
output_dir: runs/legal-assistant-offline-baseline
|
output_dir: runs/legal-assistant-offline-baseline
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
@@ -338,7 +342,7 @@ runtime:
|
|||||||
- `embedding_model`
|
- `embedding_model`
|
||||||
- 负责向量相关指标的模型
|
- 负责向量相关指标的模型
|
||||||
- `metrics`
|
- `metrics`
|
||||||
- 本次启用的指标列表
|
- 本次启用的指标列表(完整可选项与依赖见 §9.4)
|
||||||
- `output_dir`
|
- `output_dir`
|
||||||
- 本次运行结果输出目录
|
- 本次运行结果输出目录
|
||||||
- `runtime.batch_size`
|
- `runtime.batch_size`
|
||||||
@@ -399,6 +403,32 @@ app_adapter:
|
|||||||
- embedding model
|
- embedding model
|
||||||
- 指标实例
|
- 指标实例
|
||||||
|
|
||||||
|
当前支持的指标(`rag_eval/metrics/registry.py` 中的 `SUPPORTED_METRICS`):
|
||||||
|
|
||||||
|
| 指标名 | 层面 | 依赖 |
|
||||||
|
|---|---|---|
|
||||||
|
| `faithfulness` | 生成 | judge model |
|
||||||
|
| `answer_relevancy` | 生成 | judge model + embedding |
|
||||||
|
| `context_recall` | 检索 | judge model + ground_truth |
|
||||||
|
| `context_precision` | 检索 | judge model + ground_truth |
|
||||||
|
| `noise_sensitivity` | 鲁棒性 | judge model + ground_truth |
|
||||||
|
| `factual_correctness` | 端到端 | judge model + ground_truth |
|
||||||
|
| `semantic_similarity` | 端到端 | embedding + ground_truth(无 LLM 调用) |
|
||||||
|
|
||||||
|
后四项以 `ground_truth`(标准答案)为参照,数据集必须提供该字段。新增指标统一在 `registry.py` / `factory.py` / `pipeline.py` 三处对齐装配。
|
||||||
|
|
||||||
|
**Optimization Advisor(§11 优化策略落地):**
|
||||||
|
|
||||||
|
评测结束后,若场景配置 `optimization_advisor: true`,则自动调用 `rag_eval/advisor/` 模块:
|
||||||
|
- 规则引擎(`rules.py`)对 7 个指标各自设阈值,识别触发项并选取 top-3 低分样本
|
||||||
|
- LLM 分析器(`llm_analyzer.py`)结合低分样本生成中文 Markdown 优化建议(复用 judge_model,失败自动降级为纯规则报告)
|
||||||
|
- 写出层(`writer.py`)输出 `optimization_advice.md` 并打日志摘要
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# 场景配置示例
|
||||||
|
optimization_advisor: true
|
||||||
|
```
|
||||||
|
|
||||||
### 9.5 并发控制
|
### 9.5 并发控制
|
||||||
|
|
||||||
执行层负责并发上限,不把并发策略散落到各指标实现中。
|
执行层负责并发上限,不把并发策略散落到各指标实现中。
|
||||||
|
|||||||
@@ -316,11 +316,21 @@ adapter 层的目标是:**把不同类型的目标应用,统一成同一套
|
|||||||
|
|
||||||
当前支持的指标包括:
|
当前支持的指标包括:
|
||||||
|
|
||||||
|
核心检索 / 生成指标(始终可用):
|
||||||
|
|
||||||
- `faithfulness`
|
- `faithfulness`
|
||||||
- `answer_relevancy`
|
- `answer_relevancy`
|
||||||
- `context_recall`
|
- `context_recall`
|
||||||
- `context_precision`
|
- `context_precision`
|
||||||
|
|
||||||
|
鲁棒性 / 端到端指标(架构设计 §10.2,需数据集含 `ground_truth`):
|
||||||
|
|
||||||
|
- `noise_sensitivity` —— 鲁棒性:对检索噪声的敏感度
|
||||||
|
- `factual_correctness` —— 端到端:回答相对标准答案的事实正确性
|
||||||
|
- `semantic_similarity` —— 端到端:回答与标准答案的语义相似度(基于 embedding,无 LLM 调用)
|
||||||
|
|
||||||
|
所有指标都通过同一套装配点接入:`registry.py`(校验白名单)、`factory.py`(实例化)、`pipeline.py`(`ascore` 入参分发),新增指标只需在这三处对齐即可。
|
||||||
|
|
||||||
所以 metric pipeline 的职责可以总结为:
|
所以 metric pipeline 的职责可以总结为:
|
||||||
|
|
||||||
**把标准样本转换成结构化评分结果。**
|
**把标准样本转换成结构化评分结果。**
|
||||||
@@ -414,3 +424,39 @@ main.py
|
|||||||
- 可以把每次实验的资产稳定留住
|
- 可以把每次实验的资产稳定留住
|
||||||
|
|
||||||
这也是它和一次性离线脚本的根本区别。
|
这也是它和一次性离线脚本的根本区别。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 15. Optimization Advisor 链路
|
||||||
|
|
||||||
|
相关代码:
|
||||||
|
|
||||||
|
- `rag_eval/advisor/__init__.py` — 外部入口 `run_advisor()`
|
||||||
|
- `rag_eval/advisor/rules.py` — 规则引擎(纯函数,无 LLM),7 条指标诊断规则
|
||||||
|
- `rag_eval/advisor/llm_analyzer.py` — LLM 分析器(复用 judge_model llm 实例,失败自动降级)
|
||||||
|
- `rag_eval/advisor/writer.py` — 写出 `optimization_advice.md` + 日志摘要
|
||||||
|
|
||||||
|
Advisor 在 `write_run_artifacts()` 之后触发,仅当场景配置 `optimization_advisor: true` 时生效,默认关闭。
|
||||||
|
|
||||||
|
执行链路:
|
||||||
|
|
||||||
|
```text
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
-> rules.diagnose(score_rows, metrics) # 识别异常指标,选取 top-3 低分样本
|
||||||
|
-> llm_analyzer.analyze(diagnoses, llm) # LLM 生成中文建议(失败自动降级为纯规则报告)
|
||||||
|
-> writer.write_advice(...) # 写 optimization_advice.md + 日志摘要
|
||||||
|
```
|
||||||
|
|
||||||
|
输出产物追加在现有 run 目录:
|
||||||
|
|
||||||
|
```text
|
||||||
|
outputs/online/siemens-pdf-question-bank/<run_id>/
|
||||||
|
scenario.snapshot.yaml
|
||||||
|
scores.csv
|
||||||
|
invalid.csv
|
||||||
|
summary.md
|
||||||
|
metadata.json
|
||||||
|
optimization_advice.md <- 新增(optimization_advisor: true 时生成)
|
||||||
|
```
|
||||||
|
|
||||||
|
规则引擎对 7 个指标各自设 warning / critical 双档阈值,`noise_sensitivity` 为"越低越好"(方向相反)。所有诊断均附带 top-3 低分样本,喂给 LLM 生成针对具体内容的中文建议。
|
||||||
|
|||||||
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
File diff suppressed because it is too large
Load Diff
1378
docs/superpowers/plans/2026-06-16-optimization-advisor.md
Normal file
1378
docs/superpowers/plans/2026-06-16-optimization-advisor.md
Normal file
File diff suppressed because it is too large
Load Diff
1537
docs/superpowers/plans/2026-06-18-metric-doc-weights.md
Normal file
1537
docs/superpowers/plans/2026-06-18-metric-doc-weights.md
Normal file
File diff suppressed because it is too large
Load Diff
974
docs/superpowers/plans/2026-06-22-dify-score-api.md
Normal file
974
docs/superpowers/plans/2026-06-22-dify-score-api.md
Normal file
@@ -0,0 +1,974 @@
|
|||||||
|
# Dify 实时评分 API Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** 新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,接受单条问答记录并同步返回 RAGAS 各指标得分。
|
||||||
|
|
||||||
|
**Architecture:** 新增 `inline_scorer.py` 服务层封装 RAGAS 打分逻辑,以 `(judge_model, embedding_model)` 为 key 缓存 LLM 客户端;新增 `webapp/api/score.py` 路由;`ScoreRequest`/`ScoreResponse` 放入 `webapp/models.py`;`SCORE_API_TOKEN` 加入 `EvaluationSettings`。
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, RAGAS 0.4.3, pytest
|
||||||
|
|
||||||
|
## Global Constraints
|
||||||
|
|
||||||
|
- Python 3.12+,PEP 8,4 空格缩进,类型注解必须
|
||||||
|
- contexts 用 `context_separator`(默认 `" |||| "`)拆分为 list[str]
|
||||||
|
- ground_truth 为可选;缺失时跳过 context_recall / factual_correctness / semantic_similarity / noise_sensitivity
|
||||||
|
- SCORE_API_TOKEN 为空时不鉴权(内网部署场景)
|
||||||
|
- 所有测试用 pytest,不依赖真实 LLM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
| 操作 | 文件 | 职责 |
|
||||||
|
|------|------|------|
|
||||||
|
| 新建 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + 单题打分 |
|
||||||
|
| 新建 | `webapp/api/score.py` | `/api/score` 路由 |
|
||||||
|
| 新建 | `tests/webapp/test_score_api.py` | 端点测试(全 mock) |
|
||||||
|
| 修改 | `webapp/models.py` | 新增 ScoreRequest / ScoreResponse |
|
||||||
|
| 修改 | `rag_eval/settings.py` | 新增 score_api_token 字段 |
|
||||||
|
| 修改 | `webapp/server.py` | 注册 score router,更新 OPENAPI_TAGS 和 description |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: ScoreRequest / ScoreResponse 模型 + settings 字段
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/models.py`
|
||||||
|
- Modify: `rag_eval/settings.py`
|
||||||
|
- Test: `tests/webapp/test_score_api.py` (partial — model validation tests)
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Produces:
|
||||||
|
- `ScoreRequest` Pydantic model(见下方字段)
|
||||||
|
- `ScoreResponse` Pydantic model
|
||||||
|
- `EvaluationSettings.score_api_token: str | None`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing model-validation tests**
|
||||||
|
|
||||||
|
Create `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for POST /api/score endpoint."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreRequest:
|
||||||
|
def test_minimal_valid_request(self):
|
||||||
|
"""Only required fields — question, answer, contexts."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="What is CT?",
|
||||||
|
answer="CT is imaging.",
|
||||||
|
contexts="CT uses X-rays.",
|
||||||
|
)
|
||||||
|
assert req.question == "What is CT?"
|
||||||
|
assert req.contexts == "CT uses X-rays."
|
||||||
|
assert req.ground_truth is None
|
||||||
|
assert req.context_separator == " |||| "
|
||||||
|
assert req.metrics == ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
|
||||||
|
|
||||||
|
def test_contexts_split_by_separator(self):
|
||||||
|
"""contexts_as_list() splits on context_separator."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts="ctx1 |||| ctx2 |||| ctx3",
|
||||||
|
context_separator=" |||| ",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
|
||||||
|
|
||||||
|
def test_contexts_split_custom_separator(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts="a---b---c",
|
||||||
|
context_separator="---",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["a", "b", "c"]
|
||||||
|
|
||||||
|
def test_contexts_split_single_item(self):
|
||||||
|
req = ScoreRequest(question="q", answer="a", contexts="only one")
|
||||||
|
assert req.contexts_as_list() == ["only one"]
|
||||||
|
|
||||||
|
def test_missing_question_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_answer_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_contexts_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_custom_metrics_accepted(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
)
|
||||||
|
assert req.metrics == ["faithfulness"]
|
||||||
|
|
||||||
|
def test_invalid_metric_name_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", answer="a", contexts="c", metrics=["not_a_metric"])
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
|
||||||
|
"""Without ground_truth, GT-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "faithfulness" in effective
|
||||||
|
assert "context_recall" not in effective
|
||||||
|
assert "factual_correctness" not in effective
|
||||||
|
assert "semantic_similarity" not in effective
|
||||||
|
assert "noise_sensitivity" not in effective
|
||||||
|
|
||||||
|
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a", contexts="c", ground_truth="gt",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert effective == ["faithfulness", "context_recall", "factual_correctness"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreResponse:
|
||||||
|
def test_score_response_structure(self):
|
||||||
|
resp = ScoreResponse(
|
||||||
|
scores={"faithfulness": 0.85, "answer_relevancy": None},
|
||||||
|
weighted_score=0.85,
|
||||||
|
latency_ms=1200,
|
||||||
|
)
|
||||||
|
assert resp.scores["faithfulness"] == 0.85
|
||||||
|
assert resp.scores["answer_relevancy"] is None
|
||||||
|
assert resp.latency_ms == 1200
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
|
||||||
|
```
|
||||||
|
Expected: `ImportError: cannot import name 'ScoreRequest' from 'webapp.models'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Add ScoreRequest and ScoreResponse to `webapp/models.py`**
|
||||||
|
|
||||||
|
Append to the end of `webapp/models.py` (after `PipelineJobResponse`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dify 实时评分 API 模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 需要 ground_truth 才能计算的指标集合
|
||||||
|
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 所有合法指标名称
|
||||||
|
_VALID_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
})
|
||||||
|
|
||||||
|
_DEFAULT_SCORE_METRICS: list[str] = [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreRequest(BaseModel):
|
||||||
|
"""Request body for the real-time single-sample scoring endpoint."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "基础评分请求",
|
||||||
|
"value": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy", "context_recall", "context_precision"],
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
question: str = Field(description="问题文本。")
|
||||||
|
answer: str = Field(description="待评分的回答。")
|
||||||
|
contexts: str = Field(
|
||||||
|
description="检索上下文字符串,多段之间用 context_separator 拼接。"
|
||||||
|
)
|
||||||
|
ground_truth: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
|
||||||
|
)
|
||||||
|
context_separator: str = Field(
|
||||||
|
default=" |||| ",
|
||||||
|
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
|
||||||
|
description="需要计算的 RAGAS 指标列表。",
|
||||||
|
)
|
||||||
|
judge_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
|
||||||
|
)
|
||||||
|
embedding_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
|
||||||
|
)
|
||||||
|
|
||||||
|
@field_validator("metrics")
|
||||||
|
@classmethod
|
||||||
|
def validate_metric_names(cls, value: list[str]) -> list[str]:
|
||||||
|
"""Reject any metric name not in the supported registry."""
|
||||||
|
invalid = [m for m in value if m not in _VALID_METRICS]
|
||||||
|
if invalid:
|
||||||
|
raise ValueError(
|
||||||
|
f"不支持的指标名称:{invalid}。"
|
||||||
|
f"合法值:{sorted(_VALID_METRICS)}"
|
||||||
|
)
|
||||||
|
if not value:
|
||||||
|
raise ValueError("metrics 不能为空列表。")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def contexts_as_list(self) -> list[str]:
|
||||||
|
"""Split the contexts string into a list of non-empty fragments."""
|
||||||
|
sep = self.context_separator or " |||| "
|
||||||
|
return [s.strip() for s in self.contexts.split(sep) if s.strip()]
|
||||||
|
|
||||||
|
def effective_metrics(self) -> list[str]:
|
||||||
|
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
|
||||||
|
if self.ground_truth is not None:
|
||||||
|
return list(self.metrics)
|
||||||
|
return [m for m in self.metrics if m not in _GT_DEPENDENT_METRICS]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreResponse(BaseModel):
|
||||||
|
"""Response payload for the real-time scoring endpoint."""
|
||||||
|
|
||||||
|
scores: dict[str, float | None] = Field(
|
||||||
|
description="各指标得分(NaN 或计算失败时为 null)。"
|
||||||
|
)
|
||||||
|
weighted_score: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="等权加权综合得分(仅对非 null 指标求均值)。",
|
||||||
|
)
|
||||||
|
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
|
||||||
|
skipped_metrics: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="因缺少 ground_truth 而跳过的指标名称列表。",
|
||||||
|
)
|
||||||
|
error: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Also add `field_validator` to the import line at the top of `webapp/models.py`:
|
||||||
|
```python
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Add `score_api_token` to `rag_eval/settings.py`**
|
||||||
|
|
||||||
|
Add after the `dataset_generator_model` field:
|
||||||
|
```python
|
||||||
|
score_api_token: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
alias="SCORE_API_TOKEN",
|
||||||
|
description="Bearer token for /api/score endpoint. Empty = no auth.",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
|
||||||
|
```
|
||||||
|
Expected: all 12 tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/models.py rag_eval/settings.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: InlineScorer 服务(LLM 缓存 + 打分)
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/services/inline_scorer.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes:
|
||||||
|
- `build_models(judge_model, embedding_model, settings) -> tuple[Any, Any]` from `rag_eval.metrics.factory`
|
||||||
|
- `MetricPipeline(metrics, metric_timeout_seconds)` from `rag_eval.metrics.pipeline`
|
||||||
|
- `NormalizedSample` from `rag_eval.shared.models`
|
||||||
|
- `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights`
|
||||||
|
- `EvaluationSettings` from `rag_eval.settings`
|
||||||
|
- Produces:
|
||||||
|
- `inline_scorer: InlineScorer` (module-level singleton)
|
||||||
|
- `InlineScorer.score(question, answer, contexts, ground_truth, metrics, judge_model, embedding_model, settings) -> dict[str, float | None]`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing test**
|
||||||
|
|
||||||
|
Add to `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class TestInlineScorer:
|
||||||
|
def test_score_returns_dict_with_requested_metrics(self):
|
||||||
|
"""InlineScorer.score returns a dict keyed by the requested metrics."""
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts=["ctx1"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness", "answer_relevancy"],
|
||||||
|
judge_model="test-model",
|
||||||
|
embedding_model="test-embed",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert "faithfulness" in result
|
||||||
|
assert "answer_relevancy" in result
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9)
|
||||||
|
|
||||||
|
def test_score_converts_nan_to_none(self):
|
||||||
|
"""NaN scores are converted to None in the returned dict."""
|
||||||
|
import math
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": float("nan")}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a", contexts=["c"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.services.inline_scorer'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/services/inline_scorer.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
|
||||||
|
|
||||||
|
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
|
||||||
|
(judge_model, embedding_model), so repeated Dify Tool calls with the same
|
||||||
|
models reuse existing AsyncOpenAI connections instead of creating new ones.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.compat import ensure_ragas_import_compat
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import NormalizedSample
|
||||||
|
|
||||||
|
ensure_ragas_import_compat()
|
||||||
|
|
||||||
|
from ragas.metrics.collections import ( # noqa: E402
|
||||||
|
AnswerRelevancy,
|
||||||
|
ContextPrecision,
|
||||||
|
ContextRecall,
|
||||||
|
FactualCorrectness,
|
||||||
|
Faithfulness,
|
||||||
|
NoiseSensitivity,
|
||||||
|
SemanticSimilarity,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
|
||||||
|
"""Instantiate only the RAGAS metric objects requested."""
|
||||||
|
registry: dict[str, Any] = {
|
||||||
|
"faithfulness": Faithfulness(llm=llm),
|
||||||
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||||
|
"context_recall": ContextRecall(llm=llm),
|
||||||
|
"context_precision": ContextPrecision(llm=llm),
|
||||||
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||||
|
"factual_correctness": FactualCorrectness(llm=llm),
|
||||||
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||||
|
}
|
||||||
|
return {name: registry[name] for name in metrics if name in registry}
|
||||||
|
|
||||||
|
|
||||||
|
class InlineScorer:
|
||||||
|
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
|
||||||
|
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def _get_models(
|
||||||
|
self,
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> tuple[Any, Any]:
|
||||||
|
"""Return cached LLM/embedding clients, building them on first use."""
|
||||||
|
cache_key = (judge_model, embedding_model)
|
||||||
|
with self._lock:
|
||||||
|
if cache_key not in self._model_cache:
|
||||||
|
llm, embeddings = build_models(judge_model, embedding_model, settings)
|
||||||
|
self._model_cache[cache_key] = (llm, embeddings)
|
||||||
|
return self._model_cache[cache_key]
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Score one sample synchronously and return {metric_name: score | None}.
|
||||||
|
|
||||||
|
NaN values from RAGAS are converted to None for clean JSON serialization.
|
||||||
|
"""
|
||||||
|
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
|
||||||
|
metric_instances = _build_metric_instances(metrics, llm, embeddings)
|
||||||
|
|
||||||
|
pipeline = MetricPipeline(
|
||||||
|
metrics=metric_instances,
|
||||||
|
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="inline-score",
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
contexts=contexts,
|
||||||
|
ground_truth=ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
metric_score = asyncio.run(pipeline.score_sample(sample))
|
||||||
|
|
||||||
|
# Convert NaN → None for clean JSON output
|
||||||
|
return {
|
||||||
|
name: (None if math.isnan(v) or math.isinf(v) else round(v, 4))
|
||||||
|
for name, v in metric_score.metrics.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
inline_scorer = InlineScorer()
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
|
||||||
|
```
|
||||||
|
Expected: both tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/services/inline_scorer.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add InlineScorer service with LLM client caching"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: `/api/score` 路由 + 鉴权 + 集成测试
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/api/score.py`
|
||||||
|
- Modify: `webapp/server.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes:
|
||||||
|
- `ScoreRequest`, `ScoreResponse` from `webapp.models`
|
||||||
|
- `inline_scorer: InlineScorer` from `webapp.services.inline_scorer`
|
||||||
|
- `EvaluationSettings` from `rag_eval.settings`
|
||||||
|
- `compute_weighted_score(scores, {}) -> float | None` from `rag_eval.metrics.weights`
|
||||||
|
- Produces: `POST /api/score` endpoint
|
||||||
|
|
||||||
|
- [ ] **Step 1: Write failing endpoint tests**
|
||||||
|
|
||||||
|
Add to `tests/webapp/test_score_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ── Fixtures ─────────────────────────────────────────────────────────────────
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(monkeypatch):
|
||||||
|
"""TestClient with mocked InlineScorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {
|
||||||
|
"faithfulness": 0.85,
|
||||||
|
"answer_relevancy": 0.90,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreEndpoint:
|
||||||
|
def test_post_score_returns_200(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is imaging.",
|
||||||
|
"contexts": "CT uses X-rays.",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "scores" in data
|
||||||
|
assert "latency_ms" in data
|
||||||
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
||||||
|
|
||||||
|
def test_weighted_score_computed(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
# weighted_score is the mean of all non-null scores
|
||||||
|
assert data["weighted_score"] is not None
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_invalid_metric_name_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["not_a_metric"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["faithfulness", "context_recall"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "context_recall" in data["skipped_metrics"]
|
||||||
|
|
||||||
|
def test_contexts_split_on_separator(self, client, monkeypatch):
|
||||||
|
"""contexts string is split before passing to scorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
calls = []
|
||||||
|
def capture(*args, **kwargs):
|
||||||
|
calls.append(kwargs.get("contexts", []))
|
||||||
|
return {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod.inline_scorer, "score", capture)
|
||||||
|
|
||||||
|
client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a",
|
||||||
|
"contexts": "ctx1 |||| ctx2",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
})
|
||||||
|
assert calls[0] == ["ctx1", "ctx2"]
|
||||||
|
|
||||||
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
||||||
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
test_client = TestClient(create_app())
|
||||||
|
|
||||||
|
# No auth header → 401
|
||||||
|
resp = test_client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
# Correct token → 200
|
||||||
|
resp = test_client.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer secret-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
test_client = TestClient(create_app())
|
||||||
|
resp = test_client.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer wrong-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 401
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py::TestScoreEndpoint -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.api.score'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/api/score.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_settings() -> EvaluationSettings:
|
||||||
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
||||||
|
return EvaluationSettings()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_auth(authorization: str | None, token: str) -> None:
|
||||||
|
"""Raise 401 if Bearer token does not match the configured token."""
|
||||||
|
if authorization is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
||||||
|
parts = authorization.split(" ", 1)
|
||||||
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"",
|
||||||
|
response_model=ScoreResponse,
|
||||||
|
summary="单题实时评分(Dify 外部 Tool)",
|
||||||
|
responses={
|
||||||
|
200: {"description": "各指标得分和加权综合得分。"},
|
||||||
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
||||||
|
422: {"description": "请求参数校验失败。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def score_sample(
|
||||||
|
request: ScoreRequest,
|
||||||
|
authorization: Annotated[str | None, Header()] = None,
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
|
||||||
|
|
||||||
|
供 Dify 外部 Tool 调用。将 `contexts` 字段按 `context_separator` 拆分后传入
|
||||||
|
RAGAS 管道;`ground_truth` 缺失时自动跳过依赖它的指标。
|
||||||
|
"""
|
||||||
|
settings = _get_settings()
|
||||||
|
|
||||||
|
# 鉴权(仅在配置了 token 时生效)
|
||||||
|
if settings.score_api_token:
|
||||||
|
_check_auth(authorization, settings.score_api_token)
|
||||||
|
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
if not effective:
|
||||||
|
# All requested metrics require ground_truth which is absent.
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={m: None for m in request.metrics},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=0,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
# Merge: skipped metrics appear as null in final scores dict.
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
# Weighted score = equal-weight mean of non-null effective scores.
|
||||||
|
weighted = compute_weighted_score(
|
||||||
|
{k: v for k, v in raw_scores.items() if v is not None},
|
||||||
|
{},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScoreResponse(
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=round(weighted, 4) if weighted is not None else None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Register router in `webapp/server.py`**
|
||||||
|
|
||||||
|
Add `score` to the import line:
|
||||||
|
```python
|
||||||
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
|
||||||
|
```
|
||||||
|
|
||||||
|
Add the router registration after `pipeline.router`:
|
||||||
|
```python
|
||||||
|
app.include_router(score.router)
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `"score"` tag to `OPENAPI_TAGS` list (insert before `"meta"`):
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(Dify 外部 Tool)**\n\n"
|
||||||
|
"接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n"
|
||||||
|
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
|
||||||
|
"适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
|
Also update the `description` field in `FastAPI(...)` to add a bullet:
|
||||||
|
```python
|
||||||
|
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_api.py -v
|
||||||
|
```
|
||||||
|
Expected: all tests PASS.
|
||||||
|
|
||||||
|
- [ ] **Step 6: Verify server boots and route appears**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
from webapp.server import create_app
|
||||||
|
app = create_app()
|
||||||
|
routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes]
|
||||||
|
score_routes = [(p,m) for p,m in routes if 'score' in p]
|
||||||
|
print('Score routes:', score_routes)
|
||||||
|
"
|
||||||
|
```
|
||||||
|
Expected output:
|
||||||
|
```
|
||||||
|
Score routes: [('/api/score', ['POST'])]
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/api/score.py webapp/server.py tests/webapp/test_score_api.py
|
||||||
|
git commit -m "feat: add POST /api/score endpoint for Dify real-time scoring"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: 全量回归 + `.env.example` 更新
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `.env.example`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add SCORE_API_TOKEN to `.env.example`**
|
||||||
|
|
||||||
|
Add this block after `DATASET_GENERATOR_MODEL=qwen3.6-plus`:
|
||||||
|
|
||||||
|
```
|
||||||
|
# ===== Dify 集成 — 实时评分 API =====
|
||||||
|
# 为 /api/score 端点设置 Bearer Token 鉴权(留空则不鉴权,适合内网部署)
|
||||||
|
# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
|
||||||
|
SCORE_API_TOKEN=
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run full test suite**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/ -v --tb=short
|
||||||
|
```
|
||||||
|
|
||||||
|
Pre-existing failures to ignore:
|
||||||
|
- `test_normalize_sample_pdf_offline_smoke_row` — 缺少 CSV fixture
|
||||||
|
- `test_evaluator_and_reporting_write_run_assets` — 预存在的断言不匹配
|
||||||
|
- `test_question_generator_rejects_invalid_json` — retry 循环吞掉了 ValueError
|
||||||
|
- `test_question_generator_rejects_non_list_samples` — 同上
|
||||||
|
|
||||||
|
**零新增失败**即为通过。
|
||||||
|
|
||||||
|
- [ ] **Step 3: Final commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add .env.example
|
||||||
|
git commit -m "feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
|
||||||
|
|
||||||
|
- POST /api/score: real-time RAGAS scoring for Dify external Tool
|
||||||
|
- ScoreRequest/ScoreResponse Pydantic models with full field docs
|
||||||
|
- InlineScorer with (judge_model, embedding_model) client cache
|
||||||
|
- Bearer token auth via SCORE_API_TOKEN env var (optional)
|
||||||
|
- contexts split by configurable separator (default ' |||| ')
|
||||||
|
- GT-dependent metrics auto-skipped when ground_truth absent
|
||||||
|
- Full test coverage (22 new tests)
|
||||||
|
|
||||||
|
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dify 侧配置参考
|
||||||
|
|
||||||
|
任务完成后,在 Dify 「工具」→「自定义工具」中填写如下 OpenAPI Schema:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
openapi: 3.1.0
|
||||||
|
info:
|
||||||
|
title: RAGAS 实时评分
|
||||||
|
version: 1.0.0
|
||||||
|
servers:
|
||||||
|
- url: http://<your-server>:8800
|
||||||
|
paths:
|
||||||
|
/api/score:
|
||||||
|
post:
|
||||||
|
operationId: scoreQA
|
||||||
|
summary: 对一条问答记录进行 RAGAS 评分
|
||||||
|
requestBody:
|
||||||
|
required: true
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
required: [question, answer, contexts]
|
||||||
|
properties:
|
||||||
|
question: { type: string }
|
||||||
|
answer: { type: string }
|
||||||
|
contexts: { type: string, description: "多段上下文用 ' |||| ' 拼接" }
|
||||||
|
ground_truth: { type: string }
|
||||||
|
metrics:
|
||||||
|
type: array
|
||||||
|
items: { type: string }
|
||||||
|
default: [faithfulness, answer_relevancy, context_recall, context_precision]
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: 评分结果
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
scores: { type: object }
|
||||||
|
weighted_score: { type: number }
|
||||||
|
latency_ms: { type: integer }
|
||||||
|
skipped_metrics: { type: array, items: { type: string } }
|
||||||
|
```
|
||||||
808
docs/superpowers/plans/2026-06-24-async-score-jobs.md
Normal file
808
docs/superpowers/plans/2026-06-24-async-score-jobs.md
Normal file
@@ -0,0 +1,808 @@
|
|||||||
|
# 异步评分记录(Async Score Jobs)Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** 新增 `POST /api/score/async` 异步端点,结果持久化至 `outputs/score-jobs/`,并在前端新增「评分记录」页面展示。
|
||||||
|
|
||||||
|
**Architecture:** 新建 `ScoreJobManager`(复用 `pipeline_task_manager` 线程池模式)在后台执行 `InlineScorer.score()`,写入 JSON 文件;新增三个 REST 端点;前端新增导航页加载并轮询记录。
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, threading, Vanilla JS, pytest
|
||||||
|
|
||||||
|
## Global Constraints
|
||||||
|
|
||||||
|
- Python 3.12+,PEP 8,4 空格缩进,类型注解必须
|
||||||
|
- 存储路径:`outputs/score-jobs/<job_id>.json`
|
||||||
|
- 复用现有 `ScoreRequest`(含 `effective_metrics()` 和 `contexts_as_list()` 方法)
|
||||||
|
- 复用现有 `InlineScorer.score()` 和 `compute_weighted_score()`
|
||||||
|
- 所有测试用 pytest,不依赖真实 LLM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
| 操作 | 文件 | 职责 |
|
||||||
|
|------|------|------|
|
||||||
|
| 新建 | `webapp/services/score_job_manager.py` | ScoreJobManager:线程池 + JSON 持久化 |
|
||||||
|
| 新建 | `webapp/api/score_jobs.py` | 3 个端点路由 |
|
||||||
|
| 新建 | `webapp/static/js/score_jobs.js` | 前端列表 + 轮询逻辑 |
|
||||||
|
| 新建 | `tests/webapp/test_score_jobs_api.py` | API 集成测试 |
|
||||||
|
| 修改 | `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
|
||||||
|
| 修改 | `webapp/server.py` | 注册 score_jobs router,更新 OPENAPI_TAGS 和 description |
|
||||||
|
| 修改 | `webapp/static/index.html` | 新增导航项 + `#view-scorejobs` section |
|
||||||
|
| 修改 | `webapp/static/js/api.js` | 新增 `scoreJobsAsync()`、`getScoreJob()`、`listScoreJobs()` |
|
||||||
|
| 修改 | `webapp/static/js/app.js` | 注册 `scorejobs` 视图、加载调用 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: Pydantic 模型 + ScoreJobManager
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/models.py`
|
||||||
|
- Create: `webapp/services/score_job_manager.py`
|
||||||
|
- Create: `tests/webapp/test_score_jobs_api.py` (partial)
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Produces:
|
||||||
|
- `AsyncScoreJobStatus` Pydantic model
|
||||||
|
- `AsyncScoreJobResponse` Pydantic model
|
||||||
|
- `score_job_manager: ScoreJobManager` singleton
|
||||||
|
- `ScoreJobManager.submit(request: ScoreRequest) -> AsyncScoreJobStatus`
|
||||||
|
- `ScoreJobManager.get(job_id: str) -> AsyncScoreJobStatus | None`
|
||||||
|
- `ScoreJobManager.list_jobs() -> list[AsyncScoreJobStatus]`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add models to `webapp/models.py`**
|
||||||
|
|
||||||
|
Append after `AsyncScoreJobResponse` (at the end of the file, after `ScoreResponse`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 异步评分记录模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AsyncScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate response after submitting an async score job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续查询结果。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncScoreJobStatus(BaseModel):
|
||||||
|
"""Full state of one async score job, persisted to disk."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
status: str = Field(description="queued | running | completed | failed")
|
||||||
|
created_at: str = Field(default="", description="创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="完成时间(ISO 8601 UTC)。")
|
||||||
|
request_summary: dict = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="请求参数快照(question 前80字、metrics、judge_model 等)。",
|
||||||
|
)
|
||||||
|
scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
|
||||||
|
weighted_score: float | None = Field(default=None, description="加权综合得分。")
|
||||||
|
latency_ms: int = Field(default=0, description="评分耗时毫秒。")
|
||||||
|
skipped_metrics: list[str] = Field(default_factory=list)
|
||||||
|
error: str | None = Field(default=None)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Write failing tests**
|
||||||
|
|
||||||
|
Create `tests/webapp/test_score_jobs_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for async score jobs API."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
import webapp.services.score_job_manager as mgr_mod
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
fresh_mgr = ScoreJobManager(jobs_dir=tmp_path / "score-jobs")
|
||||||
|
monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
|
||||||
|
import webapp.api.score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreJobManager:
|
||||||
|
def test_submit_returns_job_status_with_queued(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
req = ScoreRequest(question="q", answer="a", metrics=["answer_relevancy"])
|
||||||
|
with patch.object(mgr, "_execute") as mock_exec:
|
||||||
|
mock_exec.return_value = None
|
||||||
|
status = mgr.submit(req)
|
||||||
|
assert status.status in ("queued", "running", "completed")
|
||||||
|
assert len(status.job_id) > 0
|
||||||
|
|
||||||
|
def test_get_returns_none_for_unknown_id(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
assert mgr.get("nonexistent") is None
|
||||||
|
|
||||||
|
def test_list_returns_empty_initially(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
assert mgr.list_jobs() == []
|
||||||
|
|
||||||
|
def test_completed_job_persisted_to_disk(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs", max_workers=1)
|
||||||
|
req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"answer_relevancy": 0.85}
|
||||||
|
with patch("webapp.services.score_job_manager.inline_scorer", mock_scorer):
|
||||||
|
with patch("webapp.services.score_job_manager.EvaluationSettings"):
|
||||||
|
status = mgr.submit(req)
|
||||||
|
for _ in range(20):
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
if s and s.status in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.2)
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
assert s is not None
|
||||||
|
json_path = tmp_path / "jobs" / f"{status.job_id}.json"
|
||||||
|
assert json_path.exists()
|
||||||
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.services.score_job_manager'`
|
||||||
|
|
||||||
|
- [ ] **Step 4: Create `webapp/services/score_job_manager.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Background task manager for async RAGAS single-sample scoring.
|
||||||
|
|
||||||
|
Each job runs InlineScorer.score() in a thread pool and persists the
|
||||||
|
result as a JSON file under outputs/score-jobs/<job_id>.json so results
|
||||||
|
survive server restarts and can be listed by the frontend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-jobs"
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreJobManager:
|
||||||
|
"""Thread-pool manager for async RAGAS scoring jobs with JSON persistence."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
jobs_dir: Path = _DEFAULT_JOBS_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._jobs_dir = Path(jobs_dir)
|
||||||
|
self._jobs_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
# In-memory index: job_id -> AsyncScoreJobStatus (authoritative while running)
|
||||||
|
self._cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
|
||||||
|
"""Queue one scoring job and return its initial status immediately."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": request.question[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
self._cache[job_id] = status
|
||||||
|
self._persist(status)
|
||||||
|
self._executor.submit(self._run, job_id, request)
|
||||||
|
return status
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return the current status for one job, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all known jobs sorted newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Internal
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Execute scoring in the thread pool and persist the result."""
|
||||||
|
self._update(job_id, status="running")
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
import time as _time
|
||||||
|
t0 = _time.monotonic()
|
||||||
|
try:
|
||||||
|
if not effective:
|
||||||
|
scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
weighted = None
|
||||||
|
else:
|
||||||
|
raw = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
scores = {m: None for m in request.metrics}
|
||||||
|
scores.update(raw)
|
||||||
|
weighted_raw = compute_weighted_score(
|
||||||
|
{k: v for k, v in raw.items() if v is not None}, {}
|
||||||
|
)
|
||||||
|
weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
|
||||||
|
latency_ms = int((_time.monotonic() - t0) * 1000)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
scores=scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((_time.monotonic() - t0) * 1000)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _update(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
"""Merge kwargs into the job status and persist."""
|
||||||
|
with self._lock:
|
||||||
|
existing = self._cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._cache[job_id] = updated
|
||||||
|
self._persist(updated)
|
||||||
|
|
||||||
|
def _persist(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Write one job's status to its JSON file."""
|
||||||
|
path = self._jobs_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Load completed jobs from disk into memory on startup."""
|
||||||
|
for path in sorted(self._jobs_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass # Corrupt file — skip
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
score_job_manager = ScoreJobManager()
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify tests PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
|
||||||
|
```
|
||||||
|
Expected: 4 tests PASS
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/models.py webapp/services/score_job_manager.py tests/webapp/test_score_jobs_api.py
|
||||||
|
git commit -m "feat: add AsyncScoreJobStatus model and ScoreJobManager with JSON persistence"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: API 端点
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/api/score_jobs.py`
|
||||||
|
- Modify: `webapp/server.py`
|
||||||
|
- Modify: `tests/webapp/test_score_jobs_api.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes: `score_job_manager: ScoreJobManager`, `AsyncScoreJobResponse`, `AsyncScoreJobStatus`, `ScoreRequest`
|
||||||
|
- Produces: `POST /api/score/async`, `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add API tests to `tests/webapp/test_score_jobs_api.py`**
|
||||||
|
|
||||||
|
Append this class:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class TestScoreJobsEndpoint:
|
||||||
|
def test_submit_async_returns_202(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._execute"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/jobs/nonexistent")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_list_jobs_returns_empty_initially(self, client):
|
||||||
|
resp = client.get("/api/score/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
def test_submitted_job_appears_in_list(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
list_resp = client.get("/api/score/jobs")
|
||||||
|
ids = [j["job_id"] for j in list_resp.json()["jobs"]]
|
||||||
|
assert job_id in ids
|
||||||
|
|
||||||
|
def test_get_job_by_id(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
get_resp = client.get(f"/api/score/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobsEndpoint -v
|
||||||
|
```
|
||||||
|
Expected: FAIL — `ModuleNotFoundError: No module named 'webapp.api.score_jobs'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/api/score_jobs.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Routes for async RAGAS scoring jobs (Dify fire-and-forget integration)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.score_job_manager import score_job_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=AsyncScoreJobResponse,
|
||||||
|
summary="提交异步评分任务(Dify 推荐方式)",
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": "任务已排队,立即返回 job_id。通过 GET /api/score/jobs/{job_id} 查询结果。",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {"job_id": "abc123def456", "status": "queued"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
|
||||||
|
"""提交异步 RAGAS 评分任务,立即返回 job_id(202 Accepted)。
|
||||||
|
|
||||||
|
评分在后台线程中执行,结果持久化至 `outputs/score-jobs/<job_id>.json`。
|
||||||
|
在 RAGAS 平台「评分记录」页面可查看所有历史评分记录。
|
||||||
|
|
||||||
|
**Dify 工作流推荐使用此接口**:不等待评分完成,工作流立即继续,
|
||||||
|
避免 HTTP 节点超时。评分结果通过平台界面查看。
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[score_async] submit metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.metrics, bool(request.contexts), bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
status = score_job_manager.submit(request)
|
||||||
|
logger.info("[score_async] queued job_id=%s", status.job_id)
|
||||||
|
return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有评分记录",
|
||||||
|
)
|
||||||
|
def list_score_jobs() -> dict:
|
||||||
|
"""返回所有异步评分记录,按创建时间倒序排列。"""
|
||||||
|
jobs = score_job_manager.list_jobs()
|
||||||
|
logger.info("[score_jobs] list count=%d", len(jobs))
|
||||||
|
return {"jobs": [j.model_dump() for j in jobs]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询评分记录详情",
|
||||||
|
responses={404: {"description": "指定 job_id 的评分记录不存在。"}},
|
||||||
|
)
|
||||||
|
def get_score_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""返回一个异步评分任务的当前状态和结果。"""
|
||||||
|
status = score_job_manager.get(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
|
||||||
|
return status
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Register router in `webapp/server.py`**
|
||||||
|
|
||||||
|
Add import:
|
||||||
|
```python
|
||||||
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs
|
||||||
|
```
|
||||||
|
|
||||||
|
Add after `app.include_router(score.router)`:
|
||||||
|
```python
|
||||||
|
app.include_router(score_jobs.router)
|
||||||
|
```
|
||||||
|
|
||||||
|
Add entry to `OPENAPI_TAGS` before `"meta"`:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(同步)** — `POST /api/score`\n\n"
|
||||||
|
"**异步评分 API(Dify 推荐)** — `POST /api/score/async`\n\n"
|
||||||
|
"异步方式立即返回 job_id(202),评分在后台执行,结果在「评分记录」页查看。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
|
> Note: this replaces the existing `"score"` entry in `OPENAPI_TAGS`.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Verify no route conflict**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
from webapp.server import create_app
|
||||||
|
app = create_app()
|
||||||
|
score_routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes if 'score' in r.path]
|
||||||
|
print(score_routes)
|
||||||
|
"
|
||||||
|
```
|
||||||
|
Expected: shows `/api/score`, `/api/score/async`, `/api/score/jobs`, `/api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
- [ ] **Step 6: Run API tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py -v --tb=short
|
||||||
|
```
|
||||||
|
Expected: all 9 tests PASS
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/api/score_jobs.py webapp/server.py tests/webapp/test_score_jobs_api.py
|
||||||
|
git commit -m "feat: add POST /api/score/async and GET /api/score/jobs endpoints"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: 前端「评分记录」页
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/static/index.html`
|
||||||
|
- Modify: `webapp/static/js/api.js`
|
||||||
|
- Modify: `webapp/static/js/app.js`
|
||||||
|
- Create: `webapp/static/js/score_jobs.js`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes: `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
|
||||||
|
- Produces: `#view-scorejobs` section, `ScoreJobs` JS object
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add API methods to `webapp/static/js/api.js`**
|
||||||
|
|
||||||
|
Add before the closing `};`:
|
||||||
|
```javascript
|
||||||
|
// 异步评分记录 API
|
||||||
|
scoreJobsAsync(body) { return API.post("/api/score/async", body); },
|
||||||
|
getScoreJob(jobId) { return API.get(`/api/score/jobs/${encodeURIComponent(jobId)}`); },
|
||||||
|
listScoreJobs() { return API.get("/api/score/jobs"); },
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Add nav item and section to `webapp/static/index.html`**
|
||||||
|
|
||||||
|
In the `<nav class="nav">` block, add after the `profiles` nav-item and before the `apidocs` nav-item:
|
||||||
|
```html
|
||||||
|
<button class="nav-item" data-view="scorejobs">
|
||||||
|
<span class="nav-ico">📋</span><span>评分记录</span>
|
||||||
|
</button>
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a new section before the `<!-- API 文档视图 -->` comment:
|
||||||
|
```html
|
||||||
|
<!-- 评分记录视图 -->
|
||||||
|
<section class="view" id="view-scorejobs" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<div class="panel-head">
|
||||||
|
<h2>评分记录</h2>
|
||||||
|
<span class="muted" style="font-size:13px">来自 Dify 异步评分任务(POST /api/score/async)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="scorejobs-container"></div>
|
||||||
|
<div class="empty" id="scorejobs-empty" hidden>
|
||||||
|
<p>暂无评分记录。</p>
|
||||||
|
<p class="muted">在 Dify 工作流中调用 <code>POST /api/score/async</code> 后,记录将在此显示。</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/static/js/score_jobs.js`**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// score_jobs.js — 评分记录页面逻辑(异步 RAGAS 评分结果列表)
|
||||||
|
|
||||||
|
const ScoreJobs = {
|
||||||
|
_pollTimers: {}, // job_id -> setInterval handle
|
||||||
|
|
||||||
|
async load() {
|
||||||
|
const container = document.getElementById("scorejobs-container");
|
||||||
|
const empty = document.getElementById("scorejobs-empty");
|
||||||
|
container.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
|
try {
|
||||||
|
const data = await API.listScoreJobs();
|
||||||
|
const jobs = data.jobs || [];
|
||||||
|
container.innerHTML = "";
|
||||||
|
if (jobs.length === 0) {
|
||||||
|
empty.hidden = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
empty.hidden = true;
|
||||||
|
jobs.forEach(job => container.appendChild(ScoreJobs.renderRow(job)));
|
||||||
|
// Auto-poll any queued/running jobs
|
||||||
|
jobs.forEach(job => {
|
||||||
|
if (job.status === "queued" || job.status === "running") {
|
||||||
|
ScoreJobs._startPoll(job.job_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
container.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
renderRow(job) {
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "panel score-job-row";
|
||||||
|
row.id = `score-job-${job.job_id}`;
|
||||||
|
row.innerHTML = ScoreJobs._rowHtml(job);
|
||||||
|
return row;
|
||||||
|
},
|
||||||
|
|
||||||
|
_rowHtml(job) {
|
||||||
|
const time = App.shortTime(job.created_at);
|
||||||
|
const question = App.escape((job.request_summary?.question || "—").slice(0, 50));
|
||||||
|
const metrics = (job.request_summary?.metrics || []).join(", ");
|
||||||
|
const statusBadge = `<span class="badge ${job.status}">${job.status}</span>`;
|
||||||
|
|
||||||
|
let scoreHtml = "";
|
||||||
|
if (job.status === "completed") {
|
||||||
|
scoreHtml = Object.entries(job.scores || {})
|
||||||
|
.map(([k, v]) => {
|
||||||
|
const cls = App.scoreClass(v);
|
||||||
|
const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
|
||||||
|
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
|
||||||
|
})
|
||||||
|
.join(" ");
|
||||||
|
if (job.weighted_score !== null && job.weighted_score !== undefined) {
|
||||||
|
const cls = App.scoreClass(job.weighted_score);
|
||||||
|
scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
|
||||||
|
}
|
||||||
|
} else if (job.status === "failed") {
|
||||||
|
scoreHtml = `<span class="muted" style="color:var(--bad)">${App.escape(job.error || "未知错误")}</span>`;
|
||||||
|
} else {
|
||||||
|
scoreHtml = `<span class="muted">评分中…</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="run-card-head">
|
||||||
|
<div class="run-card-title">${question}</div>
|
||||||
|
<div>${statusBadge}</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-meta">
|
||||||
|
<div>指标:${App.escape(metrics)} · ${time} · ${job.latency_ms}ms</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-metrics">${scoreHtml}</div>
|
||||||
|
`;
|
||||||
|
},
|
||||||
|
|
||||||
|
_startPoll(jobId) {
|
||||||
|
if (ScoreJobs._pollTimers[jobId]) return;
|
||||||
|
ScoreJobs._pollTimers[jobId] = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
const job = await API.getScoreJob(jobId);
|
||||||
|
const el = document.getElementById(`score-job-${jobId}`);
|
||||||
|
if (el) el.innerHTML = ScoreJobs._rowHtml(job);
|
||||||
|
if (job.status === "completed" || job.status === "failed") {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
}
|
||||||
|
} catch (_e) {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
}
|
||||||
|
}, 5000);
|
||||||
|
},
|
||||||
|
|
||||||
|
stopAllPolls() {
|
||||||
|
Object.values(ScoreJobs._pollTimers).forEach(t => clearInterval(t));
|
||||||
|
ScoreJobs._pollTimers = {};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Update `webapp/static/js/app.js`**
|
||||||
|
|
||||||
|
Add `"scorejobs"` to the `views` array and `titles` object:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
views: ["runs", "new", "report", "profiles", "scorejobs", "apidocs"],
|
||||||
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", scorejobs: "评分记录", apidocs: "API 文档" },
|
||||||
|
```
|
||||||
|
|
||||||
|
Add in `_doSwitch` after `if (view === "profiles") Profiles.load();`:
|
||||||
|
```javascript
|
||||||
|
if (view === "scorejobs") ScoreJobs.load();
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `ScoreJobs.stopAllPolls();` when switching away, in `_doSwitch` before view switching logic:
|
||||||
|
```javascript
|
||||||
|
// Stop score job pollers when leaving the scorejobs view
|
||||||
|
if (App.activeView === "scorejobs" && view !== "scorejobs") ScoreJobs.stopAllPolls();
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Add script tag to `webapp/static/index.html`**
|
||||||
|
|
||||||
|
Add before `<script src="/static/js/app.js"></script>`:
|
||||||
|
```html
|
||||||
|
<script src="/static/js/score_jobs.js"></script>
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 6: Verify server boots**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "from webapp.server import create_app; create_app(); print('OK')"
|
||||||
|
```
|
||||||
|
Expected: `OK`
|
||||||
|
|
||||||
|
Also verify HTML has all new elements:
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
c = open('webapp/static/index.html', encoding='utf-8').read()
|
||||||
|
assert 'view-scorejobs' in c
|
||||||
|
assert 'scorejobs-container' in c
|
||||||
|
assert '评分记录' in c
|
||||||
|
print('HTML OK')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/static/index.html webapp/static/js/api.js webapp/static/js/app.js webapp/static/js/score_jobs.js
|
||||||
|
git commit -m "feat: add 评分记录 page with async score job list and auto-polling"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: 全量回归测试 + Dify 说明注释
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/static/js/score_jobs.js` (minor: add Dify curl comment at top)
|
||||||
|
|
||||||
|
- [ ] **Step 1: Run full test suite**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/ -v --tb=short -q 2>&1 | tail -15
|
||||||
|
```
|
||||||
|
|
||||||
|
Pre-existing failures to ignore:
|
||||||
|
- `test_normalize_sample_pdf_offline_smoke_row`
|
||||||
|
- `test_evaluator_and_reporting_write_run_assets`
|
||||||
|
- `test_question_generator_rejects_invalid_json`
|
||||||
|
- `test_question_generator_rejects_non_list_samples`
|
||||||
|
|
||||||
|
Any other failure is a regression — fix before proceeding.
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run targeted tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py tests/webapp/test_score_api.py tests/test_pipeline.py -v --tb=short
|
||||||
|
```
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
- [ ] **Step 3: Final commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add .
|
||||||
|
git commit -m "feat: async score jobs complete — POST /api/score/async + 评分记录 page
|
||||||
|
|
||||||
|
- ScoreJobManager: thread pool + JSON persistence (outputs/score-jobs/)
|
||||||
|
- POST /api/score/async: 202 immediate response with job_id
|
||||||
|
- GET /api/score/jobs + GET /api/score/jobs/{id}: query endpoints
|
||||||
|
- Frontend: 评分记录 nav page with 5s auto-polling for pending jobs
|
||||||
|
- Dify integration: change /api/score → /api/score/async, remove response parsing
|
||||||
|
|
||||||
|
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
|
||||||
|
```
|
||||||
225
docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
Normal file
225
docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
# 优化顾问模块设计 Spec
|
||||||
|
|
||||||
|
- 日期:2026-06-16
|
||||||
|
- 状态:已确认,进入实现。
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
在现有 RAG 评测流程结束后,新增一个**优化顾问模块**(Optimization Advisor),根据本次评测的多项指标分数与低分样本,自动诊断指标偏低的原因并给出针对性的优化建议,输出为中文 Markdown 报告 + 日志摘要。
|
||||||
|
|
||||||
|
对应架构设计 §11(优化策略):将"指标到动作的映射"(§11.2)从文档形式落地为代码自动执行。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 决策摘要
|
||||||
|
|
||||||
|
| 决策点 | 选择 |
|
||||||
|
|---|---|
|
||||||
|
| 输出形式 | `optimization_advice.md`(文件)+ 控制台/日志摘要(双输出) |
|
||||||
|
| 生成机制 | 规则引擎定位异常指标 → LLM 结合低分样本二次解读(两层) |
|
||||||
|
| 触发方式 | YAML 场景文件显式声明 `optimization_advisor: true`,默认关闭 |
|
||||||
|
| LLM 实例 | 复用 `build_models()` 已创建的 `llm` 实例,不重建 client |
|
||||||
|
| 包位置 | `rag_eval/advisor/`(独立包,对外暴露 `run_advisor()` 单一入口) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 架构
|
||||||
|
|
||||||
|
### 3.1 执行链路
|
||||||
|
|
||||||
|
```
|
||||||
|
run_scenario()
|
||||||
|
→ load_scenario() # 读 YAML,解析 optimization_advisor 字段
|
||||||
|
→ build_models() # 已有:创建 llm, embeddings
|
||||||
|
→ build_metric_pipeline() # 已有
|
||||||
|
→ Evaluator.evaluate() # 已有:打分 → EvaluationResult
|
||||||
|
→ write_run_artifacts() # 已有:scores.csv / summary.md / ...
|
||||||
|
→ run_advisor( # 新增(3 行)
|
||||||
|
result, scenario, llm, artifact_paths
|
||||||
|
)
|
||||||
|
→ rules.diagnose(score_rows) # 规则引擎:返回 Diagnosis 列表
|
||||||
|
→ llm_analyzer.analyze(diags, samples) # LLM:生成中文 Markdown 建议
|
||||||
|
→ writer.write(advice, paths) # 写文件 + 打日志
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 新增文件
|
||||||
|
|
||||||
|
```
|
||||||
|
rag_eval/advisor/
|
||||||
|
__init__.py ← 暴露 run_advisor(),外部唯一入口
|
||||||
|
rules.py ← 纯函数规则引擎,无 LLM,可单独单测
|
||||||
|
llm_analyzer.py ← 接收 llm 实例 + 诊断结构 → 中文 Markdown
|
||||||
|
writer.py ← 写 optimization_advice.md,打日志摘要
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 修改文件(最小改动)
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|---|---|
|
||||||
|
| `rag_eval/shared/models.py` | `Scenario` 加 `optimization_advisor: bool = False` 字段 |
|
||||||
|
| `rag_eval/config/schema.py` | `ScenarioModel` 加同名字段 + 透传到 `Scenario` |
|
||||||
|
| `rag_eval/config/loader.py` | 透传 `optimization_advisor` 到 `Scenario` 构造 |
|
||||||
|
| `rag_eval/reporting/artifacts.py` | `RunArtifactPaths` 加 `advice_md: Path` 字段 + `build_artifact_paths()` 加赋值 |
|
||||||
|
| `rag_eval/execution/runner.py` | `run_scenario()` 末尾:`build_models` 返回 llm 传入,条件调用 `run_advisor()` |
|
||||||
|
|
||||||
|
### 3.4 输出产物
|
||||||
|
|
||||||
|
```
|
||||||
|
outputs/online/siemens-pdf-question-bank/<run_id>/
|
||||||
|
scenario.snapshot.yaml
|
||||||
|
scores.csv
|
||||||
|
invalid.csv
|
||||||
|
summary.md
|
||||||
|
metadata.json
|
||||||
|
optimization_advice.md ← 新增(optimization_advisor: true 时生成)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. 规则引擎(rules.py)
|
||||||
|
|
||||||
|
### 4.1 数据结构
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class Diagnosis:
|
||||||
|
metric: str # 指标名
|
||||||
|
mean_score: float # 本次均值
|
||||||
|
threshold: float # 警戒阈值
|
||||||
|
severity: str # "warning" | "critical"
|
||||||
|
root_causes: list[str] # 可能原因(来自架构设计 §11.2)
|
||||||
|
suggested_actions: list[str] # 对应可调阶段
|
||||||
|
low_samples: list[dict] # 分数最低的 N 条样本(含 question/answer/ground_truth)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 七条指标诊断规则
|
||||||
|
|
||||||
|
阈值参考 RAG 评测最佳实践,分 warning / critical 两档:
|
||||||
|
|
||||||
|
| 指标 | warning | critical | 根因方向 | 对应优化阶段(§11.2) |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| `faithfulness` | < 0.7 | < 0.5 | 生成未严格基于检索片段 / 幻觉 | 生成 prompt grounding、开启校验 |
|
||||||
|
| `answer_relevancy` | < 0.7 | < 0.5 | 回答偏离问题 / 格式冗余 | 查询改写、生成 prompt 格式 |
|
||||||
|
| `context_recall` | < 0.7 | < 0.5 | 检索遗漏关键信息 | 多查询、问题分解、Step-back、加大过召回 |
|
||||||
|
| `context_precision` | < 0.6 | < 0.4 | 检索引入过多噪声 / 排序差 | 后检索重排、压缩、相关性过滤 |
|
||||||
|
| `noise_sensitivity` | > 0.3 | > 0.5 | 回答被噪声片段干扰(越低越好) | 后检索相关性过滤、重排 |
|
||||||
|
| `factual_correctness` | < 0.6 | < 0.4 | 回答事实与标准答案偏差大 | 检索与生成综合优化 |
|
||||||
|
| `semantic_similarity` | < 0.7 | < 0.5 | 回答语义与标准答案差距大 | 生成 prompt、检索质量 |
|
||||||
|
|
||||||
|
> 注:`noise_sensitivity` 越低越好(0=完全不受噪声影响),其阈值方向与其余相反。
|
||||||
|
|
||||||
|
### 4.3 低分样本选取
|
||||||
|
|
||||||
|
每个触发诊断的指标,取该指标分数最低的 **top-3** 样本(排除 NaN)附入 `Diagnosis.low_samples`,字段包含 `sample_id / question / answer / ground_truth / <metric_score>`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. LLM 分析器(llm_analyzer.py)
|
||||||
|
|
||||||
|
### 5.1 输入
|
||||||
|
|
||||||
|
- `diagnoses: list[Diagnosis]` — 规则引擎输出(仅触发阈值的指标)
|
||||||
|
- `llm` — 已有 RAGAS LLM 实例(scenario 的 judge_model)
|
||||||
|
- `scenario_name: str` — 用于报告标题
|
||||||
|
|
||||||
|
### 5.2 Prompt 设计
|
||||||
|
|
||||||
|
使用**一次 LLM 调用**,把所有触发诊断的指标和低分样本一起发送:
|
||||||
|
|
||||||
|
```
|
||||||
|
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
|
||||||
|
请用中文撰写一份优化建议报告,格式为 Markdown。
|
||||||
|
|
||||||
|
## 评测诊断摘要
|
||||||
|
{for each diagnosis: 指标名、均值、阈值、可能原因、建议动作}
|
||||||
|
|
||||||
|
## 低分样本示例
|
||||||
|
{for each diagnosis: top-3 低分样本的 question / answer / ground_truth}
|
||||||
|
|
||||||
|
## 要求
|
||||||
|
1. 按指标分节(## 指标名),先解释"为什么低",再给出"具体怎么改"
|
||||||
|
2. "具体怎么改"要结合低分样本的具体内容,而不只是泛泛建议
|
||||||
|
3. 最后写一节 ## 优先优化次序,按性价比排序(参考:不增加调用次数的优先)
|
||||||
|
4. 语言简洁,面向工程师,不要废话
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5.3 输出
|
||||||
|
|
||||||
|
LLM 返回的 Markdown 字符串,直接写入 `optimization_advice.md`(在报告头部追加运行元信息)。
|
||||||
|
|
||||||
|
### 5.4 失败降级
|
||||||
|
|
||||||
|
LLM 调用失败(超时/异常)时:降级为**纯规则报告**(只输出规则引擎的诊断结构,不含 LLM 解读),文件照常写出,错误信息写入报告末尾,不阻断整个评测流程。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 写出层(writer.py)
|
||||||
|
|
||||||
|
### 6.1 文件写出
|
||||||
|
|
||||||
|
`optimization_advice.md` 结构:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# 优化建议报告 — <scenario_name>
|
||||||
|
|
||||||
|
- run_id: `<run_id>`
|
||||||
|
- 生成时间: `<timestamp>`
|
||||||
|
- judge_model: `<model>`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
<LLM 生成的 Markdown 正文>
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 日志摘要
|
||||||
|
|
||||||
|
`run_advisor()` 完成后向 `logger.info` 打印一条精简摘要(单行,适合 `run_eval.bat` 结束后一眼扫到):
|
||||||
|
|
||||||
|
```
|
||||||
|
[advisor] 触发诊断 3 项: faithfulness(0.42, critical) context_recall(0.58, warning) noise_sensitivity(0.41, critical)
|
||||||
|
[advisor] 优化建议已写出: outputs/online/.../optimization_advice.md
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. YAML 配置
|
||||||
|
|
||||||
|
场景文件新增一个顶层字段:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
optimization_advisor: true # 默认 false;true 时评测结束后自动生成优化建议
|
||||||
|
```
|
||||||
|
|
||||||
|
后续若需精细配置(阈值覆盖、top-N 低分样本数),可扩展为:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
optimization_advisor:
|
||||||
|
enabled: true
|
||||||
|
top_low_samples: 3 # 每个指标取几条低分样本(默认 3)
|
||||||
|
# thresholds: # 可选:覆盖默认阈值
|
||||||
|
# faithfulness: 0.65
|
||||||
|
```
|
||||||
|
|
||||||
|
本轮实现仅支持 `optimization_advisor: true/false`,扩展接口预留但不实现。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 测试策略
|
||||||
|
|
||||||
|
| 测试 | 文件 | 说明 |
|
||||||
|
|---|---|---|
|
||||||
|
| 规则引擎单测 | `tests/test_advisor_rules.py` | 纯函数,无 LLM,覆盖每条规则的 warning/critical 触发、NaN 跳过、low_samples 选取 |
|
||||||
|
| writer 单测 | `tests/test_advisor_writer.py` | mock Diagnosis 列表,验证 md 文件写出格式和日志输出 |
|
||||||
|
| 集成(可选) | 现有 `tests/test_online_eval.py` | 验证 `optimization_advisor: true` 场景下 advice_md 存在 |
|
||||||
|
|
||||||
|
LLM 分析器不写单测(依赖网络),由集成场景覆盖。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. 不覆盖(本轮边界)
|
||||||
|
|
||||||
|
- 不支持跨版本对比分析(只分析本次 run)
|
||||||
|
- 不支持批量场景聚合建议
|
||||||
|
- 不建设 Web UI 展示
|
||||||
|
- LLM 分析器 prompt 本轮不做多语言适配(直接中文)
|
||||||
|
- advisor 阈值本轮硬编码在 `rules.py`,不从 YAML 读取
|
||||||
240
docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
Normal file
240
docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
# 指标权重 & 文档片段权重功能设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-18
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 在「新建评估」运行评估时,支持为 RAGAS 指标和文档配置权重,计算加权综合得分并在报告中展示。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
1. **指标权重(Metric Weights)**:允许为每个 RAGAS 指标配置浮点权重(如 faithfulness: 0.35),计算每道题的加权综合得分 `weighted_score`。
|
||||||
|
2. **文档权重(Doc Weights)**:允许为特定 PDF 文档名称配置权重(如 `"322_双源CT.pdf": 2.0`),该文档的题目在汇总指标均值时按权重放大贡献。
|
||||||
|
3. **前端覆盖**:在「新建评估」页面选中场景后,展示可编辑的权重面板,运行前可临时覆盖 YAML 中的权重。
|
||||||
|
4. **完全向后兼容**:两个字段均为可选,省略时退化为等权行为,现有场景 YAML 无需修改。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 数据模型
|
||||||
|
|
||||||
|
### 2.1 场景 YAML(新增可选字段)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# 可选。缺省时所有指标权重 = 1.0
|
||||||
|
metric_weights:
|
||||||
|
faithfulness: 0.35
|
||||||
|
context_recall: 0.25
|
||||||
|
context_precision: 0.20
|
||||||
|
answer_relevancy: 0.20
|
||||||
|
|
||||||
|
# 可选。缺省时所有文档权重 = 1.0
|
||||||
|
doc_weights:
|
||||||
|
"322_双源CT成像技术.pdf": 2.0
|
||||||
|
"323_单源CT对比.pdf": 1.5
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 Pydantic Schema(`rag_eval/config/schema.py`)
|
||||||
|
|
||||||
|
`ScenarioModel` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
```
|
||||||
|
|
||||||
|
`ConfigDict(extra="ignore")` 不变,新字段不影响既有 YAML 的加载。
|
||||||
|
|
||||||
|
### 2.3 内部 Scenario dataclass(`rag_eval/shared/models.py`)
|
||||||
|
|
||||||
|
`Scenario` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
```
|
||||||
|
|
||||||
|
随 `scenario.snapshot()` 序列化,供 `run_reader` / 报告层读取。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 后端:权重计算逻辑
|
||||||
|
|
||||||
|
### 3.1 新模块 `rag_eval/metrics/weights.py`
|
||||||
|
|
||||||
|
纯函数模块,无外部依赖,独立可测:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
|
||||||
|
"""返回 key 对应的权重,缺失时返回 default。"""
|
||||||
|
|
||||||
|
def compute_weighted_score(
|
||||||
|
scores: dict[str, float | None],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""
|
||||||
|
给定各指标得分和权重,返回加权综合得分。
|
||||||
|
- 忽略 NaN / None 值
|
||||||
|
- metric_weights 为空时退化为等权均值
|
||||||
|
- 全部 NaN 时返回 None
|
||||||
|
公式: Σ(w_i * s_i) / Σ(w_i),只对非 NaN 项求和
|
||||||
|
"""
|
||||||
|
|
||||||
|
def weighted_metric_means(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metrics: list[str],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""
|
||||||
|
对每个指标计算文档加权均值。
|
||||||
|
- sample_weight = doc_weights.get(row["doc_name"], 1.0)
|
||||||
|
- 公式: Σ(sample_weight_j * score_m_j) / Σ(sample_weight_j)
|
||||||
|
- doc_weights 为空时退化为普通算术均值
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 评估器(`rag_eval/execution/evaluator.py`)
|
||||||
|
|
||||||
|
`_merge_score()` 新增两列:
|
||||||
|
```python
|
||||||
|
record["weighted_score"] = compute_weighted_score(
|
||||||
|
score.metrics, self.scenario.metric_weights
|
||||||
|
)
|
||||||
|
record["sample_weight"] = self.scenario.doc_weights.get(
|
||||||
|
sample.metadata.get("doc_name", ""), 1.0
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`scores.csv` 新增 `weighted_score`、`sample_weight` 两列。
|
||||||
|
|
||||||
|
### 3.3 报告摘要(`rag_eval/reporting/summary.py`)
|
||||||
|
|
||||||
|
`build_summary_markdown()` 改用 `weighted_metric_means()` 计算各指标均值;
|
||||||
|
新增 `weighted_score` 整体均值行:
|
||||||
|
|
||||||
|
```
|
||||||
|
## Metric Means(加权)
|
||||||
|
- faithfulness: 0.8123 (w=0.35)
|
||||||
|
- context_recall: 0.7654 (w=0.25)
|
||||||
|
- context_precision: 0.7200 (w=0.20)
|
||||||
|
- answer_relevancy: 0.7400 (w=0.20)
|
||||||
|
- **weighted_score: 0.7789**
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. yaml_patcher 扩展(`webapp/services/yaml_patcher.py`)
|
||||||
|
|
||||||
|
`apply_profiles_to_scenario()` 扩展签名,新增可选参数:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def apply_profiles_to_scenario(
|
||||||
|
scenario_path: str,
|
||||||
|
judge_profile: LLMProfile | None,
|
||||||
|
answer_profile: LLMProfile | None,
|
||||||
|
dataset_profile: LLMProfile | None,
|
||||||
|
metric_weights: dict[str, float] | None = None, # 新增
|
||||||
|
doc_weights: dict[str, float] | None = None, # 新增
|
||||||
|
_resolve_absolute: bool = False,
|
||||||
|
) -> list[str]:
|
||||||
|
```
|
||||||
|
|
||||||
|
- `metric_weights` 非 None 时写入 `data["metric_weights"]`,追加 `"metric_weights"` 到 patched 列表
|
||||||
|
- `doc_weights` 非 None 时写入 `data["doc_weights"]`,追加 `"doc_weights"` 到 patched 列表
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Webapp 模型与 API 扩展
|
||||||
|
|
||||||
|
### 5.1 `webapp/models.py`
|
||||||
|
|
||||||
|
`ProfileApplyRequest` 新增:
|
||||||
|
```python
|
||||||
|
metric_weights: dict[str, float] | None = None
|
||||||
|
doc_weights: dict[str, float] | None = None
|
||||||
|
```
|
||||||
|
|
||||||
|
`ProfileApplyResponse` 不变(`patched_fields` 已包含新字段名)。
|
||||||
|
|
||||||
|
### 5.2 `webapp/api/llm_profiles.py` — `apply_profiles()`
|
||||||
|
|
||||||
|
透传 `metric_weights` / `doc_weights` 给 `apply_profiles_to_scenario()`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 前端:权重配置面板
|
||||||
|
|
||||||
|
### 6.1 HTML(`index.html`)
|
||||||
|
|
||||||
|
在 `#llm-assignment-panel` 下方新增 `#weight-config-panel`(选中场景后显示):
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────┐
|
||||||
|
│ 权重配置 (可选,留空使用场景原始配置) │
|
||||||
|
├─────────────────────────────────────────────┤
|
||||||
|
│ 指标权重 │
|
||||||
|
│ faithfulness [____1.0____] │
|
||||||
|
│ context_recall [____1.0____] │
|
||||||
|
│ ...(根据选中场景的 metrics 动态生成) │
|
||||||
|
│ │
|
||||||
|
│ 文档权重(doc_weights) │
|
||||||
|
│ [doc名称_______________] [权重__] [+] [✕] │
|
||||||
|
│ [doc名称_______________] [权重__] [+] [✕] │
|
||||||
|
│ + 添加文档权重规则 │
|
||||||
|
└─────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 `runner.js`
|
||||||
|
|
||||||
|
- `renderScenarioItem()` 选中后调用 `Runner._renderWeightPanel(sc)` 动态生成指标行
|
||||||
|
- `_applyProfilesIfNeeded()` 同时读取权重输入,追加到 `apply` 请求 body
|
||||||
|
- `Runner._collectWeights()` 收集 metric_weights / doc_weights,全部为 1.0 时不发送(跳过)
|
||||||
|
|
||||||
|
### 6.3 CSS(`app.css`)
|
||||||
|
|
||||||
|
新增 `.weight-config-panel`、`.weight-row`、`.weight-input` 样式,与现有 `.llm-role-row` 风格一致。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 报告展示(`webapp/services/report_builder.py`)
|
||||||
|
|
||||||
|
- `RunSummary.metric_means` 改用 `weighted_metric_means()` 计算(需从 `scenario.snapshot.yaml` 读取 `doc_weights` / `metric_weights`)
|
||||||
|
- `RunSummary` 新增 `weighted_score_mean: float | None` 字段
|
||||||
|
- 前端 `report.js` 的指标卡片区新增「综合加权得分」卡片,使用 `good/warn/bad` 配色
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 测试计划
|
||||||
|
|
||||||
|
| 测试文件 | 覆盖内容 |
|
||||||
|
|----------|---------|
|
||||||
|
| `tests/test_weights.py` | `compute_weighted_score` / `weighted_metric_means` 纯函数,含 NaN 边界、空权重、全 NaN |
|
||||||
|
| `tests/test_dataset_build.py` | 无改动(隔离良好) |
|
||||||
|
| `tests/test_offline_eval.py` | `_merge_score` 新增 weighted_score / sample_weight 列断言 |
|
||||||
|
| `tests/webapp/test_llm_profiles_api.py` | `apply_profiles` 带 metric_weights / doc_weights 的 patching 测试 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. 改动文件清单
|
||||||
|
|
||||||
|
| 文件 | 改动类型 |
|
||||||
|
|------|---------|
|
||||||
|
| `rag_eval/config/schema.py` | 新增字段 |
|
||||||
|
| `rag_eval/shared/models.py` | 新增字段 |
|
||||||
|
| `rag_eval/config/loader.py` | 透传新字段到 Scenario |
|
||||||
|
| `rag_eval/metrics/weights.py` | **新建** |
|
||||||
|
| `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 |
|
||||||
|
| `rag_eval/reporting/summary.py` | 改用加权均值 |
|
||||||
|
| `webapp/services/yaml_patcher.py` | 新增 metric_weights / doc_weights 参数 |
|
||||||
|
| `webapp/models.py` | ProfileApplyRequest 新增字段;RunSummary 新增 weighted_score_mean |
|
||||||
|
| `webapp/api/llm_profiles.py` | 透传新参数 |
|
||||||
|
| `webapp/services/report_builder.py` | 加权均值计算 |
|
||||||
|
| `webapp/static/index.html` | 新增权重配置面板 |
|
||||||
|
| `webapp/static/js/runner.js` | 权重面板逻辑 |
|
||||||
|
| `webapp/static/css/app.css` | 新增权重面板样式 |
|
||||||
|
| `tests/test_weights.py` | **新建** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. 向后兼容保证
|
||||||
|
|
||||||
|
- `metric_weights: {}` + `doc_weights: {}` → 所有权重 = 1.0,行为与当前完全一致
|
||||||
|
- 现有场景 YAML 不含这两个字段 → Pydantic `default_factory=dict` 填充空字典
|
||||||
|
- `scores.csv` 新增两列不影响现有报告读取(`run_reader` 只读已知列)
|
||||||
138
docs/superpowers/specs/2026-06-22-dify-score-api-design.md
Normal file
138
docs/superpowers/specs/2026-06-22-dify-score-api-design.md
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
# Dify 集成 — 单题实时评分 API 设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-22
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 在现有 FastAPI 服务中新增 `POST /api/score` 端点,供 Dify 外部 Tool 调用,实现单条问答记录的实时 RAGAS 指标评分。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
让 Dify Agent 能在回答完问题后,将 `(question, answer, contexts, ground_truth)` 发给 siemens_ragas 服务,实时获取各 RAGAS 指标得分,用于质量监控或 Agent 自我改进。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. API 规范
|
||||||
|
|
||||||
|
### `POST /api/score`
|
||||||
|
|
||||||
|
**请求体:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "片段1:双源CT采用两套管-探测器系统... |||| 片段2:单扇区采集旋转135度...",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy"],
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**字段说明:**
|
||||||
|
|
||||||
|
| 字段 | 类型 | 必填 | 说明 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| `question` | str | ✅ | 问题文本 |
|
||||||
|
| `answer` | str | ✅ | 待评分的回答 |
|
||||||
|
| `contexts` | str | ✅ | 检索到的上下文,多段用 `context_separator` 拼接 |
|
||||||
|
| `ground_truth` | str | ❌ | 标准答案;缺失时跳过依赖它的指标(context_recall、factual_correctness、semantic_similarity) |
|
||||||
|
| `context_separator` | str | ❌ | 默认 `" \|\|\|\| "`(四个竖线,两侧各一空格) |
|
||||||
|
| `metrics` | list[str] | ❌ | 默认 `["faithfulness", "answer_relevancy", "context_recall", "context_precision"]` |
|
||||||
|
| `judge_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_JUDGE_MODEL` |
|
||||||
|
| `embedding_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_EMBEDDING_MODEL` |
|
||||||
|
|
||||||
|
**响应体(200 OK):**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scores": {
|
||||||
|
"faithfulness": 0.8750,
|
||||||
|
"answer_relevancy": 0.9200
|
||||||
|
},
|
||||||
|
"weighted_score": 0.8975,
|
||||||
|
"latency_ms": 3420
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**错误响应:**
|
||||||
|
|
||||||
|
| 状态码 | 场景 |
|
||||||
|
|--------|------|
|
||||||
|
| 400 | 必填字段缺失、metrics 名称不合法 |
|
||||||
|
| 401 | 配置了 `SCORE_API_TOKEN` 但请求未携带有效 Bearer Token |
|
||||||
|
| 422 | 请求体 JSON 格式错误(Pydantic 校验) |
|
||||||
|
| 500 | RAGAS 内部评分异常,附带 error 字段 |
|
||||||
|
|
||||||
|
**鉴权(可选):**
|
||||||
|
若 `.env` 中 `SCORE_API_TOKEN` 非空,则要求请求头携带 `Authorization: Bearer <token>`。为空则不鉴权(内网部署场景)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 架构与文件改动
|
||||||
|
|
||||||
|
### 新文件
|
||||||
|
|
||||||
|
| 文件 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/api/score.py` | 路由定义,请求验证,调用 InlineScorer |
|
||||||
|
| `webapp/services/inline_scorer.py` | LLM 客户端缓存 + RAGAS 评分逻辑封装 |
|
||||||
|
|
||||||
|
### 修改文件
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/models.py` | 新增 `ScoreRequest`、`ScoreResponse` |
|
||||||
|
| `webapp/server.py` | 注册 `score.router`,更新 `openapi_tags` |
|
||||||
|
| `rag_eval/settings.py` | 新增 `score_api_token: str | None` 字段 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. `inline_scorer.py` 设计
|
||||||
|
|
||||||
|
```python
|
||||||
|
class InlineScorer:
|
||||||
|
"""同步执行 RAGAS 单题评分,内部缓存 LLM 客户端。"""
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""返回 {metric_name: score} 字典,NaN 记为 None。"""
|
||||||
|
```
|
||||||
|
|
||||||
|
**客户端缓存策略:**
|
||||||
|
以 `(judge_model, embedding_model)` 为 key,缓存 `(llm, embeddings)` 对象,避免每次请求都重建 AsyncOpenAI 连接。缓存为模块级单例(`_scorer_cache: dict`),线程安全(加 `threading.Lock`)。
|
||||||
|
|
||||||
|
**评分执行:**
|
||||||
|
复用 `build_metric_pipeline` 构建 `MetricPipeline`,然后 `asyncio.run(pipeline.score_sample(sample))` 执行。与现有 `evaluator.py` 模式一致。
|
||||||
|
|
||||||
|
**ground_truth 为空时的指标跳过逻辑:**
|
||||||
|
`context_recall`、`factual_correctness`、`semantic_similarity`、`noise_sensitivity` 需要 ground_truth;若请求中未提供,自动从 metrics 列表中移除这些指标,并在响应中对应字段返回 `null`。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Dify 侧配置方法
|
||||||
|
|
||||||
|
1. 在 Dify 「工具」→「自定义工具」中创建新工具
|
||||||
|
2. 填写 OpenAPI Schema(与 `/api/score` 端点对齐)
|
||||||
|
3. 鉴权方式:API Key(Bearer)或无鉴权
|
||||||
|
4. 在 Agent / Workflow 节点中引用该工具,将 `question`、`answer`、`contexts` 变量映射到工具输入
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 不在范围内
|
||||||
|
|
||||||
|
- 批量评分接口(异步 job)
|
||||||
|
- Dify Workflow 节点插件(需要 Dify 插件开发框架)
|
||||||
|
- 评分结果持久化到 scores.csv
|
||||||
|
- 与现有 report_builder 集成展示
|
||||||
173
docs/superpowers/specs/2026-06-22-linux-deploy-design.md
Normal file
173
docs/superpowers/specs/2026-06-22-linux-deploy-design.md
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
# Linux 一键部署脚本设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-22
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 为 siemens_ragas 项目提供 Linux 环境的部署与运维脚本(无 Docker,无 systemd)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
提供四个 Bash 脚本,覆盖 Linux 服务器上的完整生命周期:
|
||||||
|
|
||||||
|
| 脚本 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `deploy.sh` | 一键完成环境检查、依赖安装、配置初始化、启动服务 |
|
||||||
|
| `start.sh` | 仅启动 Web 服务(已部署后复用,不重装依赖) |
|
||||||
|
| `stop.sh` | 停止后台 Web 服务 |
|
||||||
|
| `run_eval.sh` | 运行单次评估(对应 Windows 的 `run_eval.ps1`) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 约束与假设
|
||||||
|
|
||||||
|
- Linux 目标环境有 PyPI 网络访问(pip 可直接安装)
|
||||||
|
- 代码已通过 `git clone` 或文件拷贝到服务器
|
||||||
|
- 使用 `pip + venv`(不使用 uv)
|
||||||
|
- Web 服务监听 `0.0.0.0:8800`(内网可达)
|
||||||
|
- 后台运行使用 `nohup`,PID 写入 `.server.pid`,日志追加到 `logs/server.log`
|
||||||
|
- 所有脚本均放在仓库根目录,路径相对于 `$SCRIPT_DIR`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. `deploy.sh` 详细设计
|
||||||
|
|
||||||
|
### 3.1 阶段 1:Python 版本检查
|
||||||
|
|
||||||
|
```
|
||||||
|
require Python >= 3.12
|
||||||
|
```
|
||||||
|
|
||||||
|
- `python3 --version` 解析 major.minor
|
||||||
|
- 不满足则打印错误并 `exit 1`
|
||||||
|
- 满足则打印 `[OK] Python X.Y.Z`
|
||||||
|
|
||||||
|
### 3.2 阶段 2:虚拟环境
|
||||||
|
|
||||||
|
- 目标路径:`$SCRIPT_DIR/.venv`
|
||||||
|
- 已存在则跳过创建(打印 `[OK] .venv already exists`)
|
||||||
|
- 不存在则 `python3 -m venv .venv`
|
||||||
|
|
||||||
|
### 3.3 阶段 3:依赖安装
|
||||||
|
|
||||||
|
```bash
|
||||||
|
.venv/bin/pip install --upgrade pip -q
|
||||||
|
.venv/bin/pip install -e . -q # 安装 pyproject.toml 中的依赖
|
||||||
|
.venv/bin/pip install fastapi uvicorn httpx -q # Web 服务额外依赖
|
||||||
|
```
|
||||||
|
|
||||||
|
- 失败则打印错误并 `exit 1`
|
||||||
|
- `fastapi`、`uvicorn`、`httpx` 在 `pyproject.toml` 中未列,需单独安装
|
||||||
|
|
||||||
|
### 3.4 阶段 4:配置文件
|
||||||
|
|
||||||
|
- 若 `.env` 不存在:`cp .env.example .env`,打印警告提示用户编辑后再启动
|
||||||
|
- 若 `.env` 已存在:跳过,打印 `[OK] .env found`
|
||||||
|
|
||||||
|
### 3.5 阶段 5:目录初始化
|
||||||
|
|
||||||
|
创建以下目录(`mkdir -p`,幂等):
|
||||||
|
- `configs/` — LLM Profile 持久化存储
|
||||||
|
- `logs/` — 评估日志 + 服务器日志
|
||||||
|
- `outputs/` — 评估运行产物
|
||||||
|
- `datasets/` — 原始数据集
|
||||||
|
|
||||||
|
### 3.6 阶段 6:Demo 数据
|
||||||
|
|
||||||
|
- 检查 `outputs/kba-knowledge-base-offline-baseline/` 是否存在
|
||||||
|
- 不存在则运行 `.venv/bin/python scripts/seed_sample_run.py`
|
||||||
|
- 失败时打印 `[WARN]`(非致命,报告页为空但服务可启动)
|
||||||
|
|
||||||
|
### 3.7 阶段 7:端口检测
|
||||||
|
|
||||||
|
- 默认端口 `8800`
|
||||||
|
- 用 `ss -tlnp` 或 `netstat -tlnp` 检查是否占用
|
||||||
|
- 占用则尝试 `8801`,仍占用则报错退出
|
||||||
|
|
||||||
|
### 3.8 阶段 8:启动服务
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nohup .venv/bin/python webmain.py \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port $PORT \
|
||||||
|
>> logs/server.log 2>&1 &
|
||||||
|
echo $! > .server.pid
|
||||||
|
```
|
||||||
|
|
||||||
|
- 等待 2 秒后用 `kill -0 $PID` 检测进程是否存活
|
||||||
|
- 存活则打印 URL 和 stop 方法
|
||||||
|
- 未存活则打印 `[ERROR] Server failed to start. Check logs/server.log.` 并 `exit 1`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. `start.sh` 详细设计
|
||||||
|
|
||||||
|
单独负责启动,不做任何环境初始化。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# 检查 .venv 存在
|
||||||
|
# 端口检测(同 deploy.sh 逻辑)
|
||||||
|
# 检查 .env 存在(不存在则 warn 但不阻止)
|
||||||
|
# nohup 启动 + PID 文件 + 存活验证
|
||||||
|
# 打印 URL
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. `stop.sh` 详细设计
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# 读取 .server.pid
|
||||||
|
# 若文件不存在:打印 "No server PID file found." 退出
|
||||||
|
# kill $PID
|
||||||
|
# 等待 2 秒,若进程仍存活用 kill -9
|
||||||
|
# 删除 .server.pid
|
||||||
|
# 打印 "Server stopped."
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. `run_eval.sh` 详细设计
|
||||||
|
|
||||||
|
对应 Windows 的 `run_eval.ps1`。
|
||||||
|
|
||||||
|
```
|
||||||
|
用法:
|
||||||
|
./run_eval.sh # online eval (默认)
|
||||||
|
./run_eval.sh offline # offline smoke
|
||||||
|
./run_eval.sh scenarios/xxx.yaml # 自定义场景
|
||||||
|
./run_eval.sh online DEBUG # 自定义日志级别
|
||||||
|
```
|
||||||
|
|
||||||
|
- 参数 1(Scenario):`online` / `offline` / 文件路径,默认 `online`
|
||||||
|
- 参数 2(LogLevel):`DEBUG` / `INFO` / `WARNING` / `ERROR`,默认 `INFO`
|
||||||
|
- 场景别名映射:
|
||||||
|
- `online` → `scenarios/online/siemens-pdf-question-bank-online.yaml`
|
||||||
|
- `offline` → `scenarios/offline/siemens-pdf-offline-smoke.yaml`
|
||||||
|
- 时间戳日志文件:`logs/eval_$(date +%Y-%m-%d_%H%M%S).log`
|
||||||
|
- 环境变量:`PYTHONIOENCODING=utf-8 PYTHONPATH=.`
|
||||||
|
- 调用:`.venv/bin/python main.py --scenario $SCENARIO --log-file $LOG_FILE --log-level $LOG_LEVEL`
|
||||||
|
- 非零退出码时打印错误并 `exit 1`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 通用约定
|
||||||
|
|
||||||
|
- 所有脚本首行:`#!/usr/bin/env bash`
|
||||||
|
- `set -euo pipefail` — 错误立即退出,未定义变量报错,管道错误传播
|
||||||
|
- `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` — 从任意目录执行均正确
|
||||||
|
- `cd "$SCRIPT_DIR"` — 切换到仓库根目录
|
||||||
|
- 颜色输出:绿色 `[OK]`、黄色 `[WARN]`、红色 `[ERROR]`(检测 tty,非交互式终端降级为无色)
|
||||||
|
- 执行权限:脚本自身需要 `chmod +x`(在 deploy.sh 内对其他脚本自动 chmod)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 不在范围内
|
||||||
|
|
||||||
|
- Docker / docker-compose 支持
|
||||||
|
- systemd service 配置
|
||||||
|
- Nginx 反向代理配置
|
||||||
|
- SSL/TLS 配置
|
||||||
|
- 离线/内网镜像源配置
|
||||||
116
docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
Normal file
116
docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# 异步评分记录功能设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-24
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 新增 `POST /api/score/async` 异步评分端点,评分结果持久化到磁盘,前端新增「评分记录」页面展示。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
- Dify 工作流调用 `/api/score/async` 立即返回 `job_id`(202),不等待评分完成
|
||||||
|
- 后台异步执行 RAGAS 评分,结果写入 `outputs/score-jobs/<job_id>.json`
|
||||||
|
- RAGAS 平台新增「评分记录」导航页,列表展示所有评分记录及状态
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 架构
|
||||||
|
|
||||||
|
```
|
||||||
|
Dify → POST /api/score/async → 202 {job_id, status:"queued"}
|
||||||
|
↓
|
||||||
|
ScoreJobManager (线程池)
|
||||||
|
↓
|
||||||
|
InlineScorer.score()
|
||||||
|
↓
|
||||||
|
outputs/score-jobs/<job_id>.json
|
||||||
|
↓
|
||||||
|
GET /api/score/jobs ← 前端「评分记录」页轮询
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 存储格式
|
||||||
|
|
||||||
|
`outputs/score-jobs/<job_id>.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "abc123def456",
|
||||||
|
"status": "completed",
|
||||||
|
"created_at": "2026-06-24T09:00:00+00:00",
|
||||||
|
"finished_at": "2026-06-24T09:00:15+00:00",
|
||||||
|
"request": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": null,
|
||||||
|
"ground_truth": null,
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
"judge_model": "gpt-5",
|
||||||
|
"embedding_model": "text-embedding-3-small"
|
||||||
|
},
|
||||||
|
"scores": {"answer_relevancy": 0.9075},
|
||||||
|
"weighted_score": 0.9075,
|
||||||
|
"latency_ms": 12500,
|
||||||
|
"skipped_metrics": [],
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. API 端点
|
||||||
|
|
||||||
|
### `POST /api/score/async`
|
||||||
|
|
||||||
|
请求体与 `POST /api/score` 完全相同(`ScoreRequest`)。
|
||||||
|
|
||||||
|
```json
|
||||||
|
// 立即返回 202
|
||||||
|
{"job_id": "abc123def456", "status": "queued"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GET /api/score/jobs`
|
||||||
|
|
||||||
|
返回所有评分记录,按创建时间倒序:
|
||||||
|
```json
|
||||||
|
{"jobs": [{...ScoreJobStatus...}]}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GET /api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
返回单条评分记录详情。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. 新增文件
|
||||||
|
|
||||||
|
| 文件 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/services/score_job_manager.py` | ScoreJobManager:线程池 + JSON 持久化 |
|
||||||
|
| `webapp/api/score_jobs.py` | 3 个端点路由 |
|
||||||
|
| `webapp/static/js/score_jobs.js` | 前端列表逻辑 + 轮询 |
|
||||||
|
|
||||||
|
## 6. 修改文件
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
|
||||||
|
| `webapp/server.py` | 注册 score_jobs router,更新 OPENAPI_TAGS |
|
||||||
|
| `webapp/static/index.html` | 新增导航项 + section |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 前端「评分记录」页
|
||||||
|
|
||||||
|
列表列:时间 / 问题摘要(前40字)/ 指标 / 得分 / 状态
|
||||||
|
|
||||||
|
- 进入页面自动刷新
|
||||||
|
- `queued/running` 记录每 5 秒轮询 `GET /api/score/jobs/{id}` 更新状态
|
||||||
|
- 得分按 scoreClass(good/warn/bad)着色
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Dify 改造
|
||||||
|
|
||||||
|
只改 HTTP 节点 URL:`/api/score` → `/api/score/async`,删除解析响应的代码节点。
|
||||||
1
logs/online_eval.log
Normal file
1
logs/online_eval.log
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Completed run: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\online\siemens-pdf-question-bank
|
||||||
24
logs/server_2026-06-23.log
Normal file
24
logs/server_2026-06-23.log
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
2026-06-23 13:55:00 INFO webapp.server Starting RAGAS Console host=127.0.0.1 port=8800 log_level=info log_file=C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\logs\server_2026-06-23.log
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Started server process [83868]
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Waiting for application startup.
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Application startup complete.
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Uvicorn running on http://127.0.0.1:8800 (Press CTRL+C to quit)
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:53487 - "GET / HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:53487 - "GET /static/css/app.css HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:50321 - "GET /static/js/api.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:51325 - "GET /static/js/profiles.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:59869 - "GET /static/js/report.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:50980 - "GET /static/js/runner.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:63223 - "GET /static/js/app.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO webapp.access GET /docs → 200 (0ms)
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:63223 - "GET /docs HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO webapp.access GET /api/health → 200 (0ms)
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:50321 - "GET /api/health HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:49 INFO webapp.api.runs [get_runs] found 19 runs
|
||||||
|
2026-06-23 13:59:49 INFO webapp.access GET /api/runs → 200 (1094ms)
|
||||||
|
2026-06-23 13:59:49 INFO uvicorn.access 127.0.0.1:63223 - "GET /api/runs HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:49 INFO webapp.access GET /openapi.json → 200 (94ms)
|
||||||
|
2026-06-23 13:59:49 INFO uvicorn.access 127.0.0.1:63223 - "GET /openapi.json HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:50 INFO webapp.api.llm_profiles [list_profiles] count=6
|
||||||
|
2026-06-23 13:59:50 INFO webapp.access GET /api/llm-profiles → 200 (0ms)
|
||||||
|
2026-06-23 13:59:50 INFO uvicorn.access 127.0.0.1:63223 - "GET /api/llm-profiles HTTP/1.1" 200
|
||||||
35
logs/siemens_build.log
Normal file
35
logs/siemens_build.log
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[info] generating questions for: 315_1_Flash????????.pdf
|
||||||
|
[info] 315_1_Flash????????.pdf: 6 questions generated (total so far: 6)
|
||||||
|
[info] generating questions for: 316_2_Flash??????_??.pdf
|
||||||
|
[info] 316_2_Flash??????_??.pdf: 10 questions generated (total so far: 16)
|
||||||
|
[info] generating questions for: 317_3_Flash??????_??.pdf
|
||||||
|
[info] 317_3_Flash??????_??.pdf: 9 questions generated (total so far: 25)
|
||||||
|
[info] generating questions for: 318_4_Flash??????_???.pdf
|
||||||
|
[info] 318_4_Flash??????_???.pdf: 9 questions generated (total so far: 34)
|
||||||
|
[info] generating questions for: 319_5_Flash??????_?????.pdf
|
||||||
|
[info] 319_5_Flash??????_?????.pdf: 10 questions generated (total so far: 44)
|
||||||
|
[info] generating questions for: 320_6_Flash??????_??.pdf
|
||||||
|
[info] 320_6_Flash??????_??.pdf: 8 questions generated (total so far: 52)
|
||||||
|
[info] generating questions for: 321_??CT???????????--??.pdf
|
||||||
|
[info] 321_??CT???????????--??.pdf: 5 questions generated (total so far: 57)
|
||||||
|
[info] generating questions for: 322_??CT???????????--??????????.pdf
|
||||||
|
[info] 322_??CT???????????--??????????.pdf: 8 questions generated (total so far: 65)
|
||||||
|
[info] generating questions for: 323_??CT???????????--?????????.pdf
|
||||||
|
[info] 323_??CT???????????--?????????.pdf: 5 questions generated (total so far: 70)
|
||||||
|
[info] generating questions for: 324_??CT???????????--????????.pdf
|
||||||
|
[info] 324_??CT???????????--????????.pdf: 8 questions generated (total so far: 78)
|
||||||
|
[info] generating questions for: 325_??CT???????????--???????.pdf
|
||||||
|
[info] 325_??CT???????????--???????.pdf: 8 questions generated (total so far: 86)
|
||||||
|
[info] generating questions for: 326_??CT???????????--4D????.pdf
|
||||||
|
[info] 326_??CT???????????--4D????.pdf: 7 questions generated (total so far: 93)
|
||||||
|
[info] generating questions for: 327_??CT???????????--??????.pdf
|
||||||
|
[info] 327_??CT???????????--??????.pdf: 8 questions generated (total so far: 101)
|
||||||
|
[info] generating questions for: 749_????01_???????????.pdf
|
||||||
|
[info] 749_????01_???????????.pdf: 8 questions generated (total so far: 109)
|
||||||
|
[info] generating questions for: 804_????02-????????CT?????X-Map??.pdf
|
||||||
|
[info] 804_????02-????????CT?????X-Map??.pdf: 8 questions generated (total so far: 117)
|
||||||
|
[info] generating questions for: 805_????03_????????????????.pdf
|
||||||
|
[info] 805_????03_????????????????.pdf: 6 questions generated (total so far: 123)
|
||||||
|
[info] generating questions for: 807_???CT???????_SJ-L10.2??1-5.pdf
|
||||||
|
[info] 807_???CT???????_SJ-L10.2??1-5.pdf: 9 questions generated (total so far: 132)
|
||||||
|
Completed dataset build: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\dataset-builds\siemens-pdf-question-bank\2026-06-15T09-28-35.302231+00-00
|
||||||
19
main.py
19
main.py
@@ -1,6 +1,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from rag_eval.dataset_builder.runner import run_dataset_build
|
from rag_eval.dataset_builder.runner import run_dataset_build
|
||||||
from rag_eval.execution.runner import run_scenario
|
from rag_eval.execution.runner import run_scenario
|
||||||
@@ -18,18 +20,33 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"--dataset-build-config",
|
"--dataset-build-config",
|
||||||
help="Path to a YAML dataset build config file.",
|
help="Path to a YAML dataset build config file.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-file",
|
||||||
|
default=None,
|
||||||
|
help="Write evaluation logs to this file (in addition to stderr). "
|
||||||
|
"Example: logs/eval.log",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-level",
|
||||||
|
default="INFO",
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
|
help="Logging verbosity level (default: INFO). Use DEBUG for per-metric detail.",
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
"""Dispatch the CLI call to the requested workflow."""
|
"""Dispatch the CLI call to the requested workflow."""
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
||||||
|
log_file = Path(args.log_file) if args.log_file else None
|
||||||
|
|
||||||
if args.dataset_build_config:
|
if args.dataset_build_config:
|
||||||
result = run_dataset_build(args.dataset_build_config)
|
result = run_dataset_build(args.dataset_build_config)
|
||||||
print(f"Completed dataset build: {result.artifact_paths.root_dir}")
|
print(f"Completed dataset build: {result.artifact_paths.root_dir}")
|
||||||
return
|
return
|
||||||
|
|
||||||
result = run_scenario(args.scenario)
|
result = run_scenario(args.scenario, log_file=log_file, log_level=log_level)
|
||||||
print(f"Completed run: {result.scenario.output_dir}")
|
print(f"Completed run: {result.scenario.output_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -17,3 +17,8 @@ dependencies = [
|
|||||||
"pydantic-settings>=2.14.1",
|
"pydantic-settings>=2.14.1",
|
||||||
"ragas==0.4.3",
|
"ragas==0.4.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
# 只打包源码目录,排除运行时产生的数据目录
|
||||||
|
include = ["rag_eval*", "apps*", "webapp*"]
|
||||||
|
exclude = ["logs*", "outputs*", "datasets*", "configs*", "scenarios*", "scripts*", "tests*"]
|
||||||
|
|||||||
67
rag_eval/advisor/__init__.py
Normal file
67
rag_eval/advisor/__init__.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
"""Optimization advisor: rule-based diagnosis + LLM-powered recommendations."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.reporting.artifacts import build_artifact_paths
|
||||||
|
from rag_eval.shared.models import EvaluationResult, Scenario
|
||||||
|
|
||||||
|
from .llm_analyzer import analyze
|
||||||
|
from .rules import Diagnosis, diagnose
|
||||||
|
from .writer import write_advice
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.advisor")
|
||||||
|
|
||||||
|
__all__ = ["run_advisor", "Diagnosis", "diagnose"]
|
||||||
|
|
||||||
|
|
||||||
|
def run_advisor(
|
||||||
|
result: EvaluationResult,
|
||||||
|
scenario: Scenario,
|
||||||
|
llm: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Run the full optimization advisor pipeline after an evaluation completes.
|
||||||
|
|
||||||
|
Skips silently if scenario.optimization_advisor is False.
|
||||||
|
Never raises — failures are logged as warnings, not exceptions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
result: Completed EvaluationResult from Evaluator.evaluate().
|
||||||
|
scenario: The resolved Scenario (provides metrics, judge_model, output_dir).
|
||||||
|
llm: Pre-built RAGAS LLM instance (from build_models()) for LLM analysis.
|
||||||
|
"""
|
||||||
|
if not scenario.optimization_advisor:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[advisor] starting optimization analysis scenario=%s", scenario.scenario_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
artifact_paths = build_artifact_paths(scenario.output_dir, result.run_id)
|
||||||
|
if artifact_paths.advice_md is None:
|
||||||
|
logger.warning("[advisor] advice_md path not set in RunArtifactPaths — skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
diagnoses = diagnose(result.score_rows, scenario.metrics)
|
||||||
|
logger.info("[advisor] rule diagnosis complete: %d metric(s) triggered", len(diagnoses))
|
||||||
|
|
||||||
|
if diagnoses:
|
||||||
|
llm_markdown = asyncio.run(analyze(diagnoses, llm, scenario.scenario_name))
|
||||||
|
else:
|
||||||
|
llm_markdown = ""
|
||||||
|
|
||||||
|
write_advice(
|
||||||
|
diagnoses=diagnoses,
|
||||||
|
llm_markdown=llm_markdown,
|
||||||
|
advice_path=artifact_paths.advice_md,
|
||||||
|
scenario_name=scenario.scenario_name,
|
||||||
|
run_id=result.run_id,
|
||||||
|
judge_model=scenario.judge_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"[advisor] advisor failed (%s: %s) — evaluation result is unaffected",
|
||||||
|
type(exc).__name__, exc,
|
||||||
|
)
|
||||||
109
rag_eval/advisor/llm_analyzer.py
Normal file
109
rag_eval/advisor/llm_analyzer.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
"""LLM-powered analysis of rule diagnostics and low-score samples."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from .rules import Diagnosis
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.advisor")
|
||||||
|
|
||||||
|
_PROMPT_TEMPLATE = """\
|
||||||
|
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
|
||||||
|
请用中文撰写一份优化建议报告,格式为 Markdown。
|
||||||
|
|
||||||
|
## 评测诊断摘要
|
||||||
|
|
||||||
|
{diagnosis_summary}
|
||||||
|
|
||||||
|
## 低分样本示例
|
||||||
|
|
||||||
|
{low_sample_text}
|
||||||
|
|
||||||
|
## 报告要求
|
||||||
|
|
||||||
|
1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||||
|
2. 严重程度说明:critical=严重(<阈值50%),warning=警告(<阈值70%),low=待优化(低于0.85,有提升空间)
|
||||||
|
3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||||
|
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先),critical 和 warning 项优先于 low 项
|
||||||
|
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||||
|
|
||||||
|
只输出 Markdown 报告正文,不要任何前置说明。
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_SEVERITY_LABEL_ZH: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
||||||
|
lines = []
|
||||||
|
for d in diagnoses:
|
||||||
|
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
||||||
|
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
|
||||||
|
lines.append(
|
||||||
|
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
||||||
|
f"阈值={d.threshold},严重程度={label}"
|
||||||
|
)
|
||||||
|
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
||||||
|
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
|
||||||
|
lines = []
|
||||||
|
for d in diagnoses:
|
||||||
|
if not d.low_samples:
|
||||||
|
continue
|
||||||
|
lines.append(f"### {d.metric} 低分样本(最多 3 条)")
|
||||||
|
for i, s in enumerate(d.low_samples, 1):
|
||||||
|
score = s.get(d.metric, "N/A")
|
||||||
|
lines.append(f"\n**样本 {i}**(分数={score})")
|
||||||
|
lines.append(f"- 问题:{s.get('question', '')}")
|
||||||
|
lines.append(f"- 回答:{s.get('answer', '')[:300]}")
|
||||||
|
lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
async def analyze(
|
||||||
|
diagnoses: list[Diagnosis],
|
||||||
|
llm: Any,
|
||||||
|
scenario_name: str,
|
||||||
|
) -> str:
|
||||||
|
"""Call the judge LLM to generate a Chinese optimization report.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
diagnoses: Non-empty list of Diagnosis from rules.diagnose().
|
||||||
|
llm: RAGAS LLM wrapper (has .agenerate() method).
|
||||||
|
scenario_name: Used only for logging.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LLM-generated Markdown string, or "" on failure (triggers writer fallback).
|
||||||
|
"""
|
||||||
|
if not diagnoses:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
diagnosis_summary = _build_diagnosis_summary(diagnoses)
|
||||||
|
low_sample_text = _build_low_sample_text(diagnoses)
|
||||||
|
prompt = _PROMPT_TEMPLATE.format(
|
||||||
|
diagnosis_summary=diagnosis_summary,
|
||||||
|
low_sample_text=low_sample_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name)
|
||||||
|
from langchain_core.messages import HumanMessage
|
||||||
|
# Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel)
|
||||||
|
response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)])
|
||||||
|
text = response.content.strip()
|
||||||
|
logger.info("[advisor] LLM analysis complete chars=%d", len(text))
|
||||||
|
return text
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
|
||||||
|
type(exc).__name__, exc,
|
||||||
|
)
|
||||||
|
return ""
|
||||||
243
rag_eval/advisor/rules.py
Normal file
243
rag_eval/advisor/rules.py
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
"""Rule-based diagnostic engine for RAG evaluation metric scores."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricRule:
|
||||||
|
"""Threshold configuration and diagnostic text for one metric."""
|
||||||
|
warning_threshold: float
|
||||||
|
critical_threshold: float
|
||||||
|
higher_is_better: bool # False for noise_sensitivity
|
||||||
|
root_causes: list[str]
|
||||||
|
suggested_actions: list[str]
|
||||||
|
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
|
||||||
|
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
|
||||||
|
advisory_threshold: float = 0.85
|
||||||
|
|
||||||
|
|
||||||
|
METRIC_RULES: dict[str, MetricRule] = {
|
||||||
|
"faithfulness": MetricRule(
|
||||||
|
warning_threshold=0.7,
|
||||||
|
critical_threshold=0.5,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"生成回答包含检索片段中不支持的陈述(幻觉)",
|
||||||
|
"生成阶段未严格遵循 grounding 约束",
|
||||||
|
"校验阶段未开启或未生效",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"强化生成 prompt 的 grounding 约束('只依据参考资料作答')",
|
||||||
|
"开启校验阶段(validation: by_scenario)",
|
||||||
|
"检查低分样本中模型是否引用了片段外的知识",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"answer_relevancy": MetricRule(
|
||||||
|
warning_threshold=0.7,
|
||||||
|
critical_threshold=0.5,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"回答偏离问题主旨或包含大量冗余内容",
|
||||||
|
"查询改写后问题语义漂移",
|
||||||
|
"生成 prompt 格式约束不足",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"优化查询改写 prompt,确保改写后语义不偏移",
|
||||||
|
"在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
|
||||||
|
"检查低分样本的回答是否存在格式冗余或话题偏移",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"context_recall": MetricRule(
|
||||||
|
warning_threshold=0.7,
|
||||||
|
critical_threshold=0.5,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"检索未能召回标准答案所涉及的关键信息",
|
||||||
|
"单一查询未能覆盖问题的多个角度",
|
||||||
|
"过召回数量不足,关键片段被截断",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"启用多查询扩展(use_multi_query)覆盖不同措辞",
|
||||||
|
"对多跳问题启用问题分解(sub_questions)",
|
||||||
|
"加大过召回宽度(recall_top_k)",
|
||||||
|
"对颗粒度细的问题尝试 Step-back 双路检索",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"context_precision": MetricRule(
|
||||||
|
warning_threshold=0.6,
|
||||||
|
critical_threshold=0.4,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"检索引入过多与问题无关的片段",
|
||||||
|
"重排未能将相关片段排在前列",
|
||||||
|
"缺少相关性过滤,噪声片段进入上下文",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"启用或优化 listwise 重排,将相关片段排在前列",
|
||||||
|
"启用上下文压缩(compression)过滤无关句子",
|
||||||
|
"启用相关性过滤(relevance_filter)丢弃明确无关片段",
|
||||||
|
"缩小 rerank_keep_k(如从 8 降到 5)",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"noise_sensitivity": MetricRule(
|
||||||
|
warning_threshold=0.3, # higher is worse; trigger when mean > threshold
|
||||||
|
critical_threshold=0.5,
|
||||||
|
higher_is_better=False,
|
||||||
|
root_causes=[
|
||||||
|
"回答中包含检索到的噪声片段所引入的错误陈述",
|
||||||
|
"相关性过滤未能拦截干扰性片段",
|
||||||
|
"生成阶段对噪声片段未加区分地引用",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"启用相关性过滤(relevance_filter)拦截噪声",
|
||||||
|
"优化重排,将不相关片段排到截断点之后",
|
||||||
|
"在生成 prompt 中强调'来源冲突时并列陈述,不擅自下定论'",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"factual_correctness": MetricRule(
|
||||||
|
warning_threshold=0.6,
|
||||||
|
critical_threshold=0.4,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"回答的事实陈述与标准答案存在偏差",
|
||||||
|
"检索未能命中标准答案所依据的关键片段",
|
||||||
|
"生成阶段对多个来源综合时产生事实错误",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"重点检查低分样本,确认是检索遗漏还是生成错误",
|
||||||
|
"提升 context_recall 以确保关键信息被检索到",
|
||||||
|
"对事实型问题将 temperature 降至 0",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"semantic_similarity": MetricRule(
|
||||||
|
warning_threshold=0.7,
|
||||||
|
critical_threshold=0.5,
|
||||||
|
higher_is_better=True,
|
||||||
|
root_causes=[
|
||||||
|
"回答语义与标准答案差距较大",
|
||||||
|
"回答过于简短或过于冗长,语义偏移",
|
||||||
|
"检索到的片段质量不足,导致生成内容偏离",
|
||||||
|
],
|
||||||
|
suggested_actions=[
|
||||||
|
"检查低分样本的回答与标准答案的表述差异",
|
||||||
|
"优化生成 prompt 使回答更贴近标准表述风格",
|
||||||
|
"提升检索质量(context_recall / context_precision)",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Diagnosis:
|
||||||
|
"""Diagnostic result for one metric that triggered a threshold."""
|
||||||
|
metric: str
|
||||||
|
mean_score: float
|
||||||
|
threshold: float # the triggered threshold
|
||||||
|
severity: str # "warning" | "critical"
|
||||||
|
root_causes: list[str] = field(default_factory=list)
|
||||||
|
suggested_actions: list[str] = field(default_factory=list)
|
||||||
|
low_samples: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _mean_ignoring_nan(values: list[float]) -> float | None:
|
||||||
|
valid = [v for v in values if not math.isnan(v)]
|
||||||
|
if not valid:
|
||||||
|
return None
|
||||||
|
return sum(valid) / len(valid)
|
||||||
|
|
||||||
|
|
||||||
|
def _select_low_samples(
|
||||||
|
rows: list[dict[str, Any]],
|
||||||
|
metric: str,
|
||||||
|
top_n: int,
|
||||||
|
higher_is_better: bool,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Return the top_n worst-scoring rows for a metric, excluding NaN."""
|
||||||
|
valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
|
||||||
|
sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
|
||||||
|
worst = sorted_rows[:top_n]
|
||||||
|
keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
|
||||||
|
return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
|
||||||
|
|
||||||
|
|
||||||
|
def diagnose(
|
||||||
|
score_rows: list[dict[str, Any]],
|
||||||
|
metrics: list[str],
|
||||||
|
top_low_samples: int = 3,
|
||||||
|
) -> list[Diagnosis]:
|
||||||
|
"""Analyse score_rows and return a Diagnosis for each metric below threshold.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
|
||||||
|
metrics: Metric names to evaluate (from Scenario.metrics).
|
||||||
|
top_low_samples: How many worst-scoring samples to attach per diagnosis.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Diagnosis objects, one per triggered metric. Empty if all OK.
|
||||||
|
"""
|
||||||
|
diagnoses: list[Diagnosis] = []
|
||||||
|
|
||||||
|
for metric in metrics:
|
||||||
|
rule = METRIC_RULES.get(metric)
|
||||||
|
if rule is None:
|
||||||
|
continue # unknown metric, skip
|
||||||
|
|
||||||
|
values = []
|
||||||
|
for row in score_rows:
|
||||||
|
raw = row.get(metric)
|
||||||
|
if raw is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
v = float(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
values.append(v)
|
||||||
|
|
||||||
|
if not values:
|
||||||
|
continue
|
||||||
|
|
||||||
|
mean = _mean_ignoring_nan(values)
|
||||||
|
if mean is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Determine severity (direction-aware)
|
||||||
|
if rule.higher_is_better:
|
||||||
|
if mean < rule.critical_threshold:
|
||||||
|
severity = "critical"
|
||||||
|
threshold = rule.critical_threshold
|
||||||
|
elif mean < rule.warning_threshold:
|
||||||
|
severity = "warning"
|
||||||
|
threshold = rule.warning_threshold
|
||||||
|
elif mean < rule.advisory_threshold:
|
||||||
|
# Score is acceptable but below 0.85 — request LLM optimization advice.
|
||||||
|
severity = "low"
|
||||||
|
threshold = rule.advisory_threshold
|
||||||
|
else:
|
||||||
|
continue # >= advisory_threshold → no diagnosis needed
|
||||||
|
else:
|
||||||
|
# lower is better (noise_sensitivity): keep existing two-tier logic
|
||||||
|
if mean > rule.critical_threshold:
|
||||||
|
severity = "critical"
|
||||||
|
threshold = rule.critical_threshold
|
||||||
|
elif mean > rule.warning_threshold:
|
||||||
|
severity = "warning"
|
||||||
|
threshold = rule.warning_threshold
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
|
||||||
|
|
||||||
|
diagnoses.append(Diagnosis(
|
||||||
|
metric=metric,
|
||||||
|
mean_score=round(mean, 4),
|
||||||
|
threshold=threshold,
|
||||||
|
severity=severity,
|
||||||
|
root_causes=list(rule.root_causes),
|
||||||
|
suggested_actions=list(rule.suggested_actions),
|
||||||
|
low_samples=low_samples,
|
||||||
|
))
|
||||||
|
|
||||||
|
return diagnoses
|
||||||
93
rag_eval/advisor/writer.py
Normal file
93
rag_eval/advisor/writer.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""Write optimization advice to markdown file and emit log summary."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .rules import Diagnosis
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.advisor")
|
||||||
|
|
||||||
|
# Chinese display labels for each severity tier.
|
||||||
|
_SEVERITY_LABEL: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
||||||
|
"""Return a single-line log summary of triggered diagnoses."""
|
||||||
|
if not diagnoses:
|
||||||
|
return "[advisor] 所有指标正常,无需优化建议。"
|
||||||
|
parts = [
|
||||||
|
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
|
||||||
|
for d in diagnoses
|
||||||
|
]
|
||||||
|
triggered = " ".join(parts)
|
||||||
|
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
|
||||||
|
"""Build a rules-only report when LLM analysis is unavailable."""
|
||||||
|
if not diagnoses:
|
||||||
|
return ""
|
||||||
|
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
||||||
|
for d in diagnoses:
|
||||||
|
label = _SEVERITY_LABEL.get(d.severity, d.severity)
|
||||||
|
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
|
||||||
|
lines.append("\n**可能原因:**")
|
||||||
|
for cause in d.root_causes:
|
||||||
|
lines.append(f"- {cause}")
|
||||||
|
lines.append("\n**建议动作:**")
|
||||||
|
for action in d.suggested_actions:
|
||||||
|
lines.append(f"- {action}")
|
||||||
|
lines.append("")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def write_advice(
|
||||||
|
diagnoses: list[Diagnosis],
|
||||||
|
llm_markdown: str,
|
||||||
|
advice_path: Path,
|
||||||
|
scenario_name: str,
|
||||||
|
run_id: str,
|
||||||
|
judge_model: str,
|
||||||
|
) -> None:
|
||||||
|
"""Write optimization_advice.md and emit a log summary line.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
diagnoses: List of Diagnosis from rules.diagnose().
|
||||||
|
llm_markdown: LLM-generated Markdown body. Empty string triggers fallback.
|
||||||
|
advice_path: Full path to write the .md file.
|
||||||
|
scenario_name: Human-readable scenario identifier for the report header.
|
||||||
|
run_id: Run identifier string.
|
||||||
|
judge_model: Model used for LLM analysis (shown in header).
|
||||||
|
"""
|
||||||
|
advice_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
header_lines = [
|
||||||
|
f"# 优化建议报告 — {scenario_name}",
|
||||||
|
"",
|
||||||
|
f"- run_id: `{run_id}`",
|
||||||
|
f"- 生成时间: `{utc_now_iso()}`",
|
||||||
|
f"- judge_model: `{judge_model}`",
|
||||||
|
"",
|
||||||
|
"---",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
if not diagnoses:
|
||||||
|
body = "## ✅ 未发现明显指标异常\n\n所有指标均在正常范围内,当前 RAG 链路表现良好。\n"
|
||||||
|
elif llm_markdown:
|
||||||
|
body = llm_markdown
|
||||||
|
else:
|
||||||
|
body = _build_fallback_report(diagnoses)
|
||||||
|
|
||||||
|
content = "\n".join(header_lines) + body
|
||||||
|
advice_path.write_text(content, encoding="utf-8")
|
||||||
|
|
||||||
|
summary = _format_log_summary(diagnoses, advice_path)
|
||||||
|
logger.info(summary)
|
||||||
|
logger.info("[advisor] 优化建议已写出: %s", advice_path)
|
||||||
@@ -61,6 +61,9 @@ def load_scenario(path: str | Path) -> Scenario:
|
|||||||
max_samples=model.runtime.max_samples,
|
max_samples=model.runtime.max_samples,
|
||||||
),
|
),
|
||||||
source_path=scenario_path,
|
source_path=scenario_path,
|
||||||
|
optimization_advisor=model.optimization_advisor,
|
||||||
|
metric_weights=dict(model.metric_weights),
|
||||||
|
doc_weights=dict(model.doc_weights),
|
||||||
)
|
)
|
||||||
# Run cross-field checks after all relative paths have been resolved.
|
# Run cross-field checks after all relative paths have been resolved.
|
||||||
validate_scenario(scenario)
|
validate_scenario(scenario)
|
||||||
|
|||||||
@@ -54,6 +54,9 @@ class ScenarioModel(BaseModel):
|
|||||||
metrics: list[str]
|
metrics: list[str]
|
||||||
output_dir: str
|
output_dir: str
|
||||||
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
|
runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
|
||||||
|
optimization_advisor: bool = False
|
||||||
|
metric_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = Field(default_factory=dict)
|
||||||
|
|
||||||
@field_validator("metrics")
|
@field_validator("metrics")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -3,6 +3,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from rag_eval.adapters.base import AppAdapter
|
from rag_eval.adapters.base import AppAdapter
|
||||||
@@ -10,9 +12,12 @@ from rag_eval.datasets.loader import load_dataset_records
|
|||||||
from rag_eval.datasets.normalizers import normalize_records
|
from rag_eval.datasets.normalizers import normalize_records
|
||||||
from rag_eval.execution.concurrency import gather_with_limit
|
from rag_eval.execution.concurrency import gather_with_limit
|
||||||
from rag_eval.metrics.pipeline import MetricPipeline
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
|
||||||
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
||||||
from rag_eval.shared.utils import utc_now_iso
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.execution.evaluator")
|
||||||
|
|
||||||
|
|
||||||
class Evaluator:
|
class Evaluator:
|
||||||
"""Coordinate dataset loading, optional app execution, and metric scoring."""
|
"""Coordinate dataset loading, optional app execution, and metric scoring."""
|
||||||
@@ -31,27 +36,61 @@ class Evaluator:
|
|||||||
def evaluate(self) -> EvaluationResult:
|
def evaluate(self) -> EvaluationResult:
|
||||||
"""Execute the full evaluation flow and return the collected results."""
|
"""Execute the full evaluation flow and return the collected results."""
|
||||||
started_at = utc_now_iso()
|
started_at = utc_now_iso()
|
||||||
|
scenario_name = self.scenario.scenario_name
|
||||||
|
mode = self.scenario.mode
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("[eval] START scenario=%s mode=%s", scenario_name, mode)
|
||||||
|
logger.info("[eval] dataset=%s", self.scenario.dataset.path)
|
||||||
|
logger.info("[eval] metrics=%s", list(self.scenario.metrics))
|
||||||
|
logger.info("[eval] judge=%s embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
|
||||||
|
|
||||||
raw_records = load_dataset_records(self.scenario.dataset.path)
|
raw_records = load_dataset_records(self.scenario.dataset.path)
|
||||||
|
logger.info("[eval] raw_records=%d", len(raw_records))
|
||||||
|
|
||||||
samples, invalid_samples = normalize_records(
|
samples, invalid_samples = normalize_records(
|
||||||
raw_records,
|
raw_records,
|
||||||
mode=self.scenario.mode,
|
mode=self.scenario.mode,
|
||||||
max_samples=self.scenario.runtime.max_samples,
|
max_samples=self.scenario.runtime.max_samples,
|
||||||
)
|
)
|
||||||
|
logger.info("[eval] normalized: valid=%d invalid=%d", len(samples), len(invalid_samples))
|
||||||
|
|
||||||
if self.scenario.mode == "online":
|
if self.scenario.mode == "online":
|
||||||
# Online mode enriches each sample by calling the target application first.
|
logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
|
||||||
|
t0 = time.monotonic()
|
||||||
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
|
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
invalid_samples.extend(online_invalids)
|
invalid_samples.extend(online_invalids)
|
||||||
|
logger.info(
|
||||||
|
"[eval] adapter done: enriched=%d adapter_invalids=%d elapsed=%.1fs",
|
||||||
|
len(samples), len(online_invalids), elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
|
||||||
|
t0 = time.monotonic()
|
||||||
metric_scores = asyncio.run(
|
metric_scores = asyncio.run(
|
||||||
self.metric_pipeline.score_samples(
|
self.metric_pipeline.score_samples(
|
||||||
samples,
|
samples,
|
||||||
max_concurrency=self.scenario.runtime.metric_limit(),
|
max_concurrency=self.scenario.runtime.metric_limit(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
logger.info("[eval] metric scoring done elapsed=%.1fs", elapsed)
|
||||||
|
|
||||||
finished_at = utc_now_iso()
|
finished_at = utc_now_iso()
|
||||||
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
|
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
|
||||||
|
|
||||||
|
# Summary of NaN rates per metric
|
||||||
|
import math
|
||||||
|
for metric_name in self.scenario.metrics:
|
||||||
|
nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
|
||||||
|
logger.info("[eval] %-22s NaN=%d/%d (%.0f%%)",
|
||||||
|
metric_name, nan_count, len(score_rows),
|
||||||
|
100 * nan_count / len(score_rows) if score_rows else 0)
|
||||||
|
|
||||||
run_id = finished_at.replace(":", "-")
|
run_id = finished_at.replace(":", "-")
|
||||||
|
logger.info("[eval] DONE run_id=%s total_valid=%d total_invalid=%d",
|
||||||
|
run_id, len(samples), len(invalid_samples))
|
||||||
|
logger.info("=" * 60)
|
||||||
return EvaluationResult(
|
return EvaluationResult(
|
||||||
scenario=self.scenario,
|
scenario=self.scenario,
|
||||||
run_id=run_id,
|
run_id=run_id,
|
||||||
@@ -72,13 +111,27 @@ class Evaluator:
|
|||||||
|
|
||||||
valid: list[NormalizedSample] = []
|
valid: list[NormalizedSample] = []
|
||||||
invalid: list[InvalidSample] = []
|
invalid: list[InvalidSample] = []
|
||||||
|
total = len(samples)
|
||||||
|
|
||||||
async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
|
async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
|
||||||
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
|
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
|
||||||
|
sid = sample.sample_id[:12]
|
||||||
|
logger.debug("[adapter] [%d/%d] calling adapter sample=%s question=%r",
|
||||||
|
idx + 1, total, sid, (sample.question or "")[:60])
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
return await self.app_adapter.enrich_sample(sample)
|
result = await self.app_adapter.enrich_sample(sample)
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
ans_len = len(result.answer or "")
|
||||||
|
ctx_count = len(result.contexts or [])
|
||||||
|
logger.info("[adapter] [%d/%d] OK sample=%-12s ans_len=%d ctx_count=%d elapsed=%.1fs",
|
||||||
|
idx + 1, total, sid, ans_len, ctx_count, elapsed)
|
||||||
|
return result
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
error_type = type(exc).__name__
|
error_type = type(exc).__name__
|
||||||
|
logger.warning("[adapter] [%d/%d] FAIL sample=%-12s %s: %s (elapsed=%.1fs)",
|
||||||
|
idx + 1, total, sid, error_type, exc, elapsed)
|
||||||
return InvalidSample(
|
return InvalidSample(
|
||||||
sample_id=sample.sample_id,
|
sample_id=sample.sample_id,
|
||||||
error=f"adapter failed [{error_type}]: {exc}",
|
error=f"adapter failed [{error_type}]: {exc}",
|
||||||
@@ -86,8 +139,8 @@ class Evaluator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
factories = [
|
factories = [
|
||||||
(lambda sample=sample: enrich_with_capture(sample))
|
(lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
|
||||||
for sample in samples
|
for i, sample in enumerate(samples)
|
||||||
]
|
]
|
||||||
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
|
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
|
||||||
|
|
||||||
@@ -102,6 +155,8 @@ class Evaluator:
|
|||||||
if not sample.contexts:
|
if not sample.contexts:
|
||||||
errors.append("adapter returned empty contexts")
|
errors.append("adapter returned empty contexts")
|
||||||
if errors:
|
if errors:
|
||||||
|
logger.warning("[adapter] incomplete payload sample=%s errors=%s",
|
||||||
|
sample.sample_id[:12], errors)
|
||||||
invalid.append(
|
invalid.append(
|
||||||
InvalidSample(
|
InvalidSample(
|
||||||
sample_id=sample.sample_id,
|
sample_id=sample.sample_id,
|
||||||
@@ -111,10 +166,13 @@ class Evaluator:
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
valid.append(sample)
|
valid.append(sample)
|
||||||
|
|
||||||
|
logger.info("[adapter] enrichment summary: valid=%d invalid=%d of total=%d",
|
||||||
|
len(valid), len(invalid), total)
|
||||||
return valid, invalid
|
return valid, invalid
|
||||||
|
|
||||||
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
||||||
"""Combine sample data, metric results, and run metadata into one output row."""
|
"""Combine sample data, metric results, run metadata, and weight columns."""
|
||||||
record = sample.to_record()
|
record = sample.to_record()
|
||||||
record["contexts"] = sample.contexts
|
record["contexts"] = sample.contexts
|
||||||
record.update(score.metrics)
|
record.update(score.metrics)
|
||||||
@@ -122,4 +180,12 @@ class Evaluator:
|
|||||||
record["judge_model"] = self.scenario.judge_model
|
record["judge_model"] = self.scenario.judge_model
|
||||||
record["embedding_model"] = self.scenario.embedding_model
|
record["embedding_model"] = self.scenario.embedding_model
|
||||||
record["run_id"] = self.scenario.scenario_name
|
record["run_id"] = self.scenario.scenario_name
|
||||||
|
# 综合加权得分列(已暂时禁用)
|
||||||
|
# record["weighted_score"] = compute_weighted_score(
|
||||||
|
# score.metrics, self.scenario.metric_weights
|
||||||
|
# )
|
||||||
|
# doc_name = str(sample.metadata.get("doc_name", "") or "")
|
||||||
|
# record["sample_weight"] = resolve_weight(
|
||||||
|
# self.scenario.doc_weights, doc_name, default=1.0
|
||||||
|
# )
|
||||||
return record
|
return record
|
||||||
|
|||||||
@@ -2,16 +2,42 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from rag_eval.adapters.http import HttpAppAdapter
|
from rag_eval.adapters.http import HttpAppAdapter
|
||||||
from rag_eval.adapters.python import PythonFunctionAdapter
|
from rag_eval.adapters.python import PythonFunctionAdapter
|
||||||
|
from rag_eval.advisor import run_advisor
|
||||||
from rag_eval.config.loader import load_scenario
|
from rag_eval.config.loader import load_scenario
|
||||||
from rag_eval.metrics.factory import build_metric_pipeline
|
from rag_eval.metrics.factory import build_models, build_metric_pipeline
|
||||||
from rag_eval.reporting.writers import write_run_artifacts
|
from rag_eval.reporting.writers import write_run_artifacts
|
||||||
from rag_eval.settings import EvaluationSettings
|
from rag_eval.settings import EvaluationSettings
|
||||||
from rag_eval.shared.models import Scenario
|
from rag_eval.shared.models import Scenario
|
||||||
|
|
||||||
from .evaluator import Evaluator
|
from .evaluator import Evaluator
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.execution.runner")
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
|
||||||
|
"""Configure root logger: always write to stderr, optionally also to a file."""
|
||||||
|
fmt = "%(asctime)s %(levelname)-8s %(name)s %(message)s"
|
||||||
|
datefmt = "%H:%M:%S"
|
||||||
|
|
||||||
|
handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
|
||||||
|
if log_file is not None:
|
||||||
|
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
fh = logging.FileHandler(log_file, encoding="utf-8")
|
||||||
|
fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
|
||||||
|
handlers.append(fh)
|
||||||
|
|
||||||
|
logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
|
||||||
|
# Also show ragas internal logs at WARNING so we can see LLM errors
|
||||||
|
logging.getLogger("ragas").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("openai").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
def build_adapter(scenario: Scenario):
|
def build_adapter(scenario: Scenario):
|
||||||
"""Instantiate the adapter required by the resolved scenario, if any."""
|
"""Instantiate the adapter required by the resolved scenario, if any."""
|
||||||
@@ -27,16 +53,32 @@ def build_adapter(scenario: Scenario):
|
|||||||
def run_scenario(
|
def run_scenario(
|
||||||
scenario_path: str,
|
scenario_path: str,
|
||||||
settings: EvaluationSettings | None = None,
|
settings: EvaluationSettings | None = None,
|
||||||
|
log_file: Path | None = None,
|
||||||
|
log_level: int = logging.INFO,
|
||||||
):
|
):
|
||||||
"""Run one scenario end to end and persist its reporting artifacts."""
|
"""Run one scenario end to end and persist its reporting artifacts."""
|
||||||
|
_setup_logging(log_file=log_file, level=log_level)
|
||||||
|
logger.info("[runner] run_scenario path=%s", scenario_path)
|
||||||
|
|
||||||
settings = settings or EvaluationSettings()
|
settings = settings or EvaluationSettings()
|
||||||
if not settings.openai_api_key:
|
if not settings.openai_api_key:
|
||||||
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
|
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
|
||||||
|
|
||||||
scenario = load_scenario(scenario_path)
|
scenario = load_scenario(scenario_path)
|
||||||
|
logger.info("[runner] scenario loaded: name=%s mode=%s max_samples=%s",
|
||||||
|
scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)
|
||||||
|
|
||||||
|
# Build models once; reuse llm in both MetricPipeline and advisor.
|
||||||
|
llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)
|
||||||
|
|
||||||
adapter = build_adapter(scenario)
|
adapter = build_adapter(scenario)
|
||||||
pipeline = build_metric_pipeline(scenario, settings)
|
pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
|
||||||
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
|
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
|
||||||
result = evaluator.evaluate()
|
result = evaluator.evaluate()
|
||||||
write_run_artifacts(result)
|
write_run_artifacts(result)
|
||||||
|
logger.info("[runner] artifacts written for run_id=%s", result.run_id)
|
||||||
|
|
||||||
|
# Optimization advisor — runs only if scenario.optimization_advisor is True.
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -18,20 +18,64 @@ from ragas.metrics.collections import (
|
|||||||
AnswerRelevancy,
|
AnswerRelevancy,
|
||||||
ContextPrecision,
|
ContextPrecision,
|
||||||
ContextRecall,
|
ContextRecall,
|
||||||
|
FactualCorrectness,
|
||||||
Faithfulness,
|
Faithfulness,
|
||||||
|
NoiseSensitivity,
|
||||||
|
SemanticSimilarity,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .pipeline import MetricPipeline
|
from .pipeline import MetricPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_openai_client_kwargs(
|
||||||
|
judge_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings.
|
||||||
|
|
||||||
|
Lookup order:
|
||||||
|
1. LLM Profile whose model name equals judge_model (exact match)
|
||||||
|
2. Fall back to EvaluationSettings (.env)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Lazy import to avoid circular dependency (webapp -> rag_eval is one-way).
|
||||||
|
from webapp.services.profile_manager import profile_manager
|
||||||
|
profiles = profile_manager.list_all()
|
||||||
|
for profile in profiles:
|
||||||
|
if profile.model == judge_model:
|
||||||
|
kwargs: dict[str, Any] = {
|
||||||
|
"api_key": profile.api_key or "sk-placeholder",
|
||||||
|
"timeout": float(profile.timeout_seconds or 30),
|
||||||
|
}
|
||||||
|
if profile.base_url and profile.base_url.strip():
|
||||||
|
kwargs["base_url"] = profile.base_url.strip()
|
||||||
|
return kwargs
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
# If profile lookup fails for any reason, fall through to .env settings.
|
||||||
|
pass
|
||||||
|
|
||||||
|
return settings.openai_client_kwargs
|
||||||
|
|
||||||
|
|
||||||
def build_models(
|
def build_models(
|
||||||
judge_model: str,
|
judge_model: str,
|
||||||
embedding_model: str,
|
embedding_model: str,
|
||||||
settings: EvaluationSettings,
|
settings: EvaluationSettings,
|
||||||
) -> tuple[Any, Any]:
|
) -> tuple[Any, Any]:
|
||||||
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
|
"""Create the LLM and embedding clients required by the selected RAGAS metrics.
|
||||||
client = AsyncOpenAI(**settings.openai_client_kwargs)
|
|
||||||
llm = llm_factory(judge_model, client=client)
|
Dynamically resolves connection settings from the stored LLM Profiles first
|
||||||
|
(matched by model name), falling back to .env settings when no profile matches.
|
||||||
|
"""
|
||||||
|
client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
|
||||||
|
client = AsyncOpenAI(**client_kwargs)
|
||||||
|
# RAGAS structured-output judge calls can be truncated by the upstream default
|
||||||
|
# 1024 completion budget, especially for faithfulness and GPT-5 family models.
|
||||||
|
llm = llm_factory(
|
||||||
|
judge_model,
|
||||||
|
client=client,
|
||||||
|
max_tokens=max(1, int(settings.ragas_llm_max_tokens)),
|
||||||
|
)
|
||||||
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
||||||
return llm, embeddings
|
return llm, embeddings
|
||||||
|
|
||||||
@@ -39,19 +83,34 @@ def build_models(
|
|||||||
def build_metric_pipeline(
|
def build_metric_pipeline(
|
||||||
scenario: Scenario,
|
scenario: Scenario,
|
||||||
settings: EvaluationSettings,
|
settings: EvaluationSettings,
|
||||||
|
llm: Any | None = None,
|
||||||
|
embeddings: Any | None = None,
|
||||||
) -> MetricPipeline:
|
) -> MetricPipeline:
|
||||||
"""Build a metric pipeline containing only the metrics requested by the scenario."""
|
"""Build a metric pipeline containing only the metrics requested by the scenario.
|
||||||
llm, embeddings = build_models(
|
|
||||||
scenario.judge_model,
|
If llm and embeddings are provided (pre-built by the caller), they are reused.
|
||||||
scenario.embedding_model,
|
Otherwise, new instances are created from scenario + settings.
|
||||||
settings,
|
"""
|
||||||
)
|
if llm is None or embeddings is None:
|
||||||
|
llm, embeddings = build_models(
|
||||||
|
scenario.judge_model,
|
||||||
|
scenario.embedding_model,
|
||||||
|
settings,
|
||||||
|
)
|
||||||
|
|
||||||
# Build the full registry once, then slice it by configured metric names.
|
# Build the full registry once, then slice it by configured metric names.
|
||||||
registry: dict[str, Any] = {
|
registry: dict[str, Any] = {
|
||||||
"faithfulness": Faithfulness(llm=llm),
|
"faithfulness": Faithfulness(llm=llm),
|
||||||
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||||
"context_recall": ContextRecall(llm=llm),
|
"context_recall": ContextRecall(llm=llm),
|
||||||
"context_precision": ContextPrecision(llm=llm),
|
"context_precision": ContextPrecision(llm=llm),
|
||||||
|
# Robustness / end-to-end metrics (架构设计 §10.2).
|
||||||
|
# NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
|
||||||
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||||
|
# FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
|
||||||
|
"factual_correctness": FactualCorrectness(llm=llm),
|
||||||
|
# SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
|
||||||
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||||
}
|
}
|
||||||
return MetricPipeline(
|
return MetricPipeline(
|
||||||
metrics={name: registry[name] for name in scenario.metrics},
|
metrics={name: registry[name] for name in scenario.metrics},
|
||||||
|
|||||||
@@ -3,12 +3,16 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
import math
|
import math
|
||||||
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from rag_eval.shared.models import MetricScore, NormalizedSample
|
from rag_eval.shared.models import MetricScore, NormalizedSample
|
||||||
|
|
||||||
|
logger = logging.getLogger("rag_eval.metrics.pipeline")
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
class MetricPipeline:
|
class MetricPipeline:
|
||||||
@@ -22,12 +26,43 @@ class MetricPipeline:
|
|||||||
results = {name: math.nan for name in self.metrics}
|
results = {name: math.nan for name in self.metrics}
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
||||||
|
sid = sample.sample_id[:12]
|
||||||
|
ans_len = len(sample.answer or "")
|
||||||
|
ctx_count = len(sample.contexts or [])
|
||||||
|
logger.debug(
|
||||||
|
"[score] sample=%s ans_len=%d ctx_count=%d question=%r",
|
||||||
|
sid, ans_len, ctx_count,
|
||||||
|
(sample.question or "")[:80],
|
||||||
|
)
|
||||||
|
|
||||||
for name, metric in self.metrics.items():
|
for name, metric in self.metrics.items():
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
result = await self._run_metric(name, metric, sample)
|
result = await self._run_metric(name, metric, sample)
|
||||||
results[name] = float(result.value)
|
score_val = float(result.value)
|
||||||
|
results[name] = score_val
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
logger.info(
|
||||||
|
"[metric OK ] sample=%-12s %-20s score=%.4f elapsed=%.1fs",
|
||||||
|
sid, name, score_val, elapsed,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
msg = f"timeout after {self.metric_timeout_seconds}s"
|
||||||
|
errors.append(f"{name}: {msg}")
|
||||||
|
logger.warning(
|
||||||
|
"[metric TMO] sample=%-12s %-20s TIMEOUT after %.1fs",
|
||||||
|
sid, name, elapsed,
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
elapsed = time.monotonic() - t0
|
||||||
|
exc_type = type(exc).__name__
|
||||||
errors.append(f"{name}: {exc}")
|
errors.append(f"{name}: {exc}")
|
||||||
|
logger.warning(
|
||||||
|
"[metric ERR] sample=%-12s %-20s %s: %s (elapsed=%.1fs)",
|
||||||
|
sid, name, exc_type, exc, elapsed,
|
||||||
|
)
|
||||||
|
|
||||||
return MetricScore(metrics=results, error=" | ".join(errors))
|
return MetricScore(metrics=results, error=" | ".join(errors))
|
||||||
|
|
||||||
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
|
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
|
||||||
@@ -59,6 +94,23 @@ class MetricPipeline:
|
|||||||
reference=sample.ground_truth,
|
reference=sample.ground_truth,
|
||||||
retrieved_contexts=sample.contexts,
|
retrieved_contexts=sample.contexts,
|
||||||
)
|
)
|
||||||
|
elif name == "noise_sensitivity":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
user_input=sample.question,
|
||||||
|
response=sample.answer,
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
retrieved_contexts=sample.contexts,
|
||||||
|
)
|
||||||
|
elif name == "factual_correctness":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
response=sample.answer,
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
)
|
||||||
|
elif name == "semantic_similarity":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
response=sample.answer,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported metric: {name}")
|
raise ValueError(f"Unsupported metric: {name}")
|
||||||
|
|
||||||
@@ -72,11 +124,22 @@ class MetricPipeline:
|
|||||||
max_concurrency: int,
|
max_concurrency: int,
|
||||||
) -> list[MetricScore]:
|
) -> list[MetricScore]:
|
||||||
"""Score all samples while respecting the configured concurrency limit."""
|
"""Score all samples while respecting the configured concurrency limit."""
|
||||||
|
total = len(samples)
|
||||||
|
logger.info("[pipeline] scoring %d samples concurrency=%d timeout=%ss",
|
||||||
|
total, max_concurrency, self.metric_timeout_seconds)
|
||||||
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
||||||
|
completed = 0
|
||||||
|
|
||||||
async def guarded(sample: NormalizedSample) -> MetricScore:
|
async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
|
||||||
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
|
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
|
||||||
|
nonlocal completed
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await self.score_sample(sample)
|
result = await self.score_sample(sample)
|
||||||
|
completed += 1
|
||||||
|
nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
|
||||||
|
status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
|
||||||
|
logger.info("[pipeline] progress %d/%d sample=%-12s %s",
|
||||||
|
completed, total, sample.sample_id[:12], status)
|
||||||
|
return result
|
||||||
|
|
||||||
return await asyncio.gather(*(guarded(sample) for sample in samples))
|
return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
||||||
|
|
||||||
SUPPORTED_METRICS = {
|
SUPPORTED_METRICS = {
|
||||||
|
# Core retrieval / generation metrics (always available).
|
||||||
"faithfulness",
|
"faithfulness",
|
||||||
"answer_relevancy",
|
"answer_relevancy",
|
||||||
"context_recall",
|
"context_recall",
|
||||||
"context_precision",
|
"context_precision",
|
||||||
|
# Robustness and end-to-end metrics (see 架构设计 §10.2).
|
||||||
|
"noise_sensitivity", # 鲁棒性:对检索噪声的敏感度
|
||||||
|
"factual_correctness", # 端到端:回答相对标准答案的事实正确性
|
||||||
|
"semantic_similarity", # 端到端:回答与标准答案的语义相似度(embedding,无 LLM 调用)
|
||||||
}
|
}
|
||||||
|
|||||||
152
rag_eval/metrics/weights.py
Normal file
152
rag_eval/metrics/weights.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Utility functions for weighted metric aggregation.
|
||||||
|
|
||||||
|
All functions are pure (no side effects, no I/O) and operate on plain dicts/lists.
|
||||||
|
Weights do not need to be pre-normalised — normalisation is done internally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
|
||||||
|
"""Return the weight for *key*, or *default* when absent."""
|
||||||
|
return float(weights.get(key, default))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_weighted_score(
|
||||||
|
scores: dict[str, float | None],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""Return the weighted mean of valid (non-NaN, non-None) metric scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
scores: mapping of metric_name -> raw score (may be NaN or None).
|
||||||
|
metric_weights: optional per-metric weights; absent keys default to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Weighted mean as a float, or None when no valid score exists.
|
||||||
|
"""
|
||||||
|
total_weight = 0.0
|
||||||
|
total_score = 0.0
|
||||||
|
for metric, score in scores.items():
|
||||||
|
if score is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
value = float(score)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if math.isnan(value) or math.isinf(value):
|
||||||
|
continue
|
||||||
|
weight = resolve_weight(metric_weights, metric, default=1.0)
|
||||||
|
total_weight += weight
|
||||||
|
total_score += weight * value
|
||||||
|
if total_weight == 0.0:
|
||||||
|
return None
|
||||||
|
return total_score / total_weight
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_metric_means(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metrics: list[str],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Compute per-metric weighted means across all score rows.
|
||||||
|
|
||||||
|
Each row's contribution is scaled by the doc_weight for its ``doc_name``.
|
||||||
|
Rows with NaN/None for a given metric are excluded from that metric's mean.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
score_rows: list of score record dicts (from scores.csv).
|
||||||
|
metrics: ordered list of metric names to aggregate.
|
||||||
|
doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping metric_name -> weighted mean (or None if no valid data).
|
||||||
|
"""
|
||||||
|
totals: dict[str, float] = {metric: 0.0 for metric in metrics}
|
||||||
|
weights_sum: dict[str, float] = {metric: 0.0 for metric in metrics}
|
||||||
|
|
||||||
|
for row in score_rows:
|
||||||
|
doc_name = str(row.get("doc_name", "") or "")
|
||||||
|
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
|
||||||
|
for metric in metrics:
|
||||||
|
raw_value = row.get(metric)
|
||||||
|
if raw_value is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
value = float(raw_value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if math.isnan(value) or math.isinf(value):
|
||||||
|
continue
|
||||||
|
totals[metric] += sample_weight * value
|
||||||
|
weights_sum[metric] += sample_weight
|
||||||
|
|
||||||
|
return {
|
||||||
|
metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None)
|
||||||
|
for metric in metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def compute_overall_weighted_score_mean(
|
||||||
|
score_rows: list[dict],
|
||||||
|
metric_weights: dict[str, float],
|
||||||
|
doc_weights: dict[str, float],
|
||||||
|
) -> float | None:
|
||||||
|
"""Compute the overall weighted-score mean across all samples.
|
||||||
|
|
||||||
|
For each sample:
|
||||||
|
1. Compute per-sample weighted_score via compute_weighted_score.
|
||||||
|
2. Scale by the doc weight for that sample's doc_name.
|
||||||
|
Then return the weighted mean of all per-sample weighted_scores.
|
||||||
|
"""
|
||||||
|
total_weight = 0.0
|
||||||
|
total_score = 0.0
|
||||||
|
for row in score_rows:
|
||||||
|
metric_scores: dict[str, float | None] = {}
|
||||||
|
for key, value in row.items():
|
||||||
|
if key in _META_COLUMNS:
|
||||||
|
continue
|
||||||
|
metric_scores[key] = value # type: ignore[assignment]
|
||||||
|
|
||||||
|
weighted_score = compute_weighted_score(metric_scores, metric_weights)
|
||||||
|
if weighted_score is None:
|
||||||
|
continue
|
||||||
|
doc_name = str(row.get("doc_name", "") or "")
|
||||||
|
sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
|
||||||
|
total_weight += sample_weight
|
||||||
|
total_score += sample_weight * weighted_score
|
||||||
|
|
||||||
|
return total_score / total_weight if total_weight > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
# Columns in scores.csv that are sample metadata, not metric scores.
|
||||||
|
_META_COLUMNS = frozenset(
|
||||||
|
{
|
||||||
|
"sample_id",
|
||||||
|
"question",
|
||||||
|
"contexts",
|
||||||
|
"answer",
|
||||||
|
"ground_truth",
|
||||||
|
"scenario",
|
||||||
|
"language",
|
||||||
|
"retrieval_config",
|
||||||
|
"error",
|
||||||
|
"judge_model",
|
||||||
|
"embedding_model",
|
||||||
|
"run_id",
|
||||||
|
"difficulty",
|
||||||
|
"question_type",
|
||||||
|
"doc_id",
|
||||||
|
"doc_name",
|
||||||
|
"section_path",
|
||||||
|
"page_start",
|
||||||
|
"page_end",
|
||||||
|
"source_chunk_ids",
|
||||||
|
"review_status",
|
||||||
|
"review_notes",
|
||||||
|
"weighted_score",
|
||||||
|
"sample_weight",
|
||||||
|
}
|
||||||
|
)
|
||||||
@@ -17,4 +17,5 @@ def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
|
|||||||
invalid_csv=run_dir / "invalid.csv",
|
invalid_csv=run_dir / "invalid.csv",
|
||||||
summary_md=run_dir / "summary.md",
|
summary_md=run_dir / "summary.md",
|
||||||
metadata_json=run_dir / "metadata.json",
|
metadata_json=run_dir / "metadata.json",
|
||||||
|
advice_md=run_dir / "optimization_advice.md",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -6,6 +6,10 @@ import math
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means,
|
||||||
|
)
|
||||||
from rag_eval.shared.models import EvaluationResult
|
from rag_eval.shared.models import EvaluationResult
|
||||||
|
|
||||||
|
|
||||||
@@ -55,24 +59,42 @@ def build_summary_markdown(result: EvaluationResult) -> str:
|
|||||||
lines.append("No valid samples were scored.")
|
lines.append("No valid samples were scored.")
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
for metric in result.scenario.metrics:
|
score_rows_list = scores.to_dict(orient="records")
|
||||||
mean_value = scores[metric].mean(numeric_only=True)
|
w_means = weighted_metric_means(
|
||||||
if isinstance(mean_value, float) and not math.isnan(mean_value):
|
score_rows_list, result.scenario.metrics, result.scenario.doc_weights
|
||||||
lines.append(f"- {metric}: `{mean_value:.4f}`")
|
|
||||||
else:
|
|
||||||
lines.append(f"- {metric}: `n/a`")
|
|
||||||
|
|
||||||
# Keep the summary self-sufficient by including every scored sample and its errors.
|
|
||||||
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
|
|
||||||
detail = scores[detail_columns]
|
|
||||||
lines.extend(
|
|
||||||
[
|
|
||||||
"",
|
|
||||||
"## Per-sample Scores",
|
|
||||||
"",
|
|
||||||
"```text",
|
|
||||||
_table_from_frame(detail),
|
|
||||||
"```",
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
|
||||||
|
|
||||||
|
for metric in result.scenario.metrics:
|
||||||
|
mean_value = w_means.get(metric)
|
||||||
|
w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
|
||||||
|
weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else ""
|
||||||
|
if mean_value is not None and not math.isnan(mean_value):
|
||||||
|
lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
|
||||||
|
else:
|
||||||
|
lines.append(f"- {metric}: `n/a`{weight_note}")
|
||||||
|
|
||||||
|
# 综合加权得分(已暂时禁用)
|
||||||
|
# if has_weights:
|
||||||
|
# overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
# score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
||||||
|
# )
|
||||||
|
# weight_suffix = " (加权)"
|
||||||
|
# if overall_ws is not None and not math.isnan(overall_ws):
|
||||||
|
# lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
||||||
|
# else:
|
||||||
|
# lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
||||||
|
|
||||||
|
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
||||||
|
existing_columns = [c for c in detail_columns if c in scores.columns]
|
||||||
|
detail = scores[existing_columns]
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
"## Per-sample Scores",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
_table_from_frame(detail),
|
||||||
|
"```",
|
||||||
|
])
|
||||||
return "\n".join(lines) + "\n"
|
return "\n".join(lines) + "\n"
|
||||||
|
|||||||
@@ -21,11 +21,16 @@ class EvaluationSettings(BaseSettings):
|
|||||||
|
|
||||||
openai_api_key: str | None = Field(default=None, alias="OPENAI_API_KEY")
|
openai_api_key: str | None = Field(default=None, alias="OPENAI_API_KEY")
|
||||||
openai_base_url: str = Field(default="http://6.86.80.4:30080/v1", alias="OPENAI_BASE_URL")
|
openai_base_url: str = Field(default="http://6.86.80.4:30080/v1", alias="OPENAI_BASE_URL")
|
||||||
ragas_judge_model: str = Field(default="deepseek-v4-flash", alias="RAGAS_JUDGE_MODEL")
|
ragas_judge_model: str = Field(default="gpt-5", alias="RAGAS_JUDGE_MODEL")
|
||||||
ragas_embedding_model: str = Field(
|
ragas_embedding_model: str = Field(
|
||||||
default="text-embedding-v3",
|
default="text-embedding-3-small",
|
||||||
alias="RAGAS_EMBEDDING_MODEL",
|
alias="RAGAS_EMBEDDING_MODEL",
|
||||||
)
|
)
|
||||||
|
ragas_llm_max_tokens: int = Field(
|
||||||
|
default=4096,
|
||||||
|
alias="RAGAS_LLM_MAX_TOKENS",
|
||||||
|
gt=0,
|
||||||
|
)
|
||||||
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
||||||
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
||||||
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
||||||
@@ -52,6 +57,11 @@ class EvaluationSettings(BaseSettings):
|
|||||||
)
|
)
|
||||||
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
|
parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
|
||||||
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
|
dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
|
||||||
|
score_api_token: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
alias="SCORE_API_TOKEN",
|
||||||
|
description="Bearer token for /api/score endpoint. Empty = no auth.",
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def openai_client_kwargs(self) -> dict[str, str | float]:
|
def openai_client_kwargs(self) -> dict[str, str | float]:
|
||||||
|
|||||||
@@ -76,6 +76,9 @@ class Scenario:
|
|||||||
runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
|
runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
|
||||||
app_adapter: AppAdapterConfig | None = None
|
app_adapter: AppAdapterConfig | None = None
|
||||||
source_path: Path | None = None
|
source_path: Path | None = None
|
||||||
|
optimization_advisor: bool = False
|
||||||
|
metric_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
doc_weights: dict[str, float] = field(default_factory=dict)
|
||||||
|
|
||||||
def snapshot(self) -> dict[str, Any]:
|
def snapshot(self) -> dict[str, Any]:
|
||||||
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
|
"""Serialize the scenario into a reporting-friendly dictionary snapshot."""
|
||||||
@@ -159,3 +162,4 @@ class RunArtifactPaths:
|
|||||||
invalid_csv: Path
|
invalid_csv: Path
|
||||||
summary_md: Path
|
summary_md: Path
|
||||||
metadata_json: Path
|
metadata_json: Path
|
||||||
|
advice_md: Path | None = None
|
||||||
|
|||||||
53
rag_eval/shared/profile_store.py
Normal file
53
rag_eval/shared/profile_store.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Lightweight read-only accessor for configs/llm_profiles.json.
|
||||||
|
|
||||||
|
Kept in ``rag_eval`` (not ``webapp``) so the runner can look up per-model
|
||||||
|
credentials without depending on the webapp layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PROFILES_PATH = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
|
||||||
|
|
||||||
|
|
||||||
|
def find_by_model(model_name: str) -> dict[str, Any] | None:
|
||||||
|
"""Return the first profile whose ``model`` field matches *model_name*, or None.
|
||||||
|
|
||||||
|
Returns None (without raising) when the profiles file does not exist or
|
||||||
|
cannot be parsed — callers fall back to environment-variable defaults.
|
||||||
|
"""
|
||||||
|
if not _PROFILES_PATH.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
data = json.loads(_PROFILES_PATH.read_text(encoding="utf-8"))
|
||||||
|
for profile in data.get("profiles", []):
|
||||||
|
if profile.get("model") == model_name:
|
||||||
|
return profile
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("[profile_store] failed to read %s: %s", _PROFILES_PATH, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def profile_to_client_kwargs(
|
||||||
|
profile: dict[str, Any],
|
||||||
|
fallback_api_key: str | None,
|
||||||
|
fallback_timeout: float,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Convert a profile dict into keyword arguments for ``openai.AsyncOpenAI``.
|
||||||
|
|
||||||
|
Fields present in the profile override the supplied fallback values.
|
||||||
|
"""
|
||||||
|
kwargs: dict[str, Any] = {
|
||||||
|
"api_key": profile.get("api_key") or fallback_api_key or "",
|
||||||
|
"timeout": float(profile.get("timeout_seconds") or fallback_timeout),
|
||||||
|
}
|
||||||
|
base_url = (profile.get("base_url") or "").strip()
|
||||||
|
if base_url:
|
||||||
|
kwargs["base_url"] = base_url
|
||||||
|
return kwargs
|
||||||
107
run_eval.bat
Normal file
107
run_eval.bat
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
@echo off
|
||||||
|
setlocal enabledelayedexpansion
|
||||||
|
|
||||||
|
:: ============================================================
|
||||||
|
:: run_eval.bat - Run a RAGAS evaluation scenario with logs
|
||||||
|
::
|
||||||
|
:: Usage:
|
||||||
|
:: run_eval.bat (uses default online scenario)
|
||||||
|
:: run_eval.bat offline (runs offline smoke scenario)
|
||||||
|
:: run_eval.bat path\to\scenario.yaml (any custom scenario)
|
||||||
|
:: run_eval.bat offline DEBUG (second arg = log level)
|
||||||
|
:: ============================================================
|
||||||
|
|
||||||
|
cd /d "%~dp0"
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Siemens RAGAS - Evaluation Runner
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 1. Resolve scenario path (arg1)
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
set "SCENARIO=%~1"
|
||||||
|
if "%SCENARIO%"=="" set "SCENARIO=online"
|
||||||
|
|
||||||
|
if /i "%SCENARIO%"=="online" (
|
||||||
|
set "SCENARIO=scenarios\online\siemens-pdf-question-bank-online.yaml"
|
||||||
|
)
|
||||||
|
if /i "%SCENARIO%"=="offline" (
|
||||||
|
set "SCENARIO=scenarios\offline\siemens-pdf-offline-smoke.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not exist "%SCENARIO%" (
|
||||||
|
echo [ERROR] Scenario file not found: %SCENARIO%
|
||||||
|
echo.
|
||||||
|
echo Usage examples:
|
||||||
|
echo run_eval.bat - online eval (default)
|
||||||
|
echo run_eval.bat offline - offline smoke
|
||||||
|
echo run_eval.bat path\to\file.yaml - custom scenario
|
||||||
|
goto :error
|
||||||
|
)
|
||||||
|
echo [OK] Scenario : %SCENARIO%
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 2. Resolve log level (arg2, default INFO)
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
set "LOG_LEVEL=%~2"
|
||||||
|
if "%LOG_LEVEL%"=="" set "LOG_LEVEL=INFO"
|
||||||
|
echo [OK] Log level: %LOG_LEVEL%
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 3. Create logs dir and build timestamped log filename
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
if not exist "logs" mkdir logs
|
||||||
|
for /f "tokens=1-3 delims=/-" %%a in ("%DATE%") do (
|
||||||
|
set "YMD=%%c-%%a-%%b"
|
||||||
|
)
|
||||||
|
for /f "tokens=1-3 delims=:." %%a in ("%TIME: =0%") do (
|
||||||
|
set "HMS=%%a%%b%%c"
|
||||||
|
)
|
||||||
|
set "LOG_FILE=logs\eval_%YMD%_%HMS%.log"
|
||||||
|
echo [OK] Log file : %LOG_FILE%
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Starting evaluation...
|
||||||
|
echo (Logs also written to %LOG_FILE%)
|
||||||
|
echo Press Ctrl+C to abort
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 4. Run evaluation with UTF-8 and logging
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
set PYTHONIOENCODING=utf-8
|
||||||
|
set PYTHONPATH=.
|
||||||
|
|
||||||
|
python main.py ^
|
||||||
|
--scenario "%SCENARIO%" ^
|
||||||
|
--log-file "%LOG_FILE%" ^
|
||||||
|
--log-level %LOG_LEVEL%
|
||||||
|
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo.
|
||||||
|
echo [ERROR] Evaluation failed. Check log: %LOG_FILE%
|
||||||
|
goto :error
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Evaluation complete!
|
||||||
|
echo Log saved to: %LOG_FILE%
|
||||||
|
echo Open the web console to view results: start.bat
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
pause
|
||||||
|
exit /b 0
|
||||||
|
|
||||||
|
:error
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Evaluation failed. See error above or check log file.
|
||||||
|
echo ============================================================
|
||||||
|
pause
|
||||||
|
exit /b 1
|
||||||
96
run_eval.ps1
Normal file
96
run_eval.ps1
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
# run_eval.ps1 - Siemens RAGAS Evaluation Runner
|
||||||
|
# Usage:
|
||||||
|
# .\run_eval.ps1 # online eval (default)
|
||||||
|
# .\run_eval.ps1 offline # offline smoke
|
||||||
|
# .\run_eval.ps1 path\to\scenario.yaml # custom scenario
|
||||||
|
# .\run_eval.ps1 online DEBUG # second arg = log level (DEBUG/INFO/WARNING)
|
||||||
|
# Or: powershell -ExecutionPolicy Bypass -File run_eval.ps1 [scenario] [log-level]
|
||||||
|
|
||||||
|
param(
|
||||||
|
[string]$Scenario = "online",
|
||||||
|
[string]$LogLevel = "INFO"
|
||||||
|
)
|
||||||
|
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
Set-Location $PSScriptRoot
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host " Siemens RAGAS - Evaluation Runner" -ForegroundColor Cyan
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host ""
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 1. Resolve scenario path
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$scenarioMap = @{
|
||||||
|
"online" = "scenarios\online\siemens-pdf-question-bank-online.yaml"
|
||||||
|
"offline" = "scenarios\offline\siemens-pdf-offline-smoke.yaml"
|
||||||
|
}
|
||||||
|
if ($scenarioMap.ContainsKey($Scenario.ToLower())) {
|
||||||
|
$Scenario = $scenarioMap[$Scenario.ToLower()]
|
||||||
|
}
|
||||||
|
if (-not (Test-Path $Scenario)) {
|
||||||
|
Write-Host "[ERROR] Scenario file not found: $Scenario" -ForegroundColor Red
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "Usage examples:"
|
||||||
|
Write-Host " .\run_eval.ps1 - online eval (default)"
|
||||||
|
Write-Host " .\run_eval.ps1 offline - offline smoke"
|
||||||
|
Write-Host " .\run_eval.ps1 path\to\file.yaml - custom scenario"
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
Write-Host "[OK] Scenario : $Scenario" -ForegroundColor Green
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 2. Validate log level
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$validLevels = @("DEBUG", "INFO", "WARNING", "ERROR")
|
||||||
|
if ($validLevels -notcontains $LogLevel.ToUpper()) {
|
||||||
|
Write-Host "[WARN] Unknown log level '$LogLevel', defaulting to INFO" -ForegroundColor Yellow
|
||||||
|
$LogLevel = "INFO"
|
||||||
|
}
|
||||||
|
Write-Host "[OK] Log level: $LogLevel" -ForegroundColor Green
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 3. Create logs dir with timestamped filename
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
if (-not (Test-Path "logs")) { New-Item -ItemType Directory "logs" | Out-Null }
|
||||||
|
$timestamp = Get-Date -Format "yyyy-MM-dd_HHmmss"
|
||||||
|
$logFile = "logs\eval_$timestamp.log"
|
||||||
|
Write-Host "[OK] Log file : $logFile" -ForegroundColor Green
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host " Starting evaluation..." -ForegroundColor Cyan
|
||||||
|
Write-Host " Logs also written to: $logFile" -ForegroundColor Cyan
|
||||||
|
Write-Host " Press Ctrl+C to abort" -ForegroundColor Yellow
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host ""
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 4. Run evaluation
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$env:PYTHONIOENCODING = "utf-8"
|
||||||
|
$env:PYTHONPATH = "."
|
||||||
|
|
||||||
|
& python main.py `
|
||||||
|
--scenario $Scenario `
|
||||||
|
--log-file $logFile `
|
||||||
|
--log-level $LogLevel.ToUpper()
|
||||||
|
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "[ERROR] Evaluation failed. Check log: $logFile" -ForegroundColor Red
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================================" -ForegroundColor Green
|
||||||
|
Write-Host " Evaluation complete!" -ForegroundColor Green
|
||||||
|
Write-Host " Log saved to: $logFile" -ForegroundColor Green
|
||||||
|
Write-Host " Open the web console to view results: start.bat" -ForegroundColor Cyan
|
||||||
|
Write-Host "============================================================" -ForegroundColor Green
|
||||||
|
Write-Host ""
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
147
run_eval.sh
Normal file
147
run_eval.sh
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# run_eval.sh — Siemens RAGAS 评估运行脚本(Linux)
|
||||||
|
# 对应 Windows 的 run_eval.ps1
|
||||||
|
#
|
||||||
|
# 用法:
|
||||||
|
# bash run_eval.sh # online 评估(默认)
|
||||||
|
# bash run_eval.sh offline # offline 冒烟测试
|
||||||
|
# bash run_eval.sh scenarios/xxx.yaml # 自定义场景
|
||||||
|
# bash run_eval.sh online DEBUG # 指定日志级别
|
||||||
|
# bash run_eval.sh build scenarios/siemens_build/siemens-pdf-build.yaml
|
||||||
|
# # 题库生成
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
info() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
||||||
|
|
||||||
|
# ── 参数解析 ──────────────────────────────────────────────────────
|
||||||
|
SCENARIO="${1:-online}"
|
||||||
|
LOG_LEVEL="${2:-INFO}"
|
||||||
|
|
||||||
|
# 场景别名映射
|
||||||
|
declare -A SCENARIO_MAP=(
|
||||||
|
["online"]="scenarios/online/siemens-pdf-question-bank-online.yaml"
|
||||||
|
["offline"]="scenarios/offline/siemens-pdf-offline-smoke.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 检测是否是 dataset build 模式
|
||||||
|
BUILD_MODE=false
|
||||||
|
BUILD_CONFIG=""
|
||||||
|
if [ "$SCENARIO" = "build" ]; then
|
||||||
|
BUILD_MODE=true
|
||||||
|
BUILD_CONFIG="${2:-scenarios/siemens_build/siemens-pdf-build.yaml}"
|
||||||
|
LOG_LEVEL="${3:-INFO}"
|
||||||
|
elif [ -v "SCENARIO_MAP[$SCENARIO]" ]; then
|
||||||
|
SCENARIO="${SCENARIO_MAP[$SCENARIO]}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 验证 ──────────────────────────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS — 评估运行${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查虚拟环境
|
||||||
|
if [ ! -f ".venv/bin/python" ]; then
|
||||||
|
err "未找到 .venv,请先执行部署:bash deploy.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# Build 模式校验
|
||||||
|
if [ "$BUILD_MODE" = true ]; then
|
||||||
|
if [ ! -f "$BUILD_CONFIG" ]; then
|
||||||
|
err "题库生成配置文件不存在:$BUILD_CONFIG"
|
||||||
|
echo ""
|
||||||
|
echo "可用配置:"
|
||||||
|
find scenarios/ -name "*.yaml" 2>/dev/null | head -20 | sed 's/^/ /'
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "模式 : 题库生成 (dataset build)"
|
||||||
|
ok "配置文件 : $BUILD_CONFIG"
|
||||||
|
else
|
||||||
|
# 场景文件校验
|
||||||
|
if [ ! -f "$SCENARIO" ]; then
|
||||||
|
err "场景文件不存在:$SCENARIO"
|
||||||
|
echo ""
|
||||||
|
echo "用法示例:"
|
||||||
|
echo " bash run_eval.sh # online 评估"
|
||||||
|
echo " bash run_eval.sh offline # offline 冒烟"
|
||||||
|
echo " bash run_eval.sh scenarios/xxx.yaml # 自定义场景"
|
||||||
|
echo " bash run_eval.sh build [config.yaml] # 题库生成"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "场景文件 : $SCENARIO"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 日志级别校验
|
||||||
|
LOG_LEVEL_UPPER="${LOG_LEVEL^^}"
|
||||||
|
case "$LOG_LEVEL_UPPER" in
|
||||||
|
DEBUG|INFO|WARNING|ERROR) ;;
|
||||||
|
*)
|
||||||
|
warn "未知日志级别 '$LOG_LEVEL',使用默认值 INFO"
|
||||||
|
LOG_LEVEL_UPPER="INFO"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
ok "日志级别 : $LOG_LEVEL_UPPER"
|
||||||
|
|
||||||
|
# 创建日志目录
|
||||||
|
mkdir -p logs
|
||||||
|
TIMESTAMP=$(date +%Y-%m-%d_%H%M%S)
|
||||||
|
LOG_FILE="logs/eval_${TIMESTAMP}.log"
|
||||||
|
ok "日志文件 : $LOG_FILE"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} 开始运行,按 Ctrl+C 中止${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# ── 运行 ──────────────────────────────────────────────────────────
|
||||||
|
export PYTHONIOENCODING="utf-8"
|
||||||
|
export PYTHONPATH="."
|
||||||
|
|
||||||
|
if [ "$BUILD_MODE" = true ]; then
|
||||||
|
"$PYTHON" main.py \
|
||||||
|
--dataset-build-config "$BUILD_CONFIG"
|
||||||
|
else
|
||||||
|
"$PYTHON" main.py \
|
||||||
|
--scenario "$SCENARIO" \
|
||||||
|
--log-file "$LOG_FILE" \
|
||||||
|
--log-level "$LOG_LEVEL_UPPER"
|
||||||
|
fi
|
||||||
|
|
||||||
|
EXIT_CODE=$?
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
if [ $EXIT_CODE -eq 0 ]; then
|
||||||
|
echo -e "${GREEN}============================================================${NC}"
|
||||||
|
echo -e "${GREEN} 运行完成!${NC}"
|
||||||
|
if [ "$BUILD_MODE" = false ]; then
|
||||||
|
echo -e "${GREEN} 日志已保存到:$LOG_FILE${NC}"
|
||||||
|
fi
|
||||||
|
echo -e "${CYAN} 在 Web 控制台查看报告:bash start.sh${NC}"
|
||||||
|
echo -e "${GREEN}============================================================${NC}"
|
||||||
|
else
|
||||||
|
err "运行失败(exit code=$EXIT_CODE)"
|
||||||
|
if [ "$BUILD_MODE" = false ]; then
|
||||||
|
err "查看日志:cat $LOG_FILE"
|
||||||
|
fi
|
||||||
|
exit $EXIT_CODE
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
@@ -9,6 +9,10 @@ metrics:
|
|||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
|
# 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth,取消注释即可启用)
|
||||||
|
# - noise_sensitivity # 鲁棒性:对检索噪声的敏感度
|
||||||
|
# - factual_correctness # 端到端:事实正确性(相对标准答案)
|
||||||
|
# - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用)
|
||||||
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
scenario_name: sample-pdf-question-bank-online
|
scenario_name: sample-pdf-question-bank-online
|
||||||
mode: online
|
mode: online
|
||||||
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
|
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
|
||||||
judge_model: deepseek-v4-pro
|
judge_model: qwen3.5-flash
|
||||||
embedding_model: text-embedding-v3
|
embedding_model: text-embedding-v3
|
||||||
metrics:
|
metrics:
|
||||||
- faithfulness
|
- faithfulness
|
||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
output_dir: ../../outputs/online/sample-pdf-question-bank
|
output_dir: ../../outputs/online/sample-pdf-question-bank
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 2
|
batch_size: 2
|
||||||
@@ -19,4 +19,4 @@ app_adapter:
|
|||||||
callable: apps.pdf_question_bank.adapter:run
|
callable: apps.pdf_question_bank.adapter:run
|
||||||
static_kwargs:
|
static_kwargs:
|
||||||
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
|
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
|
||||||
model: deepseek-v4-flash
|
model: glm-5
|
||||||
|
|||||||
@@ -3,20 +3,24 @@ mode: online
|
|||||||
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||||
judge_model: deepseek-v4-flash
|
judge_model: deepseek-v4-flash
|
||||||
embedding_model: text-embedding-v3
|
embedding_model: text-embedding-v3
|
||||||
|
optimization_advisor: true
|
||||||
metrics:
|
metrics:
|
||||||
- faithfulness
|
- faithfulness
|
||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
|
- noise_sensitivity
|
||||||
|
- factual_correctness
|
||||||
|
- semantic_similarity
|
||||||
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 4
|
batch_size: 3
|
||||||
app_concurrency: 4
|
app_concurrency: 3
|
||||||
metric_concurrency: 4
|
metric_concurrency: 3
|
||||||
max_samples: 50
|
max_samples: 10
|
||||||
app_adapter:
|
app_adapter:
|
||||||
type: python
|
type: python
|
||||||
callable: apps.siemens_pdf_qa.adapter:run
|
callable: apps.siemens_pdf_qa.adapter:run
|
||||||
static_kwargs:
|
static_kwargs:
|
||||||
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
||||||
model: deepseek-v4-flash
|
model: glm-5
|
||||||
|
|||||||
59
scripts/smoke_advisor.py
Normal file
59
scripts/smoke_advisor.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
"""Offline smoke-check for the advisor module wiring (no network required)."""
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from rag_eval.advisor.rules import diagnose
|
||||||
|
from rag_eval.advisor.writer import write_advice, _format_log_summary
|
||||||
|
|
||||||
|
# Simulate score_rows with low faithfulness and high noise_sensitivity
|
||||||
|
rows = [
|
||||||
|
{
|
||||||
|
"sample_id": f"s{i}",
|
||||||
|
"question": f"问题{i}:西门子CT扫描的Flash技术原理是什么?",
|
||||||
|
"answer": f"答案{i}:Flash技术采用双源CT扫描",
|
||||||
|
"ground_truth": f"标准答案{i}:Flash扫描利用双源CT和大螺距实现超低辐射剂量扫描",
|
||||||
|
"faithfulness": 0.3 + i * 0.05,
|
||||||
|
"noise_sensitivity": 0.4 + i * 0.02,
|
||||||
|
"context_recall": 0.75,
|
||||||
|
"semantic_similarity": 0.65,
|
||||||
|
}
|
||||||
|
for i in range(5)
|
||||||
|
]
|
||||||
|
|
||||||
|
diags = diagnose(rows, metrics=["faithfulness", "noise_sensitivity", "context_recall", "semantic_similarity"])
|
||||||
|
print(f"Diagnosed {len(diags)} metric(s):")
|
||||||
|
for d in diags:
|
||||||
|
print(f" {d.metric}: mean={d.mean_score}, severity={d.severity}, low_samples={len(d.low_samples)}")
|
||||||
|
|
||||||
|
assert len(diags) >= 2, f"Expected at least 2 diagnoses, got {len(diags)}"
|
||||||
|
metrics_hit = {d.metric for d in diags}
|
||||||
|
assert "faithfulness" in metrics_hit, "faithfulness should be triggered"
|
||||||
|
assert "noise_sensitivity" in metrics_hit, "noise_sensitivity should be triggered"
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
path = Path(tmp) / "optimization_advice.md"
|
||||||
|
write_advice(
|
||||||
|
diagnoses=diags,
|
||||||
|
llm_markdown="", # fallback mode (no LLM)
|
||||||
|
advice_path=path,
|
||||||
|
scenario_name="smoke-test-siemens",
|
||||||
|
run_id="2026-06-16T00-00-00",
|
||||||
|
judge_model="deepseek-v4-flash",
|
||||||
|
)
|
||||||
|
content = path.read_text(encoding="utf-8")
|
||||||
|
assert "smoke-test-siemens" in content, "scenario name missing from report"
|
||||||
|
assert "faithfulness" in content, "faithfulness missing from report"
|
||||||
|
assert "noise_sensitivity" in content, "noise_sensitivity missing from report"
|
||||||
|
print(f"\nAdvice file ({len(content)} chars) — assertions OK")
|
||||||
|
|
||||||
|
# Verify log summary format
|
||||||
|
summary = _format_log_summary(diags, Path("optimization_advice.md"))
|
||||||
|
print(f"\nLog summary length: {len(summary)} chars, faithfulness present: {'faithfulness' in summary}")
|
||||||
|
assert "触发诊断" in summary
|
||||||
|
assert "faithfulness" in summary
|
||||||
|
|
||||||
|
print("\nSmoke check PASSED")
|
||||||
1101
siemens-ragas-project-overview.html
Normal file
1101
siemens-ragas-project-overview.html
Normal file
File diff suppressed because it is too large
Load Diff
14
start.bat
14
start.bat
@@ -56,7 +56,17 @@ if errorlevel 1 (
|
|||||||
)
|
)
|
||||||
|
|
||||||
:: ----------------------------------------------------------------
|
:: ----------------------------------------------------------------
|
||||||
:: 4. Seed demo data if no runs exist yet
|
:: 4. Ensure configs/ directory exists for LLM profile storage
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
if not exist "configs" (
|
||||||
|
mkdir configs
|
||||||
|
echo [OK] Created configs/ directory for LLM profile storage.
|
||||||
|
) else (
|
||||||
|
echo [OK] configs/ directory ready.
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 5. Seed demo data if no runs exist yet
|
||||||
:: ----------------------------------------------------------------
|
:: ----------------------------------------------------------------
|
||||||
if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
||||||
echo [INFO] No run data found. Generating demo data...
|
echo [INFO] No run data found. Generating demo data...
|
||||||
@@ -71,7 +81,7 @@ if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
|||||||
)
|
)
|
||||||
|
|
||||||
:: ----------------------------------------------------------------
|
:: ----------------------------------------------------------------
|
||||||
:: 5. Pick an available port
|
:: 6. Pick an available port
|
||||||
:: ----------------------------------------------------------------
|
:: ----------------------------------------------------------------
|
||||||
set PORT=8800
|
set PORT=8800
|
||||||
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
|
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
|
||||||
|
|||||||
14
start.ps1
14
start.ps1
@@ -58,7 +58,17 @@ if ($LASTEXITCODE -ne 0) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
# 4. Seed demo data if missing
|
# 4. Ensure configs/ directory exists for LLM profile storage
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
if (-not (Test-Path "configs")) {
|
||||||
|
New-Item -ItemType Directory "configs" | Out-Null
|
||||||
|
Write-Host "[OK] Created configs/ directory for LLM profile storage." -ForegroundColor Green
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] configs/ directory ready." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 5. Seed demo data if missing
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
||||||
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
|
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
|
||||||
@@ -73,7 +83,7 @@ if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
# 5. Pick an available port
|
# 6. Pick an available port
|
||||||
# ----------------------------------------------------------------
|
# ----------------------------------------------------------------
|
||||||
$PORT = 8800
|
$PORT = 8800
|
||||||
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
|
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
|
||||||
|
|||||||
94
start.sh
Normal file
94
start.sh
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# start.sh — 启动 Siemens RAGAS Web 服务(后台运行)
|
||||||
|
# 前提:已执行过 deploy.sh(.venv 和依赖均已就绪)
|
||||||
|
# 用法:bash start.sh
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — 启动服务${NC}"
|
||||||
|
echo -e "${CYAN}============================================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 检查虚拟环境
|
||||||
|
if [ ! -f ".venv/bin/python" ]; then
|
||||||
|
err "未找到 .venv,请先执行部署:bash deploy.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PYTHON=".venv/bin/python"
|
||||||
|
|
||||||
|
# 检查 .env
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
warn ".env 不存在,请先复制并编辑配置:"
|
||||||
|
warn " cp .env.example .env && nano .env"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q "your-api-key" .env 2>/dev/null; then
|
||||||
|
warn ".env 中仍包含默认占位符,部分功能(评估执行)将不可用"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查是否已有运行中的进程
|
||||||
|
if [ -f ".server.pid" ]; then
|
||||||
|
EXISTING_PID=$(cat .server.pid)
|
||||||
|
if kill -0 "$EXISTING_PID" 2>/dev/null; then
|
||||||
|
warn "服务已在运行 (PID=$EXISTING_PID),无需重复启动"
|
||||||
|
warn "如需重启请先执行:bash stop.sh"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
# PID 文件残留,清理
|
||||||
|
rm -f .server.pid
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 创建必要目录
|
||||||
|
mkdir -p logs
|
||||||
|
|
||||||
|
# 端口检测
|
||||||
|
PORT=8800
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
warn "端口 $PORT 已被占用,尝试 8801..."
|
||||||
|
PORT=8801
|
||||||
|
if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
|
||||||
|
err "端口 8800 和 8801 均被占用,请手动指定端口:"
|
||||||
|
err " .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 后台启动
|
||||||
|
nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
echo "$SERVER_PID" > .server.pid
|
||||||
|
|
||||||
|
# 等待 3 秒验证进程存活
|
||||||
|
sleep 3
|
||||||
|
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||||
|
ok "服务已启动 (PID=$SERVER_PID)"
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN} 访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 本机访问: http://127.0.0.1:${PORT}${NC}"
|
||||||
|
echo -e "${CYAN} 查看日志: tail -f logs/server.log${NC}"
|
||||||
|
echo -e "${CYAN} 停止服务: bash stop.sh${NC}"
|
||||||
|
echo ""
|
||||||
|
else
|
||||||
|
err "服务启动失败,请查看日志:"
|
||||||
|
err " tail -20 logs/server.log"
|
||||||
|
rm -f .server.pid
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
68
stop.sh
Normal file
68
stop.sh
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# stop.sh — 停止 Siemens RAGAS 后台 Web 服务
|
||||||
|
# 用法:bash stop.sh
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# ── 颜色输出 ──────────────────────────────────────────────────────
|
||||||
|
if [ -t 1 ]; then
|
||||||
|
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||||||
|
else
|
||||||
|
GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||||
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||||
|
err() { echo -e "${RED}[ERROR]${NC} $*" >&2; }
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${CYAN} Siemens RAGAS Console — 停止服务${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
PID_FILE="$SCRIPT_DIR/.server.pid"
|
||||||
|
|
||||||
|
if [ ! -f "$PID_FILE" ]; then
|
||||||
|
warn "未找到 .server.pid,服务可能未启动或已停止"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
warn "进程 $PID 已不存在,清理 PID 文件"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 优雅停止(SIGTERM)
|
||||||
|
echo -e " 正在停止进程 (PID=$PID)..."
|
||||||
|
kill "$PID" 2>/dev/null || true
|
||||||
|
|
||||||
|
# 等待最多 5 秒
|
||||||
|
for i in 1 2 3 4 5; do
|
||||||
|
sleep 1
|
||||||
|
if ! kill -0 "$PID" 2>/dev/null; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
echo -e " 等待进程退出... ($i/5)"
|
||||||
|
done
|
||||||
|
|
||||||
|
# 若进程仍存在,强制终止
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
warn "进程未响应,强制终止 (SIGKILL)..."
|
||||||
|
kill -9 "$PID" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
err "无法停止进程 $PID,请手动执行:kill -9 $PID"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ok "服务已停止"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
100
tests/test_advisor_rules.py
Normal file
100
tests/test_advisor_rules.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import math
|
||||||
|
import unittest
|
||||||
|
from rag_eval.advisor.rules import Diagnosis, diagnose, METRIC_RULES
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiagnosis(unittest.TestCase):
|
||||||
|
def _make_rows(self, metric: str, scores: list[float]) -> list[dict]:
|
||||||
|
return [{metric: s, "question": f"q{i}", "answer": f"a{i}",
|
||||||
|
"ground_truth": f"gt{i}", "sample_id": f"s{i}"}
|
||||||
|
for i, s in enumerate(scores)]
|
||||||
|
|
||||||
|
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||||
|
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
|
||||||
|
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_low_severity_when_mean_below_advisory_threshold(self):
|
||||||
|
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
|
||||||
|
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
|
||||||
|
|
||||||
|
def test_low_severity_answer_relevancy_at_0_84(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
|
||||||
|
def test_low_severity_has_root_causes_and_actions(self):
|
||||||
|
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
|
||||||
|
result = diagnose(rows, metrics=["context_precision"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertTrue(len(result[0].root_causes) > 0)
|
||||||
|
self.assertTrue(len(result[0].suggested_actions) > 0)
|
||||||
|
|
||||||
|
def test_warning_when_mean_below_warning_threshold(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].metric, "faithfulness")
|
||||||
|
self.assertEqual(result[0].severity, "warning")
|
||||||
|
self.assertAlmostEqual(result[0].mean_score, 0.65, places=2)
|
||||||
|
|
||||||
|
def test_critical_when_mean_below_critical_threshold(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.3, 0.4, 0.45])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(result[0].severity, "critical")
|
||||||
|
|
||||||
|
def test_low_samples_selected_are_bottom_three(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.1, 0.2, 0.3, 0.8, 0.9])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result[0].low_samples), 3)
|
||||||
|
scores = [s["faithfulness"] for s in result[0].low_samples]
|
||||||
|
self.assertEqual(sorted(scores), [0.1, 0.2, 0.3])
|
||||||
|
|
||||||
|
def test_nan_scores_excluded_from_mean_and_low_samples(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.3, float("nan"), 0.4])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
for s in result[0].low_samples:
|
||||||
|
self.assertFalse(math.isnan(s["faithfulness"]))
|
||||||
|
|
||||||
|
def test_noise_sensitivity_direction_inverted(self):
|
||||||
|
# noise_sensitivity: higher is worse; threshold > 0.3 is warning
|
||||||
|
rows = self._make_rows("noise_sensitivity", [0.4, 0.45, 0.5])
|
||||||
|
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].metric, "noise_sensitivity")
|
||||||
|
|
||||||
|
def test_noise_sensitivity_no_diagnosis_when_low(self):
|
||||||
|
rows = self._make_rows("noise_sensitivity", [0.1, 0.15, 0.2])
|
||||||
|
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_skips_metric_not_in_rows(self):
|
||||||
|
rows = [{"faithfulness": 0.3, "question": "q", "answer": "a",
|
||||||
|
"ground_truth": "gt", "sample_id": "s1"}]
|
||||||
|
result = diagnose(rows, metrics=["faithfulness", "context_recall"])
|
||||||
|
metrics_found = [d.metric for d in result]
|
||||||
|
self.assertIn("faithfulness", metrics_found)
|
||||||
|
self.assertNotIn("context_recall", metrics_found)
|
||||||
|
|
||||||
|
def test_all_seven_metrics_have_rules(self):
|
||||||
|
expected = {"faithfulness", "answer_relevancy", "context_recall",
|
||||||
|
"context_precision", "noise_sensitivity",
|
||||||
|
"factual_correctness", "semantic_similarity"}
|
||||||
|
self.assertEqual(set(METRIC_RULES.keys()), expected)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
113
tests/test_advisor_writer.py
Normal file
113
tests/test_advisor_writer.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
import shutil
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from rag_eval.advisor.rules import Diagnosis
|
||||||
|
from rag_eval.advisor.writer import write_advice, _format_log_summary
|
||||||
|
|
||||||
|
|
||||||
|
class TestWriteAdvice(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.tmp = Path("tests/.tmp/test_advisor_writer")
|
||||||
|
shutil.rmtree(self.tmp, ignore_errors=True)
|
||||||
|
self.tmp.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.advice_path = self.tmp / "optimization_advice.md"
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.tmp, ignore_errors=True)
|
||||||
|
|
||||||
|
def _make_diagnosis(self, metric="faithfulness", severity="warning"):
|
||||||
|
return Diagnosis(
|
||||||
|
metric=metric,
|
||||||
|
mean_score=0.55,
|
||||||
|
threshold=0.7,
|
||||||
|
severity=severity,
|
||||||
|
root_causes=["原因1", "原因2"],
|
||||||
|
suggested_actions=["建议1", "建议2"],
|
||||||
|
low_samples=[
|
||||||
|
{"sample_id": "s1", "question": "问题1", "answer": "答案1",
|
||||||
|
"ground_truth": "标准1", metric: 0.4},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_write_creates_file(self):
|
||||||
|
diag = self._make_diagnosis()
|
||||||
|
write_advice(
|
||||||
|
diagnoses=[diag],
|
||||||
|
llm_markdown="## faithfulness\n\nLLM 建议内容",
|
||||||
|
advice_path=self.advice_path,
|
||||||
|
scenario_name="test-scenario",
|
||||||
|
run_id="2026-01-01T00-00-00",
|
||||||
|
judge_model="deepseek-v4-flash",
|
||||||
|
)
|
||||||
|
self.assertTrue(self.advice_path.exists())
|
||||||
|
|
||||||
|
def test_write_contains_scenario_name_and_run_id(self):
|
||||||
|
diag = self._make_diagnosis()
|
||||||
|
write_advice(
|
||||||
|
diagnoses=[diag],
|
||||||
|
llm_markdown="## faithfulness\n\nLLM 建议",
|
||||||
|
advice_path=self.advice_path,
|
||||||
|
scenario_name="siemens-test",
|
||||||
|
run_id="2026-01-01T00-00-00",
|
||||||
|
judge_model="deepseek-v4-flash",
|
||||||
|
)
|
||||||
|
content = self.advice_path.read_text(encoding="utf-8")
|
||||||
|
self.assertIn("siemens-test", content)
|
||||||
|
self.assertIn("2026-01-01T00-00-00", content)
|
||||||
|
|
||||||
|
def test_write_contains_llm_markdown(self):
|
||||||
|
diag = self._make_diagnosis()
|
||||||
|
write_advice(
|
||||||
|
diagnoses=[diag],
|
||||||
|
llm_markdown="## faithfulness\n\n具体建议文本",
|
||||||
|
advice_path=self.advice_path,
|
||||||
|
scenario_name="test",
|
||||||
|
run_id="rid",
|
||||||
|
judge_model="model",
|
||||||
|
)
|
||||||
|
content = self.advice_path.read_text(encoding="utf-8")
|
||||||
|
self.assertIn("具体建议文本", content)
|
||||||
|
|
||||||
|
def test_write_fallback_when_no_llm_markdown(self):
|
||||||
|
"""When llm_markdown is empty, writer emits rule-only report."""
|
||||||
|
diag = self._make_diagnosis()
|
||||||
|
write_advice(
|
||||||
|
diagnoses=[diag],
|
||||||
|
llm_markdown="",
|
||||||
|
advice_path=self.advice_path,
|
||||||
|
scenario_name="test",
|
||||||
|
run_id="rid",
|
||||||
|
judge_model="model",
|
||||||
|
)
|
||||||
|
content = self.advice_path.read_text(encoding="utf-8")
|
||||||
|
self.assertIn("faithfulness", content)
|
||||||
|
self.assertIn("原因1", content)
|
||||||
|
|
||||||
|
def test_log_summary_format(self):
|
||||||
|
diags = [
|
||||||
|
self._make_diagnosis("faithfulness", "critical"),
|
||||||
|
self._make_diagnosis("context_recall", "warning"),
|
||||||
|
]
|
||||||
|
summary = _format_log_summary(diags, self.advice_path)
|
||||||
|
self.assertIn("faithfulness", summary)
|
||||||
|
self.assertIn("严重", summary) # "critical" maps to Chinese label
|
||||||
|
self.assertIn("context_recall", summary)
|
||||||
|
self.assertIn("警告", summary) # "warning" maps to Chinese label
|
||||||
|
|
||||||
|
def test_write_empty_diagnoses_still_creates_file(self):
|
||||||
|
write_advice(
|
||||||
|
diagnoses=[],
|
||||||
|
llm_markdown="",
|
||||||
|
advice_path=self.advice_path,
|
||||||
|
scenario_name="test",
|
||||||
|
run_id="rid",
|
||||||
|
judge_model="model",
|
||||||
|
)
|
||||||
|
self.assertTrue(self.advice_path.exists())
|
||||||
|
content = self.advice_path.read_text(encoding="utf-8")
|
||||||
|
self.assertIn("未发现明显指标异常", content)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
68
tests/test_metric_presenter.py
Normal file
68
tests/test_metric_presenter.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_node(script: str) -> str:
|
||||||
|
"""Execute a short Node.js script and return stdout."""
|
||||||
|
completed = subprocess.run(
|
||||||
|
["node", "-e", script],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return completed.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def test_metric_presenter_applies_thresholds_and_noise_direction() -> None:
|
||||||
|
"""MetricPresenter should centralize thresholds and inverse noise semantics."""
|
||||||
|
metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix()
|
||||||
|
script = f"""
|
||||||
|
const fs = require("fs");
|
||||||
|
const vm = require("vm");
|
||||||
|
const code = fs.readFileSync("{metric_js}", "utf8");
|
||||||
|
const sandbox = {{ window: {{}}, console }};
|
||||||
|
vm.runInNewContext(code, sandbox);
|
||||||
|
const p = sandbox.window.MetricPresenter;
|
||||||
|
const result = {{
|
||||||
|
faith085: p.scoreClass("faithfulness", 0.85),
|
||||||
|
faith070: p.scoreClass("faithfulness", 0.70),
|
||||||
|
faith064: p.scoreClass("faithfulness", 0.64),
|
||||||
|
noise010: p.scoreClass("noise_sensitivity", 0.10),
|
||||||
|
noise030: p.scoreClass("noise_sensitivity", 0.30),
|
||||||
|
noise050: p.scoreClass("noise_sensitivity", 0.50),
|
||||||
|
desc: p.describeMetric("faithfulness"),
|
||||||
|
noiseDesc: p.describeMetric("noise_sensitivity"),
|
||||||
|
noiseBin: p.binColor("noise_sensitivity", 0.0),
|
||||||
|
faithBin: p.binColor("faithfulness", 0.8)
|
||||||
|
}};
|
||||||
|
console.log(JSON.stringify(result));
|
||||||
|
"""
|
||||||
|
output = _run_node(script)
|
||||||
|
assert '"faith085":"good"' in output
|
||||||
|
assert '"faith070":"warn"' in output
|
||||||
|
assert '"faith064":"bad"' in output
|
||||||
|
assert '"noise010":"good"' in output
|
||||||
|
assert '"noise030":"warn"' in output
|
||||||
|
assert '"noise050":"bad"' in output
|
||||||
|
assert '"desc":"' in output
|
||||||
|
assert '"noiseDesc":"' in output
|
||||||
|
assert '"noiseBin":"#16a34a"' in output
|
||||||
|
assert '"faithBin":"#16a34a"' in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_report_and_index_load_metric_presenter_helper() -> None:
|
||||||
|
"""The report page should use the shared helper for card descriptions and colors."""
|
||||||
|
index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8")
|
||||||
|
report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8")
|
||||||
|
app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
assert "js/metric_presenter.js" in index_html
|
||||||
|
assert "MetricPresenter.describeMetric" in report_js
|
||||||
|
assert "MetricPresenter.scoreClass" in app_js
|
||||||
@@ -80,6 +80,64 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
|
self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
|
||||||
self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
|
self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
|
||||||
|
|
||||||
|
def test_load_scenario_metric_and_doc_weights(self) -> None:
|
||||||
|
"""load_scenario passes metric_weights and doc_weights into Scenario."""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from rag_eval.config.loader import load_scenario
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"scenario_name": "w-test",
|
||||||
|
"mode": "offline",
|
||||||
|
"dataset": "nonexistent.csv",
|
||||||
|
"judge_model": "m",
|
||||||
|
"embedding_model": "e",
|
||||||
|
"metrics": ["faithfulness"],
|
||||||
|
"output_dir": "out",
|
||||||
|
"metric_weights": {"faithfulness": 0.7},
|
||||||
|
"doc_weights": {"doc.pdf": 2.0},
|
||||||
|
}
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
|
||||||
|
yaml.dump(payload, f, allow_unicode=True)
|
||||||
|
tmp_path = f.name
|
||||||
|
try:
|
||||||
|
scenario = load_scenario(tmp_path)
|
||||||
|
assert scenario.metric_weights == {"faithfulness": 0.7}
|
||||||
|
assert scenario.doc_weights == {"doc.pdf": 2.0}
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
def test_load_scenario_defaults_to_empty_weights(self) -> None:
|
||||||
|
"""load_scenario defaults metric_weights and doc_weights to empty dicts."""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from rag_eval.config.loader import load_scenario
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"scenario_name": "no-w",
|
||||||
|
"mode": "offline",
|
||||||
|
"dataset": "nonexistent.csv",
|
||||||
|
"judge_model": "m",
|
||||||
|
"embedding_model": "e",
|
||||||
|
"metrics": ["faithfulness"],
|
||||||
|
"output_dir": "out",
|
||||||
|
}
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
|
||||||
|
yaml.dump(payload, f, allow_unicode=True)
|
||||||
|
tmp_path = f.name
|
||||||
|
try:
|
||||||
|
scenario = load_scenario(tmp_path)
|
||||||
|
assert scenario.metric_weights == {}
|
||||||
|
assert scenario.doc_weights == {}
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
|
def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
|
||||||
scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
|
scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
|
||||||
snapshot = scenario.snapshot()
|
snapshot = scenario.snapshot()
|
||||||
@@ -125,6 +183,119 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class EvaluatorAndReportingTests(unittest.TestCase):
|
class EvaluatorAndReportingTests(unittest.TestCase):
|
||||||
|
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
||||||
|
"""_merge_score no longer adds weighted_score/sample_weight (feature disabled)."""
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from rag_eval.execution.evaluator import Evaluator
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="w-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness", "context_recall"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
|
||||||
|
doc_weights={"doc.pdf": 2.0},
|
||||||
|
)
|
||||||
|
evaluator = Evaluator(
|
||||||
|
scenario=scenario,
|
||||||
|
metric_pipeline=MagicMock(),
|
||||||
|
app_adapter=None,
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["ctx"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
metadata={"doc_name": "doc.pdf"},
|
||||||
|
)
|
||||||
|
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
||||||
|
row = evaluator._merge_score(sample, score)
|
||||||
|
# 综合加权得分已暂时禁用,weighted_score 和 sample_weight 不再写入
|
||||||
|
assert "weighted_score" not in row
|
||||||
|
assert "sample_weight" not in row
|
||||||
|
assert row["faithfulness"] == 1.0
|
||||||
|
assert row["context_recall"] == 0.0
|
||||||
|
|
||||||
|
def test_summary_markdown_shows_weighted_score(self):
|
||||||
|
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||||
|
import math
|
||||||
|
from rag_eval.reporting.summary import build_summary_markdown
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
|
||||||
|
)
|
||||||
|
from pathlib import Path
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="ws-test", mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={"faithfulness": 1.0},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1", question="q", contexts=["c"],
|
||||||
|
answer="a", ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario, run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample], invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1", "faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8, "sample_weight": 1.0,
|
||||||
|
"doc_name": "", "error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
assert "weighted_score" in md
|
||||||
|
assert "0.8000" in md
|
||||||
|
|
||||||
|
def test_summary_markdown_hides_weighted_score_without_weights(self):
|
||||||
|
"""build_summary_markdown preserves unweighted summaries when no weights set."""
|
||||||
|
from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name="plain-test",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=Path("d.csv")),
|
||||||
|
judge_model="m",
|
||||||
|
embedding_model="e",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
output_dir=Path("out"),
|
||||||
|
metric_weights={},
|
||||||
|
doc_weights={},
|
||||||
|
)
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="s1",
|
||||||
|
question="q",
|
||||||
|
contexts=["c"],
|
||||||
|
answer="a",
|
||||||
|
ground_truth="gt",
|
||||||
|
)
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id="r1",
|
||||||
|
started_at="2026-01-01T00:00:00",
|
||||||
|
finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[sample],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[{
|
||||||
|
"sample_id": "s1",
|
||||||
|
"faithfulness": 0.8,
|
||||||
|
"weighted_score": 0.8,
|
||||||
|
"sample_weight": 1.0,
|
||||||
|
"doc_name": "",
|
||||||
|
"error": "",
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
|
||||||
|
md = build_summary_markdown(result)
|
||||||
|
|
||||||
|
assert "- **weighted_score" not in md
|
||||||
|
|
||||||
def test_metric_pipeline_scores_sample(self) -> None:
|
def test_metric_pipeline_scores_sample(self) -> None:
|
||||||
pipeline = MetricPipeline(
|
pipeline = MetricPipeline(
|
||||||
metrics={
|
metrics={
|
||||||
|
|||||||
280
tests/test_pipeline.py
Normal file
280
tests/test_pipeline.py
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
"""Tests for the end-to-end pipeline API and pipeline task manager."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
# ── fixtures ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with a fresh PipelineTaskManager backed by tmp_path outputs."""
|
||||||
|
import webapp.services.pipeline_task_manager as mgr_mod
|
||||||
|
from webapp.services.pipeline_task_manager import PipelineTaskManager
|
||||||
|
|
||||||
|
fresh_mgr = PipelineTaskManager(max_workers=2)
|
||||||
|
monkeypatch.setattr(mgr_mod, "pipeline_task_manager", fresh_mgr)
|
||||||
|
monkeypatch.setattr(mgr_mod, "_PIPELINE_OUTPUT_ROOT", tmp_path / "pipeline")
|
||||||
|
|
||||||
|
import webapp.api.pipeline as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "pipeline_task_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
def _minimal_pdf_dir(tmp_path: Path) -> Path:
|
||||||
|
"""Create a temp directory that looks like a PDF folder (empty, valid dir)."""
|
||||||
|
d = tmp_path / "pdfs"
|
||||||
|
d.mkdir()
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_build_result(tmp_path: Path, job, run_id="r1"):
|
||||||
|
"""Return a fake DatasetBuildResult with a minimal dataset CSV."""
|
||||||
|
from rag_eval.dataset_builder.models import (
|
||||||
|
DatasetBuildArtifactPaths,
|
||||||
|
DatasetBuildResult,
|
||||||
|
DraftQuestionSample,
|
||||||
|
)
|
||||||
|
|
||||||
|
artifact_root = tmp_path / "build" / run_id
|
||||||
|
artifact_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
latest = tmp_path / "build" / "latest"
|
||||||
|
latest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
chunks_path = artifact_root / "source_chunks.jsonl"
|
||||||
|
chunks_path.write_text(
|
||||||
|
json.dumps({"chunk_id": "c1", "doc_id": "d1", "doc_name": "test.pdf",
|
||||||
|
"text": "CT scan context.", "page_start": 1, "page_end": 1,
|
||||||
|
"section_path": "/", "section_title": "", "source_layout_ids": []}) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(latest / "source_chunks.jsonl").write_text(chunks_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||||
|
|
||||||
|
dataset_csv = tmp_path / "generated_dataset.csv"
|
||||||
|
dataset_csv.write_text(
|
||||||
|
"sample_id,question,ground_truth,scenario,language,doc_id,doc_name,"
|
||||||
|
"section_path,page_start,page_end,source_chunk_ids,question_type,difficulty,"
|
||||||
|
"review_status,review_notes\n"
|
||||||
|
's1,"What is CT?","CT is imaging.","test","zh","d1","test.pdf","/",'
|
||||||
|
'1,1,"[""c1""]","fact","easy","draft",""\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = DraftQuestionSample(
|
||||||
|
sample_id="s1", question="What is CT?", ground_truth="CT is imaging.",
|
||||||
|
scenario="test", language="zh", doc_id="d1", doc_name="test.pdf",
|
||||||
|
section_path="/", page_start=1, page_end=1, source_chunk_ids=["c1"],
|
||||||
|
question_type="fact", difficulty="easy",
|
||||||
|
)
|
||||||
|
|
||||||
|
artifact_paths = DatasetBuildArtifactPaths(
|
||||||
|
root_dir=artifact_root,
|
||||||
|
documents_jsonl=artifact_root / "documents.jsonl",
|
||||||
|
semantic_blocks_jsonl=artifact_root / "semantic_blocks.jsonl",
|
||||||
|
source_chunks_jsonl=chunks_path,
|
||||||
|
dataset_draft_csv=artifact_root / "dataset_draft.csv",
|
||||||
|
parse_failures_csv=artifact_root / "parse_failures.csv",
|
||||||
|
metadata_json=artifact_root / "metadata.json",
|
||||||
|
)
|
||||||
|
return DatasetBuildResult(
|
||||||
|
job=job,
|
||||||
|
run_id=run_id,
|
||||||
|
artifact_paths=artifact_paths,
|
||||||
|
documents=[],
|
||||||
|
draft_samples=[sample],
|
||||||
|
parse_failures=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_eval_result(tmp_path: Path, scenario):
|
||||||
|
"""Return a fake EvaluationResult."""
|
||||||
|
from rag_eval.shared.models import EvaluationResult
|
||||||
|
|
||||||
|
return EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id="eval-r1",
|
||||||
|
started_at="2026-01-01T00:00:00",
|
||||||
|
finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── API route tests ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_submit_returns_202_and_job_id(client, tmp_path):
|
||||||
|
"""POST /api/pipeline/jobs returns 202 with job_id immediately."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||||
|
parse_failures=0, eval_run_id="r1", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
resp = client.post("/api/pipeline/jobs", json={
|
||||||
|
"docs_path": str(pdf_dir),
|
||||||
|
"job_name": "test-job",
|
||||||
|
})
|
||||||
|
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["job_name"] == "test-job"
|
||||||
|
# status may already be completed by the time the response is read (mock runs instantly)
|
||||||
|
assert data["status"] in ("queued", "completed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nonexistent_job_returns_404(client):
|
||||||
|
"""GET /api/pipeline/jobs/{id} returns 404 for unknown job."""
|
||||||
|
resp = client.get("/api/pipeline/jobs/doesnotexist")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_jobs_returns_empty_initially(client):
|
||||||
|
"""GET /api/pipeline/jobs returns empty list when no jobs submitted."""
|
||||||
|
resp = client.get("/api/pipeline/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_job_status_polling(client, tmp_path):
|
||||||
|
"""Submitted job becomes visible via GET /api/pipeline/jobs/{id}."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=3,
|
||||||
|
parse_failures=0, eval_run_id="r2", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
post_resp = client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir)})
|
||||||
|
|
||||||
|
job_id = post_resp.json()["job_id"]
|
||||||
|
|
||||||
|
# Poll until done or timeout (max 5s for mock)
|
||||||
|
for _ in range(20):
|
||||||
|
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||||
|
assert status_resp.status_code == 200
|
||||||
|
status = status_resp.json()
|
||||||
|
if status["status"] in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
assert status["status"] == "completed"
|
||||||
|
assert status["result"]["total_questions"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_job_fails_on_invalid_docs_path(client):
|
||||||
|
"""Job fails quickly if docs_path does not exist."""
|
||||||
|
resp = client.post("/api/pipeline/jobs", json={
|
||||||
|
"docs_path": "/nonexistent/path/that/does/not/exist",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
|
||||||
|
for _ in range(20):
|
||||||
|
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||||
|
status = status_resp.json()
|
||||||
|
if status["status"] in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
assert status["status"] == "failed"
|
||||||
|
assert "docs_path" in status["error"] or "not" in status["error"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_jobs_shows_submitted(client, tmp_path):
|
||||||
|
"""GET /api/pipeline/jobs includes jobs after submission."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||||
|
parse_failures=0, eval_run_id="r3", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir), "job_name": "listed-job"})
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
list_resp = client.get("/api/pipeline/jobs")
|
||||||
|
assert list_resp.status_code == 200
|
||||||
|
jobs = list_resp.json()["jobs"]
|
||||||
|
assert len(jobs) >= 1
|
||||||
|
names = [j["job_name"] for j in jobs]
|
||||||
|
assert "listed-job" in names
|
||||||
|
|
||||||
|
|
||||||
|
# ── execute_dataset_build_job refactor test ────────────────────────────────────
|
||||||
|
|
||||||
|
def test_execute_dataset_build_job_directly(tmp_path):
|
||||||
|
"""execute_dataset_build_job runs the build without a YAML file."""
|
||||||
|
from unittest.mock import patch as _patch
|
||||||
|
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
|
||||||
|
from rag_eval.dataset_builder.runner import execute_dataset_build_job
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
pdf_dir = tmp_path / "pdfs"
|
||||||
|
pdf_dir.mkdir()
|
||||||
|
(pdf_dir / "doc.pdf").write_bytes(b"%PDF-fake")
|
||||||
|
|
||||||
|
job = DatasetBuildJob(
|
||||||
|
job_name="direct-test",
|
||||||
|
input_path=pdf_dir,
|
||||||
|
input_glob="*.pdf",
|
||||||
|
parser_provider="aliyun_docmind",
|
||||||
|
failure_mode="skip",
|
||||||
|
generation_model="test-model",
|
||||||
|
output_type="online_question_bank",
|
||||||
|
review_mode="draft_with_manual_review",
|
||||||
|
max_questions_per_document=5,
|
||||||
|
max_source_chunks_per_question=3,
|
||||||
|
dataset_path=tmp_path / "out.csv",
|
||||||
|
artifact_dir=tmp_path / "artifacts",
|
||||||
|
runtime=DatasetBuildRuntime(max_documents=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_doc = MagicMock()
|
||||||
|
mock_doc.doc_id = "d1"
|
||||||
|
mock_doc.doc_name = "doc.pdf"
|
||||||
|
mock_doc.source_chunks = []
|
||||||
|
mock_doc.semantic_blocks = []
|
||||||
|
mock_doc.raw_text = ""
|
||||||
|
mock_doc.structure_nodes = []
|
||||||
|
mock_doc.metadata = {}
|
||||||
|
mock_doc.to_record.return_value = {
|
||||||
|
"doc_id": "d1", "doc_name": "doc.pdf", "raw_text": "",
|
||||||
|
"structure_nodes": [], "metadata": {},
|
||||||
|
"semantic_block_count": 0, "source_chunk_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_parser = MagicMock()
|
||||||
|
mock_parser.parse.return_value = mock_doc
|
||||||
|
|
||||||
|
mock_generator = MagicMock()
|
||||||
|
mock_generator.generate.return_value = []
|
||||||
|
|
||||||
|
result = execute_dataset_build_job(
|
||||||
|
job,
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
parser=mock_parser,
|
||||||
|
generator=mock_generator,
|
||||||
|
)
|
||||||
|
assert result.job.job_name == "direct-test"
|
||||||
|
assert result.artifact_paths.root_dir.exists()
|
||||||
117
tests/test_webapp_report_builder.py
Normal file
117
tests/test_webapp_report_builder.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
"""Regression tests for weighted webapp report aggregation."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from webapp.services.report_builder import build_report
|
||||||
|
from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
def _write_run_artifacts(run_dir: Path) -> None:
|
||||||
|
"""Create a minimal run directory with weighted scores and a snapshot."""
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,1.0,0.5,0.8333,3.0",
|
||||||
|
"s2,b.pdf,0.0,0.5,0.1667,1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
|
||||||
|
(run_dir / "scenario.snapshot.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"metrics:",
|
||||||
|
" - faithfulness",
|
||||||
|
" - context_recall",
|
||||||
|
"metric_weights:",
|
||||||
|
" faithfulness: 2.0",
|
||||||
|
" context_recall: 1.0",
|
||||||
|
"doc_weights:",
|
||||||
|
" a.pdf: 3.0",
|
||||||
|
" b.pdf: 1.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
|
||||||
|
"""Snapshot weight reader returns both weight maps as plain float dicts."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
|
assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
|
||||||
|
"""Report aggregation uses weighted means and surfaces snapshot weights."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
_write_run_artifacts(run_dir)
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["faithfulness", "context_recall"])
|
||||||
|
|
||||||
|
assert report.metric_means == {
|
||||||
|
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
||||||
|
"context_recall": pytest.approx(0.5, rel=1e-4),
|
||||||
|
}
|
||||||
|
# 综合加权得分已暂时禁用
|
||||||
|
assert report.weighted_score_mean is None
|
||||||
|
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
|
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
assert report.summary_markdown == "summary"
|
||||||
|
assert report.advice_markdown == "advice"
|
||||||
|
|
||||||
|
|
||||||
|
def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
|
||||||
|
"""Metric inference excludes weighted helper columns from scores.csv."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,doc_name,faithfulness,weighted_score,sample_weight",
|
||||||
|
"s1,a.pdf,0.8,0.8,2.0",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None:
|
||||||
|
"""Lowest-sample review should treat higher noise sensitivity as worse."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,question,noise_sensitivity",
|
||||||
|
"s-good,q1,0.10",
|
||||||
|
"s-warn,q2,0.30",
|
||||||
|
"s-bad,q3,0.90",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["noise_sensitivity"])
|
||||||
|
|
||||||
|
assert [sample.sample_id for sample in report.lowest_samples[:3]] == [
|
||||||
|
"s-bad",
|
||||||
|
"s-warn",
|
||||||
|
"s-good",
|
||||||
|
]
|
||||||
124
tests/test_weights.py
Normal file
124
tests/test_weights.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
"""Unit tests for rag_eval/metrics/weights.py"""
|
||||||
|
import math
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
compute_weighted_score,
|
||||||
|
resolve_weight,
|
||||||
|
weighted_metric_means,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveWeight:
|
||||||
|
def test_returns_value_when_key_present(self):
|
||||||
|
assert resolve_weight({"faith": 0.5}, "faith") == 0.5
|
||||||
|
|
||||||
|
def test_returns_default_when_key_missing(self):
|
||||||
|
assert resolve_weight({}, "faith") == 1.0
|
||||||
|
|
||||||
|
def test_returns_custom_default_when_key_missing(self):
|
||||||
|
assert resolve_weight({}, "faith", default=2.0) == 2.0
|
||||||
|
|
||||||
|
def test_empty_dict_returns_default(self):
|
||||||
|
assert resolve_weight({}, "anything") == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeWeightedScore:
|
||||||
|
def test_equal_weights_is_simple_mean(self):
|
||||||
|
scores = {"faithfulness": 0.8, "context_recall": 0.6}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.7, rel=1e-4)
|
||||||
|
|
||||||
|
def test_explicit_weights(self):
|
||||||
|
scores = {"faithfulness": 1.0, "context_recall": 0.0}
|
||||||
|
weights = {"faithfulness": 3.0, "context_recall": 1.0}
|
||||||
|
result = compute_weighted_score(scores, weights)
|
||||||
|
assert result == pytest.approx(0.75, rel=1e-4)
|
||||||
|
|
||||||
|
def test_nan_values_excluded(self):
|
||||||
|
scores = {"faithfulness": float("nan"), "context_recall": 0.8}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.8, rel=1e-4)
|
||||||
|
|
||||||
|
def test_none_values_excluded(self):
|
||||||
|
scores = {"faithfulness": None, "context_recall": 0.6}
|
||||||
|
result = compute_weighted_score(scores, {})
|
||||||
|
assert result == pytest.approx(0.6, rel=1e-4)
|
||||||
|
|
||||||
|
def test_all_nan_returns_none(self):
|
||||||
|
scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
|
||||||
|
assert compute_weighted_score(scores, {}) is None
|
||||||
|
|
||||||
|
def test_empty_scores_returns_none(self):
|
||||||
|
assert compute_weighted_score({}, {}) is None
|
||||||
|
|
||||||
|
def test_missing_metric_in_weights_uses_default_1(self):
|
||||||
|
scores = {"faithfulness": 0.8, "context_recall": 0.4}
|
||||||
|
weights = {"faithfulness": 2.0}
|
||||||
|
result = compute_weighted_score(scores, weights)
|
||||||
|
assert result == pytest.approx(2.0 / 3, rel=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWeightedMetricMeans:
|
||||||
|
def _rows(self):
|
||||||
|
return [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_equal_weights_gives_arithmetic_mean(self):
|
||||||
|
rows = self._rows()
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
|
||||||
|
|
||||||
|
def test_doc_weight_amplifies_contribution(self):
|
||||||
|
rows = self._rows()
|
||||||
|
doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
|
||||||
|
|
||||||
|
def test_nan_rows_skipped_per_metric(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
|
||||||
|
]
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
|
||||||
|
|
||||||
|
def test_missing_metric_column_returns_none(self):
|
||||||
|
rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
|
||||||
|
result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||||||
|
assert result["unknown_metric"] is None
|
||||||
|
|
||||||
|
def test_empty_rows_returns_none_for_all(self):
|
||||||
|
result = weighted_metric_means([], ["faithfulness"], {})
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeOverallWeightedScoreMean:
|
||||||
|
def test_basic_weighted_mean_of_weighted_scores(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
|
||||||
|
{"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
|
||||||
|
]
|
||||||
|
metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
|
||||||
|
result = compute_overall_weighted_score_mean(rows, metric_weights, {})
|
||||||
|
assert result == pytest.approx(0.5, rel=1e-4)
|
||||||
|
|
||||||
|
def test_doc_weight_amplifies_sample(self):
|
||||||
|
rows = [
|
||||||
|
{"doc_name": "important.pdf", "faithfulness": 1.0},
|
||||||
|
{"doc_name": "other.pdf", "faithfulness": 0.0},
|
||||||
|
]
|
||||||
|
doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
|
||||||
|
result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
|
||||||
|
assert result == pytest.approx(0.9, rel=1e-4)
|
||||||
|
|
||||||
|
def test_all_nan_returns_none(self):
|
||||||
|
rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
|
||||||
|
assert compute_overall_weighted_score_mean(rows, {}, {}) is None
|
||||||
0
tests/webapp/__init__.py
Normal file
0
tests/webapp/__init__.py
Normal file
245
tests/webapp/test_llm_profiles_api.py
Normal file
245
tests/webapp/test_llm_profiles_api.py
Normal file
@@ -0,0 +1,245 @@
|
|||||||
|
"""Integration tests for /api/llm-profiles endpoints."""
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with a fresh ProfileManager backed by a temp file."""
|
||||||
|
store = tmp_path / "profiles.json"
|
||||||
|
import webapp.services.profile_manager as pm_mod
|
||||||
|
from webapp.services.profile_manager import ProfileManager
|
||||||
|
fresh_mgr = ProfileManager(store_path=store)
|
||||||
|
monkeypatch.setattr(pm_mod, "profile_manager", fresh_mgr)
|
||||||
|
import webapp.api.llm_profiles as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "profile_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_empty(client):
|
||||||
|
resp = client.get("/api/llm-profiles")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["profiles"] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_and_list(client):
|
||||||
|
body = {"name": "Test", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
|
resp = client.post("/api/llm-profiles", json=body)
|
||||||
|
assert resp.status_code == 201
|
||||||
|
data = resp.json()
|
||||||
|
assert data["name"] == "Test"
|
||||||
|
assert data["profile_id"] != ""
|
||||||
|
|
||||||
|
resp2 = client.get("/api/llm-profiles")
|
||||||
|
assert len(resp2.json()["profiles"]) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_profile(client):
|
||||||
|
body = {"name": "Old", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
|
||||||
|
upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
|
||||||
|
with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
|
||||||
|
resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["name"] == "New"
|
||||||
|
assert resp.json()["timeout_seconds"] == 60
|
||||||
|
invalidate.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_profile(client):
|
||||||
|
body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
|
||||||
|
resp = client.delete(f"/api/llm-profiles/{pid}")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["deleted"] is True
|
||||||
|
assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
|
||||||
|
invalidate.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_nonexistent(client):
|
||||||
|
resp = client.put("/api/llm-profiles/nope",
|
||||||
|
json={"name": "X", "model": "m", "base_url": "http://x/v1", "api_key": "k"})
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_nonexistent(client):
|
||||||
|
resp = client.delete("/api/llm-profiles/nope")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# YAML patcher tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
import yaml as yaml_lib
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
from webapp.models import LLMProfile
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_judge_profile(tmp_path):
|
||||||
|
"""Applying a judge profile patches judge_model in the YAML."""
|
||||||
|
scenario_file = tmp_path / "test-scenario.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: test\nmode: offline\njudge_model: old-model\nembedding_model: emb\n"
|
||||||
|
"dataset: data.csv\nmetrics:\n- faithfulness\noutput_dir: outputs/test\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
judge_p = LLMProfile(
|
||||||
|
profile_id="x", name="J", model="new-model",
|
||||||
|
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
|
||||||
|
)
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=judge_p,
|
||||||
|
answer_profile=None,
|
||||||
|
dataset_profile=None,
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "judge_model" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert data["judge_model"] == "new-model"
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_answer_profile(tmp_path):
|
||||||
|
"""Applying an answer profile patches app_adapter.static_kwargs.model."""
|
||||||
|
scenario_file = tmp_path / "online.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: online\nmode: online\njudge_model: j\nembedding_model: emb\n"
|
||||||
|
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n"
|
||||||
|
"app_adapter:\n type: python\n callable: apps.foo:run\n"
|
||||||
|
" static_kwargs:\n model: old\n source_chunks_path: chunks.jsonl\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
answer_p = LLMProfile(
|
||||||
|
profile_id="y", name="A", model="new-answer-model",
|
||||||
|
base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
|
||||||
|
)
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None,
|
||||||
|
answer_profile=answer_p,
|
||||||
|
dataset_profile=None,
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "app_adapter.static_kwargs.model" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert data["app_adapter"]["static_kwargs"]["model"] == "new-answer-model"
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_no_profiles_returns_empty(tmp_path):
|
||||||
|
"""When no profiles are given, no fields are patched."""
|
||||||
|
scenario_file = tmp_path / "noop.yaml"
|
||||||
|
scenario_file.write_text("scenario_name: noop\njudge_model: m\n", encoding="utf-8")
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None,
|
||||||
|
answer_profile=None,
|
||||||
|
dataset_profile=None,
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert patched == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_metric_weights_patches_yaml(tmp_path):
|
||||||
|
"""Applying metric_weights writes them into the YAML."""
|
||||||
|
import yaml as yaml_lib
|
||||||
|
import pytest
|
||||||
|
scenario_file = tmp_path / "w-scenario.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
|
||||||
|
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None, answer_profile=None, dataset_profile=None,
|
||||||
|
metric_weights={"faithfulness": 0.7, "context_recall": 0.3},
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "metric_weights" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert abs(data["metric_weights"]["faithfulness"] - 0.7) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_doc_weights_patches_yaml(tmp_path):
|
||||||
|
"""Applying doc_weights writes them into the YAML."""
|
||||||
|
import yaml as yaml_lib
|
||||||
|
scenario_file = tmp_path / "dw-scenario.yaml"
|
||||||
|
scenario_file.write_text(
|
||||||
|
"scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
|
||||||
|
"dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=str(scenario_file),
|
||||||
|
judge_profile=None, answer_profile=None, dataset_profile=None,
|
||||||
|
doc_weights={"doc.pdf": 2.0},
|
||||||
|
_resolve_absolute=True,
|
||||||
|
)
|
||||||
|
assert "doc_weights" in patched
|
||||||
|
data = yaml_lib.safe_load(scenario_file.read_text())
|
||||||
|
assert abs(data["doc_weights"]["doc.pdf"] - 2.0) < 1e-9
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Connectivity test endpoint tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
def test_probe_connectivity_success(client):
|
||||||
|
"""POST /api/llm-profiles/probe returns ok=True on successful completion."""
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [MagicMock()]
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
|
||||||
|
resp = client.post("/api/llm-profiles/probe", json={
|
||||||
|
"model": "test-model",
|
||||||
|
"base_url": "http://x/v1",
|
||||||
|
"api_key": "sk-test",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ok"] is True
|
||||||
|
assert data["latency_ms"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_probe_connectivity_failure(client):
|
||||||
|
"""POST /api/llm-profiles/probe returns ok=False when the LLM call raises."""
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.side_effect = Exception("connection refused")
|
||||||
|
resp = client.post("/api/llm-profiles/probe", json={
|
||||||
|
"model": "test-model",
|
||||||
|
"base_url": "http://x/v1",
|
||||||
|
"api_key": "sk-test",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ok"] is False
|
||||||
|
assert "connection refused" in data["message"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_test_saved_profile_success(client):
|
||||||
|
"""POST /api/llm-profiles/{id}/test returns ok=True for a saved profile."""
|
||||||
|
body = {"name": "T", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [MagicMock()]
|
||||||
|
with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
|
||||||
|
MockOpenAI.return_value.chat.completions.create.return_value = mock_response
|
||||||
|
resp = client.post(f"/api/llm-profiles/{pid}/test")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["ok"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_test_nonexistent_profile_returns_404(client):
|
||||||
|
"""POST /api/llm-profiles/{id}/test returns 404 for unknown profile id."""
|
||||||
|
resp = client.post("/api/llm-profiles/nonexistent/test")
|
||||||
|
assert resp.status_code == 404
|
||||||
205
tests/webapp/test_profile_manager.py
Normal file
205
tests/webapp/test_profile_manager.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import sentinel
|
||||||
|
|
||||||
|
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
||||||
|
|
||||||
|
def test_llm_profile_defaults():
|
||||||
|
p = LLMProfile(
|
||||||
|
profile_id="abc",
|
||||||
|
name="Test",
|
||||||
|
model="gpt-4",
|
||||||
|
base_url="http://localhost/v1",
|
||||||
|
api_key="sk-test",
|
||||||
|
)
|
||||||
|
assert p.timeout_seconds == 30
|
||||||
|
assert p.created_at != ""
|
||||||
|
assert p.updated_at != ""
|
||||||
|
|
||||||
|
def test_profile_apply_request_fields():
|
||||||
|
req = ProfileApplyRequest(
|
||||||
|
scenario_path="scenarios/offline/sample.yaml",
|
||||||
|
judge_profile_id="id1",
|
||||||
|
answer_profile_id="id2",
|
||||||
|
dataset_profile_id=None,
|
||||||
|
)
|
||||||
|
assert req.judge_profile_id == "id1"
|
||||||
|
assert req.dataset_profile_id is None
|
||||||
|
|
||||||
|
def test_profile_apply_response():
|
||||||
|
resp = ProfileApplyResponse(scenario_path="scenarios/offline/sample.yaml", patched_fields=["judge_model"])
|
||||||
|
assert "judge_model" in resp.patched_fields
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# ProfileManager service tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
import json
|
||||||
|
from webapp.services.profile_manager import ProfileManager
|
||||||
|
|
||||||
|
|
||||||
|
def _make_manager(tmp_path):
|
||||||
|
store = tmp_path / "profiles.json"
|
||||||
|
return ProfileManager(store_path=store)
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_profile(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
p = mgr.create(name="Local", model="deepseek-v4-flash",
|
||||||
|
base_url="http://localhost/v1", api_key="sk-x")
|
||||||
|
assert p.profile_id != ""
|
||||||
|
assert p.name == "Local"
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_profiles(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
mgr.create(name="A", model="m1", base_url="http://a/v1", api_key="k1")
|
||||||
|
mgr.create(name="B", model="m2", base_url="http://b/v1", api_key="k2")
|
||||||
|
profiles = mgr.list_all()
|
||||||
|
assert len(profiles) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_profile(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
created = mgr.create(name="X", model="m", base_url="http://x/v1", api_key="k")
|
||||||
|
fetched = mgr.get(created.profile_id)
|
||||||
|
assert fetched is not None
|
||||||
|
assert fetched.name == "X"
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_profile(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
p = mgr.create(name="Old", model="m", base_url="http://x/v1", api_key="k")
|
||||||
|
updated = mgr.update(p.profile_id, name="New", model="m2",
|
||||||
|
base_url="http://x/v1", api_key="k", timeout_seconds=60)
|
||||||
|
assert updated is not None
|
||||||
|
assert updated.name == "New"
|
||||||
|
assert updated.model == "m2"
|
||||||
|
assert updated.timeout_seconds == 60
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_profile(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
p = mgr.create(name="Del", model="m", base_url="http://x/v1", api_key="k")
|
||||||
|
assert mgr.delete(p.profile_id) is True
|
||||||
|
assert mgr.get(p.profile_id) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_persistence(tmp_path):
|
||||||
|
store = tmp_path / "profiles.json"
|
||||||
|
mgr1 = ProfileManager(store_path=store)
|
||||||
|
p = mgr1.create(name="Persist", model="m", base_url="http://x/v1", api_key="k")
|
||||||
|
mgr2 = ProfileManager(store_path=store)
|
||||||
|
assert mgr2.get(p.profile_id) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nonexistent(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
assert mgr.get("does-not-exist") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_nonexistent(tmp_path):
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
assert mgr.delete("does-not-exist") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_openai_client_kwargs_prefers_matching_profile(tmp_path, monkeypatch):
|
||||||
|
"""Metric runtime should prefer the saved LLM Profile over .env defaults."""
|
||||||
|
from rag_eval.metrics.factory import _resolve_openai_client_kwargs
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
import webapp.services.profile_manager as pm_mod
|
||||||
|
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
mgr.create(
|
||||||
|
name="Judge",
|
||||||
|
model="gpt-5.5",
|
||||||
|
base_url="http://39.107.88.131:13000",
|
||||||
|
api_key="sk-profile",
|
||||||
|
timeout_seconds=300,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(pm_mod, "profile_manager", mgr)
|
||||||
|
|
||||||
|
settings = EvaluationSettings(
|
||||||
|
OPENAI_API_KEY="sk-env",
|
||||||
|
OPENAI_BASE_URL="http://env-base/v1",
|
||||||
|
OPENAI_TIMEOUT_SECONDS=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = _resolve_openai_client_kwargs("gpt-5.5", settings)
|
||||||
|
assert kwargs["api_key"] == "sk-profile"
|
||||||
|
assert kwargs["base_url"] == "http://39.107.88.131:13000"
|
||||||
|
assert kwargs["timeout"] == 300.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
|
||||||
|
"""When no saved profile matches, .env settings remain the fallback."""
|
||||||
|
from rag_eval.metrics.factory import _resolve_openai_client_kwargs
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
import webapp.services.profile_manager as pm_mod
|
||||||
|
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
monkeypatch.setattr(pm_mod, "profile_manager", mgr)
|
||||||
|
|
||||||
|
settings = EvaluationSettings(
|
||||||
|
OPENAI_API_KEY="sk-env",
|
||||||
|
OPENAI_BASE_URL="http://env-base/v1",
|
||||||
|
OPENAI_TIMEOUT_SECONDS=45,
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = _resolve_openai_client_kwargs("gpt-5", settings)
|
||||||
|
assert kwargs["api_key"] == "sk-env"
|
||||||
|
assert kwargs["base_url"] == "http://env-base/v1"
|
||||||
|
assert kwargs["timeout"] == 45.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch):
|
||||||
|
"""Structured RAGAS judge calls should use a larger completion budget by default."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["model"] = model
|
||||||
|
captured["client"] = client
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
llm, embeddings = factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert llm is sentinel.llm
|
||||||
|
assert embeddings is sentinel.embeddings
|
||||||
|
assert captured["model"] == "gpt-5"
|
||||||
|
assert captured["client"] is sentinel.client
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 4096}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch):
|
||||||
|
"""Operators should be able to raise the judge completion budget via settings."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 8192}
|
||||||
341
tests/webapp/test_score_api.py
Normal file
341
tests/webapp/test_score_api.py
Normal file
@@ -0,0 +1,341 @@
|
|||||||
|
"""Tests for POST /api/score endpoint."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreRequest:
|
||||||
|
def test_minimal_valid_request(self):
|
||||||
|
"""Only required fields — question, answer, contexts."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="What is CT?",
|
||||||
|
answer="CT is imaging.",
|
||||||
|
contexts="CT uses X-rays.",
|
||||||
|
)
|
||||||
|
assert req.question == "What is CT?"
|
||||||
|
assert req.contexts == "CT uses X-rays."
|
||||||
|
assert req.ground_truth is None
|
||||||
|
assert req.context_separator == " |||| "
|
||||||
|
assert req.metrics == [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_contexts_split_by_separator(self):
|
||||||
|
"""contexts_as_list() splits on context_separator."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="ctx1 |||| ctx2 |||| ctx3",
|
||||||
|
context_separator=" |||| ",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
|
||||||
|
|
||||||
|
def test_contexts_split_custom_separator(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="a---b---c",
|
||||||
|
context_separator="---",
|
||||||
|
)
|
||||||
|
assert req.contexts_as_list() == ["a", "b", "c"]
|
||||||
|
|
||||||
|
def test_contexts_split_single_item(self):
|
||||||
|
req = ScoreRequest(question="q", answer="a", contexts="only one")
|
||||||
|
assert req.contexts_as_list() == ["only one"]
|
||||||
|
|
||||||
|
def test_missing_question_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(answer="a", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_answer_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
|
def test_missing_contexts_defaults_to_none(self):
|
||||||
|
"""contexts is now optional — missing contexts is allowed."""
|
||||||
|
req = ScoreRequest(question="q", answer="a")
|
||||||
|
assert req.contexts is None
|
||||||
|
assert req.contexts_as_list() == []
|
||||||
|
|
||||||
|
def test_custom_metrics_accepted(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
)
|
||||||
|
assert req.metrics == ["faithfulness"]
|
||||||
|
|
||||||
|
def test_invalid_metric_name_raises(self):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=["not_a_metric"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
|
||||||
|
"""Without ground_truth, GT-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
metrics=[
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "faithfulness" in effective
|
||||||
|
assert "context_recall" not in effective
|
||||||
|
assert "factual_correctness" not in effective
|
||||||
|
assert "semantic_similarity" not in effective
|
||||||
|
assert "noise_sensitivity" not in effective
|
||||||
|
|
||||||
|
def test_effective_metrics_keeps_all_when_ground_truth_present(self):
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q",
|
||||||
|
answer="a",
|
||||||
|
contexts="c",
|
||||||
|
ground_truth="gt",
|
||||||
|
metrics=["faithfulness", "context_recall", "factual_correctness"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert effective == [
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_context_dependent_when_contexts_absent(self):
|
||||||
|
"""Without contexts, context-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
metrics=["faithfulness", "answer_relevancy", "context_precision"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "answer_relevancy" in effective
|
||||||
|
assert "faithfulness" not in effective
|
||||||
|
assert "context_precision" not in effective
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreResponse:
|
||||||
|
def test_score_response_structure(self):
|
||||||
|
resp = ScoreResponse(
|
||||||
|
scores={"faithfulness": 0.85, "answer_relevancy": None},
|
||||||
|
weighted_score=0.85,
|
||||||
|
latency_ms=1200,
|
||||||
|
)
|
||||||
|
assert resp.scores["faithfulness"] == 0.85
|
||||||
|
assert resp.scores["answer_relevancy"] is None
|
||||||
|
assert resp.latency_ms == 1200
|
||||||
|
|
||||||
|
|
||||||
|
class TestInlineScorer:
|
||||||
|
def test_score_returns_dict_with_requested_metrics(self):
|
||||||
|
"""InlineScorer.score returns a dict keyed by the requested metrics."""
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a",
|
||||||
|
contexts=["ctx1"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness", "answer_relevancy"],
|
||||||
|
judge_model="test-model",
|
||||||
|
embedding_model="test-embed",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert "faithfulness" in result
|
||||||
|
assert "answer_relevancy" in result
|
||||||
|
assert result["faithfulness"] == pytest.approx(0.9)
|
||||||
|
|
||||||
|
def test_score_converts_nan_to_none(self):
|
||||||
|
"""NaN scores are converted to None in the returned dict."""
|
||||||
|
import math
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from webapp.services.inline_scorer import InlineScorer
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
mock_score = MagicMock()
|
||||||
|
mock_score.metrics = {"faithfulness": float("nan")}
|
||||||
|
mock_score.error = ""
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
|
||||||
|
|
||||||
|
with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
|
||||||
|
with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
|
||||||
|
with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
|
||||||
|
scorer = InlineScorer()
|
||||||
|
result = scorer.score(
|
||||||
|
question="q", answer="a", contexts=["c"],
|
||||||
|
ground_truth=None,
|
||||||
|
metrics=["faithfulness"],
|
||||||
|
judge_model="m", embedding_model="e",
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
)
|
||||||
|
assert result["faithfulness"] is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Endpoint integration tests ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(monkeypatch):
|
||||||
|
"""TestClient with mocked InlineScorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {
|
||||||
|
"faithfulness": 0.85,
|
||||||
|
"answer_relevancy": 0.90,
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreEndpoint:
|
||||||
|
def test_post_score_returns_200(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is imaging.",
|
||||||
|
"contexts": "CT uses X-rays.",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "scores" in data
|
||||||
|
assert "latency_ms" in data
|
||||||
|
assert data["scores"]["faithfulness"] == pytest.approx(0.85)
|
||||||
|
|
||||||
|
def test_weighted_score_computed(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
# 综合加权得分已暂时禁用,始终返回 null
|
||||||
|
assert data["weighted_score"] is None
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_invalid_metric_name_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["not_a_metric"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_skipped_metrics_returned_when_no_ground_truth(self, client):
|
||||||
|
resp = client.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
"metrics": ["faithfulness", "context_recall"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "context_recall" in data["skipped_metrics"]
|
||||||
|
|
||||||
|
def test_contexts_split_on_separator(self, monkeypatch):
|
||||||
|
"""contexts string is split before passing to scorer."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
calls = []
|
||||||
|
def capture(**kwargs):
|
||||||
|
calls.append(kwargs.get("contexts", []))
|
||||||
|
return {"faithfulness": 0.9}
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.side_effect = lambda **kw: capture(**kw)
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a",
|
||||||
|
"contexts": "ctx1 |||| ctx2",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
})
|
||||||
|
assert len(calls) == 1
|
||||||
|
assert calls[0] == ["ctx1", "ctx2"]
|
||||||
|
|
||||||
|
def test_bearer_token_auth_required_when_configured(self, monkeypatch):
|
||||||
|
"""When SCORE_API_TOKEN is set, requests without token get 401."""
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "secret-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"faithfulness": 0.9}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
|
||||||
|
# No auth header -> 401
|
||||||
|
resp = tc.post("/api/score", json={
|
||||||
|
"question": "q", "answer": "a", "contexts": "c",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
# Correct token -> 200
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer secret-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
def test_wrong_bearer_token_returns_401(self, monkeypatch):
|
||||||
|
import webapp.api.score as score_mod
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_settings = EvaluationSettings(_env_file=None)
|
||||||
|
object.__setattr__(mock_settings, "score_api_token", "correct-token")
|
||||||
|
monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
|
||||||
|
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {}
|
||||||
|
monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
tc = TestClient(create_app())
|
||||||
|
resp = tc.post("/api/score",
|
||||||
|
json={"question": "q", "answer": "a", "contexts": "c"},
|
||||||
|
headers={"Authorization": "Bearer wrong-token"},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 401
|
||||||
146
tests/webapp/test_score_jobs_api.py
Normal file
146
tests/webapp/test_score_jobs_api.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
"""Tests for async score jobs API."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with fresh ScoreJobManager backed by tmp dirs."""
|
||||||
|
import webapp.services.score_job_manager as mgr_mod
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
|
||||||
|
fresh_mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-async",
|
||||||
|
index_dir=tmp_path / "score-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
|
||||||
|
|
||||||
|
import webapp.api.score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAsyncScoreEndpoints:
|
||||||
|
def test_submit_returns_202_with_job_id(self, client):
|
||||||
|
"""POST /api/score/async returns 202 immediately."""
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?",
|
||||||
|
"answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
|
||||||
|
def test_list_jobs_empty_initially(self, client):
|
||||||
|
resp = client.get("/api/score/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/jobs/nonexistent123")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_submitted_job_appears_in_list(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.1)
|
||||||
|
list_resp = client.get("/api/score/jobs")
|
||||||
|
ids = [j["job_id"] for j in list_resp.json()["jobs"]]
|
||||||
|
assert job_id in ids
|
||||||
|
|
||||||
|
def test_get_job_by_id_returns_status(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.1)
|
||||||
|
get_resp = client.get(f"/api/score/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score/async", json={"question": "q?"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreJobManager:
|
||||||
|
def test_completed_job_persisted_to_index(self, tmp_path):
|
||||||
|
"""Completed job writes index JSON."""
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
|
||||||
|
mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "runs",
|
||||||
|
index_dir=tmp_path / "index",
|
||||||
|
max_workers=1,
|
||||||
|
)
|
||||||
|
req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
|
||||||
|
|
||||||
|
# Patch _run directly — it uses lazy imports internally
|
||||||
|
def fake_run(job_id, request):
|
||||||
|
mgr._update(job_id, status="completed", finished_at="2026-01-01T00:00:01+00:00",
|
||||||
|
run_id="fake-run-id", scores={"answer_relevancy": 0.85},
|
||||||
|
weighted_score=0.85, latency_ms=500)
|
||||||
|
|
||||||
|
with patch.object(mgr, "_run", side_effect=fake_run):
|
||||||
|
status = mgr.submit(req)
|
||||||
|
|
||||||
|
for _ in range(20):
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
if s and s.status == "completed":
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
assert s is not None
|
||||||
|
idx_path = tmp_path / "index" / f"{status.job_id}.json"
|
||||||
|
assert idx_path.exists()
|
||||||
|
data = json.loads(idx_path.read_text(encoding="utf-8"))
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
assert data["status"] == "completed"
|
||||||
|
|
||||||
|
def test_loads_existing_index_on_startup(self, tmp_path):
|
||||||
|
"""Manager loads persisted jobs from index dir on init."""
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import AsyncScoreJobStatus
|
||||||
|
|
||||||
|
idx_dir = tmp_path / "index"
|
||||||
|
idx_dir.mkdir()
|
||||||
|
fake = AsyncScoreJobStatus(
|
||||||
|
job_id="testjob001",
|
||||||
|
status="completed",
|
||||||
|
created_at="2026-01-01T00:00:00+00:00",
|
||||||
|
run_id="some-run-id",
|
||||||
|
scores={"answer_relevancy": 0.9},
|
||||||
|
weighted_score=0.9,
|
||||||
|
latency_ms=1000,
|
||||||
|
)
|
||||||
|
(idx_dir / "testjob001.json").write_text(
|
||||||
|
json.dumps(fake.model_dump(), ensure_ascii=False), encoding="utf-8"
|
||||||
|
)
|
||||||
|
mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "runs",
|
||||||
|
index_dir=idx_dir,
|
||||||
|
max_workers=1,
|
||||||
|
)
|
||||||
|
loaded = mgr.get("testjob001")
|
||||||
|
assert loaded is not None
|
||||||
|
assert loaded.status == "completed"
|
||||||
|
assert loaded.run_id == "some-run-id"
|
||||||
299
tests/webapp/test_session_score_jobs_api.py
Normal file
299
tests/webapp/test_session_score_jobs_api.py
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
"""Tests for session-grouped async scoring API and SessionScoreJobManager."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tmp_manager(tmp_path):
|
||||||
|
"""Isolated SessionScoreJobManager backed by tmp dirs (no real LLM calls)."""
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
return SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with fresh SessionScoreJobManager backed by tmp dirs."""
|
||||||
|
import webapp.services.session_score_manager as mgr_mod
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
|
||||||
|
fresh_mgr = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(mgr_mod, "session_score_manager", fresh_mgr)
|
||||||
|
|
||||||
|
import webapp.api.session_score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "session_score_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return pytest.importorskip("fastapi.testclient").TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Unit tests for SessionScoreJobManager
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSessionRunId:
|
||||||
|
def test_same_session_always_same_run_id(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("abc") == tmp_manager.session_run_id("abc")
|
||||||
|
|
||||||
|
def test_different_sessions_different_run_ids(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("session-A") != tmp_manager.session_run_id("session-B")
|
||||||
|
|
||||||
|
def test_run_id_prefixed_with_session(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("test123").startswith("session-")
|
||||||
|
|
||||||
|
def test_special_chars_sanitized(self, tmp_manager):
|
||||||
|
run_id = tmp_manager.session_run_id("user@dify:flow/001")
|
||||||
|
assert "/" not in run_id
|
||||||
|
assert "@" not in run_id
|
||||||
|
assert ":" not in run_id
|
||||||
|
|
||||||
|
|
||||||
|
class TestSubmit:
|
||||||
|
def test_submit_returns_job_status_and_run_id(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
status, run_id = tmp_manager.submit("session-1", _mock_request())
|
||||||
|
assert status.job_id
|
||||||
|
assert status.status == "queued"
|
||||||
|
assert run_id == tmp_manager.session_run_id("session-1")
|
||||||
|
|
||||||
|
def test_submit_adds_job_to_session(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
status, _ = tmp_manager.submit("session-1", _mock_request())
|
||||||
|
session = tmp_manager.get_session("session-1")
|
||||||
|
assert session is not None
|
||||||
|
assert any(j.job_id == status.job_id for j in session.jobs)
|
||||||
|
|
||||||
|
def test_multiple_submits_same_session_accumulate(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
session = tmp_manager.get_session("session-X")
|
||||||
|
assert session.call_count == 3
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_none(self, tmp_manager):
|
||||||
|
assert tmp_manager.get_job("does-not-exist") is None
|
||||||
|
|
||||||
|
def test_get_unknown_session_returns_none(self, tmp_manager):
|
||||||
|
assert tmp_manager.get_session("no-such-session") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSessionIndexPersistence:
|
||||||
|
def test_session_index_survives_restart(self, tmp_path):
|
||||||
|
"""Jobs and session mappings loaded from disk on new manager instance."""
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
|
||||||
|
mgr1 = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
with patch.object(mgr1._executor, "submit"):
|
||||||
|
mgr1.submit("persist-session", _mock_request())
|
||||||
|
mgr1.submit("persist-session", _mock_request())
|
||||||
|
|
||||||
|
# New manager instance loads from disk
|
||||||
|
mgr2 = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
session = mgr2.get_session("persist-session")
|
||||||
|
assert session is not None
|
||||||
|
assert session.call_count == 2
|
||||||
|
|
||||||
|
def test_job_index_file_created_on_submit(self, tmp_path):
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
mgr = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
with patch.object(mgr._executor, "submit"):
|
||||||
|
status, _ = mgr.submit("file-test", _mock_request())
|
||||||
|
index_file = tmp_path / "score-session-jobs" / f"{status.job_id}.json"
|
||||||
|
assert index_file.is_file()
|
||||||
|
data = json.loads(index_file.read_text())
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
|
||||||
|
|
||||||
|
class TestAppendBehaviour:
|
||||||
|
"""Test the CSV append / read-all logic in _append_and_regenerate via _read_score_rows."""
|
||||||
|
|
||||||
|
def test_read_score_rows_returns_empty_for_missing_csv(self, tmp_manager, tmp_path):
|
||||||
|
rows = tmp_manager._read_score_rows(tmp_path / "nonexistent")
|
||||||
|
assert rows == []
|
||||||
|
|
||||||
|
def test_read_score_rows_reads_existing_csv(self, tmp_manager, tmp_path):
|
||||||
|
run_dir = tmp_path / "run1"
|
||||||
|
run_dir.mkdir()
|
||||||
|
df = pd.DataFrame([{"sample_id": "s1", "answer_relevancy": 0.9}])
|
||||||
|
df.to_csv(run_dir / "scores.csv", index=False)
|
||||||
|
rows = tmp_manager._read_score_rows(run_dir)
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0]["sample_id"] == "s1"
|
||||||
|
|
||||||
|
def test_metric_means_computed_from_csv(self, tmp_manager, tmp_path):
|
||||||
|
run_dir = tmp_path / "run2"
|
||||||
|
run_dir.mkdir()
|
||||||
|
df = pd.DataFrame([
|
||||||
|
{"sample_id": "s1", "answer_relevancy": 0.8},
|
||||||
|
{"sample_id": "s2", "answer_relevancy": 0.6},
|
||||||
|
])
|
||||||
|
df.to_csv(run_dir / "scores.csv", index=False)
|
||||||
|
means = tmp_manager._read_metric_means(run_dir)
|
||||||
|
assert means["answer_relevancy"] == pytest.approx(0.7, abs=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# API endpoint tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSessionAsyncEndpoints:
|
||||||
|
def test_submit_returns_202_with_session_fields(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "test-session-001",
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is computed tomography.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert data["session_id"] == "test-session-001"
|
||||||
|
assert "job_id" in data
|
||||||
|
assert "run_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
assert data["call_count"] >= 1
|
||||||
|
|
||||||
|
def test_run_id_deterministic_for_session(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
r1 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "det-session",
|
||||||
|
"question": "Q1",
|
||||||
|
"answer": "A1",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
r2 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "det-session",
|
||||||
|
"question": "Q2",
|
||||||
|
"answer": "A2",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert r1.json()["run_id"] == r2.json()["run_id"]
|
||||||
|
|
||||||
|
def test_different_sessions_different_run_ids(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
r1 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "session-A",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
r2 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "session-B",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert r1.json()["run_id"] != r2.json()["run_id"]
|
||||||
|
|
||||||
|
def test_call_count_increments_per_session(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
for _ in range(3):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "count-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
time.sleep(0.05)
|
||||||
|
resp = client.get("/api/score/sessions/count-session")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["call_count"] == 3
|
||||||
|
|
||||||
|
def test_get_session_returns_jobs_list(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "list-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
time.sleep(0.05)
|
||||||
|
resp = client.get("/api/score/sessions/list-session")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert len(data["jobs"]) == 1
|
||||||
|
|
||||||
|
def test_get_unknown_session_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/sessions/no-such-session-xyz")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_get_session_job_by_id(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "job-lookup-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.05)
|
||||||
|
get_resp = client.get(f"/api/score/session/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/session/jobs/nonexistent-job-id")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_missing_session_id_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_list_sessions_endpoint(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "list-all-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
resp = client.get("/api/score/sessions")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "sessions" in resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _mock_request():
|
||||||
|
"""Build a minimal ScoreRequest for testing."""
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
return ScoreRequest(
|
||||||
|
question="What is dual-source CT?",
|
||||||
|
answer="It uses two X-ray sources.",
|
||||||
|
metrics=["answer_relevancy"],
|
||||||
|
)
|
||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
from webapp.models import (
|
from webapp.models import (
|
||||||
@@ -13,19 +15,23 @@ from webapp.services import scenario_scanner
|
|||||||
from webapp.services.task_manager import task_manager
|
from webapp.services.task_manager import task_manager
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
|
router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
|
||||||
|
logger = logging.getLogger("webapp.api.evaluations")
|
||||||
|
|
||||||
|
|
||||||
@router.post("", response_model=TriggerEvaluationResponse)
|
@router.post("", response_model=TriggerEvaluationResponse)
|
||||||
def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
|
def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
|
||||||
"""Validate the scenario path and queue a background evaluation task."""
|
"""Validate the scenario path and queue a background evaluation task."""
|
||||||
|
logger.info("[trigger] scenario=%s", request.scenario_path)
|
||||||
resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
|
resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
|
||||||
if resolved is None:
|
if resolved is None:
|
||||||
|
logger.warning("[trigger] invalid scenario path: %s", request.scenario_path)
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail=f"无效或不允许的场景路径: {request.scenario_path}",
|
detail=f"无效或不允许的场景路径: {request.scenario_path}",
|
||||||
)
|
)
|
||||||
|
|
||||||
task_id = task_manager.submit(request.scenario_path)
|
task_id = task_manager.submit(request.scenario_path)
|
||||||
|
logger.info("[trigger] queued task_id=%s scenario=%s", task_id, request.scenario_path)
|
||||||
return TriggerEvaluationResponse(task_id=task_id)
|
return TriggerEvaluationResponse(task_id=task_id)
|
||||||
|
|
||||||
|
|
||||||
@@ -34,11 +40,15 @@ def get_task_status(task_id: str) -> TaskStatus:
|
|||||||
"""Return the current status and logs for one evaluation task."""
|
"""Return the current status and logs for one evaluation task."""
|
||||||
status = task_manager.get(task_id)
|
status = task_manager.get(task_id)
|
||||||
if status is None:
|
if status is None:
|
||||||
|
logger.warning("[task_status] not found task_id=%s", task_id)
|
||||||
raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
|
raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
|
||||||
|
logger.debug("[task_status] task_id=%s status=%s", task_id, status.status)
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=dict)
|
@router.get("", response_model=dict)
|
||||||
def list_tasks() -> dict[str, list]:
|
def list_tasks() -> dict[str, list]:
|
||||||
"""Return all known evaluation tasks for this server session."""
|
"""Return all known evaluation tasks for this server session."""
|
||||||
return {"tasks": [task.model_dump() for task in task_manager.list_tasks()]}
|
tasks = task_manager.list_tasks()
|
||||||
|
logger.info("[list_tasks] count=%d", len(tasks))
|
||||||
|
return {"tasks": [task.model_dump() for task in tasks]}
|
||||||
|
|||||||
245
webapp/api/llm_profiles.py
Normal file
245
webapp/api/llm_profiles.py
Normal file
@@ -0,0 +1,245 @@
|
|||||||
|
"""CRUD routes for LLM profiles plus the scenario-patching apply endpoint."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
CreateProfileRequest,
|
||||||
|
LLMProfile,
|
||||||
|
ProfileApplyRequest,
|
||||||
|
ProfileApplyResponse,
|
||||||
|
ProfileProbeRequest,
|
||||||
|
ProfileTestResponse,
|
||||||
|
)
|
||||||
|
from webapp.services.profile_manager import profile_manager
|
||||||
|
from webapp.services.yaml_patcher import apply_profiles_to_scenario
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
|
||||||
|
logger = logging.getLogger("webapp.api.llm_profiles")
|
||||||
|
|
||||||
|
|
||||||
|
# 常见 embedding 模型名称关键词,用于自动判断走 /embeddings 端点
|
||||||
|
_EMBEDDING_MODEL_KEYWORDS = (
|
||||||
|
"embedding", "embed", "text-search", "text-similarity",
|
||||||
|
"code-search", "ada-002",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_embedding_model(model: str) -> bool:
|
||||||
|
"""Heuristic: return True if the model name looks like an embedding model."""
|
||||||
|
return any(kw in model.lower() for kw in _EMBEDDING_MODEL_KEYWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
def _do_connectivity_test(
|
||||||
|
model: str,
|
||||||
|
base_url: str,
|
||||||
|
api_key: str,
|
||||||
|
timeout_seconds: int,
|
||||||
|
) -> ProfileTestResponse:
|
||||||
|
"""Send a minimal request and return the connectivity test result.
|
||||||
|
|
||||||
|
- Embedding models → POST /embeddings with a short text
|
||||||
|
- Chat models → POST /chat/completions, tries max_completion_tokens first
|
||||||
|
(required by newer models like gpt-5.x), falls back to max_tokens.
|
||||||
|
"""
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=api_key,
|
||||||
|
base_url=base_url.rstrip("/"),
|
||||||
|
timeout=float(timeout_seconds),
|
||||||
|
)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
if _is_embedding_model(model):
|
||||||
|
# Embedding 模型走 /embeddings 端点
|
||||||
|
try:
|
||||||
|
client.embeddings.create(model=model, input="test")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=True, message="连接成功(embedding)", latency_ms=latency_ms)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
|
||||||
|
|
||||||
|
# Chat 模型:先不限制 token(最兼容),超时/鉴权错误直接返回
|
||||||
|
# 避免 max_tokens=1 对部分模型(gpt-5.x)触发 min-output 限制
|
||||||
|
try:
|
||||||
|
client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
max_tokens=8, # 足够小节省费用,同时满足各模型最小输出要求
|
||||||
|
)
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
err_str = str(exc)
|
||||||
|
# 如果 max_tokens 不被支持,改用 max_completion_tokens 再试一次
|
||||||
|
if "max_tokens" in err_str and "max_completion_tokens" in err_str:
|
||||||
|
try:
|
||||||
|
client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
max_completion_tokens=8,
|
||||||
|
)
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
|
||||||
|
except Exception as exc2: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=False, message=str(exc2), latency_ms=latency_ms)
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
|
||||||
|
def probe_connectivity(request: ProfileProbeRequest) -> ProfileTestResponse:
|
||||||
|
"""Test LLM connectivity with inline credentials (no saved profile required)."""
|
||||||
|
logger.info("[probe] model=%s base_url=%s", request.model, request.base_url)
|
||||||
|
result = _do_connectivity_test(
|
||||||
|
model=request.model,
|
||||||
|
base_url=request.base_url,
|
||||||
|
api_key=request.api_key,
|
||||||
|
timeout_seconds=request.timeout_seconds,
|
||||||
|
)
|
||||||
|
logger.info("[probe] ok=%s latency=%sms msg=%s", result.ok, result.latency_ms, result.message)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=dict)
|
||||||
|
def list_profiles() -> dict:
|
||||||
|
"""Return all saved LLM profiles."""
|
||||||
|
profiles = profile_manager.list_all()
|
||||||
|
logger.info("[list_profiles] count=%d", len(profiles))
|
||||||
|
return {"profiles": [p.model_dump() for p in profiles]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("", status_code=201, response_model=LLMProfile)
|
||||||
|
def create_profile(request: CreateProfileRequest) -> LLMProfile:
|
||||||
|
"""Create a new LLM profile."""
|
||||||
|
logger.info("[create_profile] name=%r model=%s base_url=%s", request.name, request.model, request.base_url)
|
||||||
|
profile = profile_manager.create(
|
||||||
|
name=request.name,
|
||||||
|
model=request.model,
|
||||||
|
base_url=request.base_url,
|
||||||
|
api_key=request.api_key,
|
||||||
|
timeout_seconds=request.timeout_seconds,
|
||||||
|
)
|
||||||
|
logger.info("[create_profile] created id=%s", profile.profile_id)
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/{profile_id}", response_model=LLMProfile)
|
||||||
|
def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile:
|
||||||
|
"""Update an existing LLM profile by id."""
|
||||||
|
logger.info("[update_profile] id=%s name=%r model=%s", profile_id, request.name, request.model)
|
||||||
|
updated = profile_manager.update(
|
||||||
|
profile_id=profile_id,
|
||||||
|
name=request.name,
|
||||||
|
model=request.model,
|
||||||
|
base_url=request.base_url,
|
||||||
|
api_key=request.api_key,
|
||||||
|
timeout_seconds=request.timeout_seconds,
|
||||||
|
)
|
||||||
|
if updated is None:
|
||||||
|
logger.warning("[update_profile] not found id=%s", profile_id)
|
||||||
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
# Invalidate scorer cache so next request picks up the new profile settings.
|
||||||
|
try:
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
inline_scorer.invalidate_cache()
|
||||||
|
logger.info("[update_profile] scorer cache invalidated id=%s", profile_id)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
logger.info("[update_profile] updated id=%s", profile_id)
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{profile_id}", response_model=dict)
|
||||||
|
def delete_profile(profile_id: str) -> dict:
|
||||||
|
"""Delete an LLM profile by id."""
|
||||||
|
logger.info("[delete_profile] id=%s", profile_id)
|
||||||
|
deleted = profile_manager.delete(profile_id)
|
||||||
|
if not deleted:
|
||||||
|
logger.warning("[delete_profile] not found id=%s", profile_id)
|
||||||
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
# Invalidate scorer cache in case the deleted profile was in use.
|
||||||
|
try:
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
inline_scorer.invalidate_cache()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
logger.info("[delete_profile] deleted id=%s", profile_id)
|
||||||
|
return {"deleted": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{profile_id}/test", response_model=ProfileTestResponse)
|
||||||
|
def test_profile(profile_id: str) -> ProfileTestResponse:
|
||||||
|
"""Test LLM connectivity for a saved profile."""
|
||||||
|
profile = profile_manager.get(profile_id)
|
||||||
|
if profile is None:
|
||||||
|
logger.warning("[test_profile] not found id=%s", profile_id)
|
||||||
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
logger.info("[test_profile] id=%s model=%s base_url=%s", profile_id, profile.model, profile.base_url)
|
||||||
|
result = _do_connectivity_test(
|
||||||
|
model=profile.model,
|
||||||
|
base_url=profile.base_url,
|
||||||
|
api_key=profile.api_key,
|
||||||
|
timeout_seconds=profile.timeout_seconds,
|
||||||
|
)
|
||||||
|
logger.info("[test_profile] ok=%s latency=%sms", result.ok, result.latency_ms)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/apply", response_model=ProfileApplyResponse)
|
||||||
|
def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
|
||||||
|
"""Patch selected LLM profiles into the target scenario YAML file."""
|
||||||
|
logger.info(
|
||||||
|
"[apply_profiles] scenario=%s judge=%s answer=%s dataset=%s metric_weights=%s doc_weights=%s",
|
||||||
|
request.scenario_path,
|
||||||
|
request.judge_profile_id,
|
||||||
|
request.answer_profile_id,
|
||||||
|
request.dataset_profile_id,
|
||||||
|
bool(request.metric_weights),
|
||||||
|
bool(request.doc_weights),
|
||||||
|
)
|
||||||
|
role_profiles: dict[str, LLMProfile | None] = {
|
||||||
|
"judge": profile_manager.get(request.judge_profile_id) if request.judge_profile_id else None,
|
||||||
|
"answer": profile_manager.get(request.answer_profile_id) if request.answer_profile_id else None,
|
||||||
|
"dataset": profile_manager.get(request.dataset_profile_id) if request.dataset_profile_id else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
missing = [
|
||||||
|
role
|
||||||
|
for role, pid in [
|
||||||
|
("judge", request.judge_profile_id),
|
||||||
|
("answer", request.answer_profile_id),
|
||||||
|
("dataset", request.dataset_profile_id),
|
||||||
|
]
|
||||||
|
if pid and role_profiles[role] is None
|
||||||
|
]
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
logger.warning("[apply_profiles] missing profiles for roles: %s", missing)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Profile(s) not found for roles: {', '.join(missing)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
patched = apply_profiles_to_scenario(
|
||||||
|
scenario_path=request.scenario_path,
|
||||||
|
judge_profile=role_profiles["judge"],
|
||||||
|
answer_profile=role_profiles["answer"],
|
||||||
|
dataset_profile=role_profiles["dataset"],
|
||||||
|
metric_weights=request.metric_weights,
|
||||||
|
doc_weights=request.doc_weights,
|
||||||
|
)
|
||||||
|
logger.info("[apply_profiles] patched fields: %s", patched)
|
||||||
|
return ProfileApplyResponse(
|
||||||
|
scenario_path=request.scenario_path,
|
||||||
|
patched_fields=patched,
|
||||||
|
)
|
||||||
131
webapp/api/pipeline.py
Normal file
131
webapp/api/pipeline.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
"""Routes for the end-to-end pipeline API (document parse → build → eval)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
PipelineJobRequest,
|
||||||
|
PipelineJobResponse,
|
||||||
|
PipelineJobStatus,
|
||||||
|
)
|
||||||
|
from webapp.services.pipeline_task_manager import pipeline_task_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/pipeline", tags=["pipeline"])
|
||||||
|
logger = logging.getLogger("webapp.api.pipeline")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/jobs",
|
||||||
|
status_code=202,
|
||||||
|
response_model=PipelineJobResponse,
|
||||||
|
summary="提交全链路评估任务",
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": "任务已成功排队,立即返回 job_id。",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"job_id": "a1b2c3d4e5f6",
|
||||||
|
"job_name": "siemens-ct-eval-2026",
|
||||||
|
"status": "queued",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
422: {"description": "请求参数校验失败(docs_path 等必填字段缺失或格式错误)。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_pipeline_job(request: PipelineJobRequest) -> PipelineJobResponse:
|
||||||
|
"""提交一个「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全链路任务。
|
||||||
|
|
||||||
|
任务在后台线程中异步执行,立即返回 `job_id`。
|
||||||
|
通过 `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。
|
||||||
|
|
||||||
|
**Pipeline 执行阶段**:
|
||||||
|
1. `parsing_documents` — 调用阿里云 DocMind 解析每份 PDF
|
||||||
|
2. `generating_questions` — LLM 从文档片段生成草稿题库
|
||||||
|
3. `evaluating` — RAGAS 在线评测打分(answer_model 答题 + judge_model 评分)
|
||||||
|
4. `done` — 所有产物写入磁盘,`status` 变为 `completed`
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[submit_pipeline] docs_path=%s job_name=%r gen_model=%s judge=%s max_docs=%s",
|
||||||
|
request.docs_path, request.job_name, request.generation_model,
|
||||||
|
request.judge_model, request.max_documents,
|
||||||
|
)
|
||||||
|
task = pipeline_task_manager.submit(request)
|
||||||
|
logger.info("[submit_pipeline] queued job_id=%s job_name=%s", task.job_id, task.job_name)
|
||||||
|
return PipelineJobResponse(
|
||||||
|
job_id=task.job_id,
|
||||||
|
job_name=task.job_name,
|
||||||
|
status=task.status,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs/{job_id}",
|
||||||
|
response_model=PipelineJobStatus,
|
||||||
|
summary="查询任务状态",
|
||||||
|
responses={
|
||||||
|
200: {"description": "返回任务当前状态、执行阶段、日志及完成后的产物路径。"},
|
||||||
|
404: {"description": "指定 job_id 的任务不存在。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def get_pipeline_job(job_id: str) -> PipelineJobStatus:
|
||||||
|
"""查询一个 Pipeline 任务的当前状态、执行阶段、实时日志和结果。
|
||||||
|
|
||||||
|
**轮询建议**:每 3–5 秒查询一次,直到 `status` 为 `completed` 或 `failed`。
|
||||||
|
|
||||||
|
`result` 字段在任务完成后填充,包含:
|
||||||
|
- `scores_csv` — 每道题目逐项评分
|
||||||
|
- `summary_md` — 评估摘要 Markdown
|
||||||
|
- `dataset_csv` — 生成的题库 CSV
|
||||||
|
- `source_chunks_jsonl` — 文档片段索引
|
||||||
|
"""
|
||||||
|
status = pipeline_task_manager.get(job_id)
|
||||||
|
if status is None:
|
||||||
|
logger.warning("[get_pipeline_job] not found job_id=%s", job_id)
|
||||||
|
raise HTTPException(status_code=404, detail=f"Pipeline job not found: {job_id}")
|
||||||
|
logger.debug("[get_pipeline_job] job_id=%s status=%s phase=%s", job_id, status.status, status.phase)
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有任务",
|
||||||
|
responses={
|
||||||
|
200: {
|
||||||
|
"description": "按创建时间倒序返回本次服务器会话中所有的 Pipeline 任务。",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"jobs": [
|
||||||
|
{
|
||||||
|
"job_id": "a1b2c3d4e5f6",
|
||||||
|
"job_name": "siemens-ct-eval",
|
||||||
|
"status": "completed",
|
||||||
|
"phase": "done",
|
||||||
|
"logs": ["[build] 17 documents parsed", "..."],
|
||||||
|
"result": {
|
||||||
|
"total_questions": 19,
|
||||||
|
"eval_run_id": "2026-06-18T...",
|
||||||
|
"scores_csv": "outputs/pipeline/.../scores.csv",
|
||||||
|
"summary_md": "outputs/pipeline/.../summary.md",
|
||||||
|
},
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def list_pipeline_jobs() -> dict:
|
||||||
|
"""返回本次服务器会话中所有已提交的 Pipeline 任务,按创建时间倒序排列。"""
|
||||||
|
jobs = pipeline_task_manager.list_jobs()
|
||||||
|
logger.info("[list_pipeline_jobs] count=%d", len(jobs))
|
||||||
|
return {"jobs": [s.model_dump() for s in jobs]}
|
||||||
@@ -2,31 +2,42 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
from webapp.models import RunDetail
|
from webapp.models import RunDetail
|
||||||
from webapp.services import report_builder, run_reader
|
from webapp.services import report_builder, run_reader
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/runs", tags=["runs"])
|
router = APIRouter(prefix="/api/runs", tags=["runs"])
|
||||||
|
logger = logging.getLogger("webapp.api.runs")
|
||||||
|
|
||||||
|
|
||||||
@router.get("")
|
@router.get("")
|
||||||
def get_runs() -> dict[str, list]:
|
def get_runs() -> dict[str, list]:
|
||||||
"""Return summaries for every discoverable evaluation run."""
|
"""Return summaries for every discoverable evaluation run."""
|
||||||
summaries = run_reader.list_run_summaries()
|
summaries = run_reader.list_run_summaries()
|
||||||
|
logger.info("[get_runs] found %d runs", len(summaries))
|
||||||
return {"runs": [summary.model_dump() for summary in summaries]}
|
return {"runs": [summary.model_dump() for summary in summaries]}
|
||||||
|
|
||||||
|
|
||||||
@router.get("/{run_id}")
|
@router.get("/{run_id}")
|
||||||
def get_run_detail(run_id: str) -> RunDetail:
|
def get_run_detail(run_id: str) -> RunDetail:
|
||||||
"""Return the full summary and aggregated report for one run."""
|
"""Return the full summary and aggregated report for one run."""
|
||||||
|
logger.info("[get_run_detail] run_id=%s", run_id)
|
||||||
run_dir = run_reader.find_run_dir(run_id)
|
run_dir = run_reader.find_run_dir(run_id)
|
||||||
if run_dir is None:
|
if run_dir is None:
|
||||||
|
logger.warning("[get_run_detail] not found run_id=%s", run_id)
|
||||||
raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
|
raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
|
||||||
|
|
||||||
summary = run_reader.build_run_summary(run_dir)
|
summary = run_reader.build_run_summary(run_dir)
|
||||||
if summary is None:
|
if summary is None:
|
||||||
|
logger.warning("[get_run_detail] missing metadata run_id=%s", run_id)
|
||||||
raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
|
raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
|
||||||
|
|
||||||
report = report_builder.build_report(run_dir, summary.metrics)
|
report = report_builder.build_report(run_dir, summary.metrics)
|
||||||
|
logger.info(
|
||||||
|
"[get_run_detail] ok run_id=%s metrics=%s valid=%d invalid=%d",
|
||||||
|
run_id, summary.metrics, summary.valid_samples, summary.invalid_samples,
|
||||||
|
)
|
||||||
return RunDetail(summary=summary, report=report)
|
return RunDetail(summary=summary, report=report)
|
||||||
|
|||||||
@@ -2,15 +2,20 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
|
|
||||||
from webapp.services import scenario_scanner
|
from webapp.services import scenario_scanner
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
|
router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
|
||||||
|
logger = logging.getLogger("webapp.api.scenarios")
|
||||||
|
|
||||||
|
|
||||||
@router.get("")
|
@router.get("")
|
||||||
def get_scenarios() -> dict[str, list]:
|
def get_scenarios() -> dict[str, list]:
|
||||||
"""Return every scenario file found under the scenarios/ directory."""
|
"""Return every scenario file found under the scenarios/ directory."""
|
||||||
scenarios = scenario_scanner.list_scenarios()
|
scenarios = scenario_scanner.list_scenarios()
|
||||||
|
valid = sum(1 for s in scenarios if not s.error)
|
||||||
|
logger.info("[get_scenarios] total=%d valid=%d errors=%d", len(scenarios), valid, len(scenarios) - valid)
|
||||||
return {"scenarios": [item.model_dump() for item in scenarios]}
|
return {"scenarios": [item.model_dump() for item in scenarios]}
|
||||||
|
|||||||
176
webapp/api/score.py
Normal file
176
webapp/api/score.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Header, HTTPException, Request
|
||||||
|
from fastapi.exceptions import RequestValidationError
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import ScoreRequest, ScoreResponse
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.score")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_settings() -> EvaluationSettings:
|
||||||
|
"""Return a fresh EvaluationSettings instance (overridable in tests)."""
|
||||||
|
return EvaluationSettings()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_auth(authorization: str | None, token: str) -> None:
|
||||||
|
"""Raise 401 if Bearer token does not match the configured token."""
|
||||||
|
if authorization is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Missing Authorization header.")
|
||||||
|
parts = authorization.split(" ", 1)
|
||||||
|
if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid Bearer token.")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"",
|
||||||
|
response_model=ScoreResponse,
|
||||||
|
summary="单题实时评分(Dify 外部 Tool)",
|
||||||
|
responses={
|
||||||
|
200: {
|
||||||
|
"description": "各指标得分、加权综合得分及耗时。",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"scores": {
|
||||||
|
"faithfulness": 0.875,
|
||||||
|
"answer_relevancy": 0.920,
|
||||||
|
"context_recall": 0.810,
|
||||||
|
"context_precision": 0.850,
|
||||||
|
},
|
||||||
|
"weighted_score": 0.8638,
|
||||||
|
"latency_ms": 3420,
|
||||||
|
"skipped_metrics": [],
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
|
||||||
|
422: {"description": "请求参数校验失败(必填字段缺失或 metrics 名称不合法)。"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def score_sample(
|
||||||
|
raw_request: Request,
|
||||||
|
request: ScoreRequest,
|
||||||
|
authorization: Annotated[str | None, Header()] = None,
|
||||||
|
) -> ScoreResponse:
|
||||||
|
"""接受单条问答记录,同步运行 RAGAS 指标打分,实时返回各指标得分。
|
||||||
|
|
||||||
|
**主要用途**:供 Dify 外部 Tool 调用。Dify Agent 在生成回答后,将
|
||||||
|
`(question, answer, contexts)` 发送到此端点,即可获得 RAGAS 质量评分,
|
||||||
|
用于日志记录、质量监控或触发 Agent 自我改进流程。
|
||||||
|
|
||||||
|
**contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串,
|
||||||
|
服务端自动拆分后传入 RAGAS 管道。**contexts 为可选字段**,缺失时自动跳过依赖检索内容的指标
|
||||||
|
(`faithfulness`、`context_recall`、`context_precision`、`noise_sensitivity`)。
|
||||||
|
|
||||||
|
**ground_truth 可选**:
|
||||||
|
- 提供时:所有指定指标均参与计算。
|
||||||
|
- 缺失时:自动跳过依赖参考答案的指标(`context_recall`、
|
||||||
|
`factual_correctness`、`semantic_similarity`、`noise_sensitivity`),
|
||||||
|
跳过的指标在响应的 `skipped_metrics` 列表中列出,对应 `scores` 值为 `null`。
|
||||||
|
|
||||||
|
**支持的 RAGAS 指标**:
|
||||||
|
- `faithfulness` — 回答与检索片段的事实一致性
|
||||||
|
- `answer_relevancy` — 回答与问题的相关性
|
||||||
|
- `context_recall` — 参考答案覆盖到的检索内容比例(需 ground_truth)
|
||||||
|
- `context_precision` — 检索片段中与答案相关的部分占比
|
||||||
|
- `noise_sensitivity` — 对无关噪声片段的敏感度(需 ground_truth)
|
||||||
|
- `factual_correctness` — 回答与参考答案的事实准确性(需 ground_truth)
|
||||||
|
- `semantic_similarity` — 回答与参考答案的语义相似度(需 ground_truth)
|
||||||
|
|
||||||
|
**推荐模型配置**:
|
||||||
|
- `judge_model`: `gpt-5`
|
||||||
|
- `embedding_model`: `text-embedding-3-small`
|
||||||
|
|
||||||
|
**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需在请求头携带
|
||||||
|
`Authorization: Bearer <token>`;留空则无需鉴权(适合内网部署)。
|
||||||
|
"""
|
||||||
|
client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
|
||||||
|
logger.info(
|
||||||
|
"[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s has_ctx=%s",
|
||||||
|
client,
|
||||||
|
raw_request.method,
|
||||||
|
raw_request.headers.get("content-type", ""),
|
||||||
|
request.metrics,
|
||||||
|
request.ground_truth is not None,
|
||||||
|
bool(request.contexts),
|
||||||
|
)
|
||||||
|
settings = _get_settings()
|
||||||
|
|
||||||
|
# Require Bearer auth only when the deployment configured a shared token.
|
||||||
|
if settings.score_api_token:
|
||||||
|
_check_auth(authorization, settings.score_api_token)
|
||||||
|
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
if not effective:
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={metric_name: None for metric_name in request.metrics},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=0,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores={},
|
||||||
|
weighted_score=None,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
# Keep skipped metrics visible to callers by emitting them as null scores.
|
||||||
|
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# weighted = compute_weighted_score(
|
||||||
|
# {key: value for key, value in raw_scores.items() if value is not None},
|
||||||
|
# {},
|
||||||
|
# )
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"[score] done latency=%dms skipped=%s scores=%s",
|
||||||
|
latency_ms,
|
||||||
|
skipped,
|
||||||
|
{k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
|
||||||
|
)
|
||||||
|
return ScoreResponse(
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=None, # 综合加权得分已暂时禁用
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
89
webapp/api/score_jobs.py
Normal file
89
webapp/api/score_jobs.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Routes for async RAGAS scoring jobs (Dify fire-and-forget integration).
|
||||||
|
|
||||||
|
Dify calls POST /api/score/async → gets job_id immediately (202).
|
||||||
|
Scoring runs in background, result written as a standard run artifact.
|
||||||
|
View full report at GET /api/runs/{run_id} or in the 「运行列表」 page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.score_job_manager import score_job_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=AsyncScoreJobResponse,
|
||||||
|
summary="提交异步评分任务(Dify 推荐方式)",
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": (
|
||||||
|
"任务已排队,立即返回 job_id(202 Accepted)。\n\n"
|
||||||
|
"评分在后台执行,完成后自动生成完整报告(含优化建议)。\n"
|
||||||
|
"通过 `GET /api/score/jobs/{job_id}` 查询状态,"
|
||||||
|
"完成后在「运行列表」页查看完整报告。"
|
||||||
|
),
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {"job_id": "abc123def456", "status": "queued", "run_id": None}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
|
||||||
|
"""提交异步 RAGAS 评分任务,立即返回 job_id。
|
||||||
|
|
||||||
|
**适合 Dify 工作流**:HTTP 节点无需等待评分完成(无超时风险),
|
||||||
|
工作流立即继续,评分结果在 RAGAS 平台「运行列表」中查看。
|
||||||
|
|
||||||
|
评分完成后自动生成:
|
||||||
|
- 各指标得分(`scores.csv`)
|
||||||
|
- 摘要报告(`summary.md`)
|
||||||
|
- LLM 优化建议(`optimization_advice.md`)
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[score_async] submit metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.metrics, bool(request.contexts), bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
status = score_job_manager.submit(request)
|
||||||
|
logger.info("[score_async] queued job_id=%s", status.job_id)
|
||||||
|
return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有异步评分记录",
|
||||||
|
)
|
||||||
|
def list_score_jobs() -> dict:
|
||||||
|
"""返回所有异步评分记录,按创建时间倒序排列。"""
|
||||||
|
jobs = score_job_manager.list_jobs()
|
||||||
|
logger.info("[score_jobs] list count=%d", len(jobs))
|
||||||
|
return {"jobs": [j.model_dump() for j in jobs]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询单个异步评分任务状态",
|
||||||
|
responses={404: {"description": "指定 job_id 的评分任务不存在。"}},
|
||||||
|
)
|
||||||
|
def get_score_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""查询单个异步评分任务的状态和结果。
|
||||||
|
|
||||||
|
`status` 为 `completed` 时,`run_id` 字段包含对应的运行 ID,
|
||||||
|
可通过 `GET /api/runs/{run_id}` 获取完整评分报告。
|
||||||
|
"""
|
||||||
|
status = score_job_manager.get(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
|
||||||
|
return status
|
||||||
206
webapp/api/session_score_jobs.py
Normal file
206
webapp/api/session_score_jobs.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
|
||||||
|
|
||||||
|
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
|
||||||
|
`POST /api/score/session_async` call with a shared `session_id`. All results are
|
||||||
|
accumulated into one report, visible in 「运行列表」→「报告详情」.
|
||||||
|
|
||||||
|
Key behaviour:
|
||||||
|
- Deterministic run_id: derived from session_id — same session always maps to the
|
||||||
|
same report directory (outputs/score-session/session-<id>/).
|
||||||
|
- Append semantics: each call adds a new sample row. Previous rows are preserved.
|
||||||
|
- Advisor regeneration: optimization_advice.md is regenerated after every call
|
||||||
|
using the full set of accumulated rows.
|
||||||
|
- Each call returns its own `job_id` for individual status polling, plus the
|
||||||
|
shared `run_id` and `session_id`.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /api/score/session_async Submit one call (returns job_id + run_id)
|
||||||
|
GET /api/score/sessions List all sessions
|
||||||
|
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
|
||||||
|
GET /api/score/session/jobs/{job_id} Status of one individual call
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
AsyncScoreJobStatus,
|
||||||
|
ScoreRequest,
|
||||||
|
SessionScoreJobResponse,
|
||||||
|
SessionScoreRequest,
|
||||||
|
SessionStatus,
|
||||||
|
)
|
||||||
|
from webapp.services.session_score_manager import session_score_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.session_score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/session_async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=SessionScoreJobResponse,
|
||||||
|
summary="提交 Session 异步评分(多样本批量聚合)",
|
||||||
|
description=(
|
||||||
|
"**用途**\n"
|
||||||
|
"- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
|
||||||
|
"- 相同 `session_id` 的多次调用不会生成多个独立报告,而是持续追加到同一个 session 报告。\n\n"
|
||||||
|
"**请求字段说明**\n"
|
||||||
|
"- `session_id`:会话唯一标识,同一会话必须保持一致。\n"
|
||||||
|
"- `question` / `answer`:本次待评分的问答对。\n"
|
||||||
|
"- `contexts`:检索片段拼接字符串,按 `context_separator` 拆分。\n"
|
||||||
|
"- `ground_truth`:标准答案,可选;缺失时会自动跳过依赖它的指标。\n"
|
||||||
|
"- `metrics`:本次需要计算的指标列表。\n"
|
||||||
|
"- `judge_model` / `embedding_model`:可选;为空时回退到系统默认配置。\n\n"
|
||||||
|
"**处理行为**\n"
|
||||||
|
"1. 服务端立即返回 `202 Accepted`,并生成本次调用的 `job_id`。\n"
|
||||||
|
"2. 系统根据 `session_id` 计算固定 `run_id`,格式为 `session-<sanitized-session_id>`。\n"
|
||||||
|
"3. 本次评分完成后,会向该 session 的 `scores.csv` 追加一行样本数据。\n"
|
||||||
|
"4. 系统会基于当前 session 的全量样本重写 `summary.md`,并重新生成 `optimization_advice.md`。\n"
|
||||||
|
"5. 报告可在「运行列表」中按 `run_id` 查看;同一 session 的后续调用会持续增量更新该报告。\n\n"
|
||||||
|
"**后续查询接口**\n"
|
||||||
|
"- `GET /api/score/session/jobs/{job_id}`:查询本次调用状态与得分。\n"
|
||||||
|
"- `GET /api/score/sessions/{session_id}`:查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
|
||||||
|
"- `GET /api/runs/{run_id}`:查看完整评估报告内容。\n\n"
|
||||||
|
"**典型请求示例**\n"
|
||||||
|
"```json\n"
|
||||||
|
"{\n"
|
||||||
|
" \"session_id\": \"dify-session-001\",\n"
|
||||||
|
" \"question\": \"单源CT与双源CT在球管配置上有何本质区别?\",\n"
|
||||||
|
" \"answer\": \"单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。\",\n"
|
||||||
|
" \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
|
||||||
|
" \"context_separator\": \" |||| \",\n"
|
||||||
|
" \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
|
||||||
|
" \"judge_model\": \"gpt-5.5\",\n"
|
||||||
|
" \"embedding_model\": \"text-embedding-3-small\"\n"
|
||||||
|
"}\n"
|
||||||
|
"```"
|
||||||
|
),
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": (
|
||||||
|
"调用已排队,立即返回 job_id + run_id(202 Accepted)。\n\n"
|
||||||
|
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
|
||||||
|
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
|
||||||
|
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
|
||||||
|
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
|
||||||
|
"在「运行列表」中查看完整报告(run_id 即 `session-<session_id>` 形式)。"
|
||||||
|
),
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"job_id": "abc123def456",
|
||||||
|
"session_id": "dify-session-001",
|
||||||
|
"run_id": "session-dify-session-001",
|
||||||
|
"status": "queued",
|
||||||
|
"call_count": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
|
||||||
|
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
|
||||||
|
|
||||||
|
相同 `session_id` 的多次调用合并到同一评估报告中,每次调用:
|
||||||
|
1. 新增一个样本行到 `scores.csv`
|
||||||
|
2. 重写 `summary.md`(包含所有累积样本的指标均值)
|
||||||
|
3. 重新生成 `optimization_advice.md`(基于全量样本的 LLM 优化建议)
|
||||||
|
|
||||||
|
**适合 Dify 工作流**:在循环节点中批量调用,所有轮次共用同一 `session_id`,
|
||||||
|
最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.session_id,
|
||||||
|
request.metrics,
|
||||||
|
bool(request.contexts),
|
||||||
|
bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Strip session_id to build a plain ScoreRequest for the manager
|
||||||
|
score_request = ScoreRequest(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts,
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
context_separator=request.context_separator,
|
||||||
|
metrics=request.metrics,
|
||||||
|
judge_model=request.judge_model,
|
||||||
|
embedding_model=request.embedding_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
status, run_id = session_score_manager.submit(request.session_id, score_request)
|
||||||
|
|
||||||
|
# Compute call_count from current session state
|
||||||
|
session_status = session_score_manager.get_session(request.session_id)
|
||||||
|
call_count = session_status.call_count if session_status else 1
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
|
||||||
|
status.job_id, request.session_id, run_id, call_count,
|
||||||
|
)
|
||||||
|
return SessionScoreJobResponse(
|
||||||
|
job_id=status.job_id,
|
||||||
|
session_id=request.session_id,
|
||||||
|
run_id=run_id,
|
||||||
|
status=status.status,
|
||||||
|
call_count=call_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/sessions",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有 Session 聚合状态",
|
||||||
|
)
|
||||||
|
def list_sessions() -> dict:
|
||||||
|
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
|
||||||
|
sessions = session_score_manager.list_sessions()
|
||||||
|
logger.info("[session_score] list_sessions count=%d", len(sessions))
|
||||||
|
return {"sessions": [s.model_dump() for s in sessions]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/sessions/{session_id}",
|
||||||
|
response_model=SessionStatus,
|
||||||
|
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
|
||||||
|
responses={404: {"description": "指定 session_id 不存在。"}},
|
||||||
|
)
|
||||||
|
def get_session(session_id: str) -> SessionStatus:
|
||||||
|
"""查询 session 的聚合评分状态。
|
||||||
|
|
||||||
|
返回内容:
|
||||||
|
- `run_id`:在「运行列表」中查看完整报告
|
||||||
|
- `call_count`:本 session 累计调用次数
|
||||||
|
- `metric_means`:所有已累积样本的各指标均值(实时读取 scores.csv)
|
||||||
|
- `jobs`:本 session 所有调用记录列表
|
||||||
|
"""
|
||||||
|
status = session_score_manager.get_session(session_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/session/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询 Session 单次调用状态",
|
||||||
|
responses={404: {"description": "指定 job_id 不存在。"}},
|
||||||
|
)
|
||||||
|
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""查询 session 评分中某次调用的状态和评分结果。
|
||||||
|
|
||||||
|
`status` 为 `completed` 时,`run_id` 即所属 session 的报告目录,
|
||||||
|
`scores` 包含本次调用的各指标得分。
|
||||||
|
"""
|
||||||
|
status = session_score_manager.get_job(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Session score job not found: {job_id}"
|
||||||
|
)
|
||||||
|
return status
|
||||||
490
webapp/models.py
490
webapp/models.py
@@ -2,9 +2,14 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||||
|
|
||||||
|
|
||||||
|
def _utcnow_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
class RunSummary(BaseModel):
|
class RunSummary(BaseModel):
|
||||||
@@ -68,6 +73,19 @@ class ReportData(BaseModel):
|
|||||||
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
||||||
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||||||
summary_markdown: str = ""
|
summary_markdown: str = ""
|
||||||
|
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
|
||||||
|
weighted_score_mean: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="加权综合得分均值(metric_weights × doc_weights 共同作用)。",
|
||||||
|
)
|
||||||
|
metric_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="该次运行使用的指标权重配置(来自 scenario.snapshot.yaml)。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="该次运行使用的文档权重配置(来自 scenario.snapshot.yaml)。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RunDetail(BaseModel):
|
class RunDetail(BaseModel):
|
||||||
@@ -87,6 +105,14 @@ class ScenarioInfo(BaseModel):
|
|||||||
judge_model: str = ""
|
judge_model: str = ""
|
||||||
metrics: list[str] = Field(default_factory=list)
|
metrics: list[str] = Field(default_factory=list)
|
||||||
error: str = ""
|
error: str = ""
|
||||||
|
metric_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="从场景 YAML 读取的指标权重配置,供前端权重面板预填。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="从场景 YAML 读取的文档权重配置,供前端权重面板预填。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TaskStatus(BaseModel):
|
class TaskStatus(BaseModel):
|
||||||
@@ -114,6 +140,70 @@ class TriggerEvaluationResponse(BaseModel):
|
|||||||
task_id: str
|
task_id: str
|
||||||
|
|
||||||
|
|
||||||
|
class LLMProfile(BaseModel):
|
||||||
|
"""A named LLM connection configuration that can be reused across tasks."""
|
||||||
|
|
||||||
|
profile_id: str
|
||||||
|
name: str
|
||||||
|
model: str
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
created_at: str = Field(default_factory=_utcnow_iso)
|
||||||
|
updated_at: str = Field(default_factory=_utcnow_iso)
|
||||||
|
|
||||||
|
|
||||||
|
class CreateProfileRequest(BaseModel):
|
||||||
|
"""Request body for creating or updating an LLM profile."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
model: str
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileApplyRequest(BaseModel):
|
||||||
|
"""Request body to patch LLM profile selections into a scenario YAML."""
|
||||||
|
|
||||||
|
scenario_path: str
|
||||||
|
judge_profile_id: str | None = None
|
||||||
|
answer_profile_id: str | None = None
|
||||||
|
dataset_profile_id: str | None = None
|
||||||
|
metric_weights: dict[str, float] | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="指标权重映射,如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。",
|
||||||
|
)
|
||||||
|
doc_weights: dict[str, float] | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="文档权重映射,如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileApplyResponse(BaseModel):
|
||||||
|
"""Response after patching a scenario YAML with profile settings."""
|
||||||
|
|
||||||
|
scenario_path: str
|
||||||
|
patched_fields: list[str] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileProbeRequest(BaseModel):
|
||||||
|
"""Inline credentials for testing LLM connectivity without saving a profile."""
|
||||||
|
|
||||||
|
model: str
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileTestResponse(BaseModel):
|
||||||
|
"""Result of a LLM connectivity test."""
|
||||||
|
|
||||||
|
ok: bool
|
||||||
|
message: str
|
||||||
|
latency_ms: int | None = None
|
||||||
|
|
||||||
|
|
||||||
def jsonable(value: Any) -> Any:
|
def jsonable(value: Any) -> Any:
|
||||||
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||||||
import math
|
import math
|
||||||
@@ -127,3 +217,401 @@ def jsonable(value: Any) -> Any:
|
|||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
return [jsonable(item) for item in value]
|
return [jsonable(item) for item in value]
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Full pipeline (build + eval) job models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class PipelineJobRequest(BaseModel):
|
||||||
|
"""Request body for launching an end-to-end build + evaluation pipeline job."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "西门子 CT 文档评估(完整参数)",
|
||||||
|
"value": {
|
||||||
|
"docs_path": "datasets/siemens-pdfs",
|
||||||
|
"job_name": "siemens-ct-eval-2026",
|
||||||
|
"generation_model": "qwen3.6-plus",
|
||||||
|
"answer_model": "deepseek-v4-flash",
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
"max_questions_per_document": 10,
|
||||||
|
"max_source_chunks_per_question": 3,
|
||||||
|
"max_documents": None,
|
||||||
|
"max_samples": None,
|
||||||
|
"metrics": [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
"optimization_advisor": False,
|
||||||
|
"failure_mode": "skip",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"summary": "快速冒烟测试(仅 2 份文档、5 道题)",
|
||||||
|
"value": {
|
||||||
|
"docs_path": "datasets/siemens-pdfs",
|
||||||
|
"job_name": "smoke-test",
|
||||||
|
"generation_model": "qwen3.6-plus",
|
||||||
|
"answer_model": "deepseek-v4-flash",
|
||||||
|
"judge_model": "deepseek-v4-flash",
|
||||||
|
"embedding_model": "text-embedding-v3",
|
||||||
|
"max_questions_per_document": 5,
|
||||||
|
"max_source_chunks_per_question": 3,
|
||||||
|
"max_documents": 2,
|
||||||
|
"max_samples": 10,
|
||||||
|
"metrics": ["faithfulness", "answer_relevancy"],
|
||||||
|
"optimization_advisor": False,
|
||||||
|
"failure_mode": "skip",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
docs_path: str = Field(
|
||||||
|
description="PDF 文档所在文件夹的绝对路径或相对于仓库根目录的相对路径。"
|
||||||
|
)
|
||||||
|
job_name: str = Field(
|
||||||
|
default="",
|
||||||
|
description="任务显示名称;留空时系统自动生成唯一标识。",
|
||||||
|
)
|
||||||
|
generation_model: str = Field(
|
||||||
|
default="qwen3.6-plus",
|
||||||
|
description="用于从文档片段生成草稿题库的 LLM 模型名称。",
|
||||||
|
)
|
||||||
|
answer_model: str = Field(
|
||||||
|
default="deepseek-v4-flash",
|
||||||
|
description="在线评估时调用的答题 LLM 模型名称(siemens_pdf_qa adapter)。",
|
||||||
|
)
|
||||||
|
judge_model: str = Field(
|
||||||
|
default="deepseek-v4-flash",
|
||||||
|
description="RAGAS 指标评分时使用的 Judge LLM 模型名称。",
|
||||||
|
)
|
||||||
|
embedding_model: str = Field(
|
||||||
|
default="text-embedding-v3",
|
||||||
|
description="RAGAS context-recall / context-precision 使用的 Embedding 模型名称。",
|
||||||
|
)
|
||||||
|
max_questions_per_document: int = Field(
|
||||||
|
default=10, gt=0,
|
||||||
|
description="每份 PDF 文档最多生成的草稿题目数量。",
|
||||||
|
)
|
||||||
|
max_source_chunks_per_question: int = Field(
|
||||||
|
default=3, gt=0,
|
||||||
|
description="每道题目最多引用的文档片段(source chunk)数量。",
|
||||||
|
)
|
||||||
|
max_documents: int | None = Field(
|
||||||
|
default=None, gt=0,
|
||||||
|
description="限制处理的 PDF 文件数量上限(冒烟测试时使用)。",
|
||||||
|
)
|
||||||
|
max_samples: int | None = Field(
|
||||||
|
default=None, gt=0,
|
||||||
|
description="限制评估的题目数量上限(冒烟测试时使用)。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
description=(
|
||||||
|
"需要计算的 RAGAS 指标列表。"
|
||||||
|
"可选值:faithfulness, answer_relevancy, context_recall, "
|
||||||
|
"context_precision, noise_sensitivity, factual_correctness, semantic_similarity。"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
optimization_advisor: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="为 True 时启用 RAGAS 优化建议模块,生成 optimization_advice.md。",
|
||||||
|
)
|
||||||
|
failure_mode: str = Field(
|
||||||
|
default="skip",
|
||||||
|
description="PDF 解析失败时的处理策略:skip(跳过继续)或 fail(立即中止)。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineResult(BaseModel):
|
||||||
|
"""Artifact locations and statistics for a completed pipeline run."""
|
||||||
|
|
||||||
|
build_artifact_dir: str = Field(description="题库生成阶段的产物根目录路径。")
|
||||||
|
dataset_csv: str = Field(description="生成的草稿题库 CSV 文件路径(评估输入)。")
|
||||||
|
source_chunks_jsonl: str = Field(description="文档片段索引文件路径(在线评估 adapter 使用)。")
|
||||||
|
total_questions: int = Field(description="成功生成的有效题目总数。")
|
||||||
|
parse_failures: int = Field(description="文档解析失败的 PDF 数量。")
|
||||||
|
eval_run_id: str = Field(description="RAGAS 评估运行 ID。")
|
||||||
|
eval_output_dir: str = Field(description="RAGAS 评估产物根目录路径。")
|
||||||
|
scores_csv: str = Field(description="每道题目逐项评分的 CSV 文件路径。")
|
||||||
|
summary_md: str = Field(description="评估结果摘要 Markdown 文件路径。")
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineJobStatus(BaseModel):
|
||||||
|
"""State of one end-to-end pipeline job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
job_name: str = Field(description="任务显示名称。")
|
||||||
|
status: str = Field(description="任务状态:queued | running | completed | failed。")
|
||||||
|
phase: str = Field(default="idle", description="当前执行阶段:idle | parsing_documents | generating_questions | evaluating | done。")
|
||||||
|
logs: list[str] = Field(default_factory=list, description="实时日志行列表。")
|
||||||
|
result: PipelineResult | None = Field(default=None, description="任务完成后填充的产物路径与统计信息。")
|
||||||
|
error: str | None = Field(default=None, description="失败时的错误信息。")
|
||||||
|
created_at: str = Field(default="", description="任务创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="任务结束时间(ISO 8601 UTC)。")
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineJobResponse(BaseModel):
|
||||||
|
"""Immediate response returned after a pipeline job is queued."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续轮询状态。")
|
||||||
|
job_name: str = Field(description="任务显示名称。")
|
||||||
|
status: str = Field(default="queued", description="初始状态,通常为 queued。")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dify 实时评分 API 模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 需要 ground_truth 才能计算的指标集合
|
||||||
|
_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"context_recall",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 需要 contexts 才能计算的指标集合
|
||||||
|
_CONTEXT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
|
# 所有合法指标名称
|
||||||
|
_VALID_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
"factual_correctness",
|
||||||
|
"semantic_similarity",
|
||||||
|
})
|
||||||
|
|
||||||
|
_DEFAULT_SCORE_METRICS: list[str] = [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreRequest(BaseModel):
|
||||||
|
"""Request body for the real-time single-sample scoring endpoint."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"example": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
|
||||||
|
"ground_truth": "双源CT单扇区时间分辨率为75ms,需旋转135度。",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": [
|
||||||
|
"faithfulness",
|
||||||
|
"answer_relevancy",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
],
|
||||||
|
"judge_model": "gpt-5",
|
||||||
|
"embedding_model": "text-embedding-3-small",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
question: str = Field(description="问题文本。")
|
||||||
|
answer: str = Field(description="待评分的回答。")
|
||||||
|
contexts: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="检索上下文字符串,多段之间用 context_separator 拼接。缺失时自动跳过依赖检索内容的指标(faithfulness、context_recall、context_precision、noise_sensitivity)。",
|
||||||
|
)
|
||||||
|
ground_truth: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="标准参考答案(可选)。缺失时自动跳过需要它的指标。",
|
||||||
|
)
|
||||||
|
context_separator: str = Field(
|
||||||
|
default=" |||| ",
|
||||||
|
description="contexts 字段中段落分隔符,默认为四个竖线两侧各一空格。",
|
||||||
|
)
|
||||||
|
metrics: list[str] = Field(
|
||||||
|
default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
|
||||||
|
description="需要计算的 RAGAS 指标列表。",
|
||||||
|
)
|
||||||
|
judge_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Judge LLM 模型名称;为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
|
||||||
|
)
|
||||||
|
embedding_model: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="Embedding 模型名称;为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
|
||||||
|
)
|
||||||
|
|
||||||
|
@field_validator("metrics")
|
||||||
|
@classmethod
|
||||||
|
def validate_metric_names(cls, value: list[str]) -> list[str]:
|
||||||
|
"""Reject any metric name not in the supported registry."""
|
||||||
|
invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
|
||||||
|
if invalid:
|
||||||
|
raise ValueError(
|
||||||
|
f"不支持的指标名称:{invalid}。"
|
||||||
|
f"合法值:{sorted(_VALID_METRICS)}"
|
||||||
|
)
|
||||||
|
if not value:
|
||||||
|
raise ValueError("metrics 不能为空列表。")
|
||||||
|
return value
|
||||||
|
|
||||||
|
def contexts_as_list(self) -> list[str]:
|
||||||
|
"""Split the contexts string into a list of non-empty fragments.
|
||||||
|
|
||||||
|
Returns an empty list when contexts is None or blank.
|
||||||
|
"""
|
||||||
|
if not self.contexts:
|
||||||
|
return []
|
||||||
|
separator = self.context_separator or " |||| "
|
||||||
|
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
|
||||||
|
|
||||||
|
def effective_metrics(self) -> list[str]:
|
||||||
|
"""Return metrics filtered to exclude GT-dependent or context-dependent ones when inputs are absent."""
|
||||||
|
result = list(self.metrics)
|
||||||
|
if self.ground_truth is None:
|
||||||
|
result = [m for m in result if m not in _GT_DEPENDENT_METRICS]
|
||||||
|
if not self.contexts:
|
||||||
|
result = [m for m in result if m not in _CONTEXT_DEPENDENT_METRICS]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreResponse(BaseModel):
|
||||||
|
"""Response payload for the real-time scoring endpoint."""
|
||||||
|
|
||||||
|
scores: dict[str, float | None] = Field(
|
||||||
|
description="各指标得分(NaN 或计算失败时为 null)。"
|
||||||
|
)
|
||||||
|
weighted_score: float | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="等权加权综合得分(仅对非 null 指标求均值)。",
|
||||||
|
)
|
||||||
|
latency_ms: int = Field(description="服务端打分耗时(毫秒)。")
|
||||||
|
skipped_metrics: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="因缺少 ground_truth 而跳过的指标名称列表。",
|
||||||
|
)
|
||||||
|
error: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 异步评分记录模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AsyncScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate 202 response after submitting an async score job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续查询结果。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
run_id: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="评分完成后写入的 Run ID,可在「运行列表」中查看完整报告。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Session async 评分模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class SessionScoreRequest(ScoreRequest):
|
||||||
|
"""Request body for session-grouped async scoring.
|
||||||
|
|
||||||
|
All calls sharing the same session_id are accumulated into one report.
|
||||||
|
Each call adds a new sample row to the session's scores.csv.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "Dify 会话批量评分",
|
||||||
|
"value": {
|
||||||
|
"session_id": "dify-session-001",
|
||||||
|
"question": "单源CT与双源CT在球管配置上有何本质区别?",
|
||||||
|
"answer": "单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["answer_relevancy", "faithfulness"],
|
||||||
|
"judge_model": "gpt-5.5",
|
||||||
|
"embedding_model": "text-embedding-3-small",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
session_id: str = Field(
|
||||||
|
description=(
|
||||||
|
"会话唯一标识符。相同 session_id 的多次调用合并为同一报告,"
|
||||||
|
"每次调用新增一个样本行,指标均值和优化建议在每次调用后增量更新。"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SessionScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate 202 response after submitting a session scoring call."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="本次调用的任务唯一标识符。")
|
||||||
|
session_id: str = Field(description="会话标识符。")
|
||||||
|
run_id: str = Field(description="本 session 对应的报告 Run ID,可在「运行列表」中查看。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
call_count: int = Field(default=1, description="本 session 当前累计调用次数(包含本次)。")
|
||||||
|
|
||||||
|
|
||||||
|
class SessionStatus(BaseModel):
|
||||||
|
"""Aggregate status and metrics for a scoring session."""
|
||||||
|
|
||||||
|
session_id: str = Field(description="会话标识符。")
|
||||||
|
run_id: str = Field(description="对应报告目录的 Run ID。")
|
||||||
|
call_count: int = Field(description="本 session 累计调用次数。")
|
||||||
|
metric_means: dict[str, float | None] = Field(
|
||||||
|
default_factory=dict, description="所有已累积样本的各指标均值。"
|
||||||
|
)
|
||||||
|
latest_finished_at: str = Field(default="", description="最近一次评分完成时间(ISO 8601 UTC)。")
|
||||||
|
jobs: list[AsyncScoreJobStatus] = Field(
|
||||||
|
default_factory=list, description="本 session 所有调用记录,按创建时间排序。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncScoreJobStatus(BaseModel):
|
||||||
|
"""State of one async score job (queued → running → completed/failed)."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
status: str = Field(description="queued | running | completed | failed")
|
||||||
|
created_at: str = Field(default="", description="创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="完成时间(ISO 8601 UTC)。")
|
||||||
|
run_id: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="完成后对应的 Run ID,可通过 GET /api/runs/{run_id} 查看完整报告。",
|
||||||
|
)
|
||||||
|
request_summary: dict = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="请求参数快照(question 前80字、metrics、judge_model 等)。",
|
||||||
|
)
|
||||||
|
scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
|
||||||
|
weighted_score: float | None = Field(default=None, description="加权综合得分。")
|
||||||
|
latency_ms: int = Field(default=0, description="评分耗时毫秒。")
|
||||||
|
skipped_metrics: list[str] = Field(default_factory=list)
|
||||||
|
error: str | None = Field(default=None)
|
||||||
|
|||||||
136
webapp/server.py
136
webapp/server.py
@@ -7,28 +7,152 @@ the server starts even when the evaluation dependencies are not yet installed.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI, Request
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.encoders import jsonable_encoder
|
||||||
|
from fastapi.exceptions import RequestValidationError
|
||||||
|
from fastapi.responses import FileResponse, JSONResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from webapp.api import evaluations, runs, scenarios
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs, session_score_jobs
|
||||||
|
|
||||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
|
logger = logging.getLogger("webapp.server")
|
||||||
|
access_logger = logging.getLogger("webapp.access")
|
||||||
|
|
||||||
|
# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
|
||||||
|
OPENAPI_TAGS = [
|
||||||
|
{
|
||||||
|
"name": "pipeline",
|
||||||
|
"description": (
|
||||||
|
"**全链路评估 Pipeline API**\n\n"
|
||||||
|
"一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
|
||||||
|
"**使用流程**\n"
|
||||||
|
"1. `POST /api/pipeline/jobs` 提交任务,立即拿到 `job_id`。\n"
|
||||||
|
"2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
|
||||||
|
"3. 当 `status=completed` 时,`result` 字段包含所有产物路径。\n\n"
|
||||||
|
"**Pipeline 阶段**\n"
|
||||||
|
"| phase | 说明 |\n"
|
||||||
|
"|-------|------|\n"
|
||||||
|
"| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
|
||||||
|
"| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
|
||||||
|
"| `evaluating` | RAGAS 在线评测打分 |\n"
|
||||||
|
"| `done` | 所有产物写入磁盘,任务完成 |"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "evaluations",
|
||||||
|
"description": (
|
||||||
|
"**单场景评估 API**\n\n"
|
||||||
|
"基于已有 YAML 场景文件触发评估任务,并查询任务状态与日志。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "llm-profiles",
|
||||||
|
"description": (
|
||||||
|
"**LLM 配置管理 API**\n\n"
|
||||||
|
"增删改查已保存的 LLM 连接配置(模型名称、Base URL、API Key);"
|
||||||
|
"支持连通性测试;可将配置一键写入场景 YAML 文件。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "runs",
|
||||||
|
"description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "scenarios",
|
||||||
|
"description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(同步)** — `POST /api/score`\n\n"
|
||||||
|
"**异步评分 API(Dify 推荐)** — `POST /api/score/async`\n\n"
|
||||||
|
"异步方式立即返回 job_id(202),评分在后台执行,完成后自动生成完整报告(含优化建议),"
|
||||||
|
"在「运行列表」页查看。\n\n"
|
||||||
|
"**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
|
||||||
|
"适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告,"
|
||||||
|
"每次调用新增一个样本行,指标均值和优化建议增量更新。\n\n"
|
||||||
|
"**Session 模式调用流程**\n"
|
||||||
|
"1. `POST /api/score/session_async` 提交一条问答评分请求。\n"
|
||||||
|
"2. 用 `GET /api/score/session/jobs/{job_id}` 轮询单次调用状态。\n"
|
||||||
|
"3. 用 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n"
|
||||||
|
"4. 用 `GET /api/runs/{run_id}` 或在「运行列表」中查看完整报告。\n\n"
|
||||||
|
"通过 `GET /api/score/jobs` 列出所有异步评分记录,"
|
||||||
|
"`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meta",
|
||||||
|
"description": "**系统 API**\n\n健康检查等基础接口。",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
"""Build and configure the FastAPI application instance."""
|
"""Build and configure the FastAPI application instance."""
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Siemens RAGAS 评估控制台",
|
title="Siemens RAGAS 评估平台",
|
||||||
description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
|
description=(
|
||||||
version="0.1.0",
|
"西门子医疗影像 RAG 评估平台 API 文档。\n\n"
|
||||||
|
"提供以下能力:\n"
|
||||||
|
"- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
|
||||||
|
"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
|
||||||
|
"- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
|
||||||
|
"- **LLM 配置 API** — 管理多个 LLM 连接配置,支持连通性测试\n"
|
||||||
|
"- **报告 API** — 查询历史运行记录与评估报告\n\n"
|
||||||
|
"> **快速开始**:调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
|
||||||
|
),
|
||||||
|
version="0.3.0",
|
||||||
|
openapi_tags=OPENAPI_TAGS,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.include_router(runs.router)
|
app.include_router(runs.router)
|
||||||
app.include_router(scenarios.router)
|
app.include_router(scenarios.router)
|
||||||
app.include_router(evaluations.router)
|
app.include_router(evaluations.router)
|
||||||
|
app.include_router(llm_profiles.router)
|
||||||
|
app.include_router(pipeline.router)
|
||||||
|
app.include_router(score.router)
|
||||||
|
app.include_router(score_jobs.router)
|
||||||
|
app.include_router(session_score_jobs.router)
|
||||||
|
|
||||||
|
@app.middleware("http")
|
||||||
|
async def access_log_middleware(request: Request, call_next):
|
||||||
|
"""Log every API request with method, path, status code and latency.
|
||||||
|
|
||||||
|
Static file requests are logged at DEBUG level to keep the console clean.
|
||||||
|
"""
|
||||||
|
t0 = time.monotonic()
|
||||||
|
response = await call_next(request)
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
path = request.url.path
|
||||||
|
is_static = path.startswith("/static/") or path in ("/", "/favicon.ico")
|
||||||
|
msg = "%s %s → %d (%dms)", request.method, path, response.status_code, latency_ms
|
||||||
|
if is_static:
|
||||||
|
access_logger.debug(*msg)
|
||||||
|
else:
|
||||||
|
access_logger.info(*msg)
|
||||||
|
return response
|
||||||
|
|
||||||
|
@app.exception_handler(RequestValidationError)
|
||||||
|
async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
|
||||||
|
"""Log full validation error detail to help diagnose 422 responses."""
|
||||||
|
errors = jsonable_encoder(exc.errors())
|
||||||
|
logger.warning(
|
||||||
|
"[422] validation error url=%s content_type=%s errors=%s",
|
||||||
|
request.url.path,
|
||||||
|
request.headers.get("content-type", ""),
|
||||||
|
errors,
|
||||||
|
)
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=422,
|
||||||
|
content={"detail": errors},
|
||||||
|
)
|
||||||
|
|
||||||
@app.get("/api/health", tags=["meta"])
|
@app.get("/api/health", tags=["meta"])
|
||||||
def health() -> dict[str, str]:
|
def health() -> dict[str, str]:
|
||||||
|
|||||||
118
webapp/services/inline_scorer.py
Normal file
118
webapp/services/inline_scorer.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
|
||||||
|
|
||||||
|
A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
|
||||||
|
(judge_model, embedding_model), so repeated Dify Tool calls with the same
|
||||||
|
models reuse existing AsyncOpenAI connections instead of creating new ones.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.compat import ensure_ragas_import_compat
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.pipeline import MetricPipeline
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import NormalizedSample
|
||||||
|
|
||||||
|
ensure_ragas_import_compat()
|
||||||
|
|
||||||
|
from ragas.metrics.collections import ( # noqa: E402
|
||||||
|
AnswerRelevancy,
|
||||||
|
ContextPrecision,
|
||||||
|
ContextRecall,
|
||||||
|
FactualCorrectness,
|
||||||
|
Faithfulness,
|
||||||
|
NoiseSensitivity,
|
||||||
|
SemanticSimilarity,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
|
||||||
|
"""Instantiate only the RAGAS metric objects requested."""
|
||||||
|
registry: dict[str, Any] = {
|
||||||
|
"faithfulness": Faithfulness(llm=llm),
|
||||||
|
"answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
|
||||||
|
"context_recall": ContextRecall(llm=llm),
|
||||||
|
"context_precision": ContextPrecision(llm=llm),
|
||||||
|
"noise_sensitivity": NoiseSensitivity(llm=llm),
|
||||||
|
"factual_correctness": FactualCorrectness(llm=llm),
|
||||||
|
"semantic_similarity": SemanticSimilarity(embeddings=embeddings),
|
||||||
|
}
|
||||||
|
return {name: registry[name] for name in metrics if name in registry}
|
||||||
|
|
||||||
|
|
||||||
|
class InlineScorer:
|
||||||
|
"""Thread-safe single-sample RAGAS scorer with LLM client caching."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize the scorer cache and synchronization primitives."""
|
||||||
|
# Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
|
||||||
|
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def invalidate_cache(self) -> None:
|
||||||
|
"""Clear the model cache so the next call rebuilds clients from current profiles."""
|
||||||
|
with self._lock:
|
||||||
|
self._model_cache.clear()
|
||||||
|
|
||||||
|
def _get_models(
|
||||||
|
self,
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> tuple[Any, Any]:
|
||||||
|
"""Return cached LLM/embedding clients, building them on first use.
|
||||||
|
|
||||||
|
Cache is keyed by (judge_model, embedding_model). Call invalidate_cache()
|
||||||
|
after updating an LLM Profile to force a fresh client on the next request.
|
||||||
|
"""
|
||||||
|
cache_key = (judge_model, embedding_model)
|
||||||
|
with self._lock:
|
||||||
|
if cache_key not in self._model_cache:
|
||||||
|
llm, embeddings = build_models(judge_model, embedding_model, settings)
|
||||||
|
self._model_cache[cache_key] = (llm, embeddings)
|
||||||
|
return self._model_cache[cache_key]
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
answer: str,
|
||||||
|
contexts: list[str],
|
||||||
|
ground_truth: str | None,
|
||||||
|
metrics: list[str],
|
||||||
|
judge_model: str,
|
||||||
|
embedding_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, float | None]:
|
||||||
|
"""Score one sample synchronously and return {metric_name: score | None}."""
|
||||||
|
llm, embeddings = self._get_models(judge_model, embedding_model, settings)
|
||||||
|
metric_instances = _build_metric_instances(metrics, llm, embeddings)
|
||||||
|
|
||||||
|
pipeline = MetricPipeline(
|
||||||
|
metrics=metric_instances,
|
||||||
|
metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="inline-score",
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
contexts=contexts,
|
||||||
|
ground_truth=ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
metric_score = asyncio.run(pipeline.score_sample(sample))
|
||||||
|
|
||||||
|
# Convert NaN and Inf into None for clean JSON output.
|
||||||
|
return {
|
||||||
|
name: (None if math.isnan(value) or math.isinf(value) else round(value, 4))
|
||||||
|
for name, value in metric_score.metrics.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
inline_scorer = InlineScorer()
|
||||||
257
webapp/services/pipeline_task_manager.py
Normal file
257
webapp/services/pipeline_task_manager.py
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
"""Background task manager for end-to-end pipeline jobs (build + eval).
|
||||||
|
|
||||||
|
Each job runs three sequential phases inside a worker thread:
|
||||||
|
1. parsing_documents — AliyunDocmind parses every PDF
|
||||||
|
2. generating_questions — LLM generates a draft question bank
|
||||||
|
3. evaluating — RAGAS online evaluation scores each question
|
||||||
|
|
||||||
|
The DatasetBuildJob and Scenario objects are constructed entirely from the
|
||||||
|
API request parameters, so no YAML config files are needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from contextlib import redirect_stderr, redirect_stdout
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
PipelineJobRequest,
|
||||||
|
PipelineJobStatus,
|
||||||
|
PipelineResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class _LineCapture(io.TextIOBase):
|
||||||
|
"""Write-only stream that appends complete lines to a task's log buffer."""
|
||||||
|
|
||||||
|
def __init__(self, sink: "PipelineTask") -> None:
|
||||||
|
self._sink = sink
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
def write(self, text: str) -> int:
|
||||||
|
self._buffer += text
|
||||||
|
while "\n" in self._buffer:
|
||||||
|
line, self._buffer = self._buffer.split("\n", 1)
|
||||||
|
self._sink.append_log(line)
|
||||||
|
return len(text)
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
if self._buffer:
|
||||||
|
self._sink.append_log(self._buffer)
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineTask:
|
||||||
|
"""Mutable state for one pipeline job (build + eval)."""
|
||||||
|
|
||||||
|
def __init__(self, job_id: str, job_name: str) -> None:
|
||||||
|
self.job_id = job_id
|
||||||
|
self.job_name = job_name
|
||||||
|
self.status = "queued"
|
||||||
|
self.phase = "idle"
|
||||||
|
self.logs: list[str] = []
|
||||||
|
self.result: PipelineResult | None = None
|
||||||
|
self.error: str | None = None
|
||||||
|
self.created_at = _now_iso()
|
||||||
|
self.finished_at = ""
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def append_log(self, line: str) -> None:
|
||||||
|
with self._lock:
|
||||||
|
self.logs.append(line)
|
||||||
|
|
||||||
|
def snapshot(self) -> PipelineJobStatus:
|
||||||
|
with self._lock:
|
||||||
|
return PipelineJobStatus(
|
||||||
|
job_id=self.job_id,
|
||||||
|
job_name=self.job_name,
|
||||||
|
status=self.status,
|
||||||
|
phase=self.phase,
|
||||||
|
logs=list(self.logs),
|
||||||
|
result=self.result,
|
||||||
|
error=self.error,
|
||||||
|
created_at=self.created_at,
|
||||||
|
finished_at=self.finished_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineTaskManager:
|
||||||
|
"""Owns the thread pool and registry of pipeline jobs."""
|
||||||
|
|
||||||
|
def __init__(self, max_workers: int = 2) -> None:
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
self._tasks: dict[str, PipelineTask] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def submit(self, request: PipelineJobRequest) -> PipelineTask:
|
||||||
|
"""Register and schedule a new pipeline job; return its task object."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
|
||||||
|
task = PipelineTask(job_id=job_id, job_name=job_name)
|
||||||
|
with self._lock:
|
||||||
|
self._tasks[job_id] = task
|
||||||
|
self._executor.submit(self._run, task, request)
|
||||||
|
return task
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> PipelineJobStatus | None:
|
||||||
|
with self._lock:
|
||||||
|
task = self._tasks.get(job_id)
|
||||||
|
return task.snapshot() if task is not None else None
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[PipelineJobStatus]:
|
||||||
|
with self._lock:
|
||||||
|
tasks = list(self._tasks.values())
|
||||||
|
snapshots = [t.snapshot() for t in tasks]
|
||||||
|
snapshots.sort(key=lambda s: s.created_at, reverse=True)
|
||||||
|
return snapshots
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
|
||||||
|
"""Execute the full pipeline end to end inside a worker thread."""
|
||||||
|
task.status = "running"
|
||||||
|
task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")
|
||||||
|
|
||||||
|
capture = _LineCapture(task)
|
||||||
|
try:
|
||||||
|
with redirect_stdout(capture), redirect_stderr(capture):
|
||||||
|
result = self._execute(task, request)
|
||||||
|
capture.flush()
|
||||||
|
task.result = result
|
||||||
|
task.phase = "done"
|
||||||
|
task.status = "completed"
|
||||||
|
task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
capture.flush()
|
||||||
|
task.error = f"{type(exc).__name__}: {exc}"
|
||||||
|
task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
|
||||||
|
task.status = "failed"
|
||||||
|
finally:
|
||||||
|
task.finished_at = _now_iso()
|
||||||
|
|
||||||
|
def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
|
||||||
|
"""Run build then eval, updating task.phase as we go."""
|
||||||
|
|
||||||
|
# ── resolve paths ──────────────────────────────────────────────
|
||||||
|
docs_path = Path(req.docs_path)
|
||||||
|
if not docs_path.is_absolute():
|
||||||
|
docs_path = (_REPO_ROOT / docs_path).resolve()
|
||||||
|
if not docs_path.is_dir():
|
||||||
|
raise ValueError(f"docs_path is not an existing directory: {docs_path}")
|
||||||
|
|
||||||
|
job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
|
||||||
|
build_artifact_dir = job_output_dir / "build"
|
||||||
|
dataset_csv = job_output_dir / "generated_dataset.csv"
|
||||||
|
eval_output_dir = job_output_dir / "eval"
|
||||||
|
|
||||||
|
# ── phase 1 + 2: dataset build (parse & generate) ─────────────
|
||||||
|
task.phase = "parsing_documents"
|
||||||
|
task.append_log(f" [build] 扫描文档目录: {docs_path}")
|
||||||
|
build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)
|
||||||
|
|
||||||
|
source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
|
||||||
|
total_q = len(build_result.draft_samples)
|
||||||
|
parse_failures = len(build_result.parse_failures)
|
||||||
|
task.append_log(f" [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")
|
||||||
|
|
||||||
|
if total_q == 0:
|
||||||
|
raise RuntimeError("题库为空(所有文档均解析或生成失败),中止评估。")
|
||||||
|
|
||||||
|
# ── phase 3: evaluation ────────────────────────────────────────
|
||||||
|
task.phase = "evaluating"
|
||||||
|
task.append_log(f" [eval] 开始 RAGAS 评估,共 {total_q} 道题目")
|
||||||
|
eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)
|
||||||
|
|
||||||
|
from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
|
||||||
|
eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)
|
||||||
|
|
||||||
|
return PipelineResult(
|
||||||
|
build_artifact_dir=build_artifact_dir.as_posix(),
|
||||||
|
dataset_csv=dataset_csv.as_posix(),
|
||||||
|
source_chunks_jsonl=source_chunks_jsonl.as_posix(),
|
||||||
|
total_questions=total_q,
|
||||||
|
parse_failures=parse_failures,
|
||||||
|
eval_run_id=eval_result.run_id,
|
||||||
|
eval_output_dir=eval_result.scenario.output_dir.as_posix(),
|
||||||
|
scores_csv=eval_artifact_paths.scores_csv.as_posix(),
|
||||||
|
summary_md=eval_artifact_paths.summary_md.as_posix(),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
|
||||||
|
docs_path: Path, artifact_dir: Path, dataset_csv: Path):
|
||||||
|
"""Construct DatasetBuildJob and run the build phase."""
|
||||||
|
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
|
||||||
|
from rag_eval.dataset_builder.runner import execute_dataset_build_job
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
job = DatasetBuildJob(
|
||||||
|
job_name=task.job_name,
|
||||||
|
input_path=docs_path,
|
||||||
|
input_glob="*.pdf",
|
||||||
|
parser_provider="aliyun_docmind",
|
||||||
|
failure_mode=req.failure_mode, # type: ignore[arg-type]
|
||||||
|
generation_model=req.generation_model,
|
||||||
|
output_type="online_question_bank",
|
||||||
|
review_mode="draft_with_manual_review",
|
||||||
|
max_questions_per_document=req.max_questions_per_document,
|
||||||
|
max_source_chunks_per_question=req.max_source_chunks_per_question,
|
||||||
|
dataset_path=dataset_csv,
|
||||||
|
artifact_dir=artifact_dir,
|
||||||
|
runtime=DatasetBuildRuntime(max_documents=req.max_documents),
|
||||||
|
)
|
||||||
|
return execute_dataset_build_job(job, settings=settings)
|
||||||
|
|
||||||
|
def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
|
||||||
|
dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
|
||||||
|
"""Construct Scenario and run the evaluation phase."""
|
||||||
|
from rag_eval.execution.runner import run_scenario_from_scenario_obj
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=task.job_name,
|
||||||
|
mode="online",
|
||||||
|
dataset=DatasetConfig(path=dataset_csv),
|
||||||
|
judge_model=req.judge_model,
|
||||||
|
embedding_model=req.embedding_model,
|
||||||
|
metrics=list(req.metrics),
|
||||||
|
output_dir=eval_output_dir,
|
||||||
|
runtime=RuntimeConfig(
|
||||||
|
batch_size=4,
|
||||||
|
app_concurrency=2,
|
||||||
|
metric_concurrency=2,
|
||||||
|
max_samples=req.max_samples,
|
||||||
|
),
|
||||||
|
app_adapter=AppAdapterConfig(
|
||||||
|
type="python",
|
||||||
|
callable="apps.siemens_pdf_qa.adapter:run",
|
||||||
|
static_kwargs={
|
||||||
|
"source_chunks_path": source_chunks_jsonl,
|
||||||
|
"model": req.answer_model,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
optimization_advisor=req.optimization_advisor,
|
||||||
|
)
|
||||||
|
return run_scenario_from_scenario_obj(scenario, settings=settings)
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by the FastAPI routes.
|
||||||
|
pipeline_task_manager = PipelineTaskManager()
|
||||||
137
webapp/services/profile_manager.py
Normal file
137
webapp/services/profile_manager.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
"""In-memory + JSON-file LLM profile manager.
|
||||||
|
|
||||||
|
Profiles are kept in a dict keyed by profile_id and written to a JSON file
|
||||||
|
on every mutation, so they survive server restarts. The pattern mirrors
|
||||||
|
TaskManager but without threading concerns beyond a simple lock (profiles
|
||||||
|
are only mutated by API calls in FastAPI request handlers).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from webapp.models import LLMProfile
|
||||||
|
|
||||||
|
|
||||||
|
_DEFAULT_STORE = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class ProfileManager:
|
||||||
|
"""Manages LLM profiles with in-memory cache and JSON file persistence."""
|
||||||
|
|
||||||
|
def __init__(self, store_path: Path = _DEFAULT_STORE) -> None:
|
||||||
|
self._store_path = store_path
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._profiles: dict[str, LLMProfile] = {}
|
||||||
|
self._load()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def list_all(self) -> list[LLMProfile]:
|
||||||
|
"""Return all profiles sorted by creation time."""
|
||||||
|
with self._lock:
|
||||||
|
return sorted(self._profiles.values(), key=lambda p: p.created_at)
|
||||||
|
|
||||||
|
def get(self, profile_id: str) -> LLMProfile | None:
|
||||||
|
"""Return one profile by id, or None if not found."""
|
||||||
|
with self._lock:
|
||||||
|
return self._profiles.get(profile_id)
|
||||||
|
|
||||||
|
def create(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
model: str,
|
||||||
|
base_url: str,
|
||||||
|
api_key: str,
|
||||||
|
timeout_seconds: int = 30,
|
||||||
|
) -> LLMProfile:
|
||||||
|
"""Create and persist a new profile, returning it."""
|
||||||
|
now = _now_iso()
|
||||||
|
profile = LLMProfile(
|
||||||
|
profile_id=uuid.uuid4().hex[:12],
|
||||||
|
name=name,
|
||||||
|
model=model,
|
||||||
|
base_url=base_url,
|
||||||
|
api_key=api_key,
|
||||||
|
timeout_seconds=timeout_seconds,
|
||||||
|
created_at=now,
|
||||||
|
updated_at=now,
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
self._profiles[profile.profile_id] = profile
|
||||||
|
self._persist()
|
||||||
|
return profile
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
profile_id: str,
|
||||||
|
name: str,
|
||||||
|
model: str,
|
||||||
|
base_url: str,
|
||||||
|
api_key: str,
|
||||||
|
timeout_seconds: int = 30,
|
||||||
|
) -> LLMProfile | None:
|
||||||
|
"""Update an existing profile in-place; returns None if not found."""
|
||||||
|
with self._lock:
|
||||||
|
existing = self._profiles.get(profile_id)
|
||||||
|
if existing is None:
|
||||||
|
return None
|
||||||
|
updated = existing.model_copy(update={
|
||||||
|
"name": name,
|
||||||
|
"model": model,
|
||||||
|
"base_url": base_url,
|
||||||
|
"api_key": api_key,
|
||||||
|
"timeout_seconds": timeout_seconds,
|
||||||
|
"updated_at": _now_iso(),
|
||||||
|
})
|
||||||
|
self._profiles[profile_id] = updated
|
||||||
|
self._persist()
|
||||||
|
return updated
|
||||||
|
|
||||||
|
def delete(self, profile_id: str) -> bool:
|
||||||
|
"""Remove a profile; returns True if deleted, False if not found."""
|
||||||
|
with self._lock:
|
||||||
|
if profile_id not in self._profiles:
|
||||||
|
return False
|
||||||
|
del self._profiles[profile_id]
|
||||||
|
self._persist()
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Persistence helpers
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _load(self) -> None:
|
||||||
|
"""Load profiles from the JSON store file, ignoring missing/corrupt files."""
|
||||||
|
if not self._store_path.exists():
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
data = json.loads(self._store_path.read_text(encoding="utf-8"))
|
||||||
|
for raw in data.get("profiles", []):
|
||||||
|
p = LLMProfile.model_validate(raw)
|
||||||
|
self._profiles[p.profile_id] = p
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass # Corrupt store — start fresh
|
||||||
|
|
||||||
|
def _persist(self) -> None:
|
||||||
|
"""Write current profiles to the JSON store file (called under lock)."""
|
||||||
|
self._store_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
payload = {"profiles": [p.model_dump() for p in self._profiles.values()]}
|
||||||
|
self._store_path.write_text(
|
||||||
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
profile_manager = ProfileManager()
|
||||||
@@ -13,6 +13,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import (
|
||||||
|
compute_overall_weighted_score_mean,
|
||||||
|
weighted_metric_means as _weighted_metric_means,
|
||||||
|
)
|
||||||
|
from webapp.services.run_reader import _read_weights_from_snapshot
|
||||||
from webapp.services.text_utils import parse_contexts
|
from webapp.services.text_utils import parse_contexts
|
||||||
from webapp.models import (
|
from webapp.models import (
|
||||||
DistributionBin,
|
DistributionBin,
|
||||||
@@ -32,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language")
|
|||||||
# How many lowest-scoring samples to surface for manual review.
|
# How many lowest-scoring samples to surface for manual review.
|
||||||
LOWEST_SAMPLE_COUNT = 10
|
LOWEST_SAMPLE_COUNT = 10
|
||||||
|
|
||||||
|
# Metrics whose lower raw value means stronger performance.
|
||||||
|
LOWER_IS_BETTER_METRICS = {"noise_sensitivity"}
|
||||||
|
|
||||||
|
|
||||||
def _round_or_none(value: float | None) -> float | None:
|
def _round_or_none(value: float | None) -> float | None:
|
||||||
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
||||||
@@ -42,17 +50,6 @@ def _round_or_none(value: float | None) -> float | None:
|
|||||||
return round(float(value), 4)
|
return round(float(value), 4)
|
||||||
|
|
||||||
|
|
||||||
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
|
|
||||||
"""Compute the mean of each metric column across all scored samples."""
|
|
||||||
means: dict[str, float | None] = {}
|
|
||||||
for metric in metrics:
|
|
||||||
if metric in frame.columns:
|
|
||||||
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
|
|
||||||
else:
|
|
||||||
means[metric] = None
|
|
||||||
return means
|
|
||||||
|
|
||||||
|
|
||||||
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
||||||
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
||||||
bins: list[DistributionBin] = []
|
bins: list[DistributionBin] = []
|
||||||
@@ -111,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS
|
|||||||
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
||||||
"""Average a single sample's available metric scores for ranking."""
|
"""Average a single sample's available metric scores for ranking."""
|
||||||
values = [
|
values = [
|
||||||
float(row[metric])
|
(1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric])
|
||||||
for metric in metrics
|
for metric in metrics
|
||||||
if metric in row and pd.notna(row[metric])
|
if metric in row and pd.notna(row[metric])
|
||||||
]
|
]
|
||||||
@@ -164,14 +161,31 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
"""Build the full aggregated report payload for one run directory."""
|
"""Build the full aggregated report payload for one run directory."""
|
||||||
frame = run_reader.read_scores_frame(run_dir)
|
frame = run_reader.read_scores_frame(run_dir)
|
||||||
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||||
|
advice_markdown = run_reader.read_advice_markdown(run_dir)
|
||||||
|
metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
|
||||||
|
|
||||||
if frame.empty or not metrics:
|
if frame.empty or not metrics:
|
||||||
return ReportData(
|
return ReportData(
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
metric_means={metric: None for metric in metrics},
|
metric_means={metric: None for metric in metrics},
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
|
advice_markdown=advice_markdown,
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
score_rows_list = frame.to_dict(orient="records")
|
||||||
|
|
||||||
|
# Use weighted metric means (degrades to arithmetic mean when weights are empty).
|
||||||
|
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
|
||||||
|
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
|
||||||
|
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# overall_ws = compute_overall_weighted_score_mean(
|
||||||
|
# score_rows_list, metric_weights, doc_weights
|
||||||
|
# )
|
||||||
|
overall_ws = None
|
||||||
|
|
||||||
distributions = {
|
distributions = {
|
||||||
metric: _distribution(frame, metric)
|
metric: _distribution(frame, metric)
|
||||||
for metric in metrics
|
for metric in metrics
|
||||||
@@ -180,9 +194,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
|
|
||||||
return ReportData(
|
return ReportData(
|
||||||
metrics=metrics,
|
metrics=metrics,
|
||||||
metric_means=_metric_means(frame, metrics),
|
metric_means=rounded_means,
|
||||||
distributions=distributions,
|
distributions=distributions,
|
||||||
groupings=_groupings(frame, metrics),
|
groupings=_groupings(frame, metrics),
|
||||||
lowest_samples=_lowest_samples(frame, metrics),
|
lowest_samples=_lowest_samples(frame, metrics),
|
||||||
summary_markdown=summary_markdown,
|
summary_markdown=summary_markdown,
|
||||||
|
advice_markdown=advice_markdown,
|
||||||
|
weighted_score_mean=_round_or_none(overall_ws),
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
|
||||||
|
"""Read metric_weights and doc_weights from a scenario snapshot if present.
|
||||||
|
|
||||||
|
Returns a (metric_weights, doc_weights) tuple of plain dicts.
|
||||||
|
Both default to empty dicts when the snapshot is absent or lacks the fields.
|
||||||
|
"""
|
||||||
|
snapshot = run_dir / "scenario.snapshot.yaml"
|
||||||
|
if not snapshot.is_file():
|
||||||
|
return {}, {}
|
||||||
|
try:
|
||||||
|
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
|
||||||
|
except (OSError, yaml.YAMLError):
|
||||||
|
return {}, {}
|
||||||
|
mw = payload.get("metric_weights") or {}
|
||||||
|
dw = payload.get("doc_weights") or {}
|
||||||
|
return (
|
||||||
|
{str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
|
||||||
|
{str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||||
"""Find every run directory (one that contains metadata.json) under the roots."""
|
"""Find every run directory (one that contains metadata.json) under the roots."""
|
||||||
run_dirs: list[Path] = []
|
run_dirs: list[Path] = []
|
||||||
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
|
|||||||
"source_chunk_ids",
|
"source_chunk_ids",
|
||||||
"review_status",
|
"review_status",
|
||||||
"review_notes",
|
"review_notes",
|
||||||
|
"weighted_score",
|
||||||
|
"sample_weight",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -220,3 +243,14 @@ def read_summary_markdown(run_dir: Path) -> str:
|
|||||||
return summary_path.read_text(encoding="utf-8")
|
return summary_path.read_text(encoding="utf-8")
|
||||||
except OSError:
|
except OSError:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def read_advice_markdown(run_dir: Path) -> str:
|
||||||
|
"""Return the optimization_advice.md for a run, or an empty string if not generated."""
|
||||||
|
advice_path = run_dir / "optimization_advice.md"
|
||||||
|
if not advice_path.is_file():
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return advice_path.read_text(encoding="utf-8")
|
||||||
|
except OSError:
|
||||||
|
return ""
|
||||||
|
|||||||
@@ -37,6 +37,16 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
|
|||||||
|
|
||||||
metrics = payload.get("metrics")
|
metrics = payload.get("metrics")
|
||||||
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
||||||
|
raw_metric_weights = payload.get("metric_weights") or {}
|
||||||
|
raw_doc_weights = payload.get("doc_weights") or {}
|
||||||
|
metric_weights = {
|
||||||
|
str(k): float(v) for k, v in raw_metric_weights.items()
|
||||||
|
if isinstance(v, (int, float))
|
||||||
|
}
|
||||||
|
doc_weights = {
|
||||||
|
str(k): float(v) for k, v in raw_doc_weights.items()
|
||||||
|
if isinstance(v, (int, float))
|
||||||
|
}
|
||||||
|
|
||||||
return ScenarioInfo(
|
return ScenarioInfo(
|
||||||
path=relative,
|
path=relative,
|
||||||
@@ -45,6 +55,8 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
|
|||||||
dataset=str(payload.get("dataset", "")),
|
dataset=str(payload.get("dataset", "")),
|
||||||
judge_model=str(payload.get("judge_model", "")),
|
judge_model=str(payload.get("judge_model", "")),
|
||||||
metrics=metric_list,
|
metrics=metric_list,
|
||||||
|
metric_weights=metric_weights,
|
||||||
|
doc_weights=doc_weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
271
webapp/services/score_job_manager.py
Normal file
271
webapp/services/score_job_manager.py
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
"""Background task manager for async RAGAS single-sample scoring.
|
||||||
|
|
||||||
|
Each job:
|
||||||
|
1. Runs InlineScorer.score() in a thread pool.
|
||||||
|
2. Constructs a minimal EvaluationResult + Scenario in the standard format.
|
||||||
|
3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md.
|
||||||
|
4. Calls run_advisor() — produces optimization_advice.md.
|
||||||
|
|
||||||
|
The resulting run directory lands under outputs/score-async/<run_id>/ and is
|
||||||
|
automatically picked up by run_reader.list_run_summaries(), so it appears in
|
||||||
|
the existing 「运行列表」 and 「报告详情」 pages without any extra wiring.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async"
|
||||||
|
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs" # lightweight job index
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreJobManager:
|
||||||
|
"""Thread-pool manager for async scoring jobs.
|
||||||
|
|
||||||
|
Results are written as standard run artifacts so the report detail page
|
||||||
|
can render them with zero additional code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
output_dir: Path = _DEFAULT_JOBS_DIR,
|
||||||
|
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._output_dir = Path(output_dir)
|
||||||
|
self._index_dir = Path(index_dir)
|
||||||
|
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
self._cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
|
||||||
|
"""Queue one scoring job and return its initial status immediately."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": (request.question or "")[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
self._cache[job_id] = status
|
||||||
|
self._persist_index(status)
|
||||||
|
self._executor.submit(self._run, job_id, request)
|
||||||
|
return status
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return current status or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all known jobs, newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Execute scoring, write run artifacts, run advisor."""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("webapp.services.score_job_manager")
|
||||||
|
self._update(job_id, status="running")
|
||||||
|
|
||||||
|
# Lazy imports to keep web server bootable if ragas is not installed.
|
||||||
|
from rag_eval.advisor import run_advisor
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.reporting.writers import write_run_artifacts
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||||
|
RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
started_at = utc_now_iso()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if effective:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_scores = {}
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
finished_at = utc_now_iso()
|
||||||
|
|
||||||
|
# Build full scores dict (skipped = None)
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# weighted_raw = compute_weighted_score(
|
||||||
|
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||||
|
# )
|
||||||
|
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
weighted = None
|
||||||
|
|
||||||
|
# Build a score row compatible with report_builder
|
||||||
|
score_row: dict[str, Any] = {
|
||||||
|
"sample_id": "async-score-1",
|
||||||
|
"question": request.question,
|
||||||
|
"answer": request.answer or "",
|
||||||
|
"contexts": request.contexts or "",
|
||||||
|
"ground_truth": request.ground_truth or "",
|
||||||
|
"error": "",
|
||||||
|
}
|
||||||
|
score_row.update(all_scores)
|
||||||
|
|
||||||
|
# Construct minimal EvaluationResult so write_run_artifacts works
|
||||||
|
run_id = finished_at.replace(":", "-")
|
||||||
|
output_dir = self._output_dir
|
||||||
|
|
||||||
|
# Build a minimal Scenario for snapshot + advisor
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=f"async-score-{job_id}",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"),
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
metrics=list(request.metrics),
|
||||||
|
output_dir=output_dir,
|
||||||
|
optimization_advisor=True, # always generate advice
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="async-score-1",
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer or "",
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id=run_id,
|
||||||
|
started_at=started_at,
|
||||||
|
finished_at=finished_at,
|
||||||
|
valid_samples=[sample],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[score_row],
|
||||||
|
)
|
||||||
|
|
||||||
|
write_run_artifacts(result)
|
||||||
|
logger.info("[score_job] artifacts written job_id=%s run_id=%s", job_id, run_id)
|
||||||
|
|
||||||
|
# Run optimization advisor (builds optimization_advice.md)
|
||||||
|
try:
|
||||||
|
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
logger.info("[score_job] advisor done job_id=%s", job_id)
|
||||||
|
except Exception as adv_exc: # noqa: BLE001
|
||||||
|
logger.warning("[score_job] advisor failed job_id=%s err=%s", job_id, adv_exc)
|
||||||
|
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=finished_at,
|
||||||
|
run_id=run_id,
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
logger.error("[score_job] failed job_id=%s err=%s", job_id, exc)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Persistence helpers
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _update(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
"""Merge kwargs into the job status and persist the index."""
|
||||||
|
with self._lock:
|
||||||
|
existing = self._cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._cache[job_id] = updated
|
||||||
|
self._persist_index(updated)
|
||||||
|
|
||||||
|
def _persist_index(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Write a lightweight index JSON for this job (survives restarts)."""
|
||||||
|
path = self._index_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Load existing job index files on startup."""
|
||||||
|
for path in sorted(self._index_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
score_job_manager = ScoreJobManager()
|
||||||
452
webapp/services/session_score_manager.py
Normal file
452
webapp/services/session_score_manager.py
Normal file
@@ -0,0 +1,452 @@
|
|||||||
|
"""Background task manager for session-grouped async RAGAS scoring.
|
||||||
|
|
||||||
|
Each session groups multiple scoring calls into one shared run report:
|
||||||
|
|
||||||
|
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
|
||||||
|
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
|
||||||
|
and optimization_advice.md by re-running write_run_artifacts + run_advisor
|
||||||
|
over ALL accumulated rows.
|
||||||
|
3. The resulting run directory is picked up automatically by run_reader, so the
|
||||||
|
「运行列表」 and 「报告详情」 pages show the live, growing report.
|
||||||
|
|
||||||
|
Concurrency model:
|
||||||
|
- Scoring (LLM network I/O) runs freely in the thread pool — different sessions
|
||||||
|
score concurrently; multiple calls to the same session also start scoring in
|
||||||
|
parallel.
|
||||||
|
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
|
||||||
|
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
|
||||||
|
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
|
||||||
|
|
||||||
|
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
|
||||||
|
_NON_METRIC_COLUMNS = {
|
||||||
|
"sample_id", "question", "contexts", "answer", "ground_truth",
|
||||||
|
"scenario", "language", "retrieval_config", "error",
|
||||||
|
"judge_model", "embedding_model", "run_id", "difficulty",
|
||||||
|
"question_type", "doc_id", "doc_name", "section_path",
|
||||||
|
"page_start", "page_end", "source_chunk_ids", "review_status",
|
||||||
|
"review_notes", "weighted_score", "sample_weight",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_session_id(session_id: str) -> str:
|
||||||
|
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
|
||||||
|
|
||||||
|
|
||||||
|
class SessionScoreJobManager:
|
||||||
|
"""Thread-pool manager for session-grouped async scoring jobs.
|
||||||
|
|
||||||
|
All calls sharing a session_id append to one shared run directory, so the
|
||||||
|
report detail page shows all samples and their aggregate metrics together.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
output_dir: Path = _DEFAULT_OUTPUT_DIR,
|
||||||
|
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._output_dir = Path(output_dir)
|
||||||
|
self._index_dir = Path(index_dir)
|
||||||
|
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
|
||||||
|
# job_id -> AsyncScoreJobStatus; guarded by _lock
|
||||||
|
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
# session_id -> [job_ids in order]; guarded by _lock
|
||||||
|
self._session_jobs: dict[str, list[str]] = {}
|
||||||
|
# session_id -> per-session threading.Lock; guarded by _lock
|
||||||
|
self._session_locks: dict[str, threading.Lock] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def session_run_id(self, session_id: str) -> str:
|
||||||
|
"""Return the deterministic run_id for a session (also the dir name)."""
|
||||||
|
return f"session-{_sanitize_session_id(session_id)}"
|
||||||
|
|
||||||
|
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
|
||||||
|
"""Queue one scoring call for a session.
|
||||||
|
|
||||||
|
Returns (job_status, run_id). run_id is deterministic from session_id.
|
||||||
|
"""
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": (request.question or "")[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
"session_id": session_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self._job_cache[job_id] = status
|
||||||
|
if session_id not in self._session_jobs:
|
||||||
|
self._session_jobs[session_id] = []
|
||||||
|
self._session_jobs[session_id].append(job_id)
|
||||||
|
|
||||||
|
self._persist_job_index(status)
|
||||||
|
self._persist_session_index(session_id)
|
||||||
|
self._executor.submit(self._run, job_id, session_id, run_id, request)
|
||||||
|
return status, run_id
|
||||||
|
|
||||||
|
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return current status of one call, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._job_cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all session job records, newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._job_cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
def get_session(self, session_id: str) -> SessionStatus | None:
|
||||||
|
"""Return aggregate status for a session, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||||
|
if not job_ids:
|
||||||
|
return None
|
||||||
|
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
run_dir = self._output_dir / run_id
|
||||||
|
|
||||||
|
# Compute live metric means from the CSV (may be mid-update — best effort)
|
||||||
|
metric_means = self._read_metric_means(run_dir)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
|
||||||
|
|
||||||
|
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
|
||||||
|
return SessionStatus(
|
||||||
|
session_id=session_id,
|
||||||
|
run_id=run_id,
|
||||||
|
call_count=len(job_ids),
|
||||||
|
metric_means=metric_means,
|
||||||
|
latest_finished_at=latest,
|
||||||
|
jobs=sorted(jobs, key=lambda j: j.created_at),
|
||||||
|
)
|
||||||
|
|
||||||
|
def list_sessions(self) -> list[SessionStatus]:
|
||||||
|
"""Return aggregate status for all known sessions."""
|
||||||
|
with self._lock:
|
||||||
|
session_ids = list(self._session_jobs.keys())
|
||||||
|
results = []
|
||||||
|
for sid in session_ids:
|
||||||
|
status = self.get_session(sid)
|
||||||
|
if status is not None:
|
||||||
|
results.append(status)
|
||||||
|
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Score one sample then append it to the session's shared run artifacts."""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("webapp.services.session_score_manager")
|
||||||
|
self._update_job(job_id, status="running")
|
||||||
|
|
||||||
|
# Lazy imports — keep web server bootable if ragas is not installed.
|
||||||
|
from rag_eval.advisor import run_advisor
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.reporting.writers import write_run_artifacts
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||||
|
RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- Scoring (can run concurrently for the same session) ----------
|
||||||
|
if effective:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_scores = {}
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
finished_at = utc_now_iso()
|
||||||
|
|
||||||
|
# Build complete scores for this sample (skipped metrics → None)
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# weighted_raw = compute_weighted_score(
|
||||||
|
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||||
|
# )
|
||||||
|
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
weighted = None
|
||||||
|
|
||||||
|
# --- File I/O must be serialized per session ----------------------
|
||||||
|
session_lock = self._get_session_lock(session_id)
|
||||||
|
with session_lock:
|
||||||
|
run_dir = self._output_dir / run_id
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Read all existing rows, then append the new one
|
||||||
|
existing_rows = self._read_score_rows(run_dir)
|
||||||
|
call_number = len(existing_rows) + 1
|
||||||
|
|
||||||
|
new_row: dict[str, Any] = {
|
||||||
|
"sample_id": f"session-score-{call_number}",
|
||||||
|
"question": request.question,
|
||||||
|
"answer": request.answer or "",
|
||||||
|
"contexts": request.contexts or "",
|
||||||
|
"ground_truth": request.ground_truth or "",
|
||||||
|
"error": "",
|
||||||
|
}
|
||||||
|
new_row.update(all_scores)
|
||||||
|
|
||||||
|
all_rows = existing_rows + [new_row]
|
||||||
|
|
||||||
|
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
|
||||||
|
valid_samples = [
|
||||||
|
NormalizedSample(
|
||||||
|
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
|
||||||
|
question=str(row.get("question", "")),
|
||||||
|
answer=str(row.get("answer", "")),
|
||||||
|
contexts=[
|
||||||
|
part.strip()
|
||||||
|
for part in str(row.get("contexts", "")).split(" |||| ")
|
||||||
|
if part.strip()
|
||||||
|
],
|
||||||
|
ground_truth=str(row.get("ground_truth", "")),
|
||||||
|
)
|
||||||
|
for i, row in enumerate(all_rows)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Determine all metric columns (union of all rows' metric keys)
|
||||||
|
all_metric_names = sorted({
|
||||||
|
k for row in all_rows
|
||||||
|
for k in row if k not in _NON_METRIC_COLUMNS
|
||||||
|
})
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=f"session-{_sanitize_session_id(session_id)}",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
metrics=all_metric_names,
|
||||||
|
output_dir=self._output_dir,
|
||||||
|
optimization_advisor=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
started_at_val = (
|
||||||
|
existing_rows[0].get("_started_at", finished_at)
|
||||||
|
if existing_rows else finished_at
|
||||||
|
)
|
||||||
|
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id=run_id,
|
||||||
|
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
|
||||||
|
finished_at=finished_at,
|
||||||
|
valid_samples=valid_samples,
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=all_rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
write_run_artifacts(result)
|
||||||
|
logger.info(
|
||||||
|
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
|
||||||
|
job_id, session_id, call_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Regenerate optimization advice over all accumulated rows
|
||||||
|
try:
|
||||||
|
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
|
||||||
|
except Exception as adv_exc: # noqa: BLE001
|
||||||
|
logger.warning(
|
||||||
|
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
|
||||||
|
)
|
||||||
|
|
||||||
|
self._update_job(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=finished_at,
|
||||||
|
run_id=run_id,
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
self._persist_session_index(session_id)
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
import logging as _logging
|
||||||
|
_logging.getLogger("webapp.services.session_score_manager").error(
|
||||||
|
"[session_job] failed job_id=%s err=%s", job_id, exc
|
||||||
|
)
|
||||||
|
self._update_job(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Helpers
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _get_session_lock(self, session_id: str) -> threading.Lock:
|
||||||
|
with self._lock:
|
||||||
|
if session_id not in self._session_locks:
|
||||||
|
self._session_locks[session_id] = threading.Lock()
|
||||||
|
return self._session_locks[session_id]
|
||||||
|
|
||||||
|
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
|
||||||
|
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path)
|
||||||
|
return frame.where(pd.notnull(frame), None).to_dict("records")
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
|
||||||
|
"""Compute per-metric means from the session's scores.csv."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path)
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return {}
|
||||||
|
means: dict[str, float | None] = {}
|
||||||
|
for col in frame.columns:
|
||||||
|
if col in _NON_METRIC_COLUMNS:
|
||||||
|
continue
|
||||||
|
if pd.api.types.is_numeric_dtype(frame[col]):
|
||||||
|
val = frame[col].mean(numeric_only=True)
|
||||||
|
means[col] = None if pd.isna(val) else round(float(val), 4)
|
||||||
|
return means
|
||||||
|
|
||||||
|
def _update_job(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
with self._lock:
|
||||||
|
existing = self._job_cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._job_cache[job_id] = updated
|
||||||
|
self._persist_job_index(updated)
|
||||||
|
|
||||||
|
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Persist a single job's status to the index directory."""
|
||||||
|
path = self._index_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _persist_session_index(self, session_id: str) -> None:
|
||||||
|
"""Persist the session→job_ids mapping."""
|
||||||
|
with self._lock:
|
||||||
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
|
||||||
|
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(data, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Restore job cache and session mappings from persisted index files on startup."""
|
||||||
|
# Load individual job files
|
||||||
|
for path in sorted(self._index_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._job_cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Load session→job_ids mappings
|
||||||
|
sessions_dir = self._index_dir / "_sessions"
|
||||||
|
if not sessions_dir.is_dir():
|
||||||
|
return
|
||||||
|
for path in sorted(sessions_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
sid = data.get("session_id", "")
|
||||||
|
job_ids = data.get("job_ids", [])
|
||||||
|
if sid:
|
||||||
|
self._session_jobs[sid] = job_ids
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
session_score_manager = SessionScoreJobManager()
|
||||||
84
webapp/services/yaml_patcher.py
Normal file
84
webapp/services/yaml_patcher.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
"""Patch LLM profile settings into scenario YAML files in-place.
|
||||||
|
|
||||||
|
Only the fields that correspond to a provided (non-None) profile are touched.
|
||||||
|
All other fields and structure are preserved as much as PyYAML allows
|
||||||
|
(comments are lost on round-trip, which is an accepted trade-off).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from webapp.models import LLMProfile
|
||||||
|
|
||||||
|
|
||||||
|
def _repo_root() -> Path:
|
||||||
|
return Path(__file__).resolve().parents[2]
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_scenario_path(path_str: str) -> Path:
|
||||||
|
"""Resolve a scenario path; absolute paths are used as-is."""
|
||||||
|
candidate = Path(path_str)
|
||||||
|
if candidate.is_absolute():
|
||||||
|
return candidate
|
||||||
|
return (_repo_root() / candidate).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def apply_profiles_to_scenario(
|
||||||
|
scenario_path: str,
|
||||||
|
judge_profile: LLMProfile | None,
|
||||||
|
answer_profile: LLMProfile | None,
|
||||||
|
dataset_profile: LLMProfile | None,
|
||||||
|
metric_weights: dict[str, float] | None = None,
|
||||||
|
doc_weights: dict[str, float] | None = None,
|
||||||
|
_resolve_absolute: bool = False,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Patch the YAML file at *scenario_path* with the supplied profiles and weights.
|
||||||
|
|
||||||
|
Returns a list of dotted field names that were actually patched.
|
||||||
|
Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
|
||||||
|
"""
|
||||||
|
if _resolve_absolute:
|
||||||
|
resolved = Path(scenario_path)
|
||||||
|
else:
|
||||||
|
resolved = _resolve_scenario_path(scenario_path)
|
||||||
|
|
||||||
|
if not resolved.exists():
|
||||||
|
raise FileNotFoundError(f"Scenario file not found: {resolved}")
|
||||||
|
|
||||||
|
data: dict[str, Any] = yaml.safe_load(resolved.read_text(encoding="utf-8")) or {}
|
||||||
|
patched: list[str] = []
|
||||||
|
|
||||||
|
if judge_profile is not None:
|
||||||
|
data["judge_model"] = judge_profile.model
|
||||||
|
patched.append("judge_model")
|
||||||
|
|
||||||
|
if answer_profile is not None:
|
||||||
|
adapter = data.get("app_adapter")
|
||||||
|
if isinstance(adapter, dict):
|
||||||
|
static_kwargs = adapter.setdefault("static_kwargs", {})
|
||||||
|
static_kwargs["model"] = answer_profile.model
|
||||||
|
patched.append("app_adapter.static_kwargs.model")
|
||||||
|
|
||||||
|
if dataset_profile is not None:
|
||||||
|
generation = data.get("generation")
|
||||||
|
if isinstance(generation, dict):
|
||||||
|
generation["model"] = dataset_profile.model
|
||||||
|
patched.append("generation.model")
|
||||||
|
|
||||||
|
if metric_weights is not None:
|
||||||
|
data["metric_weights"] = dict(metric_weights)
|
||||||
|
patched.append("metric_weights")
|
||||||
|
|
||||||
|
if doc_weights is not None:
|
||||||
|
data["doc_weights"] = dict(doc_weights)
|
||||||
|
patched.append("doc_weights")
|
||||||
|
|
||||||
|
resolved.write_text(
|
||||||
|
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return patched
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
/* Siemens RAGAS 评估控制台 — 样式表
|
/* Siemens RAGAS 评估平台 — 样式表
|
||||||
配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */
|
配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */
|
||||||
|
|
||||||
:root {
|
:root {
|
||||||
@@ -199,6 +199,7 @@ code {
|
|||||||
.metric-value.bad { color: var(--bad); }
|
.metric-value.bad { color: var(--bad); }
|
||||||
.metric-value.na { color: var(--slate-light); }
|
.metric-value.na { color: var(--slate-light); }
|
||||||
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||||
|
.metric-desc { font-size: 12px; color: #64748b; margin-top: 6px; line-height: 1.45; }
|
||||||
|
|
||||||
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
.report-half { margin-bottom: 0; }
|
.report-half { margin-bottom: 0; }
|
||||||
@@ -265,3 +266,283 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
|
|||||||
.sidebar { width: 64px; }
|
.sidebar { width: 64px; }
|
||||||
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
|
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------- LLM 配置管理页 ---------- */
|
||||||
|
.profile-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 16px; }
|
||||||
|
.profile-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
padding: 16px; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.profile-card-head { display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; }
|
||||||
|
.profile-card-name { font-size: 15px; font-weight: 600; }
|
||||||
|
.profile-card-actions { display: flex; gap: 6px; }
|
||||||
|
.profile-card-field { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||||
|
.field-label { font-weight: 600; color: var(--ink); }
|
||||||
|
|
||||||
|
/* Form */
|
||||||
|
.profile-form { display: flex; flex-direction: column; gap: 12px; margin-top: 14px; max-width: 560px; }
|
||||||
|
.form-row { display: flex; flex-direction: column; gap: 4px; }
|
||||||
|
.form-label { font-size: 13px; font-weight: 600; }
|
||||||
|
.req { color: var(--bad); }
|
||||||
|
.form-input {
|
||||||
|
border: 1px solid var(--line); border-radius: 6px; padding: 8px 10px;
|
||||||
|
font-size: 13px; font-family: inherit; width: 100%;
|
||||||
|
}
|
||||||
|
.form-input:focus { outline: none; border-color: var(--petrol); }
|
||||||
|
.form-input-sm { max-width: 120px; }
|
||||||
|
.form-actions { display: flex; gap: 10px; align-items: center; margin-top: 4px; }
|
||||||
|
.form-error { font-size: 12px; color: var(--bad); }
|
||||||
|
.btn-sm { padding: 4px 10px; font-size: 12px; }
|
||||||
|
.btn-danger { color: var(--bad); border-color: var(--bad); }
|
||||||
|
.btn-danger:hover { background: #fee2e2; }
|
||||||
|
.btn-test { color: #0369a1; border-color: #0369a1; }
|
||||||
|
.btn-test:hover { background: #e0f2fe; }
|
||||||
|
|
||||||
|
/* LLM 连通性测试结果 */
|
||||||
|
.profile-test-result {
|
||||||
|
margin-top: 8px;
|
||||||
|
padding: 6px 10px;
|
||||||
|
border-radius: 6px;
|
||||||
|
font-size: 12px;
|
||||||
|
font-weight: 500;
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
.profile-test-result:not([hidden]) { display: block; }
|
||||||
|
.profile-test-result.ok { background: #dcfce7; color: #166534; border: 1px solid #bbf7d0; }
|
||||||
|
.profile-test-result.fail { background: #fee2e2; color: #991b1b; border: 1px solid #fecaca; word-break: break-all; }
|
||||||
|
|
||||||
|
/* 选中态 run 卡片 */
|
||||||
|
.run-card.selected {
|
||||||
|
border-color: var(--petrol);
|
||||||
|
box-shadow: 0 0 0 2px rgba(0,153,153,0.25), var(--shadow);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- LLM 角色配置面板 ---------- */
|
||||||
|
.llm-assignment-panel { border-left: 3px solid var(--petrol); }
|
||||||
|
.llm-role-rows { display: flex; flex-direction: column; gap: 10px; }
|
||||||
|
.llm-role-row { display: flex; align-items: center; gap: 14px; }
|
||||||
|
.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
|
||||||
|
.llm-role-select { min-width: 240px; }
|
||||||
|
|
||||||
|
/* ---------- API 文档 iframe ---------- */
|
||||||
|
#view-apidocs { padding: 0; display: flex; flex-direction: column; flex: 1; }
|
||||||
|
#view-apidocs[hidden] { display: none; }
|
||||||
|
.apidocs-frame {
|
||||||
|
flex: 1;
|
||||||
|
width: 100%;
|
||||||
|
height: calc(100vh - 64px);
|
||||||
|
border: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.report-actions {
|
||||||
|
display: flex; justify-content: flex-end; margin: 0 0 12px;
|
||||||
|
}
|
||||||
|
.btn-export-pdf {
|
||||||
|
font-size: 13px; display: flex; align-items: center; gap: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- 报告历史切换下拉 ---------- */
|
||||||
|
.report-switcher {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
background: var(--surface); border: 1px solid var(--line);
|
||||||
|
border-radius: var(--radius); padding: 10px 16px;
|
||||||
|
margin-bottom: 14px; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.report-switcher-label {
|
||||||
|
font-size: 13px; font-weight: 600; color: var(--slate); white-space: nowrap;
|
||||||
|
}
|
||||||
|
.report-switcher-select {
|
||||||
|
flex: 1; min-width: 0;
|
||||||
|
border: 1px solid var(--line); border-radius: 6px; padding: 6px 10px;
|
||||||
|
font-size: 13px; font-family: inherit; background: var(--bg); color: var(--ink);
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.report-switcher-select:focus { outline: none; border-color: var(--petrol); }
|
||||||
|
|
||||||
|
/* ?? ?????? ??????????????????????????????????? */
|
||||||
|
.weight-config-panel { margin-top: 12px; }
|
||||||
|
.weight-section-title { font-size: 13px; font-weight: 600; color: var(--text); margin-bottom: 8px; }
|
||||||
|
.weight-rows { display: flex; flex-direction: column; gap: 6px; }
|
||||||
|
.weight-row {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
.weight-row-label { min-width: 180px; color: var(--slate); font-family: monospace; }
|
||||||
|
.weight-row-input {
|
||||||
|
width: 80px; padding: 4px 8px; border: 1px solid var(--border);
|
||||||
|
border-radius: 6px; font-size: 13px; text-align: right;
|
||||||
|
}
|
||||||
|
.weight-row-input:focus { outline: none; border-color: #6366f1; }
|
||||||
|
.doc-weight-name {
|
||||||
|
flex: 1; padding: 4px 8px; border: 1px solid var(--border);
|
||||||
|
border-radius: 6px; font-size: 13px; min-width: 0;
|
||||||
|
}
|
||||||
|
.weight-row-remove { color: var(--bad); cursor: pointer; font-size: 14px; background: none; border: none; padding: 2px 6px; }
|
||||||
|
.weight-row-remove:hover { background: #fee2e2; border-radius: 4px; }
|
||||||
|
|
||||||
|
/* weighted_score ???????? */
|
||||||
|
.metric-card.weighted-score-card {
|
||||||
|
border: 2px solid #6366f1;
|
||||||
|
background: #f5f3ff;
|
||||||
|
}
|
||||||
|
.metric-card.weighted-score-card .metric-name { color: #4f46e5; font-weight: 700; }
|
||||||
|
|
||||||
|
/* ================================================================
|
||||||
|
打印样式(导出 PDF 用)
|
||||||
|
浏览器打印时隐藏 UI chrome,保留报告内容,图表 canvas 原样输出
|
||||||
|
================================================================ */
|
||||||
|
@media print {
|
||||||
|
/* ── 页面尺寸与边距 ── */
|
||||||
|
@page {
|
||||||
|
size: A4 portrait;
|
||||||
|
margin: 18mm 16mm 18mm 16mm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 隐藏所有非报告元素 ── */
|
||||||
|
.sidebar,
|
||||||
|
.topbar,
|
||||||
|
.report-actions,
|
||||||
|
.no-print,
|
||||||
|
#dist-metric-select,
|
||||||
|
.grouping-tabs,
|
||||||
|
#view-runs,
|
||||||
|
#view-new,
|
||||||
|
#view-profiles { display: none !important; }
|
||||||
|
|
||||||
|
/* ── 全局基础 ── */
|
||||||
|
body {
|
||||||
|
font-size: 11pt;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: #0f1b2d;
|
||||||
|
background: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 布局重置:main 全宽 ── */
|
||||||
|
.app { display: block; }
|
||||||
|
.main { display: block; width: 100%; }
|
||||||
|
.view { padding: 0; display: block !important; }
|
||||||
|
#view-apidocs { display: none !important; } /* never print the API docs iframe */
|
||||||
|
#view-report { display: block !important; }
|
||||||
|
|
||||||
|
/* ── 报告内容 ── */
|
||||||
|
#report-content { display: block !important; }
|
||||||
|
#report-empty { display: none !important; }
|
||||||
|
|
||||||
|
/* ── 元信息条 ── */
|
||||||
|
.report-meta {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
border-bottom: 2px solid #009999;
|
||||||
|
padding-bottom: 8pt;
|
||||||
|
margin-bottom: 14pt;
|
||||||
|
}
|
||||||
|
.report-meta-title { font-size: 14pt; font-weight: 700; }
|
||||||
|
.report-meta-info { font-size: 9pt; color: #64748b; }
|
||||||
|
|
||||||
|
/* ── Section 标签 ── */
|
||||||
|
.section-label {
|
||||||
|
font-size: 9pt;
|
||||||
|
font-weight: 700;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
color: #64748b;
|
||||||
|
text-transform: uppercase;
|
||||||
|
margin: 14pt 0 6pt;
|
||||||
|
border-bottom: 1px solid #e2e8f0;
|
||||||
|
padding-bottom: 3pt;
|
||||||
|
break-after: avoid;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── ① 指标均值卡片 ── */
|
||||||
|
.metric-cards {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(90pt, 1fr));
|
||||||
|
gap: 8pt;
|
||||||
|
margin-bottom: 12pt;
|
||||||
|
}
|
||||||
|
.metric-card {
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
border-radius: 6pt;
|
||||||
|
padding: 10pt 8pt;
|
||||||
|
text-align: center;
|
||||||
|
break-inside: avoid;
|
||||||
|
}
|
||||||
|
.metric-value { font-size: 20pt; font-weight: 700; }
|
||||||
|
.metric-name { font-size: 8pt; color: #64748b; margin-top: 2pt; }
|
||||||
|
|
||||||
|
/* ── ② 分布 + ③ 分组:打印时改为纵向排列 ── */
|
||||||
|
.report-row {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
.report-half {
|
||||||
|
margin-bottom: 12pt;
|
||||||
|
break-inside: avoid;
|
||||||
|
}
|
||||||
|
#dist-chart {
|
||||||
|
max-height: 160pt;
|
||||||
|
width: 100% !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── 面板统一 ── */
|
||||||
|
.panel {
|
||||||
|
border: 1px solid #e2e8f0;
|
||||||
|
border-radius: 6pt;
|
||||||
|
padding: 10pt 12pt;
|
||||||
|
margin-bottom: 10pt;
|
||||||
|
break-inside: avoid;
|
||||||
|
box-shadow: none;
|
||||||
|
}
|
||||||
|
.panel h2 { font-size: 12pt; margin-bottom: 4pt; }
|
||||||
|
|
||||||
|
/* ── ④ 最低分样本:打印时全部展开,隐藏点击提示 ── */
|
||||||
|
.lowest-detail { display: block !important; hidden: false; }
|
||||||
|
.lowest-row { break-inside: avoid; }
|
||||||
|
.lowest-detail-inner { padding: 8pt 0; font-size: 10pt; }
|
||||||
|
.detail-label { font-size: 8pt; font-weight: 700; color: #64748b; margin-bottom: 2pt; }
|
||||||
|
.detail-context .ctx-item { border-bottom: 1px dashed #e2e8f0; padding: 2pt 0; font-size: 9pt; }
|
||||||
|
|
||||||
|
/* ── ⑤ 优化建议 ── */
|
||||||
|
#advice-section { display: block !important; }
|
||||||
|
.advice-panel { border: 1px solid #e2e8f0; border-radius: 6pt; padding: 10pt 12pt; }
|
||||||
|
.advice-md h2 { font-size: 12pt; margin-top: 10pt; }
|
||||||
|
.advice-md h3 { font-size: 11pt; }
|
||||||
|
.advice-md ul { margin: 4pt 0 4pt 16pt; }
|
||||||
|
.advice-md li { margin-bottom: 3pt; }
|
||||||
|
|
||||||
|
/* ── 分组表 ── */
|
||||||
|
table.group-table { width: 100%; font-size: 9pt; border-collapse: collapse; }
|
||||||
|
table.group-table th,
|
||||||
|
table.group-table td { padding: 4pt 6pt; border-bottom: 1px solid #e2e8f0; }
|
||||||
|
table.group-table th { font-weight: 700; color: #64748b; }
|
||||||
|
|
||||||
|
/* ── 颜色保留(部分浏览器打印默认去色) ── */
|
||||||
|
.good { color: #16a34a !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.warn { color: #eab308 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.bad { color: #dc2626 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.good { background: #dcfce7 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.warn { background: #fef9c3 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
.score-badge.bad { background: #fee2e2 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- ⑤ 优化建议面板 ---------- */
|
||||||
|
.advice-panel { border-left: 3px solid #7c3aed; }
|
||||||
|
.advice-header {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
margin-bottom: 14px;
|
||||||
|
}
|
||||||
|
.advice-badge {
|
||||||
|
background: #7c3aed; color: #fff;
|
||||||
|
font-size: 11px; font-weight: 700; letter-spacing: 0.5px;
|
||||||
|
padding: 3px 8px; border-radius: 4px; text-transform: uppercase;
|
||||||
|
}
|
||||||
|
.advice-model { font-size: 12px; color: var(--slate); }
|
||||||
|
.advice-body { line-height: 1.7; color: var(--ink); }
|
||||||
|
.advice-md h1 { font-size: 16px; font-weight: 700; margin: 16px 0 8px; color: var(--ink); }
|
||||||
|
.advice-md h2 {
|
||||||
|
font-size: 14px; font-weight: 700; margin: 20px 0 8px;
|
||||||
|
padding-bottom: 4px; border-bottom: 1px solid var(--line); color: var(--ink-soft);
|
||||||
|
}
|
||||||
|
.advice-md h3 { font-size: 13px; font-weight: 600; margin: 12px 0 6px; color: var(--slate); }
|
||||||
|
.advice-md hr { border: none; border-top: 1px solid var(--line); margin: 14px 0; }
|
||||||
|
.advice-md ul { padding-left: 20px; margin: 6px 0; }
|
||||||
|
.advice-md li { margin: 3px 0; font-size: 13px; }
|
||||||
|
.advice-md strong { color: var(--ink); font-weight: 600; }
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>Siemens RAGAS 评估控制台</title>
|
<title>Siemens RAGAS 评估平台</title>
|
||||||
<link rel="stylesheet" href="/static/css/app.css" />
|
<link rel="stylesheet" href="/static/css/app.css" />
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
||||||
</head>
|
</head>
|
||||||
@@ -12,8 +12,8 @@
|
|||||||
<!-- 左侧导航(布局 A) -->
|
<!-- 左侧导航(布局 A) -->
|
||||||
<aside class="sidebar">
|
<aside class="sidebar">
|
||||||
<div class="brand">
|
<div class="brand">
|
||||||
<div class="brand-mark">RAGAS</div>
|
<div class="brand-mark">Siemens RAGAS</div>
|
||||||
<div class="brand-sub">评估控制台</div>
|
<div class="brand-sub">评估平台</div>
|
||||||
</div>
|
</div>
|
||||||
<nav class="nav">
|
<nav class="nav">
|
||||||
<button class="nav-item" data-view="runs">
|
<button class="nav-item" data-view="runs">
|
||||||
@@ -22,9 +22,18 @@
|
|||||||
<button class="nav-item" data-view="new">
|
<button class="nav-item" data-view="new">
|
||||||
<span class="nav-ico">+</span><span>新建评估</span>
|
<span class="nav-ico">+</span><span>新建评估</span>
|
||||||
</button>
|
</button>
|
||||||
<button class="nav-item" data-view="report" data-requires-run="1">
|
<button class="nav-item" data-view="report" data-requires-run="1" disabled>
|
||||||
<span class="nav-ico">▤</span><span>报告详情</span>
|
<span class="nav-ico">▤</span><span>报告详情</span>
|
||||||
</button>
|
</button>
|
||||||
|
<button class="nav-item" data-view="profiles">
|
||||||
|
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
||||||
|
</button>
|
||||||
|
<button class="nav-item" data-view="scorejobs">
|
||||||
|
<span class="nav-ico">📋</span><span>评分记录</span>
|
||||||
|
</button>
|
||||||
|
<button class="nav-item" data-view="apidocs">
|
||||||
|
<span class="nav-ico">⎔</span><span>API 文档</span>
|
||||||
|
</button>
|
||||||
</nav>
|
</nav>
|
||||||
<div class="sidebar-foot">
|
<div class="sidebar-foot">
|
||||||
<span class="dot" id="health-dot"></span>
|
<span class="dot" id="health-dot"></span>
|
||||||
@@ -59,6 +68,49 @@
|
|||||||
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
|
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- LLM 角色配置面板(选中场景后显示) -->
|
||||||
|
<div class="panel llm-assignment-panel" id="llm-assignment-panel" hidden>
|
||||||
|
<h2>LLM 角色配置 <span class="muted" style="font-size:13px;font-weight:400">(可选)</span></h2>
|
||||||
|
<p class="muted" style="margin-bottom:14px">为不同任务角色选择已保存的 LLM 配置,留空则使用场景文件中的原始配置。</p>
|
||||||
|
<div class="llm-role-rows">
|
||||||
|
<div class="llm-role-row">
|
||||||
|
<label class="llm-role-label">评测打分 Judge LLM</label>
|
||||||
|
<select class="select llm-role-select" id="role-judge">
|
||||||
|
<option value="">— 使用场景原始配置 —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="llm-role-row">
|
||||||
|
<label class="llm-role-label">生成答案 Answer LLM</label>
|
||||||
|
<select class="select llm-role-select" id="role-answer">
|
||||||
|
<option value="">— 使用场景原始配置 —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="llm-role-row">
|
||||||
|
<label class="llm-role-label">生成题库 Dataset LLM</label>
|
||||||
|
<select class="select llm-role-select" id="role-dataset">
|
||||||
|
<option value="">— 使用场景原始配置 —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ??????????????? -->
|
||||||
|
<div class="panel weight-config-panel" id="weight-config-panel" hidden>
|
||||||
|
<h2>???? <span class="muted" style="font-size:13px;font-weight:400">???????????????</span></h2>
|
||||||
|
|
||||||
|
<div class="weight-section">
|
||||||
|
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">???????????????????</span></div>
|
||||||
|
<div id="metric-weight-rows" class="weight-rows"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="weight-section" style="margin-top:16px">
|
||||||
|
<div class="weight-section-title">???? <span class="muted" style="font-size:12px">?? PDF ???????????????????????</span></div>
|
||||||
|
<div id="doc-weight-rows" class="weight-rows"></div>
|
||||||
|
<button class="btn btn-sm" id="add-doc-weight-btn" style="margin-top:8px">? ??????</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="panel" id="task-panel" hidden>
|
<div class="panel" id="task-panel" hidden>
|
||||||
<div class="task-head">
|
<div class="task-head">
|
||||||
<h2>评估进度</h2>
|
<h2>评估进度</h2>
|
||||||
@@ -73,12 +125,25 @@
|
|||||||
|
|
||||||
<!-- 报告详情视图 -->
|
<!-- 报告详情视图 -->
|
||||||
<section class="view" id="view-report" hidden>
|
<section class="view" id="view-report" hidden>
|
||||||
|
<!-- 历史报告切换下拉(顶部,始终可见) -->
|
||||||
|
<div class="report-switcher no-print" id="report-switcher">
|
||||||
|
<label class="report-switcher-label">切换报告</label>
|
||||||
|
<select class="select report-switcher-select" id="report-switcher-select">
|
||||||
|
<option value="">— 加载中… —</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="empty" id="report-empty">
|
<div class="empty" id="report-empty">
|
||||||
<p>请先从「运行列表」选择一次运行。</p>
|
<p>请先从「运行列表」选择一次运行。</p>
|
||||||
</div>
|
</div>
|
||||||
<div id="report-content" hidden>
|
<div id="report-content" hidden>
|
||||||
<!-- 顶部元信息条 -->
|
<!-- 顶部元信息条 -->
|
||||||
<div class="report-meta" id="report-meta"></div>
|
<div class="report-meta" id="report-meta"></div>
|
||||||
|
<div class="report-actions no-print">
|
||||||
|
<button class="btn btn-ghost btn-export-pdf" id="export-pdf-btn" onclick="Report.exportPdf()">
|
||||||
|
📄 导出 PDF
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- ① 指标均值卡片 -->
|
<!-- ① 指标均值卡片 -->
|
||||||
<div class="section-label">① 指标均值 OVERVIEW</div>
|
<div class="section-label">① 指标均值 OVERVIEW</div>
|
||||||
@@ -105,14 +170,108 @@
|
|||||||
<!-- ④ 最低分样本逐条复核 -->
|
<!-- ④ 最低分样本逐条复核 -->
|
||||||
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
|
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
|
||||||
<div class="lowest-table" id="lowest-table"></div>
|
<div class="lowest-table" id="lowest-table"></div>
|
||||||
|
|
||||||
|
<!-- ⑤ 优化建议(optimization_advisor: true 时显示) -->
|
||||||
|
<div id="advice-section" hidden>
|
||||||
|
<div class="section-label">⑤ 优化建议 OPTIMIZATION ADVICE</div>
|
||||||
|
<div class="panel advice-panel">
|
||||||
|
<div class="advice-header">
|
||||||
|
<span class="advice-badge">AI 诊断报告</span>
|
||||||
|
<span class="advice-model" id="advice-model-label"></span>
|
||||||
|
</div>
|
||||||
|
<div class="advice-body" id="advice-body"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<!-- LLM 配置视图 -->
|
||||||
|
<section class="view" id="view-profiles" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<div class="panel-head">
|
||||||
|
<h2>LLM 配置管理</h2>
|
||||||
|
<button class="btn btn-primary" id="add-profile-btn">+ 新建配置</button>
|
||||||
|
</div>
|
||||||
|
<p class="muted">保存常用 LLM 连接参数,在运行评估时按角色选择。</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 新建 / 编辑表单(默认隐藏) -->
|
||||||
|
<div class="panel" id="profile-form-panel" hidden>
|
||||||
|
<h2 id="profile-form-title">新建 LLM 配置</h2>
|
||||||
|
<div class="profile-form">
|
||||||
|
<input type="hidden" id="edit-profile-id" />
|
||||||
|
<div class="form-row">
|
||||||
|
<label class="form-label">配置名称 <span class="req">*</span></label>
|
||||||
|
<input class="form-input" id="pf-name" placeholder="例:DeepSeek Flash(内网)" />
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label class="form-label">模型名称 <span class="req">*</span></label>
|
||||||
|
<input class="form-input" id="pf-model" placeholder="例:deepseek-v4-flash" />
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label class="form-label">Base URL <span class="req">*</span></label>
|
||||||
|
<input class="form-input" id="pf-base-url" placeholder="例:http://6.86.80.4:30080/v1" />
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label class="form-label">API Key <span class="req">*</span></label>
|
||||||
|
<input class="form-input" id="pf-api-key" type="password" placeholder="sk-…" />
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label class="form-label">超时(秒)</label>
|
||||||
|
<input class="form-input form-input-sm" id="pf-timeout" type="number" value="30" min="5" max="300" />
|
||||||
|
</div>
|
||||||
|
<div class="form-actions">
|
||||||
|
<button class="btn btn-primary" id="save-profile-btn">保存</button>
|
||||||
|
<button class="btn btn-test" id="test-profile-btn">测试连通性</button>
|
||||||
|
<button class="btn" id="cancel-profile-btn">取消</button>
|
||||||
|
<span class="form-error muted" id="profile-form-error"></span>
|
||||||
|
</div>
|
||||||
|
<div class="profile-test-result" id="profile-form-test-result" hidden></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="profile-cards" class="profile-grid"></div>
|
||||||
|
<div class="empty" id="profiles-empty" hidden>
|
||||||
|
<p>尚未添加任何 LLM 配置。</p>
|
||||||
|
<p class="muted">点击「新建配置」添加第一个。</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- 评分记录视图 -->
|
||||||
|
<section class="view" id="view-scorejobs" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<div class="panel-head">
|
||||||
|
<h2>评分记录</h2>
|
||||||
|
<span class="muted" style="font-size:13px">来自 Dify 异步评分任务(POST /api/score/async)</span>
|
||||||
|
</div>
|
||||||
|
<p class="muted">评分完成后自动生成完整报告(含指标得分与 LLM 优化建议),点击「查看报告」跳转报告详情页。</p>
|
||||||
|
</div>
|
||||||
|
<div id="scorejobs-list"></div>
|
||||||
|
<div class="empty" id="scorejobs-empty" hidden>
|
||||||
|
<p>暂无评分记录。</p>
|
||||||
|
<p class="muted">在 Dify 工作流中调用 <code>POST /api/score/async</code> 后,记录将在此显示。</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- API 文档视图 -->
|
||||||
|
<section class="view" id="view-apidocs" hidden>
|
||||||
|
<iframe
|
||||||
|
id="apidocs-frame"
|
||||||
|
src="/docs"
|
||||||
|
class="apidocs-frame"
|
||||||
|
title="API 文档"
|
||||||
|
allowfullscreen>
|
||||||
|
</iframe>
|
||||||
|
</section>
|
||||||
</main>
|
</main>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/static/js/api.js"></script>
|
<script src="/static/js/api.js"></script>
|
||||||
|
<script src="/static/js/metric_presenter.js"></script>
|
||||||
<script src="/static/js/report.js"></script>
|
<script src="/static/js/report.js"></script>
|
||||||
|
<script src="/static/js/profiles.js"></script>
|
||||||
<script src="/static/js/runner.js"></script>
|
<script src="/static/js/runner.js"></script>
|
||||||
|
<script src="/static/js/score_jobs.js"></script>
|
||||||
<script src="/static/js/app.js"></script>
|
<script src="/static/js/app.js"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -43,4 +43,43 @@ const API = {
|
|||||||
return API.post("/api/evaluations", { scenario_path: scenarioPath });
|
return API.post("/api/evaluations", { scenario_path: scenarioPath });
|
||||||
},
|
},
|
||||||
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
|
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
|
||||||
|
|
||||||
|
// LLM Profile API
|
||||||
|
profiles() { return API.get("/api/llm-profiles"); },
|
||||||
|
createProfile(body) { return API.post("/api/llm-profiles", body); },
|
||||||
|
updateProfile(id, body) {
|
||||||
|
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, {
|
||||||
|
method: "PUT",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
}).then(async r => {
|
||||||
|
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
|
||||||
|
return r.json();
|
||||||
|
});
|
||||||
|
},
|
||||||
|
deleteProfile(id) {
|
||||||
|
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, { method: "DELETE" })
|
||||||
|
.then(async r => {
|
||||||
|
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
|
||||||
|
return r.json();
|
||||||
|
});
|
||||||
|
},
|
||||||
|
applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); },
|
||||||
|
|
||||||
|
// 异步评分记录 API
|
||||||
|
scoreJobsAsync(body) { return API.post("/api/score/async", body); },
|
||||||
|
getScoreJob(jobId) { return API.get(`/api/score/jobs/${encodeURIComponent(jobId)}`); },
|
||||||
|
listScoreJobs() { return API.get("/api/score/jobs"); },
|
||||||
|
|
||||||
|
// 测试已保存 profile 的连通性
|
||||||
|
testProfile(id) {
|
||||||
|
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}/test`, { method: "POST" })
|
||||||
|
.then(async r => {
|
||||||
|
if (!r.ok) { const d = await API._extractError(r); throw new Error(d); }
|
||||||
|
return r.json();
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// 测试表单中填写的内联参数(保存前即可测试)
|
||||||
|
probeConnectivity(body) { return API.post("/api/llm-profiles/probe", body); },
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,28 +1,59 @@
|
|||||||
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
|
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
|
||||||
|
// 会话保持:URL hash 路由(#runs / #new / #profiles / #report/{runId})
|
||||||
|
// + sessionStorage 兜底,F5 刷新 / 浏览器前进后退均可恢复。
|
||||||
|
|
||||||
const App = {
|
const App = {
|
||||||
currentRunId: null,
|
currentRunId: null,
|
||||||
views: ["runs", "new", "report"],
|
activeView: null,
|
||||||
titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
|
views: ["runs", "new", "report", "profiles", "scorejobs", "apidocs"],
|
||||||
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", scorejobs: "评分记录", apidocs: "API 文档" },
|
||||||
|
|
||||||
// 初始化:绑定导航、加载首屏、启动健康检查。
|
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
||||||
init() {
|
init() {
|
||||||
document.querySelectorAll(".nav-item").forEach((btn) => {
|
document.querySelectorAll(".nav-item").forEach((btn) => {
|
||||||
btn.addEventListener("click", () => App.switchView(btn.dataset.view));
|
btn.addEventListener("click", () => App.navigate(btn.dataset.view));
|
||||||
});
|
});
|
||||||
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
|
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
|
||||||
|
|
||||||
Runner.init();
|
Runner.init();
|
||||||
App.switchView("runs");
|
Profiles.init();
|
||||||
|
|
||||||
|
// 恢复上次会话(优先 URL hash,其次 sessionStorage)
|
||||||
|
App._restoreSession();
|
||||||
|
|
||||||
App.checkHealth();
|
App.checkHealth();
|
||||||
setInterval(App.checkHealth, 15000);
|
setInterval(App.checkHealth, 15000);
|
||||||
|
|
||||||
|
// 浏览器前进 / 后退按钮
|
||||||
|
window.addEventListener("popstate", () => App._restoreSession());
|
||||||
},
|
},
|
||||||
|
|
||||||
// 切换主视图,并同步导航高亮与标题。
|
// ----------------------------------------------------------------
|
||||||
switchView(view) {
|
// 路由 —— 有历史记录的主动导航(更新 URL hash)
|
||||||
if (view === "report" && !App.currentRunId) {
|
// ----------------------------------------------------------------
|
||||||
// 没有选中的运行时,报告页显示占位。
|
navigate(view, runId) {
|
||||||
|
if (runId !== undefined) App.currentRunId = runId;
|
||||||
|
const hash = App._buildHash(view, App.currentRunId);
|
||||||
|
if (location.hash !== `#${hash}`) {
|
||||||
|
history.pushState({ view, runId: App.currentRunId }, "", `#${hash}`);
|
||||||
}
|
}
|
||||||
|
App._doSwitch(view);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 供内部调用(不产生历史记录),例如刷新同一视图
|
||||||
|
switchView(view) {
|
||||||
|
App._doSwitch(view);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 刷新当前视图数据
|
||||||
|
refreshCurrent() {
|
||||||
|
App._doSwitch(App.activeView || "runs");
|
||||||
|
},
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// 内部:实际切换 DOM + 触发数据加载
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
_doSwitch(view) {
|
||||||
App.views.forEach((name) => {
|
App.views.forEach((name) => {
|
||||||
const el = document.getElementById(`view-${name}`);
|
const el = document.getElementById(`view-${name}`);
|
||||||
if (el) el.hidden = name !== view;
|
if (el) el.hidden = name !== view;
|
||||||
@@ -33,17 +64,54 @@ const App = {
|
|||||||
document.getElementById("view-title").textContent = App.titles[view] || view;
|
document.getElementById("view-title").textContent = App.titles[view] || view;
|
||||||
App.activeView = view;
|
App.activeView = view;
|
||||||
|
|
||||||
if (view === "runs") App.loadRuns();
|
// 持久化到 sessionStorage(URL 共享场景的备份)
|
||||||
if (view === "new") Runner.loadScenarios();
|
sessionStorage.setItem("rag_view", view);
|
||||||
if (view === "report") Report.render(App.currentRunId);
|
if (App.currentRunId) sessionStorage.setItem("rag_run_id", App.currentRunId);
|
||||||
|
|
||||||
|
if (view === "runs") App.loadRuns();
|
||||||
|
if (view === "new") Runner.loadScenarios();
|
||||||
|
if (view === "report") Report.render(App.currentRunId);
|
||||||
|
if (view === "profiles") Profiles.load();
|
||||||
|
if (view === "scorejobs") ScoreJobs.load();
|
||||||
},
|
},
|
||||||
|
|
||||||
// 刷新当前视图的数据。
|
// ----------------------------------------------------------------
|
||||||
refreshCurrent() {
|
// Hash 工具
|
||||||
App.switchView(App.activeView || "runs");
|
// ----------------------------------------------------------------
|
||||||
|
_buildHash(view, runId) {
|
||||||
|
if (view === "report" && runId) {
|
||||||
|
return `report/${encodeURIComponent(runId)}`;
|
||||||
|
}
|
||||||
|
return view || "runs";
|
||||||
},
|
},
|
||||||
|
|
||||||
// 加载并渲染运行列表。
|
_parseHash() {
|
||||||
|
const raw = location.hash.replace(/^#\/?/, "");
|
||||||
|
if (!raw) return { view: null, runId: null };
|
||||||
|
if (raw.startsWith("report/")) {
|
||||||
|
const runId = decodeURIComponent(raw.slice("report/".length));
|
||||||
|
return { view: "report", runId };
|
||||||
|
}
|
||||||
|
const view = App.views.includes(raw) ? raw : null;
|
||||||
|
return { view, runId: null };
|
||||||
|
},
|
||||||
|
|
||||||
|
// 会话恢复:hash → sessionStorage → 默认 runs
|
||||||
|
_restoreSession() {
|
||||||
|
const { view: hView, runId: hRunId } = App._parseHash();
|
||||||
|
const view = hView || sessionStorage.getItem("rag_view") || "runs";
|
||||||
|
const runId = hRunId || sessionStorage.getItem("rag_run_id") || null;
|
||||||
|
|
||||||
|
if (runId) {
|
||||||
|
App.currentRunId = runId;
|
||||||
|
App.enableReportNav();
|
||||||
|
}
|
||||||
|
App._doSwitch(view);
|
||||||
|
},
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// 运行列表
|
||||||
|
// ----------------------------------------------------------------
|
||||||
async loadRuns() {
|
async loadRuns() {
|
||||||
const container = document.getElementById("runs-container");
|
const container = document.getElementById("runs-container");
|
||||||
const empty = document.getElementById("runs-empty");
|
const empty = document.getElementById("runs-empty");
|
||||||
@@ -64,22 +132,24 @@ const App = {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// 构造一张运行卡片。
|
|
||||||
renderRunCard(run) {
|
renderRunCard(run) {
|
||||||
const card = document.createElement("div");
|
const card = document.createElement("div");
|
||||||
card.className = "run-card";
|
card.className = "run-card" + (run.run_id === App.currentRunId ? " selected" : "");
|
||||||
|
|
||||||
card.addEventListener("click", () => {
|
card.addEventListener("click", () => {
|
||||||
App.currentRunId = run.run_id;
|
// 更新选中高亮
|
||||||
|
document.querySelectorAll(".run-card").forEach((c) => c.classList.remove("selected"));
|
||||||
|
card.classList.add("selected");
|
||||||
App.enableReportNav();
|
App.enableReportNav();
|
||||||
App.switchView("report");
|
App.navigate("report", run.run_id);
|
||||||
});
|
});
|
||||||
|
|
||||||
const chips = (run.metrics || [])
|
const chips = (run.metrics || [])
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
const val = run.metric_means ? run.metric_means[m] : null;
|
const val = run.metric_means ? run.metric_means[m] : null;
|
||||||
const cls = App.scoreClass(val);
|
const cls = App.scoreClass(m, val);
|
||||||
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
||||||
return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||||
})
|
})
|
||||||
.join("");
|
.join("");
|
||||||
|
|
||||||
@@ -96,54 +166,51 @@ const App = {
|
|||||||
return card;
|
return card;
|
||||||
},
|
},
|
||||||
|
|
||||||
// 启用报告导航项(选中运行后)。
|
// ----------------------------------------------------------------
|
||||||
|
// 工具方法
|
||||||
|
// ----------------------------------------------------------------
|
||||||
enableReportNav() {
|
enableReportNav() {
|
||||||
const btn = document.querySelector('.nav-item[data-view="report"]');
|
const btn = document.querySelector('.nav-item[data-view="report"]');
|
||||||
if (btn) btn.disabled = false;
|
if (btn) btn.disabled = false;
|
||||||
},
|
},
|
||||||
|
|
||||||
// 根据分值返回 good/warn/bad/na 配色类。
|
scoreClass(metricName, value) {
|
||||||
scoreClass(value) {
|
return MetricPresenter.scoreClass(metricName, value);
|
||||||
if (value === null || value === undefined) return "na";
|
|
||||||
if (value >= 0.8) return "good";
|
|
||||||
if (value >= 0.65) return "warn";
|
|
||||||
return "bad";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
// 指标名缩写,节省卡片横向空间。
|
|
||||||
shortMetric(name) {
|
shortMetric(name) {
|
||||||
const map = {
|
const map = {
|
||||||
faithfulness: "faith.",
|
faithfulness: "faith.",
|
||||||
answer_relevancy: "ans.rel.",
|
answer_relevancy: "ans.rel.",
|
||||||
context_recall: "ctx.recall",
|
context_recall: "ctx.recall",
|
||||||
context_precision: "ctx.prec.",
|
context_precision: "ctx.prec.",
|
||||||
|
noise_sensitivity: "noise.sens.",
|
||||||
|
factual_correctness: "fact.corr.",
|
||||||
|
semantic_similarity: "sem.sim.",
|
||||||
};
|
};
|
||||||
return map[name] || name;
|
return map[name] || name;
|
||||||
},
|
},
|
||||||
|
|
||||||
// 截取时间戳到分钟,便于阅读。
|
|
||||||
shortTime(iso) {
|
shortTime(iso) {
|
||||||
if (!iso) return "—";
|
if (!iso) return "—";
|
||||||
return String(iso).replace("T", " ").slice(0, 16);
|
return String(iso).replace("T", " ").slice(0, 16);
|
||||||
},
|
},
|
||||||
|
|
||||||
// 简单 HTML 转义,防止注入。
|
|
||||||
escape(text) {
|
escape(text) {
|
||||||
const div = document.createElement("div");
|
const div = document.createElement("div");
|
||||||
div.textContent = text == null ? "" : String(text);
|
div.textContent = text == null ? "" : String(text);
|
||||||
return div.innerHTML;
|
return div.innerHTML;
|
||||||
},
|
},
|
||||||
|
|
||||||
// 健康检查,更新左下角状态点。
|
|
||||||
async checkHealth() {
|
async checkHealth() {
|
||||||
const dot = document.getElementById("health-dot");
|
const dot = document.getElementById("health-dot");
|
||||||
const label = document.getElementById("health-text");
|
const label = document.getElementById("health-text");
|
||||||
try {
|
try {
|
||||||
await API.health();
|
await API.health();
|
||||||
dot.className = "dot ok";
|
dot.className = "dot ok";
|
||||||
label.textContent = "服务正常";
|
label.textContent = "服务正常";
|
||||||
} catch (_e) {
|
} catch (_e) {
|
||||||
dot.className = "dot bad";
|
dot.className = "dot bad";
|
||||||
label.textContent = "服务离线";
|
label.textContent = "服务离线";
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user