Compare commits
10 Commits
a781ba1e4a
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9828b1d44c | ||
|
|
1df4010acc | ||
|
|
754a30ad59 | ||
|
|
e1751447df | ||
|
|
4fd515d2d9 | ||
|
|
abcd61ec8f | ||
|
|
363e8b0f27 | ||
|
|
b870ed8730 | ||
|
|
791738bb07 | ||
|
|
630b70cc2a |
@@ -8,10 +8,12 @@ OPENAI_BASE_URL=http://6.86.80.4:30080/v1
|
|||||||
OPENAI_TIMEOUT_SECONDS=180
|
OPENAI_TIMEOUT_SECONDS=180
|
||||||
|
|
||||||
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
# 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖)
|
||||||
# RAGAS_JUDGE_MODEL 需支持 max_tokens + json_object(gpt-5、gpt-4.1、gpt-4o 等)
|
# RAGAS_JUDGE_MODEL 需支持 OpenAI 兼容 chat.completions + 结构化 JSON 输出
|
||||||
# 注意:gpt-5.4/5.5/5.2 系列不支持 max_tokens,与 RAGAS 0.4.3 不兼容
|
# RAGAS_LLM_MAX_TOKENS 控制 Judge 评分链路的 completion budget;faithfulness 等
|
||||||
|
# 结构化指标在 GPT-5 系列上通常需要 4096 或更高,避免 IncompleteOutputException
|
||||||
RAGAS_JUDGE_MODEL=gpt-5
|
RAGAS_JUDGE_MODEL=gpt-5
|
||||||
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
|
RAGAS_EMBEDDING_MODEL=text-embedding-3-small
|
||||||
|
RAGAS_LLM_MAX_TOKENS=4096
|
||||||
|
|
||||||
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
# 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300)
|
||||||
BATCH_SIZE=8
|
BATCH_SIZE=8
|
||||||
|
|||||||
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="KubernetesApiProvider"><![CDATA[{}]]></component>
|
||||||
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
|
||||||
|
<output url="file://$PROJECT_DIR$/out" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/siemens_ragas.iml" filepath="$PROJECT_DIR$/.idea/siemens_ragas.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
9
.idea/siemens_ragas.iml
generated
Normal file
9
.idea/siemens_ragas.iml
generated
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
<h2>优化建议怎么生成?</h2>
|
||||||
|
<p class="subtitle">这决定了模块的核心机制与可维护性</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>纯规则引擎</h3>
|
||||||
|
<p>每个指标设阈值(如 faithfulness < 0.6),触发时给出预设建议文本。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>零 LLM 调用,零额外成本</li>
|
||||||
|
<li>结果可预测、可审计</li>
|
||||||
|
<li>响应极快</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>建议固定,无法结合具体样本</li>
|
||||||
|
<li>不能解释"为什么这批数据这个指标低"</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>LLM 分析(全自动)</h3>
|
||||||
|
<p>把评测结果(各指标均值 + 低分样本)一起交给 LLM,生成上下文感知的中文分析报告。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>能结合具体低分样本给出针对性建议</li>
|
||||||
|
<li>可用中文解释西门子场景下的问题</li>
|
||||||
|
<li>建议质量高、内容丰富</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>每次评测多 1 次 LLM 调用</li>
|
||||||
|
<li>依赖 judge_model 的质量</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>规则定位 + LLM 解读(推荐)</h3>
|
||||||
|
<p>规则引擎先识别哪些指标异常、触发哪条优化方向;再把"规则诊断 + 低分样本"一起给 LLM 做二次解读,生成中文建议。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>规则保证诊断稳定,不依赖 LLM 自由发挥</li>
|
||||||
|
<li>LLM 在有结构的输入下输出更准确</li>
|
||||||
|
<li>两层可独立测试</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>实现略复杂(两个子模块)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,77 @@
|
|||||||
|
<h2>优化顾问模块 — 实现方案对比</h2>
|
||||||
|
<p class="subtitle">三个方案的核心区别在于 LLM 调用边界和代码入侵程度</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>独立后处理器(轻量集成)</h3>
|
||||||
|
<p>新增 <code>rag_eval/advisor/</code> 包,<code>run_scenario()</code> 末尾调用一行 <code>maybe_run_advisor(result, scenario)</code>。</p>
|
||||||
|
<p><strong>文件结构:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/advisor/__init__.py</code></li>
|
||||||
|
<li><code>rag_eval/advisor/rules.py</code> — 规则引擎,输入 score_rows,输出诊断列表</li>
|
||||||
|
<li><code>rag_eval/advisor/llm_analyzer.py</code> — 把规则诊断 + 低分样本交给 judge_model</li>
|
||||||
|
<li><code>rag_eval/advisor/writer.py</code> — 写 optimization_advice.md,打日志摘要</li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>改动最小,runner.py 只加 3 行</li>
|
||||||
|
<li>advisor 完全独立,可单独测试</li>
|
||||||
|
<li>与现有分层架构完全吻合</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>无法拿到 per-metric 的原始 NaN 率(需从 score_rows 重新算)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>嵌入 reporting 层(复用写出基础设施)</h3>
|
||||||
|
<p>把 advisor 作为 <code>rag_eval/reporting/</code> 的一部分,<code>write_run_artifacts()</code> 内部判断是否写 advice。</p>
|
||||||
|
<p><strong>文件结构:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/reporting/advisor.py</code> — 规则 + LLM + 写出三合一</li>
|
||||||
|
<li><code>write_run_artifacts()</code> 里追加 <code>if scenario.optimization_advisor: write_advice(...)</code></li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>artifacts 路径管理统一,advice 自然进 run 目录</li>
|
||||||
|
<li>文件更少</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>reporting 层本是"无副作用写文件",混入 LLM 调用破坏这一约定</li>
|
||||||
|
<li>advisor 逻辑和写出逻辑耦合,难以单独测试规则引擎</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>方案 A 变体:advisor 有独立 settings(推荐)</h3>
|
||||||
|
<p>与方案 A 相同的文件结构,但 LLM 调用使用 <strong>scenario 已有的 judge_model</strong>,不新增任何模型配置——advisor 复用 <code>build_models()</code> 已构建好的 llm 实例。</p>
|
||||||
|
<ul>
|
||||||
|
<li><code>rag_eval/advisor/rules.py</code> — 纯函数,7 条指标诊断规则</li>
|
||||||
|
<li><code>rag_eval/advisor/llm_analyzer.py</code> — 接收已有 llm 实例,不重新建 client</li>
|
||||||
|
<li><code>rag_eval/advisor/writer.py</code> — 写 md + 日志</li>
|
||||||
|
<li><code>rag_eval/advisor/__init__.py</code> — 暴露 <code>run_advisor()</code></li>
|
||||||
|
</ul>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>不重复创建 LLM client(节省资源)</li>
|
||||||
|
<li>advisor 阈值可通过 YAML 的 optimization_advisor 块扩展配置</li>
|
||||||
|
<li>独立包边界清晰,易于单测</li>
|
||||||
|
<li>runner.py 改动最小</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>需把 llm 实例从 runner 传入 advisor(多传一个参数)</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
<h2>优化顾问模块 — 整体架构与数据流</h2>
|
||||||
|
<p class="subtitle">新增 rag_eval/advisor/ 包,插入 run_scenario() 末尾,复用已有 llm 实例</p>
|
||||||
|
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-header">执行链路(变更前 → 变更后)</div>
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:2">
|
||||||
|
<span style="color:#94a3b8">run_scenario()</span><br>
|
||||||
|
→ load_scenario() <span style="color:#94a3b8"># 读 YAML,解析 Scenario + optimization_advisor 字段</span><br>
|
||||||
|
→ build_models() <span style="color:#94a3b8"># 已有:创建 llm, embeddings</span><br>
|
||||||
|
→ build_metric_pipeline() <span style="color:#94a3b8"># 已有</span><br>
|
||||||
|
→ Evaluator.evaluate() <span style="color:#94a3b8"># 已有:打分 → EvaluationResult</span><br>
|
||||||
|
→ write_run_artifacts() <span style="color:#94a3b8"># 已有:scores.csv / summary.md / ...</span><br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">→ run_advisor(result, scenario, llm) # 新增 3 行</span><br>
|
||||||
|
<span style="color:#4ade80"> → rules.diagnose(score_rows) # 规则引擎:识别异常指标 + 方向</span><br>
|
||||||
|
<span style="color:#4ade80"> → llm_analyzer.analyze(diag, samples) # LLM:结合低分样本生成中文建议</span><br>
|
||||||
|
<span style="color:#4ade80"> → writer.write(advice, paths) # 写 optimization_advice.md + 日志</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>新增文件一览</h3>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
|
||||||
|
rag_eval/advisor/<br>
|
||||||
|
__init__.py <span style="color:#94a3b8">← 暴露 run_advisor(),是外部唯一入口</span><br>
|
||||||
|
rules.py <span style="color:#94a3b8">← 纯函数,无 LLM,可单独单测</span><br>
|
||||||
|
llm_analyzer.py <span style="color:#94a3b8">← 接收 llm 实例 + 诊断结构 → 中文 Markdown</span><br>
|
||||||
|
writer.py <span style="color:#94a3b8">← 写 optimization_advice.md,打日志摘要</span><br>
|
||||||
|
<br>
|
||||||
|
rag_eval/shared/models.py <span style="color:#fbbf24">← 修改:Scenario 加 optimization_advisor 字段</span><br>
|
||||||
|
rag_eval/config/schema.py <span style="color:#fbbf24">← 修改:ScenarioModel 加字段</span><br>
|
||||||
|
rag_eval/execution/runner.py <span style="color:#fbbf24">← 修改:末尾加 3 行调用</span><br>
|
||||||
|
rag_eval/reporting/artifacts.py <span style="color:#fbbf24">← 修改:RunArtifactPaths 加 advice_md 路径</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="section">
|
||||||
|
<h3>输出产物</h3>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
|
||||||
|
outputs/online/siemens-pdf-question-bank/<run_id>/<br>
|
||||||
|
scenario.snapshot.yaml<br>
|
||||||
|
scores.csv<br>
|
||||||
|
invalid.csv<br>
|
||||||
|
summary.md<br>
|
||||||
|
metadata.json<br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">optimization_advice.md ← 新增</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p style="margin-top:1rem;color:#94a3b8;font-size:13px">整体看起来 OK 吗?这是新模块与现有链路的接入方式。</p>
|
||||||
@@ -0,0 +1,68 @@
|
|||||||
|
<h2>优化顾问在什么情况下运行?</h2>
|
||||||
|
<p class="subtitle">这决定了模块与现有评测流程的集成方式</p>
|
||||||
|
|
||||||
|
<div class="options">
|
||||||
|
<div class="option" data-choice="a" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">A</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>每次评测自动运行</h3>
|
||||||
|
<p>run_scenario() 结束后自动调用,无需任何额外配置。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>零感知,开箱即用</li>
|
||||||
|
<li>每次跑完都有建议报告</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>每次都多一次 LLM 调用,不管是否需要</li>
|
||||||
|
<li>无法关闭</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="b" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">B</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>YAML 场景中显式开启(推荐)</h3>
|
||||||
|
<p>在 scenario YAML 里加一行 <code>optimization_advisor: true</code>,默认关闭。</p>
|
||||||
|
<div class="mockup">
|
||||||
|
<div class="mockup-header">siemens-pdf-question-bank-online.yaml</div>
|
||||||
|
<div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.8">
|
||||||
|
metrics:<br>
|
||||||
|
- faithfulness<br>
|
||||||
|
- noise_sensitivity<br>
|
||||||
|
...<br>
|
||||||
|
<span style="color:#4ade80;font-weight:bold">optimization_advisor: true # 新增</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>显式可见,按需开启</li>
|
||||||
|
<li>与现有 YAML 驱动风格一致</li>
|
||||||
|
<li>可为不同场景独立配置</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>需要手动在 YAML 里加一行</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="option" data-choice="c" onclick="toggleSelect(this)">
|
||||||
|
<div class="letter">C</div>
|
||||||
|
<div class="content">
|
||||||
|
<h3>阈值触发(任一指标低于警戒线时自动激活)</h3>
|
||||||
|
<p>规则引擎先算,若发现有指标低于阈值则自动启动 LLM 分析;一切正常则跳过。</p>
|
||||||
|
<div class="pros-cons">
|
||||||
|
<div class="pros"><h4>优点</h4><ul>
|
||||||
|
<li>"有问题才报警",符合直觉</li>
|
||||||
|
<li>高分场景无额外成本</li>
|
||||||
|
</ul></div>
|
||||||
|
<div class="cons"><h4>缺点</h4><ul>
|
||||||
|
<li>阈值需要维护,不同场景可能不同</li>
|
||||||
|
<li>正常分数时无建议,但用户可能仍想看优化空间</li>
|
||||||
|
</ul></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
<div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
|
||||||
|
<p class="subtitle">Writing spec & moving to implementation...</p>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
<div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
|
||||||
|
<p class="subtitle">Continuing in terminal — 正在设计方案...</p>
|
||||||
|
</div>
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"reason":"idle timeout","timestamp":1781598635371}
|
||||||
1
.superpowers/brainstorm/1625-1781595805/state/server.pid
Normal file
1
.superpowers/brainstorm/1625-1781595805/state/server.pid
Normal file
@@ -0,0 +1 @@
|
|||||||
|
1625
|
||||||
808
docs/superpowers/plans/2026-06-24-async-score-jobs.md
Normal file
808
docs/superpowers/plans/2026-06-24-async-score-jobs.md
Normal file
@@ -0,0 +1,808 @@
|
|||||||
|
# 异步评分记录(Async Score Jobs)Implementation Plan
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** 新增 `POST /api/score/async` 异步端点,结果持久化至 `outputs/score-jobs/`,并在前端新增「评分记录」页面展示。
|
||||||
|
|
||||||
|
**Architecture:** 新建 `ScoreJobManager`(复用 `pipeline_task_manager` 线程池模式)在后台执行 `InlineScorer.score()`,写入 JSON 文件;新增三个 REST 端点;前端新增导航页加载并轮询记录。
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, threading, Vanilla JS, pytest
|
||||||
|
|
||||||
|
## Global Constraints
|
||||||
|
|
||||||
|
- Python 3.12+,PEP 8,4 空格缩进,类型注解必须
|
||||||
|
- 存储路径:`outputs/score-jobs/<job_id>.json`
|
||||||
|
- 复用现有 `ScoreRequest`(含 `effective_metrics()` 和 `contexts_as_list()` 方法)
|
||||||
|
- 复用现有 `InlineScorer.score()` 和 `compute_weighted_score()`
|
||||||
|
- 所有测试用 pytest,不依赖真实 LLM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件清单
|
||||||
|
|
||||||
|
| 操作 | 文件 | 职责 |
|
||||||
|
|------|------|------|
|
||||||
|
| 新建 | `webapp/services/score_job_manager.py` | ScoreJobManager:线程池 + JSON 持久化 |
|
||||||
|
| 新建 | `webapp/api/score_jobs.py` | 3 个端点路由 |
|
||||||
|
| 新建 | `webapp/static/js/score_jobs.js` | 前端列表 + 轮询逻辑 |
|
||||||
|
| 新建 | `tests/webapp/test_score_jobs_api.py` | API 集成测试 |
|
||||||
|
| 修改 | `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
|
||||||
|
| 修改 | `webapp/server.py` | 注册 score_jobs router,更新 OPENAPI_TAGS 和 description |
|
||||||
|
| 修改 | `webapp/static/index.html` | 新增导航项 + `#view-scorejobs` section |
|
||||||
|
| 修改 | `webapp/static/js/api.js` | 新增 `scoreJobsAsync()`、`getScoreJob()`、`listScoreJobs()` |
|
||||||
|
| 修改 | `webapp/static/js/app.js` | 注册 `scorejobs` 视图、加载调用 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: Pydantic 模型 + ScoreJobManager
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/models.py`
|
||||||
|
- Create: `webapp/services/score_job_manager.py`
|
||||||
|
- Create: `tests/webapp/test_score_jobs_api.py` (partial)
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Produces:
|
||||||
|
- `AsyncScoreJobStatus` Pydantic model
|
||||||
|
- `AsyncScoreJobResponse` Pydantic model
|
||||||
|
- `score_job_manager: ScoreJobManager` singleton
|
||||||
|
- `ScoreJobManager.submit(request: ScoreRequest) -> AsyncScoreJobStatus`
|
||||||
|
- `ScoreJobManager.get(job_id: str) -> AsyncScoreJobStatus | None`
|
||||||
|
- `ScoreJobManager.list_jobs() -> list[AsyncScoreJobStatus]`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add models to `webapp/models.py`**
|
||||||
|
|
||||||
|
Append after `AsyncScoreJobResponse` (at the end of the file, after `ScoreResponse`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 异步评分记录模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AsyncScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate response after submitting an async score job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续查询结果。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncScoreJobStatus(BaseModel):
|
||||||
|
"""Full state of one async score job, persisted to disk."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
status: str = Field(description="queued | running | completed | failed")
|
||||||
|
created_at: str = Field(default="", description="创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="完成时间(ISO 8601 UTC)。")
|
||||||
|
request_summary: dict = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="请求参数快照(question 前80字、metrics、judge_model 等)。",
|
||||||
|
)
|
||||||
|
scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
|
||||||
|
weighted_score: float | None = Field(default=None, description="加权综合得分。")
|
||||||
|
latency_ms: int = Field(default=0, description="评分耗时毫秒。")
|
||||||
|
skipped_metrics: list[str] = Field(default_factory=list)
|
||||||
|
error: str | None = Field(default=None)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Write failing tests**
|
||||||
|
|
||||||
|
Create `tests/webapp/test_score_jobs_api.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for async score jobs API."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
import webapp.services.score_job_manager as mgr_mod
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
fresh_mgr = ScoreJobManager(jobs_dir=tmp_path / "score-jobs")
|
||||||
|
monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
|
||||||
|
import webapp.api.score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreJobManager:
|
||||||
|
def test_submit_returns_job_status_with_queued(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
req = ScoreRequest(question="q", answer="a", metrics=["answer_relevancy"])
|
||||||
|
with patch.object(mgr, "_execute") as mock_exec:
|
||||||
|
mock_exec.return_value = None
|
||||||
|
status = mgr.submit(req)
|
||||||
|
assert status.status in ("queued", "running", "completed")
|
||||||
|
assert len(status.job_id) > 0
|
||||||
|
|
||||||
|
def test_get_returns_none_for_unknown_id(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
assert mgr.get("nonexistent") is None
|
||||||
|
|
||||||
|
def test_list_returns_empty_initially(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
|
||||||
|
assert mgr.list_jobs() == []
|
||||||
|
|
||||||
|
def test_completed_job_persisted_to_disk(self, tmp_path):
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs", max_workers=1)
|
||||||
|
req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
|
||||||
|
mock_scorer = MagicMock()
|
||||||
|
mock_scorer.score.return_value = {"answer_relevancy": 0.85}
|
||||||
|
with patch("webapp.services.score_job_manager.inline_scorer", mock_scorer):
|
||||||
|
with patch("webapp.services.score_job_manager.EvaluationSettings"):
|
||||||
|
status = mgr.submit(req)
|
||||||
|
for _ in range(20):
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
if s and s.status in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.2)
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
assert s is not None
|
||||||
|
json_path = tmp_path / "jobs" / f"{status.job_id}.json"
|
||||||
|
assert json_path.exists()
|
||||||
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
|
||||||
|
```
|
||||||
|
Expected: `ModuleNotFoundError: No module named 'webapp.services.score_job_manager'`
|
||||||
|
|
||||||
|
- [ ] **Step 4: Create `webapp/services/score_job_manager.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Background task manager for async RAGAS single-sample scoring.
|
||||||
|
|
||||||
|
Each job runs InlineScorer.score() in a thread pool and persists the
|
||||||
|
result as a JSON file under outputs/score-jobs/<job_id>.json so results
|
||||||
|
survive server restarts and can be listed by the frontend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-jobs"
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreJobManager:
|
||||||
|
"""Thread-pool manager for async RAGAS scoring jobs with JSON persistence."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
jobs_dir: Path = _DEFAULT_JOBS_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._jobs_dir = Path(jobs_dir)
|
||||||
|
self._jobs_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
# In-memory index: job_id -> AsyncScoreJobStatus (authoritative while running)
|
||||||
|
self._cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
|
||||||
|
"""Queue one scoring job and return its initial status immediately."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": request.question[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
self._cache[job_id] = status
|
||||||
|
self._persist(status)
|
||||||
|
self._executor.submit(self._run, job_id, request)
|
||||||
|
return status
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return the current status for one job, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all known jobs sorted newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Internal
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Execute scoring in the thread pool and persist the result."""
|
||||||
|
self._update(job_id, status="running")
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
import time as _time
|
||||||
|
t0 = _time.monotonic()
|
||||||
|
try:
|
||||||
|
if not effective:
|
||||||
|
scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
weighted = None
|
||||||
|
else:
|
||||||
|
raw = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
scores = {m: None for m in request.metrics}
|
||||||
|
scores.update(raw)
|
||||||
|
weighted_raw = compute_weighted_score(
|
||||||
|
{k: v for k, v in raw.items() if v is not None}, {}
|
||||||
|
)
|
||||||
|
weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
|
||||||
|
latency_ms = int((_time.monotonic() - t0) * 1000)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
scores=scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((_time.monotonic() - t0) * 1000)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _update(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
"""Merge kwargs into the job status and persist."""
|
||||||
|
with self._lock:
|
||||||
|
existing = self._cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._cache[job_id] = updated
|
||||||
|
self._persist(updated)
|
||||||
|
|
||||||
|
def _persist(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Write one job's status to its JSON file."""
|
||||||
|
path = self._jobs_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Load completed jobs from disk into memory on startup."""
|
||||||
|
for path in sorted(self._jobs_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass # Corrupt file — skip
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
score_job_manager = ScoreJobManager()
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Run to verify tests PASS**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
|
||||||
|
```
|
||||||
|
Expected: 4 tests PASS
|
||||||
|
|
||||||
|
- [ ] **Step 6: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/models.py webapp/services/score_job_manager.py tests/webapp/test_score_jobs_api.py
|
||||||
|
git commit -m "feat: add AsyncScoreJobStatus model and ScoreJobManager with JSON persistence"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: API 端点
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `webapp/api/score_jobs.py`
|
||||||
|
- Modify: `webapp/server.py`
|
||||||
|
- Modify: `tests/webapp/test_score_jobs_api.py`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes: `score_job_manager: ScoreJobManager`, `AsyncScoreJobResponse`, `AsyncScoreJobStatus`, `ScoreRequest`
|
||||||
|
- Produces: `POST /api/score/async`, `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add API tests to `tests/webapp/test_score_jobs_api.py`**
|
||||||
|
|
||||||
|
Append this class:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class TestScoreJobsEndpoint:
|
||||||
|
def test_submit_async_returns_202(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._execute"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/jobs/nonexistent")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_list_jobs_returns_empty_initially(self, client):
|
||||||
|
resp = client.get("/api/score/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
def test_submitted_job_appears_in_list(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
list_resp = client.get("/api/score/jobs")
|
||||||
|
ids = [j["job_id"] for j in list_resp.json()["jobs"]]
|
||||||
|
assert job_id in ids
|
||||||
|
|
||||||
|
def test_get_job_by_id(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
get_resp = client.get(f"/api/score/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run to verify FAIL**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobsEndpoint -v
|
||||||
|
```
|
||||||
|
Expected: FAIL — `ModuleNotFoundError: No module named 'webapp.api.score_jobs'`
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/api/score_jobs.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Routes for async RAGAS scoring jobs (Dify fire-and-forget integration)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.score_job_manager import score_job_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=AsyncScoreJobResponse,
|
||||||
|
summary="提交异步评分任务(Dify 推荐方式)",
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": "任务已排队,立即返回 job_id。通过 GET /api/score/jobs/{job_id} 查询结果。",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {"job_id": "abc123def456", "status": "queued"}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
|
||||||
|
"""提交异步 RAGAS 评分任务,立即返回 job_id(202 Accepted)。
|
||||||
|
|
||||||
|
评分在后台线程中执行,结果持久化至 `outputs/score-jobs/<job_id>.json`。
|
||||||
|
在 RAGAS 平台「评分记录」页面可查看所有历史评分记录。
|
||||||
|
|
||||||
|
**Dify 工作流推荐使用此接口**:不等待评分完成,工作流立即继续,
|
||||||
|
避免 HTTP 节点超时。评分结果通过平台界面查看。
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[score_async] submit metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.metrics, bool(request.contexts), bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
status = score_job_manager.submit(request)
|
||||||
|
logger.info("[score_async] queued job_id=%s", status.job_id)
|
||||||
|
return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有评分记录",
|
||||||
|
)
|
||||||
|
def list_score_jobs() -> dict:
|
||||||
|
"""返回所有异步评分记录,按创建时间倒序排列。"""
|
||||||
|
jobs = score_job_manager.list_jobs()
|
||||||
|
logger.info("[score_jobs] list count=%d", len(jobs))
|
||||||
|
return {"jobs": [j.model_dump() for j in jobs]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询评分记录详情",
|
||||||
|
responses={404: {"description": "指定 job_id 的评分记录不存在。"}},
|
||||||
|
)
|
||||||
|
def get_score_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""返回一个异步评分任务的当前状态和结果。"""
|
||||||
|
status = score_job_manager.get(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
|
||||||
|
return status
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Register router in `webapp/server.py`**
|
||||||
|
|
||||||
|
Add import:
|
||||||
|
```python
|
||||||
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs
|
||||||
|
```
|
||||||
|
|
||||||
|
Add after `app.include_router(score.router)`:
|
||||||
|
```python
|
||||||
|
app.include_router(score_jobs.router)
|
||||||
|
```
|
||||||
|
|
||||||
|
Add entry to `OPENAPI_TAGS` before `"meta"`:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"name": "score",
|
||||||
|
"description": (
|
||||||
|
"**实时评分 API(同步)** — `POST /api/score`\n\n"
|
||||||
|
"**异步评分 API(Dify 推荐)** — `POST /api/score/async`\n\n"
|
||||||
|
"异步方式立即返回 job_id(202),评分在后台执行,结果在「评分记录」页查看。\n\n"
|
||||||
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
|
> Note: this replaces the existing `"score"` entry in `OPENAPI_TAGS`.
|
||||||
|
|
||||||
|
- [ ] **Step 5: Verify no route conflict**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
from webapp.server import create_app
|
||||||
|
app = create_app()
|
||||||
|
score_routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes if 'score' in r.path]
|
||||||
|
print(score_routes)
|
||||||
|
"
|
||||||
|
```
|
||||||
|
Expected: shows `/api/score`, `/api/score/async`, `/api/score/jobs`, `/api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
- [ ] **Step 6: Run API tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py -v --tb=short
|
||||||
|
```
|
||||||
|
Expected: all 9 tests PASS
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/api/score_jobs.py webapp/server.py tests/webapp/test_score_jobs_api.py
|
||||||
|
git commit -m "feat: add POST /api/score/async and GET /api/score/jobs endpoints"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: 前端「评分记录」页
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/static/index.html`
|
||||||
|
- Modify: `webapp/static/js/api.js`
|
||||||
|
- Modify: `webapp/static/js/app.js`
|
||||||
|
- Create: `webapp/static/js/score_jobs.js`
|
||||||
|
|
||||||
|
**Interfaces:**
|
||||||
|
- Consumes: `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
|
||||||
|
- Produces: `#view-scorejobs` section, `ScoreJobs` JS object
|
||||||
|
|
||||||
|
- [ ] **Step 1: Add API methods to `webapp/static/js/api.js`**
|
||||||
|
|
||||||
|
Add before the closing `};`:
|
||||||
|
```javascript
|
||||||
|
// 异步评分记录 API
|
||||||
|
scoreJobsAsync(body) { return API.post("/api/score/async", body); },
|
||||||
|
getScoreJob(jobId) { return API.get(`/api/score/jobs/${encodeURIComponent(jobId)}`); },
|
||||||
|
listScoreJobs() { return API.get("/api/score/jobs"); },
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: Add nav item and section to `webapp/static/index.html`**
|
||||||
|
|
||||||
|
In the `<nav class="nav">` block, add after the `profiles` nav-item and before the `apidocs` nav-item:
|
||||||
|
```html
|
||||||
|
<button class="nav-item" data-view="scorejobs">
|
||||||
|
<span class="nav-ico">📋</span><span>评分记录</span>
|
||||||
|
</button>
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a new section before the `<!-- API 文档视图 -->` comment:
|
||||||
|
```html
|
||||||
|
<!-- 评分记录视图 -->
|
||||||
|
<section class="view" id="view-scorejobs" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<div class="panel-head">
|
||||||
|
<h2>评分记录</h2>
|
||||||
|
<span class="muted" style="font-size:13px">来自 Dify 异步评分任务(POST /api/score/async)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="scorejobs-container"></div>
|
||||||
|
<div class="empty" id="scorejobs-empty" hidden>
|
||||||
|
<p>暂无评分记录。</p>
|
||||||
|
<p class="muted">在 Dify 工作流中调用 <code>POST /api/score/async</code> 后,记录将在此显示。</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 3: Create `webapp/static/js/score_jobs.js`**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// score_jobs.js — 评分记录页面逻辑(异步 RAGAS 评分结果列表)
|
||||||
|
|
||||||
|
const ScoreJobs = {
|
||||||
|
_pollTimers: {}, // job_id -> setInterval handle
|
||||||
|
|
||||||
|
async load() {
|
||||||
|
const container = document.getElementById("scorejobs-container");
|
||||||
|
const empty = document.getElementById("scorejobs-empty");
|
||||||
|
container.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
|
try {
|
||||||
|
const data = await API.listScoreJobs();
|
||||||
|
const jobs = data.jobs || [];
|
||||||
|
container.innerHTML = "";
|
||||||
|
if (jobs.length === 0) {
|
||||||
|
empty.hidden = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
empty.hidden = true;
|
||||||
|
jobs.forEach(job => container.appendChild(ScoreJobs.renderRow(job)));
|
||||||
|
// Auto-poll any queued/running jobs
|
||||||
|
jobs.forEach(job => {
|
||||||
|
if (job.status === "queued" || job.status === "running") {
|
||||||
|
ScoreJobs._startPoll(job.job_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
container.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
renderRow(job) {
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "panel score-job-row";
|
||||||
|
row.id = `score-job-${job.job_id}`;
|
||||||
|
row.innerHTML = ScoreJobs._rowHtml(job);
|
||||||
|
return row;
|
||||||
|
},
|
||||||
|
|
||||||
|
_rowHtml(job) {
|
||||||
|
const time = App.shortTime(job.created_at);
|
||||||
|
const question = App.escape((job.request_summary?.question || "—").slice(0, 50));
|
||||||
|
const metrics = (job.request_summary?.metrics || []).join(", ");
|
||||||
|
const statusBadge = `<span class="badge ${job.status}">${job.status}</span>`;
|
||||||
|
|
||||||
|
let scoreHtml = "";
|
||||||
|
if (job.status === "completed") {
|
||||||
|
scoreHtml = Object.entries(job.scores || {})
|
||||||
|
.map(([k, v]) => {
|
||||||
|
const cls = App.scoreClass(v);
|
||||||
|
const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
|
||||||
|
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
|
||||||
|
})
|
||||||
|
.join(" ");
|
||||||
|
if (job.weighted_score !== null && job.weighted_score !== undefined) {
|
||||||
|
const cls = App.scoreClass(job.weighted_score);
|
||||||
|
scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
|
||||||
|
}
|
||||||
|
} else if (job.status === "failed") {
|
||||||
|
scoreHtml = `<span class="muted" style="color:var(--bad)">${App.escape(job.error || "未知错误")}</span>`;
|
||||||
|
} else {
|
||||||
|
scoreHtml = `<span class="muted">评分中…</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="run-card-head">
|
||||||
|
<div class="run-card-title">${question}</div>
|
||||||
|
<div>${statusBadge}</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-meta">
|
||||||
|
<div>指标:${App.escape(metrics)} · ${time} · ${job.latency_ms}ms</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-metrics">${scoreHtml}</div>
|
||||||
|
`;
|
||||||
|
},
|
||||||
|
|
||||||
|
_startPoll(jobId) {
|
||||||
|
if (ScoreJobs._pollTimers[jobId]) return;
|
||||||
|
ScoreJobs._pollTimers[jobId] = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
const job = await API.getScoreJob(jobId);
|
||||||
|
const el = document.getElementById(`score-job-${jobId}`);
|
||||||
|
if (el) el.innerHTML = ScoreJobs._rowHtml(job);
|
||||||
|
if (job.status === "completed" || job.status === "failed") {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
}
|
||||||
|
} catch (_e) {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
}
|
||||||
|
}, 5000);
|
||||||
|
},
|
||||||
|
|
||||||
|
stopAllPolls() {
|
||||||
|
Object.values(ScoreJobs._pollTimers).forEach(t => clearInterval(t));
|
||||||
|
ScoreJobs._pollTimers = {};
|
||||||
|
},
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: Update `webapp/static/js/app.js`**
|
||||||
|
|
||||||
|
Add `"scorejobs"` to the `views` array and `titles` object:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
views: ["runs", "new", "report", "profiles", "scorejobs", "apidocs"],
|
||||||
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", scorejobs: "评分记录", apidocs: "API 文档" },
|
||||||
|
```
|
||||||
|
|
||||||
|
Add in `_doSwitch` after `if (view === "profiles") Profiles.load();`:
|
||||||
|
```javascript
|
||||||
|
if (view === "scorejobs") ScoreJobs.load();
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `ScoreJobs.stopAllPolls();` when switching away, in `_doSwitch` before view switching logic:
|
||||||
|
```javascript
|
||||||
|
// Stop score job pollers when leaving the scorejobs view
|
||||||
|
if (App.activeView === "scorejobs" && view !== "scorejobs") ScoreJobs.stopAllPolls();
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 5: Add script tag to `webapp/static/index.html`**
|
||||||
|
|
||||||
|
Add before `<script src="/static/js/app.js"></script>`:
|
||||||
|
```html
|
||||||
|
<script src="/static/js/score_jobs.js"></script>
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 6: Verify server boots**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -c "from webapp.server import create_app; create_app(); print('OK')"
|
||||||
|
```
|
||||||
|
Expected: `OK`
|
||||||
|
|
||||||
|
Also verify HTML has all new elements:
|
||||||
|
```
|
||||||
|
python -c "
|
||||||
|
c = open('webapp/static/index.html', encoding='utf-8').read()
|
||||||
|
assert 'view-scorejobs' in c
|
||||||
|
assert 'scorejobs-container' in c
|
||||||
|
assert '评分记录' in c
|
||||||
|
print('HTML OK')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 7: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add webapp/static/index.html webapp/static/js/api.js webapp/static/js/app.js webapp/static/js/score_jobs.js
|
||||||
|
git commit -m "feat: add 评分记录 page with async score job list and auto-polling"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: 全量回归测试 + Dify 说明注释
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `webapp/static/js/score_jobs.js` (minor: add Dify curl comment at top)
|
||||||
|
|
||||||
|
- [ ] **Step 1: Run full test suite**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/ -v --tb=short -q 2>&1 | tail -15
|
||||||
|
```
|
||||||
|
|
||||||
|
Pre-existing failures to ignore:
|
||||||
|
- `test_normalize_sample_pdf_offline_smoke_row`
|
||||||
|
- `test_evaluator_and_reporting_write_run_assets`
|
||||||
|
- `test_question_generator_rejects_invalid_json`
|
||||||
|
- `test_question_generator_rejects_non_list_samples`
|
||||||
|
|
||||||
|
Any other failure is a regression — fix before proceeding.
|
||||||
|
|
||||||
|
- [ ] **Step 2: Run targeted tests**
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m pytest tests/webapp/test_score_jobs_api.py tests/webapp/test_score_api.py tests/test_pipeline.py -v --tb=short
|
||||||
|
```
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
- [ ] **Step 3: Final commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
git add .
|
||||||
|
git commit -m "feat: async score jobs complete — POST /api/score/async + 评分记录 page
|
||||||
|
|
||||||
|
- ScoreJobManager: thread pool + JSON persistence (outputs/score-jobs/)
|
||||||
|
- POST /api/score/async: 202 immediate response with job_id
|
||||||
|
- GET /api/score/jobs + GET /api/score/jobs/{id}: query endpoints
|
||||||
|
- Frontend: 评分记录 nav page with 5s auto-polling for pending jobs
|
||||||
|
- Dify integration: change /api/score → /api/score/async, remove response parsing
|
||||||
|
|
||||||
|
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
|
||||||
|
```
|
||||||
116
docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
Normal file
116
docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
# 异步评分记录功能设计
|
||||||
|
|
||||||
|
**日期**: 2026-06-24
|
||||||
|
**状态**: 已批准,待实现
|
||||||
|
**范围**: 新增 `POST /api/score/async` 异步评分端点,评分结果持久化到磁盘,前端新增「评分记录」页面展示。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
- Dify 工作流调用 `/api/score/async` 立即返回 `job_id`(202),不等待评分完成
|
||||||
|
- 后台异步执行 RAGAS 评分,结果写入 `outputs/score-jobs/<job_id>.json`
|
||||||
|
- RAGAS 平台新增「评分记录」导航页,列表展示所有评分记录及状态
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. 架构
|
||||||
|
|
||||||
|
```
|
||||||
|
Dify → POST /api/score/async → 202 {job_id, status:"queued"}
|
||||||
|
↓
|
||||||
|
ScoreJobManager (线程池)
|
||||||
|
↓
|
||||||
|
InlineScorer.score()
|
||||||
|
↓
|
||||||
|
outputs/score-jobs/<job_id>.json
|
||||||
|
↓
|
||||||
|
GET /api/score/jobs ← 前端「评分记录」页轮询
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. 存储格式
|
||||||
|
|
||||||
|
`outputs/score-jobs/<job_id>.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "abc123def456",
|
||||||
|
"status": "completed",
|
||||||
|
"created_at": "2026-06-24T09:00:00+00:00",
|
||||||
|
"finished_at": "2026-06-24T09:00:15+00:00",
|
||||||
|
"request": {
|
||||||
|
"question": "双源CT的时间分辨率是多少?",
|
||||||
|
"answer": "双源CT的单扇区时间分辨率为75ms。",
|
||||||
|
"contexts": null,
|
||||||
|
"ground_truth": null,
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
"judge_model": "gpt-5",
|
||||||
|
"embedding_model": "text-embedding-3-small"
|
||||||
|
},
|
||||||
|
"scores": {"answer_relevancy": 0.9075},
|
||||||
|
"weighted_score": 0.9075,
|
||||||
|
"latency_ms": 12500,
|
||||||
|
"skipped_metrics": [],
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. API 端点
|
||||||
|
|
||||||
|
### `POST /api/score/async`
|
||||||
|
|
||||||
|
请求体与 `POST /api/score` 完全相同(`ScoreRequest`)。
|
||||||
|
|
||||||
|
```json
|
||||||
|
// 立即返回 202
|
||||||
|
{"job_id": "abc123def456", "status": "queued"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GET /api/score/jobs`
|
||||||
|
|
||||||
|
返回所有评分记录,按创建时间倒序:
|
||||||
|
```json
|
||||||
|
{"jobs": [{...ScoreJobStatus...}]}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GET /api/score/jobs/{job_id}`
|
||||||
|
|
||||||
|
返回单条评分记录详情。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. 新增文件
|
||||||
|
|
||||||
|
| 文件 | 职责 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/services/score_job_manager.py` | ScoreJobManager:线程池 + JSON 持久化 |
|
||||||
|
| `webapp/api/score_jobs.py` | 3 个端点路由 |
|
||||||
|
| `webapp/static/js/score_jobs.js` | 前端列表逻辑 + 轮询 |
|
||||||
|
|
||||||
|
## 6. 修改文件
|
||||||
|
|
||||||
|
| 文件 | 改动 |
|
||||||
|
|------|------|
|
||||||
|
| `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
|
||||||
|
| `webapp/server.py` | 注册 score_jobs router,更新 OPENAPI_TAGS |
|
||||||
|
| `webapp/static/index.html` | 新增导航项 + section |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 前端「评分记录」页
|
||||||
|
|
||||||
|
列表列:时间 / 问题摘要(前40字)/ 指标 / 得分 / 状态
|
||||||
|
|
||||||
|
- 进入页面自动刷新
|
||||||
|
- `queued/running` 记录每 5 秒轮询 `GET /api/score/jobs/{id}` 更新状态
|
||||||
|
- 得分按 scoreClass(good/warn/bad)着色
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Dify 改造
|
||||||
|
|
||||||
|
只改 HTTP 节点 URL:`/api/score` → `/api/score/async`,删除解析响应的代码节点。
|
||||||
1
logs/online_eval.log
Normal file
1
logs/online_eval.log
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Completed run: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\online\siemens-pdf-question-bank
|
||||||
24
logs/server_2026-06-23.log
Normal file
24
logs/server_2026-06-23.log
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
2026-06-23 13:55:00 INFO webapp.server Starting RAGAS Console host=127.0.0.1 port=8800 log_level=info log_file=C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\logs\server_2026-06-23.log
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Started server process [83868]
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Waiting for application startup.
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Application startup complete.
|
||||||
|
2026-06-23 13:55:14 INFO uvicorn.error Uvicorn running on http://127.0.0.1:8800 (Press CTRL+C to quit)
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:53487 - "GET / HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:53487 - "GET /static/css/app.css HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:50321 - "GET /static/js/api.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:51325 - "GET /static/js/profiles.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:47 INFO uvicorn.access 127.0.0.1:59869 - "GET /static/js/report.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:50980 - "GET /static/js/runner.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:63223 - "GET /static/js/app.js HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO webapp.access GET /docs → 200 (0ms)
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:63223 - "GET /docs HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:48 INFO webapp.access GET /api/health → 200 (0ms)
|
||||||
|
2026-06-23 13:59:48 INFO uvicorn.access 127.0.0.1:50321 - "GET /api/health HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:49 INFO webapp.api.runs [get_runs] found 19 runs
|
||||||
|
2026-06-23 13:59:49 INFO webapp.access GET /api/runs → 200 (1094ms)
|
||||||
|
2026-06-23 13:59:49 INFO uvicorn.access 127.0.0.1:63223 - "GET /api/runs HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:49 INFO webapp.access GET /openapi.json → 200 (94ms)
|
||||||
|
2026-06-23 13:59:49 INFO uvicorn.access 127.0.0.1:63223 - "GET /openapi.json HTTP/1.1" 200
|
||||||
|
2026-06-23 13:59:50 INFO webapp.api.llm_profiles [list_profiles] count=6
|
||||||
|
2026-06-23 13:59:50 INFO webapp.access GET /api/llm-profiles → 200 (0ms)
|
||||||
|
2026-06-23 13:59:50 INFO uvicorn.access 127.0.0.1:63223 - "GET /api/llm-profiles HTTP/1.1" 200
|
||||||
35
logs/siemens_build.log
Normal file
35
logs/siemens_build.log
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[info] generating questions for: 315_1_Flash????????.pdf
|
||||||
|
[info] 315_1_Flash????????.pdf: 6 questions generated (total so far: 6)
|
||||||
|
[info] generating questions for: 316_2_Flash??????_??.pdf
|
||||||
|
[info] 316_2_Flash??????_??.pdf: 10 questions generated (total so far: 16)
|
||||||
|
[info] generating questions for: 317_3_Flash??????_??.pdf
|
||||||
|
[info] 317_3_Flash??????_??.pdf: 9 questions generated (total so far: 25)
|
||||||
|
[info] generating questions for: 318_4_Flash??????_???.pdf
|
||||||
|
[info] 318_4_Flash??????_???.pdf: 9 questions generated (total so far: 34)
|
||||||
|
[info] generating questions for: 319_5_Flash??????_?????.pdf
|
||||||
|
[info] 319_5_Flash??????_?????.pdf: 10 questions generated (total so far: 44)
|
||||||
|
[info] generating questions for: 320_6_Flash??????_??.pdf
|
||||||
|
[info] 320_6_Flash??????_??.pdf: 8 questions generated (total so far: 52)
|
||||||
|
[info] generating questions for: 321_??CT???????????--??.pdf
|
||||||
|
[info] 321_??CT???????????--??.pdf: 5 questions generated (total so far: 57)
|
||||||
|
[info] generating questions for: 322_??CT???????????--??????????.pdf
|
||||||
|
[info] 322_??CT???????????--??????????.pdf: 8 questions generated (total so far: 65)
|
||||||
|
[info] generating questions for: 323_??CT???????????--?????????.pdf
|
||||||
|
[info] 323_??CT???????????--?????????.pdf: 5 questions generated (total so far: 70)
|
||||||
|
[info] generating questions for: 324_??CT???????????--????????.pdf
|
||||||
|
[info] 324_??CT???????????--????????.pdf: 8 questions generated (total so far: 78)
|
||||||
|
[info] generating questions for: 325_??CT???????????--???????.pdf
|
||||||
|
[info] 325_??CT???????????--???????.pdf: 8 questions generated (total so far: 86)
|
||||||
|
[info] generating questions for: 326_??CT???????????--4D????.pdf
|
||||||
|
[info] 326_??CT???????????--4D????.pdf: 7 questions generated (total so far: 93)
|
||||||
|
[info] generating questions for: 327_??CT???????????--??????.pdf
|
||||||
|
[info] 327_??CT???????????--??????.pdf: 8 questions generated (total so far: 101)
|
||||||
|
[info] generating questions for: 749_????01_???????????.pdf
|
||||||
|
[info] 749_????01_???????????.pdf: 8 questions generated (total so far: 109)
|
||||||
|
[info] generating questions for: 804_????02-????????CT?????X-Map??.pdf
|
||||||
|
[info] 804_????02-????????CT?????X-Map??.pdf: 8 questions generated (total so far: 117)
|
||||||
|
[info] generating questions for: 805_????03_????????????????.pdf
|
||||||
|
[info] 805_????03_????????????????.pdf: 6 questions generated (total so far: 123)
|
||||||
|
[info] generating questions for: 807_???CT???????_SJ-L10.2??1-5.pdf
|
||||||
|
[info] 807_???CT???????_SJ-L10.2??1-5.pdf: 9 questions generated (total so far: 132)
|
||||||
|
Completed dataset build: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\dataset-builds\siemens-pdf-question-bank\2026-06-15T09-28-35.302231+00-00
|
||||||
@@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\
|
|||||||
|
|
||||||
## 报告要求
|
## 报告要求
|
||||||
|
|
||||||
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||||
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
2. 严重程度说明:critical=严重(<阈值50%),warning=警告(<阈值70%),low=待优化(低于0.85,有提升空间)
|
||||||
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
|
3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||||
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先),critical 和 warning 项优先于 low 项
|
||||||
|
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||||
|
|
||||||
只输出 Markdown 报告正文,不要任何前置说明。
|
只输出 Markdown 报告正文,不要任何前置说明。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_SEVERITY_LABEL_ZH: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
||||||
lines = []
|
lines = []
|
||||||
for d in diagnoses:
|
for d in diagnoses:
|
||||||
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
||||||
|
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
|
||||||
lines.append(
|
lines.append(
|
||||||
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
||||||
f"阈值={d.threshold},严重程度={d.severity}"
|
f"阈值={d.threshold},严重程度={label}"
|
||||||
)
|
)
|
||||||
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
||||||
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ class MetricRule:
|
|||||||
higher_is_better: bool # False for noise_sensitivity
|
higher_is_better: bool # False for noise_sensitivity
|
||||||
root_causes: list[str]
|
root_causes: list[str]
|
||||||
suggested_actions: list[str]
|
suggested_actions: list[str]
|
||||||
|
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
|
||||||
|
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
|
||||||
|
advisory_threshold: float = 0.85
|
||||||
|
|
||||||
|
|
||||||
METRIC_RULES: dict[str, MetricRule] = {
|
METRIC_RULES: dict[str, MetricRule] = {
|
||||||
@@ -208,10 +211,14 @@ def diagnose(
|
|||||||
elif mean < rule.warning_threshold:
|
elif mean < rule.warning_threshold:
|
||||||
severity = "warning"
|
severity = "warning"
|
||||||
threshold = rule.warning_threshold
|
threshold = rule.warning_threshold
|
||||||
|
elif mean < rule.advisory_threshold:
|
||||||
|
# Score is acceptable but below 0.85 — request LLM optimization advice.
|
||||||
|
severity = "low"
|
||||||
|
threshold = rule.advisory_threshold
|
||||||
else:
|
else:
|
||||||
continue # above warning threshold → no diagnosis
|
continue # >= advisory_threshold → no diagnosis needed
|
||||||
else:
|
else:
|
||||||
# lower is better (noise_sensitivity)
|
# lower is better (noise_sensitivity): keep existing two-tier logic
|
||||||
if mean > rule.critical_threshold:
|
if mean > rule.critical_threshold:
|
||||||
severity = "critical"
|
severity = "critical"
|
||||||
threshold = rule.critical_threshold
|
threshold = rule.critical_threshold
|
||||||
|
|||||||
@@ -8,12 +8,22 @@ from .rules import Diagnosis
|
|||||||
|
|
||||||
logger = logging.getLogger("rag_eval.advisor")
|
logger = logging.getLogger("rag_eval.advisor")
|
||||||
|
|
||||||
|
# Chinese display labels for each severity tier.
|
||||||
|
_SEVERITY_LABEL: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
||||||
"""Return a single-line log summary of triggered diagnoses."""
|
"""Return a single-line log summary of triggered diagnoses."""
|
||||||
if not diagnoses:
|
if not diagnoses:
|
||||||
return "[advisor] 所有指标正常,无需优化建议。"
|
return "[advisor] 所有指标正常,无需优化建议。"
|
||||||
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
|
parts = [
|
||||||
|
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
|
||||||
|
for d in diagnoses
|
||||||
|
]
|
||||||
triggered = " ".join(parts)
|
triggered = " ".join(parts)
|
||||||
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
||||||
|
|
||||||
@@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
|
|||||||
return ""
|
return ""
|
||||||
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
||||||
for d in diagnoses:
|
for d in diagnoses:
|
||||||
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
|
label = _SEVERITY_LABEL.get(d.severity, d.severity)
|
||||||
|
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
|
||||||
lines.append("\n**可能原因:**")
|
lines.append("\n**可能原因:**")
|
||||||
for cause in d.root_causes:
|
for cause in d.root_causes:
|
||||||
lines.append(f"- {cause}")
|
lines.append(f"- {cause}")
|
||||||
|
|||||||
@@ -180,12 +180,12 @@ class Evaluator:
|
|||||||
record["judge_model"] = self.scenario.judge_model
|
record["judge_model"] = self.scenario.judge_model
|
||||||
record["embedding_model"] = self.scenario.embedding_model
|
record["embedding_model"] = self.scenario.embedding_model
|
||||||
record["run_id"] = self.scenario.scenario_name
|
record["run_id"] = self.scenario.scenario_name
|
||||||
# Weighted score columns — enable post-hoc weighted aggregation in reporting.
|
# 综合加权得分列(已暂时禁用)
|
||||||
record["weighted_score"] = compute_weighted_score(
|
# record["weighted_score"] = compute_weighted_score(
|
||||||
score.metrics, self.scenario.metric_weights
|
# score.metrics, self.scenario.metric_weights
|
||||||
)
|
# )
|
||||||
doc_name = str(sample.metadata.get("doc_name", "") or "")
|
# doc_name = str(sample.metadata.get("doc_name", "") or "")
|
||||||
record["sample_weight"] = resolve_weight(
|
# record["sample_weight"] = resolve_weight(
|
||||||
self.scenario.doc_weights, doc_name, default=1.0
|
# self.scenario.doc_weights, doc_name, default=1.0
|
||||||
)
|
# )
|
||||||
return record
|
return record
|
||||||
|
|||||||
@@ -27,14 +27,55 @@ from ragas.metrics.collections import (
|
|||||||
from .pipeline import MetricPipeline
|
from .pipeline import MetricPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_openai_client_kwargs(
|
||||||
|
judge_model: str,
|
||||||
|
settings: EvaluationSettings,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings.
|
||||||
|
|
||||||
|
Lookup order:
|
||||||
|
1. LLM Profile whose model name equals judge_model (exact match)
|
||||||
|
2. Fall back to EvaluationSettings (.env)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Lazy import to avoid circular dependency (webapp -> rag_eval is one-way).
|
||||||
|
from webapp.services.profile_manager import profile_manager
|
||||||
|
profiles = profile_manager.list_all()
|
||||||
|
for profile in profiles:
|
||||||
|
if profile.model == judge_model:
|
||||||
|
kwargs: dict[str, Any] = {
|
||||||
|
"api_key": profile.api_key or "sk-placeholder",
|
||||||
|
"timeout": float(profile.timeout_seconds or 30),
|
||||||
|
}
|
||||||
|
if profile.base_url and profile.base_url.strip():
|
||||||
|
kwargs["base_url"] = profile.base_url.strip()
|
||||||
|
return kwargs
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
# If profile lookup fails for any reason, fall through to .env settings.
|
||||||
|
pass
|
||||||
|
|
||||||
|
return settings.openai_client_kwargs
|
||||||
|
|
||||||
|
|
||||||
def build_models(
|
def build_models(
|
||||||
judge_model: str,
|
judge_model: str,
|
||||||
embedding_model: str,
|
embedding_model: str,
|
||||||
settings: EvaluationSettings,
|
settings: EvaluationSettings,
|
||||||
) -> tuple[Any, Any]:
|
) -> tuple[Any, Any]:
|
||||||
"""Create the LLM and embedding clients required by the selected RAGAS metrics."""
|
"""Create the LLM and embedding clients required by the selected RAGAS metrics.
|
||||||
client = AsyncOpenAI(**settings.openai_client_kwargs)
|
|
||||||
llm = llm_factory(judge_model, client=client)
|
Dynamically resolves connection settings from the stored LLM Profiles first
|
||||||
|
(matched by model name), falling back to .env settings when no profile matches.
|
||||||
|
"""
|
||||||
|
client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
|
||||||
|
client = AsyncOpenAI(**client_kwargs)
|
||||||
|
# RAGAS structured-output judge calls can be truncated by the upstream default
|
||||||
|
# 1024 completion budget, especially for faithfulness and GPT-5 family models.
|
||||||
|
llm = llm_factory(
|
||||||
|
judge_model,
|
||||||
|
client=client,
|
||||||
|
max_tokens=max(1, int(settings.ragas_llm_max_tokens)),
|
||||||
|
)
|
||||||
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
|
||||||
return llm, embeddings
|
return llm, embeddings
|
||||||
|
|
||||||
|
|||||||
@@ -75,15 +75,16 @@ def build_summary_markdown(result: EvaluationResult) -> str:
|
|||||||
else:
|
else:
|
||||||
lines.append(f"- {metric}: `n/a`{weight_note}")
|
lines.append(f"- {metric}: `n/a`{weight_note}")
|
||||||
|
|
||||||
if has_weights:
|
# 综合加权得分(已暂时禁用)
|
||||||
overall_ws = compute_overall_weighted_score_mean(
|
# if has_weights:
|
||||||
score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
# overall_ws = compute_overall_weighted_score_mean(
|
||||||
)
|
# score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
|
||||||
weight_suffix = " (加权)"
|
# )
|
||||||
if overall_ws is not None and not math.isnan(overall_ws):
|
# weight_suffix = " (加权)"
|
||||||
lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
# if overall_ws is not None and not math.isnan(overall_ws):
|
||||||
else:
|
# lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
|
||||||
lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
# else:
|
||||||
|
# lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
|
||||||
|
|
||||||
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
|
||||||
existing_columns = [c for c in detail_columns if c in scores.columns]
|
existing_columns = [c for c in detail_columns if c in scores.columns]
|
||||||
|
|||||||
@@ -26,6 +26,11 @@ class EvaluationSettings(BaseSettings):
|
|||||||
default="text-embedding-3-small",
|
default="text-embedding-3-small",
|
||||||
alias="RAGAS_EMBEDDING_MODEL",
|
alias="RAGAS_EMBEDDING_MODEL",
|
||||||
)
|
)
|
||||||
|
ragas_llm_max_tokens: int = Field(
|
||||||
|
default=4096,
|
||||||
|
alias="RAGAS_LLM_MAX_TOKENS",
|
||||||
|
gt=0,
|
||||||
|
)
|
||||||
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
|
||||||
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
|
||||||
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
batch_size: int = Field(default=8, alias="BATCH_SIZE")
|
||||||
|
|||||||
53
rag_eval/shared/profile_store.py
Normal file
53
rag_eval/shared/profile_store.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Lightweight read-only accessor for configs/llm_profiles.json.
|
||||||
|
|
||||||
|
Kept in ``rag_eval`` (not ``webapp``) so the runner can look up per-model
|
||||||
|
credentials without depending on the webapp layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PROFILES_PATH = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
|
||||||
|
|
||||||
|
|
||||||
|
def find_by_model(model_name: str) -> dict[str, Any] | None:
|
||||||
|
"""Return the first profile whose ``model`` field matches *model_name*, or None.
|
||||||
|
|
||||||
|
Returns None (without raising) when the profiles file does not exist or
|
||||||
|
cannot be parsed — callers fall back to environment-variable defaults.
|
||||||
|
"""
|
||||||
|
if not _PROFILES_PATH.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
data = json.loads(_PROFILES_PATH.read_text(encoding="utf-8"))
|
||||||
|
for profile in data.get("profiles", []):
|
||||||
|
if profile.get("model") == model_name:
|
||||||
|
return profile
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("[profile_store] failed to read %s: %s", _PROFILES_PATH, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def profile_to_client_kwargs(
|
||||||
|
profile: dict[str, Any],
|
||||||
|
fallback_api_key: str | None,
|
||||||
|
fallback_timeout: float,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Convert a profile dict into keyword arguments for ``openai.AsyncOpenAI``.
|
||||||
|
|
||||||
|
Fields present in the profile override the supplied fallback values.
|
||||||
|
"""
|
||||||
|
kwargs: dict[str, Any] = {
|
||||||
|
"api_key": profile.get("api_key") or fallback_api_key or "",
|
||||||
|
"timeout": float(profile.get("timeout_seconds") or fallback_timeout),
|
||||||
|
}
|
||||||
|
base_url = (profile.get("base_url") or "").strip()
|
||||||
|
if base_url:
|
||||||
|
kwargs["base_url"] = base_url
|
||||||
|
return kwargs
|
||||||
1101
siemens-ragas-project-overview.html
Normal file
1101
siemens-ragas-project-overview.html
Normal file
File diff suppressed because it is too large
Load Diff
@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
|
|||||||
for i, s in enumerate(scores)]
|
for i, s in enumerate(scores)]
|
||||||
|
|
||||||
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||||
|
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
|
||||||
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||||
result = diagnose(rows, metrics=["faithfulness"])
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
self.assertEqual(result, [])
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_low_severity_when_mean_below_advisory_threshold(self):
|
||||||
|
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
|
||||||
|
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
|
||||||
|
|
||||||
|
def test_low_severity_answer_relevancy_at_0_84(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
|
||||||
|
def test_low_severity_has_root_causes_and_actions(self):
|
||||||
|
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
|
||||||
|
result = diagnose(rows, metrics=["context_precision"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertTrue(len(result[0].root_causes) > 0)
|
||||||
|
self.assertTrue(len(result[0].suggested_actions) > 0)
|
||||||
|
|
||||||
def test_warning_when_mean_below_warning_threshold(self):
|
def test_warning_when_mean_below_warning_threshold(self):
|
||||||
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||||
result = diagnose(rows, metrics=["faithfulness"])
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
|||||||
@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
summary = _format_log_summary(diags, self.advice_path)
|
summary = _format_log_summary(diags, self.advice_path)
|
||||||
self.assertIn("faithfulness", summary)
|
self.assertIn("faithfulness", summary)
|
||||||
self.assertIn("critical", summary)
|
self.assertIn("严重", summary) # "critical" maps to Chinese label
|
||||||
self.assertIn("context_recall", summary)
|
self.assertIn("context_recall", summary)
|
||||||
self.assertIn("warning", summary)
|
self.assertIn("警告", summary) # "warning" maps to Chinese label
|
||||||
|
|
||||||
def test_write_empty_diagnoses_still_creates_file(self):
|
def test_write_empty_diagnoses_still_creates_file(self):
|
||||||
write_advice(
|
write_advice(
|
||||||
|
|||||||
68
tests/test_metric_presenter.py
Normal file
68
tests/test_metric_presenter.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_node(script: str) -> str:
|
||||||
|
"""Execute a short Node.js script and return stdout."""
|
||||||
|
completed = subprocess.run(
|
||||||
|
["node", "-e", script],
|
||||||
|
cwd=REPO_ROOT,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return completed.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def test_metric_presenter_applies_thresholds_and_noise_direction() -> None:
|
||||||
|
"""MetricPresenter should centralize thresholds and inverse noise semantics."""
|
||||||
|
metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix()
|
||||||
|
script = f"""
|
||||||
|
const fs = require("fs");
|
||||||
|
const vm = require("vm");
|
||||||
|
const code = fs.readFileSync("{metric_js}", "utf8");
|
||||||
|
const sandbox = {{ window: {{}}, console }};
|
||||||
|
vm.runInNewContext(code, sandbox);
|
||||||
|
const p = sandbox.window.MetricPresenter;
|
||||||
|
const result = {{
|
||||||
|
faith085: p.scoreClass("faithfulness", 0.85),
|
||||||
|
faith070: p.scoreClass("faithfulness", 0.70),
|
||||||
|
faith064: p.scoreClass("faithfulness", 0.64),
|
||||||
|
noise010: p.scoreClass("noise_sensitivity", 0.10),
|
||||||
|
noise030: p.scoreClass("noise_sensitivity", 0.30),
|
||||||
|
noise050: p.scoreClass("noise_sensitivity", 0.50),
|
||||||
|
desc: p.describeMetric("faithfulness"),
|
||||||
|
noiseDesc: p.describeMetric("noise_sensitivity"),
|
||||||
|
noiseBin: p.binColor("noise_sensitivity", 0.0),
|
||||||
|
faithBin: p.binColor("faithfulness", 0.8)
|
||||||
|
}};
|
||||||
|
console.log(JSON.stringify(result));
|
||||||
|
"""
|
||||||
|
output = _run_node(script)
|
||||||
|
assert '"faith085":"good"' in output
|
||||||
|
assert '"faith070":"warn"' in output
|
||||||
|
assert '"faith064":"bad"' in output
|
||||||
|
assert '"noise010":"good"' in output
|
||||||
|
assert '"noise030":"warn"' in output
|
||||||
|
assert '"noise050":"bad"' in output
|
||||||
|
assert '"desc":"' in output
|
||||||
|
assert '"noiseDesc":"' in output
|
||||||
|
assert '"noiseBin":"#16a34a"' in output
|
||||||
|
assert '"faithBin":"#16a34a"' in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_report_and_index_load_metric_presenter_helper() -> None:
|
||||||
|
"""The report page should use the shared helper for card descriptions and colors."""
|
||||||
|
index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8")
|
||||||
|
report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8")
|
||||||
|
app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
assert "js/metric_presenter.js" in index_html
|
||||||
|
assert "MetricPresenter.describeMetric" in report_js
|
||||||
|
assert "MetricPresenter.scoreClass" in app_js
|
||||||
@@ -184,7 +184,7 @@ class ScenarioAndDatasetTests(unittest.TestCase):
|
|||||||
|
|
||||||
class EvaluatorAndReportingTests(unittest.TestCase):
|
class EvaluatorAndReportingTests(unittest.TestCase):
|
||||||
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
def test_merge_score_includes_weighted_score_and_sample_weight(self):
|
||||||
"""_merge_score adds weighted_score and sample_weight columns."""
|
"""_merge_score no longer adds weighted_score/sample_weight (feature disabled)."""
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
from rag_eval.execution.evaluator import Evaluator
|
from rag_eval.execution.evaluator import Evaluator
|
||||||
from rag_eval.shared.models import (
|
from rag_eval.shared.models import (
|
||||||
@@ -212,9 +212,11 @@ class EvaluatorAndReportingTests(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
|
||||||
row = evaluator._merge_score(sample, score)
|
row = evaluator._merge_score(sample, score)
|
||||||
# (3*1.0 + 1*0.0) / (3+1) = 0.75
|
# 综合加权得分已暂时禁用,weighted_score 和 sample_weight 不再写入
|
||||||
assert abs(row["weighted_score"] - 0.75) < 1e-4
|
assert "weighted_score" not in row
|
||||||
assert row["sample_weight"] == 2.0
|
assert "sample_weight" not in row
|
||||||
|
assert row["faithfulness"] == 1.0
|
||||||
|
assert row["context_recall"] == 0.0
|
||||||
|
|
||||||
def test_summary_markdown_shows_weighted_score(self):
|
def test_summary_markdown_shows_weighted_score(self):
|
||||||
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
"""build_summary_markdown includes weighted_score when metric_weights set."""
|
||||||
|
|||||||
280
tests/test_pipeline.py
Normal file
280
tests/test_pipeline.py
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
"""Tests for the end-to-end pipeline API and pipeline task manager."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
# ── fixtures ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with a fresh PipelineTaskManager backed by tmp_path outputs."""
|
||||||
|
import webapp.services.pipeline_task_manager as mgr_mod
|
||||||
|
from webapp.services.pipeline_task_manager import PipelineTaskManager
|
||||||
|
|
||||||
|
fresh_mgr = PipelineTaskManager(max_workers=2)
|
||||||
|
monkeypatch.setattr(mgr_mod, "pipeline_task_manager", fresh_mgr)
|
||||||
|
monkeypatch.setattr(mgr_mod, "_PIPELINE_OUTPUT_ROOT", tmp_path / "pipeline")
|
||||||
|
|
||||||
|
import webapp.api.pipeline as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "pipeline_task_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
def _minimal_pdf_dir(tmp_path: Path) -> Path:
|
||||||
|
"""Create a temp directory that looks like a PDF folder (empty, valid dir)."""
|
||||||
|
d = tmp_path / "pdfs"
|
||||||
|
d.mkdir()
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_build_result(tmp_path: Path, job, run_id="r1"):
|
||||||
|
"""Return a fake DatasetBuildResult with a minimal dataset CSV."""
|
||||||
|
from rag_eval.dataset_builder.models import (
|
||||||
|
DatasetBuildArtifactPaths,
|
||||||
|
DatasetBuildResult,
|
||||||
|
DraftQuestionSample,
|
||||||
|
)
|
||||||
|
|
||||||
|
artifact_root = tmp_path / "build" / run_id
|
||||||
|
artifact_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
latest = tmp_path / "build" / "latest"
|
||||||
|
latest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
chunks_path = artifact_root / "source_chunks.jsonl"
|
||||||
|
chunks_path.write_text(
|
||||||
|
json.dumps({"chunk_id": "c1", "doc_id": "d1", "doc_name": "test.pdf",
|
||||||
|
"text": "CT scan context.", "page_start": 1, "page_end": 1,
|
||||||
|
"section_path": "/", "section_title": "", "source_layout_ids": []}) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(latest / "source_chunks.jsonl").write_text(chunks_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||||
|
|
||||||
|
dataset_csv = tmp_path / "generated_dataset.csv"
|
||||||
|
dataset_csv.write_text(
|
||||||
|
"sample_id,question,ground_truth,scenario,language,doc_id,doc_name,"
|
||||||
|
"section_path,page_start,page_end,source_chunk_ids,question_type,difficulty,"
|
||||||
|
"review_status,review_notes\n"
|
||||||
|
's1,"What is CT?","CT is imaging.","test","zh","d1","test.pdf","/",'
|
||||||
|
'1,1,"[""c1""]","fact","easy","draft",""\n',
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = DraftQuestionSample(
|
||||||
|
sample_id="s1", question="What is CT?", ground_truth="CT is imaging.",
|
||||||
|
scenario="test", language="zh", doc_id="d1", doc_name="test.pdf",
|
||||||
|
section_path="/", page_start=1, page_end=1, source_chunk_ids=["c1"],
|
||||||
|
question_type="fact", difficulty="easy",
|
||||||
|
)
|
||||||
|
|
||||||
|
artifact_paths = DatasetBuildArtifactPaths(
|
||||||
|
root_dir=artifact_root,
|
||||||
|
documents_jsonl=artifact_root / "documents.jsonl",
|
||||||
|
semantic_blocks_jsonl=artifact_root / "semantic_blocks.jsonl",
|
||||||
|
source_chunks_jsonl=chunks_path,
|
||||||
|
dataset_draft_csv=artifact_root / "dataset_draft.csv",
|
||||||
|
parse_failures_csv=artifact_root / "parse_failures.csv",
|
||||||
|
metadata_json=artifact_root / "metadata.json",
|
||||||
|
)
|
||||||
|
return DatasetBuildResult(
|
||||||
|
job=job,
|
||||||
|
run_id=run_id,
|
||||||
|
artifact_paths=artifact_paths,
|
||||||
|
documents=[],
|
||||||
|
draft_samples=[sample],
|
||||||
|
parse_failures=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_eval_result(tmp_path: Path, scenario):
|
||||||
|
"""Return a fake EvaluationResult."""
|
||||||
|
from rag_eval.shared.models import EvaluationResult
|
||||||
|
|
||||||
|
return EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id="eval-r1",
|
||||||
|
started_at="2026-01-01T00:00:00",
|
||||||
|
finished_at="2026-01-01T00:01:00",
|
||||||
|
valid_samples=[],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── API route tests ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_submit_returns_202_and_job_id(client, tmp_path):
|
||||||
|
"""POST /api/pipeline/jobs returns 202 with job_id immediately."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||||
|
parse_failures=0, eval_run_id="r1", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
resp = client.post("/api/pipeline/jobs", json={
|
||||||
|
"docs_path": str(pdf_dir),
|
||||||
|
"job_name": "test-job",
|
||||||
|
})
|
||||||
|
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["job_name"] == "test-job"
|
||||||
|
# status may already be completed by the time the response is read (mock runs instantly)
|
||||||
|
assert data["status"] in ("queued", "completed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_nonexistent_job_returns_404(client):
|
||||||
|
"""GET /api/pipeline/jobs/{id} returns 404 for unknown job."""
|
||||||
|
resp = client.get("/api/pipeline/jobs/doesnotexist")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_jobs_returns_empty_initially(client):
|
||||||
|
"""GET /api/pipeline/jobs returns empty list when no jobs submitted."""
|
||||||
|
resp = client.get("/api/pipeline/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_job_status_polling(client, tmp_path):
|
||||||
|
"""Submitted job becomes visible via GET /api/pipeline/jobs/{id}."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=3,
|
||||||
|
parse_failures=0, eval_run_id="r2", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
post_resp = client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir)})
|
||||||
|
|
||||||
|
job_id = post_resp.json()["job_id"]
|
||||||
|
|
||||||
|
# Poll until done or timeout (max 5s for mock)
|
||||||
|
for _ in range(20):
|
||||||
|
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||||
|
assert status_resp.status_code == 200
|
||||||
|
status = status_resp.json()
|
||||||
|
if status["status"] in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
assert status["status"] == "completed"
|
||||||
|
assert status["result"]["total_questions"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_job_fails_on_invalid_docs_path(client):
|
||||||
|
"""Job fails quickly if docs_path does not exist."""
|
||||||
|
resp = client.post("/api/pipeline/jobs", json={
|
||||||
|
"docs_path": "/nonexistent/path/that/does/not/exist",
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
|
||||||
|
for _ in range(20):
|
||||||
|
status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
|
||||||
|
status = status_resp.json()
|
||||||
|
if status["status"] in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
assert status["status"] == "failed"
|
||||||
|
assert "docs_path" in status["error"] or "not" in status["error"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_jobs_shows_submitted(client, tmp_path):
|
||||||
|
"""GET /api/pipeline/jobs includes jobs after submission."""
|
||||||
|
pdf_dir = _minimal_pdf_dir(tmp_path)
|
||||||
|
|
||||||
|
with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
|
||||||
|
from webapp.models import PipelineResult
|
||||||
|
mock_exec.return_value = PipelineResult(
|
||||||
|
build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
|
||||||
|
source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
|
||||||
|
parse_failures=0, eval_run_id="r3", eval_output_dir="/tmp/e",
|
||||||
|
scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
|
||||||
|
)
|
||||||
|
client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir), "job_name": "listed-job"})
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
list_resp = client.get("/api/pipeline/jobs")
|
||||||
|
assert list_resp.status_code == 200
|
||||||
|
jobs = list_resp.json()["jobs"]
|
||||||
|
assert len(jobs) >= 1
|
||||||
|
names = [j["job_name"] for j in jobs]
|
||||||
|
assert "listed-job" in names
|
||||||
|
|
||||||
|
|
||||||
|
# ── execute_dataset_build_job refactor test ────────────────────────────────────
|
||||||
|
|
||||||
|
def test_execute_dataset_build_job_directly(tmp_path):
|
||||||
|
"""execute_dataset_build_job runs the build without a YAML file."""
|
||||||
|
from unittest.mock import patch as _patch
|
||||||
|
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
|
||||||
|
from rag_eval.dataset_builder.runner import execute_dataset_build_job
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
pdf_dir = tmp_path / "pdfs"
|
||||||
|
pdf_dir.mkdir()
|
||||||
|
(pdf_dir / "doc.pdf").write_bytes(b"%PDF-fake")
|
||||||
|
|
||||||
|
job = DatasetBuildJob(
|
||||||
|
job_name="direct-test",
|
||||||
|
input_path=pdf_dir,
|
||||||
|
input_glob="*.pdf",
|
||||||
|
parser_provider="aliyun_docmind",
|
||||||
|
failure_mode="skip",
|
||||||
|
generation_model="test-model",
|
||||||
|
output_type="online_question_bank",
|
||||||
|
review_mode="draft_with_manual_review",
|
||||||
|
max_questions_per_document=5,
|
||||||
|
max_source_chunks_per_question=3,
|
||||||
|
dataset_path=tmp_path / "out.csv",
|
||||||
|
artifact_dir=tmp_path / "artifacts",
|
||||||
|
runtime=DatasetBuildRuntime(max_documents=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_doc = MagicMock()
|
||||||
|
mock_doc.doc_id = "d1"
|
||||||
|
mock_doc.doc_name = "doc.pdf"
|
||||||
|
mock_doc.source_chunks = []
|
||||||
|
mock_doc.semantic_blocks = []
|
||||||
|
mock_doc.raw_text = ""
|
||||||
|
mock_doc.structure_nodes = []
|
||||||
|
mock_doc.metadata = {}
|
||||||
|
mock_doc.to_record.return_value = {
|
||||||
|
"doc_id": "d1", "doc_name": "doc.pdf", "raw_text": "",
|
||||||
|
"structure_nodes": [], "metadata": {},
|
||||||
|
"semantic_block_count": 0, "source_chunk_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
mock_parser = MagicMock()
|
||||||
|
mock_parser.parse.return_value = mock_doc
|
||||||
|
|
||||||
|
mock_generator = MagicMock()
|
||||||
|
mock_generator.generate.return_value = []
|
||||||
|
|
||||||
|
result = execute_dataset_build_job(
|
||||||
|
job,
|
||||||
|
settings=EvaluationSettings(_env_file=None),
|
||||||
|
parser=mock_parser,
|
||||||
|
generator=mock_generator,
|
||||||
|
)
|
||||||
|
assert result.job.job_name == "direct-test"
|
||||||
|
assert result.artifact_paths.root_dir.exists()
|
||||||
@@ -65,7 +65,8 @@ def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path:
|
|||||||
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
"faithfulness": pytest.approx(0.75, rel=1e-4),
|
||||||
"context_recall": pytest.approx(0.5, rel=1e-4),
|
"context_recall": pytest.approx(0.5, rel=1e-4),
|
||||||
}
|
}
|
||||||
assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
|
# 综合加权得分已暂时禁用
|
||||||
|
assert report.weighted_score_mean is None
|
||||||
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
|
||||||
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
|
||||||
assert report.summary_markdown == "summary"
|
assert report.summary_markdown == "summary"
|
||||||
@@ -87,3 +88,30 @@ def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path)
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None:
|
||||||
|
"""Lowest-sample review should treat higher noise sensitivity as worse."""
|
||||||
|
run_dir = tmp_path / "run"
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(run_dir / "scores.csv").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sample_id,question,noise_sensitivity",
|
||||||
|
"s-good,q1,0.10",
|
||||||
|
"s-warn,q2,0.30",
|
||||||
|
"s-bad,q3,0.90",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||||
|
(run_dir / "optimization_advice.md").write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
report = build_report(run_dir, ["noise_sensitivity"])
|
||||||
|
|
||||||
|
assert [sample.sample_id for sample in report.lowest_samples[:3]] == [
|
||||||
|
"s-bad",
|
||||||
|
"s-warn",
|
||||||
|
"s-good",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Integration tests for /api/llm-profiles endpoints."""
|
"""Integration tests for /api/llm-profiles endpoints."""
|
||||||
import pytest
|
import pytest
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
@@ -41,19 +42,23 @@ def test_update_profile(client):
|
|||||||
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
|
||||||
upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
|
upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
|
||||||
|
with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
|
||||||
resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
|
resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.json()["name"] == "New"
|
assert resp.json()["name"] == "New"
|
||||||
assert resp.json()["timeout_seconds"] == 60
|
assert resp.json()["timeout_seconds"] == 60
|
||||||
|
invalidate.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
def test_delete_profile(client):
|
def test_delete_profile(client):
|
||||||
body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
|
body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
|
||||||
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
|
||||||
|
with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
|
||||||
resp = client.delete(f"/api/llm-profiles/{pid}")
|
resp = client.delete(f"/api/llm-profiles/{pid}")
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.json()["deleted"] is True
|
assert resp.json()["deleted"] is True
|
||||||
assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
|
assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
|
||||||
|
invalidate.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
def test_update_nonexistent(client):
|
def test_update_nonexistent(client):
|
||||||
@@ -185,7 +190,7 @@ def test_apply_doc_weights_patches_yaml(tmp_path):
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Connectivity test endpoint tests
|
# Connectivity test endpoint tests
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
|
||||||
def test_probe_connectivity_success(client):
|
def test_probe_connectivity_success(client):
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
from unittest.mock import sentinel
|
||||||
|
|
||||||
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
||||||
|
|
||||||
def test_llm_profile_defaults():
|
def test_llm_profile_defaults():
|
||||||
@@ -98,3 +100,106 @@ def test_get_nonexistent(tmp_path):
|
|||||||
def test_delete_nonexistent(tmp_path):
|
def test_delete_nonexistent(tmp_path):
|
||||||
mgr = _make_manager(tmp_path)
|
mgr = _make_manager(tmp_path)
|
||||||
assert mgr.delete("does-not-exist") is False
|
assert mgr.delete("does-not-exist") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_openai_client_kwargs_prefers_matching_profile(tmp_path, monkeypatch):
|
||||||
|
"""Metric runtime should prefer the saved LLM Profile over .env defaults."""
|
||||||
|
from rag_eval.metrics.factory import _resolve_openai_client_kwargs
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
import webapp.services.profile_manager as pm_mod
|
||||||
|
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
mgr.create(
|
||||||
|
name="Judge",
|
||||||
|
model="gpt-5.5",
|
||||||
|
base_url="http://39.107.88.131:13000",
|
||||||
|
api_key="sk-profile",
|
||||||
|
timeout_seconds=300,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(pm_mod, "profile_manager", mgr)
|
||||||
|
|
||||||
|
settings = EvaluationSettings(
|
||||||
|
OPENAI_API_KEY="sk-env",
|
||||||
|
OPENAI_BASE_URL="http://env-base/v1",
|
||||||
|
OPENAI_TIMEOUT_SECONDS=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = _resolve_openai_client_kwargs("gpt-5.5", settings)
|
||||||
|
assert kwargs["api_key"] == "sk-profile"
|
||||||
|
assert kwargs["base_url"] == "http://39.107.88.131:13000"
|
||||||
|
assert kwargs["timeout"] == 300.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
|
||||||
|
"""When no saved profile matches, .env settings remain the fallback."""
|
||||||
|
from rag_eval.metrics.factory import _resolve_openai_client_kwargs
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
import webapp.services.profile_manager as pm_mod
|
||||||
|
|
||||||
|
mgr = _make_manager(tmp_path)
|
||||||
|
monkeypatch.setattr(pm_mod, "profile_manager", mgr)
|
||||||
|
|
||||||
|
settings = EvaluationSettings(
|
||||||
|
OPENAI_API_KEY="sk-env",
|
||||||
|
OPENAI_BASE_URL="http://env-base/v1",
|
||||||
|
OPENAI_TIMEOUT_SECONDS=45,
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs = _resolve_openai_client_kwargs("gpt-5", settings)
|
||||||
|
assert kwargs["api_key"] == "sk-env"
|
||||||
|
assert kwargs["base_url"] == "http://env-base/v1"
|
||||||
|
assert kwargs["timeout"] == 45.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch):
|
||||||
|
"""Structured RAGAS judge calls should use a larger completion budget by default."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["model"] = model
|
||||||
|
captured["client"] = client
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
llm, embeddings = factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert llm is sentinel.llm
|
||||||
|
assert embeddings is sentinel.embeddings
|
||||||
|
assert captured["model"] == "gpt-5"
|
||||||
|
assert captured["client"] is sentinel.client
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 4096}
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch):
|
||||||
|
"""Operators should be able to raise the judge completion budget via settings."""
|
||||||
|
import rag_eval.metrics.factory as factory
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
captured: dict[str, object] = {}
|
||||||
|
|
||||||
|
def fake_llm_factory(model, client=None, **kwargs):
|
||||||
|
captured["kwargs"] = kwargs
|
||||||
|
return sentinel.llm
|
||||||
|
|
||||||
|
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||||
|
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||||
|
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||||
|
|
||||||
|
factory.build_models(
|
||||||
|
"gpt-5",
|
||||||
|
"text-embedding-3-small",
|
||||||
|
EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured["kwargs"] == {"max_tokens": 8192}
|
||||||
|
|||||||
@@ -57,9 +57,11 @@ class TestScoreRequest:
|
|||||||
with pytest.raises(ValidationError):
|
with pytest.raises(ValidationError):
|
||||||
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
ScoreRequest(question="q", contexts="c") # type: ignore[call-arg]
|
||||||
|
|
||||||
def test_missing_contexts_raises(self):
|
def test_missing_contexts_defaults_to_none(self):
|
||||||
with pytest.raises(ValidationError):
|
"""contexts is now optional — missing contexts is allowed."""
|
||||||
ScoreRequest(question="q", answer="a") # type: ignore[call-arg]
|
req = ScoreRequest(question="q", answer="a")
|
||||||
|
assert req.contexts is None
|
||||||
|
assert req.contexts_as_list() == []
|
||||||
|
|
||||||
def test_custom_metrics_accepted(self):
|
def test_custom_metrics_accepted(self):
|
||||||
req = ScoreRequest(
|
req = ScoreRequest(
|
||||||
@@ -115,6 +117,17 @@ class TestScoreRequest:
|
|||||||
"factual_correctness",
|
"factual_correctness",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def test_effective_metrics_drops_context_dependent_when_contexts_absent(self):
|
||||||
|
"""Without contexts, context-dependent metrics are excluded."""
|
||||||
|
req = ScoreRequest(
|
||||||
|
question="q", answer="a",
|
||||||
|
metrics=["faithfulness", "answer_relevancy", "context_precision"],
|
||||||
|
)
|
||||||
|
effective = req.effective_metrics()
|
||||||
|
assert "answer_relevancy" in effective
|
||||||
|
assert "faithfulness" not in effective
|
||||||
|
assert "context_precision" not in effective
|
||||||
|
|
||||||
|
|
||||||
class TestScoreResponse:
|
class TestScoreResponse:
|
||||||
def test_score_response_structure(self):
|
def test_score_response_structure(self):
|
||||||
@@ -228,7 +241,8 @@ class TestScoreEndpoint:
|
|||||||
})
|
})
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
assert data["weighted_score"] is not None
|
# 综合加权得分已暂时禁用,始终返回 null
|
||||||
|
assert data["weighted_score"] is None
|
||||||
|
|
||||||
def test_missing_required_fields_returns_422(self, client):
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
resp = client.post("/api/score", json={"question": "q"})
|
resp = client.post("/api/score", json={"question": "q"})
|
||||||
|
|||||||
146
tests/webapp/test_score_jobs_api.py
Normal file
146
tests/webapp/test_score_jobs_api.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
"""Tests for async score jobs API."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with fresh ScoreJobManager backed by tmp dirs."""
|
||||||
|
import webapp.services.score_job_manager as mgr_mod
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
|
||||||
|
fresh_mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-async",
|
||||||
|
index_dir=tmp_path / "score-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
|
||||||
|
|
||||||
|
import webapp.api.score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAsyncScoreEndpoints:
|
||||||
|
def test_submit_returns_202_with_job_id(self, client):
|
||||||
|
"""POST /api/score/async returns 202 immediately."""
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?",
|
||||||
|
"answer": "a.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert "job_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
|
||||||
|
def test_list_jobs_empty_initially(self, client):
|
||||||
|
resp = client.get("/api/score/jobs")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["jobs"] == []
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/jobs/nonexistent123")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_submitted_job_appears_in_list(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.1)
|
||||||
|
list_resp = client.get("/api/score/jobs")
|
||||||
|
ids = [j["job_id"] for j in list_resp.json()["jobs"]]
|
||||||
|
assert job_id in ids
|
||||||
|
|
||||||
|
def test_get_job_by_id_returns_status(self, client):
|
||||||
|
with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/async", json={
|
||||||
|
"question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.1)
|
||||||
|
get_resp = client.get(f"/api/score/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
|
||||||
|
def test_missing_required_fields_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score/async", json={"question": "q?"})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
class TestScoreJobManager:
|
||||||
|
def test_completed_job_persisted_to_index(self, tmp_path):
|
||||||
|
"""Completed job writes index JSON."""
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
|
||||||
|
mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "runs",
|
||||||
|
index_dir=tmp_path / "index",
|
||||||
|
max_workers=1,
|
||||||
|
)
|
||||||
|
req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
|
||||||
|
|
||||||
|
# Patch _run directly — it uses lazy imports internally
|
||||||
|
def fake_run(job_id, request):
|
||||||
|
mgr._update(job_id, status="completed", finished_at="2026-01-01T00:00:01+00:00",
|
||||||
|
run_id="fake-run-id", scores={"answer_relevancy": 0.85},
|
||||||
|
weighted_score=0.85, latency_ms=500)
|
||||||
|
|
||||||
|
with patch.object(mgr, "_run", side_effect=fake_run):
|
||||||
|
status = mgr.submit(req)
|
||||||
|
|
||||||
|
for _ in range(20):
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
if s and s.status == "completed":
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
s = mgr.get(status.job_id)
|
||||||
|
assert s is not None
|
||||||
|
idx_path = tmp_path / "index" / f"{status.job_id}.json"
|
||||||
|
assert idx_path.exists()
|
||||||
|
data = json.loads(idx_path.read_text(encoding="utf-8"))
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
assert data["status"] == "completed"
|
||||||
|
|
||||||
|
def test_loads_existing_index_on_startup(self, tmp_path):
|
||||||
|
"""Manager loads persisted jobs from index dir on init."""
|
||||||
|
from webapp.services.score_job_manager import ScoreJobManager
|
||||||
|
from webapp.models import AsyncScoreJobStatus
|
||||||
|
|
||||||
|
idx_dir = tmp_path / "index"
|
||||||
|
idx_dir.mkdir()
|
||||||
|
fake = AsyncScoreJobStatus(
|
||||||
|
job_id="testjob001",
|
||||||
|
status="completed",
|
||||||
|
created_at="2026-01-01T00:00:00+00:00",
|
||||||
|
run_id="some-run-id",
|
||||||
|
scores={"answer_relevancy": 0.9},
|
||||||
|
weighted_score=0.9,
|
||||||
|
latency_ms=1000,
|
||||||
|
)
|
||||||
|
(idx_dir / "testjob001.json").write_text(
|
||||||
|
json.dumps(fake.model_dump(), ensure_ascii=False), encoding="utf-8"
|
||||||
|
)
|
||||||
|
mgr = ScoreJobManager(
|
||||||
|
output_dir=tmp_path / "runs",
|
||||||
|
index_dir=idx_dir,
|
||||||
|
max_workers=1,
|
||||||
|
)
|
||||||
|
loaded = mgr.get("testjob001")
|
||||||
|
assert loaded is not None
|
||||||
|
assert loaded.status == "completed"
|
||||||
|
assert loaded.run_id == "some-run-id"
|
||||||
299
tests/webapp/test_session_score_jobs_api.py
Normal file
299
tests/webapp/test_session_score_jobs_api.py
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
"""Tests for session-grouped async scoring API and SessionScoreJobManager."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def tmp_manager(tmp_path):
|
||||||
|
"""Isolated SessionScoreJobManager backed by tmp dirs (no real LLM calls)."""
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
return SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with fresh SessionScoreJobManager backed by tmp dirs."""
|
||||||
|
import webapp.services.session_score_manager as mgr_mod
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
|
||||||
|
fresh_mgr = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
max_workers=2,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(mgr_mod, "session_score_manager", fresh_mgr)
|
||||||
|
|
||||||
|
import webapp.api.session_score_jobs as api_mod
|
||||||
|
monkeypatch.setattr(api_mod, "session_score_manager", fresh_mgr)
|
||||||
|
|
||||||
|
from webapp.server import create_app
|
||||||
|
return pytest.importorskip("fastapi.testclient").TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Unit tests for SessionScoreJobManager
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSessionRunId:
|
||||||
|
def test_same_session_always_same_run_id(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("abc") == tmp_manager.session_run_id("abc")
|
||||||
|
|
||||||
|
def test_different_sessions_different_run_ids(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("session-A") != tmp_manager.session_run_id("session-B")
|
||||||
|
|
||||||
|
def test_run_id_prefixed_with_session(self, tmp_manager):
|
||||||
|
assert tmp_manager.session_run_id("test123").startswith("session-")
|
||||||
|
|
||||||
|
def test_special_chars_sanitized(self, tmp_manager):
|
||||||
|
run_id = tmp_manager.session_run_id("user@dify:flow/001")
|
||||||
|
assert "/" not in run_id
|
||||||
|
assert "@" not in run_id
|
||||||
|
assert ":" not in run_id
|
||||||
|
|
||||||
|
|
||||||
|
class TestSubmit:
|
||||||
|
def test_submit_returns_job_status_and_run_id(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
status, run_id = tmp_manager.submit("session-1", _mock_request())
|
||||||
|
assert status.job_id
|
||||||
|
assert status.status == "queued"
|
||||||
|
assert run_id == tmp_manager.session_run_id("session-1")
|
||||||
|
|
||||||
|
def test_submit_adds_job_to_session(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
status, _ = tmp_manager.submit("session-1", _mock_request())
|
||||||
|
session = tmp_manager.get_session("session-1")
|
||||||
|
assert session is not None
|
||||||
|
assert any(j.job_id == status.job_id for j in session.jobs)
|
||||||
|
|
||||||
|
def test_multiple_submits_same_session_accumulate(self, tmp_manager):
|
||||||
|
with patch.object(tmp_manager._executor, "submit"):
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
tmp_manager.submit("session-X", _mock_request())
|
||||||
|
session = tmp_manager.get_session("session-X")
|
||||||
|
assert session.call_count == 3
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_none(self, tmp_manager):
|
||||||
|
assert tmp_manager.get_job("does-not-exist") is None
|
||||||
|
|
||||||
|
def test_get_unknown_session_returns_none(self, tmp_manager):
|
||||||
|
assert tmp_manager.get_session("no-such-session") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSessionIndexPersistence:
|
||||||
|
def test_session_index_survives_restart(self, tmp_path):
|
||||||
|
"""Jobs and session mappings loaded from disk on new manager instance."""
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
|
||||||
|
mgr1 = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
with patch.object(mgr1._executor, "submit"):
|
||||||
|
mgr1.submit("persist-session", _mock_request())
|
||||||
|
mgr1.submit("persist-session", _mock_request())
|
||||||
|
|
||||||
|
# New manager instance loads from disk
|
||||||
|
mgr2 = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
session = mgr2.get_session("persist-session")
|
||||||
|
assert session is not None
|
||||||
|
assert session.call_count == 2
|
||||||
|
|
||||||
|
def test_job_index_file_created_on_submit(self, tmp_path):
|
||||||
|
from webapp.services.session_score_manager import SessionScoreJobManager
|
||||||
|
mgr = SessionScoreJobManager(
|
||||||
|
output_dir=tmp_path / "score-session",
|
||||||
|
index_dir=tmp_path / "score-session-jobs",
|
||||||
|
)
|
||||||
|
with patch.object(mgr._executor, "submit"):
|
||||||
|
status, _ = mgr.submit("file-test", _mock_request())
|
||||||
|
index_file = tmp_path / "score-session-jobs" / f"{status.job_id}.json"
|
||||||
|
assert index_file.is_file()
|
||||||
|
data = json.loads(index_file.read_text())
|
||||||
|
assert data["job_id"] == status.job_id
|
||||||
|
|
||||||
|
|
||||||
|
class TestAppendBehaviour:
|
||||||
|
"""Test the CSV append / read-all logic in _append_and_regenerate via _read_score_rows."""
|
||||||
|
|
||||||
|
def test_read_score_rows_returns_empty_for_missing_csv(self, tmp_manager, tmp_path):
|
||||||
|
rows = tmp_manager._read_score_rows(tmp_path / "nonexistent")
|
||||||
|
assert rows == []
|
||||||
|
|
||||||
|
def test_read_score_rows_reads_existing_csv(self, tmp_manager, tmp_path):
|
||||||
|
run_dir = tmp_path / "run1"
|
||||||
|
run_dir.mkdir()
|
||||||
|
df = pd.DataFrame([{"sample_id": "s1", "answer_relevancy": 0.9}])
|
||||||
|
df.to_csv(run_dir / "scores.csv", index=False)
|
||||||
|
rows = tmp_manager._read_score_rows(run_dir)
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0]["sample_id"] == "s1"
|
||||||
|
|
||||||
|
def test_metric_means_computed_from_csv(self, tmp_manager, tmp_path):
|
||||||
|
run_dir = tmp_path / "run2"
|
||||||
|
run_dir.mkdir()
|
||||||
|
df = pd.DataFrame([
|
||||||
|
{"sample_id": "s1", "answer_relevancy": 0.8},
|
||||||
|
{"sample_id": "s2", "answer_relevancy": 0.6},
|
||||||
|
])
|
||||||
|
df.to_csv(run_dir / "scores.csv", index=False)
|
||||||
|
means = tmp_manager._read_metric_means(run_dir)
|
||||||
|
assert means["answer_relevancy"] == pytest.approx(0.7, abs=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# API endpoint tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSessionAsyncEndpoints:
|
||||||
|
def test_submit_returns_202_with_session_fields(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "test-session-001",
|
||||||
|
"question": "What is CT?",
|
||||||
|
"answer": "CT is computed tomography.",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 202
|
||||||
|
data = resp.json()
|
||||||
|
assert data["session_id"] == "test-session-001"
|
||||||
|
assert "job_id" in data
|
||||||
|
assert "run_id" in data
|
||||||
|
assert data["status"] == "queued"
|
||||||
|
assert data["call_count"] >= 1
|
||||||
|
|
||||||
|
def test_run_id_deterministic_for_session(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
r1 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "det-session",
|
||||||
|
"question": "Q1",
|
||||||
|
"answer": "A1",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
r2 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "det-session",
|
||||||
|
"question": "Q2",
|
||||||
|
"answer": "A2",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert r1.json()["run_id"] == r2.json()["run_id"]
|
||||||
|
|
||||||
|
def test_different_sessions_different_run_ids(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
r1 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "session-A",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
r2 = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "session-B",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert r1.json()["run_id"] != r2.json()["run_id"]
|
||||||
|
|
||||||
|
def test_call_count_increments_per_session(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
for _ in range(3):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "count-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
time.sleep(0.05)
|
||||||
|
resp = client.get("/api/score/sessions/count-session")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["call_count"] == 3
|
||||||
|
|
||||||
|
def test_get_session_returns_jobs_list(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "list-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
time.sleep(0.05)
|
||||||
|
resp = client.get("/api/score/sessions/list-session")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert len(data["jobs"]) == 1
|
||||||
|
|
||||||
|
def test_get_unknown_session_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/sessions/no-such-session-xyz")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_get_session_job_by_id(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "job-lookup-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
time.sleep(0.05)
|
||||||
|
get_resp = client.get(f"/api/score/session/jobs/{job_id}")
|
||||||
|
assert get_resp.status_code == 200
|
||||||
|
assert get_resp.json()["job_id"] == job_id
|
||||||
|
|
||||||
|
def test_get_unknown_job_returns_404(self, client):
|
||||||
|
resp = client.get("/api/score/session/jobs/nonexistent-job-id")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
def test_missing_session_id_returns_422(self, client):
|
||||||
|
resp = client.post("/api/score/session_async", json={
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
assert resp.status_code == 422
|
||||||
|
|
||||||
|
def test_list_sessions_endpoint(self, client):
|
||||||
|
with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
|
||||||
|
client.post("/api/score/session_async", json={
|
||||||
|
"session_id": "list-all-session",
|
||||||
|
"question": "Q",
|
||||||
|
"answer": "A",
|
||||||
|
"metrics": ["answer_relevancy"],
|
||||||
|
})
|
||||||
|
resp = client.get("/api/score/sessions")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "sessions" in resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _mock_request():
|
||||||
|
"""Build a minimal ScoreRequest for testing."""
|
||||||
|
from webapp.models import ScoreRequest
|
||||||
|
return ScoreRequest(
|
||||||
|
question="What is dual-source CT?",
|
||||||
|
answer="It uses two X-ray sources.",
|
||||||
|
metrics=["answer_relevancy"],
|
||||||
|
)
|
||||||
@@ -148,6 +148,13 @@ def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile
|
|||||||
if updated is None:
|
if updated is None:
|
||||||
logger.warning("[update_profile] not found id=%s", profile_id)
|
logger.warning("[update_profile] not found id=%s", profile_id)
|
||||||
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
# Invalidate scorer cache so next request picks up the new profile settings.
|
||||||
|
try:
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
inline_scorer.invalidate_cache()
|
||||||
|
logger.info("[update_profile] scorer cache invalidated id=%s", profile_id)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
logger.info("[update_profile] updated id=%s", profile_id)
|
logger.info("[update_profile] updated id=%s", profile_id)
|
||||||
return updated
|
return updated
|
||||||
|
|
||||||
@@ -160,6 +167,12 @@ def delete_profile(profile_id: str) -> dict:
|
|||||||
if not deleted:
|
if not deleted:
|
||||||
logger.warning("[delete_profile] not found id=%s", profile_id)
|
logger.warning("[delete_profile] not found id=%s", profile_id)
|
||||||
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
|
||||||
|
# Invalidate scorer cache in case the deleted profile was in use.
|
||||||
|
try:
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
inline_scorer.invalidate_cache()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
logger.info("[delete_profile] deleted id=%s", profile_id)
|
logger.info("[delete_profile] deleted id=%s", profile_id)
|
||||||
return {"deleted": True}
|
return {"deleted": True}
|
||||||
|
|
||||||
|
|||||||
@@ -73,7 +73,8 @@ def score_sample(
|
|||||||
用于日志记录、质量监控或触发 Agent 自我改进流程。
|
用于日志记录、质量监控或触发 Agent 自我改进流程。
|
||||||
|
|
||||||
**contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串,
|
**contexts 格式**:多个检索片段用 `context_separator`(默认 `" |||| "`)拼接为一个字符串,
|
||||||
服务端自动拆分后传入 RAGAS 管道。
|
服务端自动拆分后传入 RAGAS 管道。**contexts 为可选字段**,缺失时自动跳过依赖检索内容的指标
|
||||||
|
(`faithfulness`、`context_recall`、`context_precision`、`noise_sensitivity`)。
|
||||||
|
|
||||||
**ground_truth 可选**:
|
**ground_truth 可选**:
|
||||||
- 提供时:所有指定指标均参与计算。
|
- 提供时:所有指定指标均参与计算。
|
||||||
@@ -99,12 +100,13 @@ def score_sample(
|
|||||||
"""
|
"""
|
||||||
client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
|
client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
|
||||||
logger.info(
|
logger.info(
|
||||||
"[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s",
|
"[score] incoming client=%s method=%s content_type=%s metrics=%s has_gt=%s has_ctx=%s",
|
||||||
client,
|
client,
|
||||||
raw_request.method,
|
raw_request.method,
|
||||||
raw_request.headers.get("content-type", ""),
|
raw_request.headers.get("content-type", ""),
|
||||||
request.metrics,
|
request.metrics,
|
||||||
request.ground_truth is not None,
|
request.ground_truth is not None,
|
||||||
|
bool(request.contexts),
|
||||||
)
|
)
|
||||||
settings = _get_settings()
|
settings = _get_settings()
|
||||||
|
|
||||||
@@ -154,10 +156,11 @@ def score_sample(
|
|||||||
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
|
||||||
all_scores.update(raw_scores)
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
weighted = compute_weighted_score(
|
# 综合加权得分计算(已暂时禁用)
|
||||||
{key: value for key, value in raw_scores.items() if value is not None},
|
# weighted = compute_weighted_score(
|
||||||
{},
|
# {key: value for key, value in raw_scores.items() if value is not None},
|
||||||
)
|
# {},
|
||||||
|
# )
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"[score] done latency=%dms skipped=%s scores=%s",
|
"[score] done latency=%dms skipped=%s scores=%s",
|
||||||
@@ -167,7 +170,7 @@ def score_sample(
|
|||||||
)
|
)
|
||||||
return ScoreResponse(
|
return ScoreResponse(
|
||||||
scores=all_scores,
|
scores=all_scores,
|
||||||
weighted_score=round(weighted, 4) if weighted is not None else None,
|
weighted_score=None, # 综合加权得分已暂时禁用
|
||||||
latency_ms=latency_ms,
|
latency_ms=latency_ms,
|
||||||
skipped_metrics=skipped,
|
skipped_metrics=skipped,
|
||||||
)
|
)
|
||||||
|
|||||||
89
webapp/api/score_jobs.py
Normal file
89
webapp/api/score_jobs.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Routes for async RAGAS scoring jobs (Dify fire-and-forget integration).
|
||||||
|
|
||||||
|
Dify calls POST /api/score/async → gets job_id immediately (202).
|
||||||
|
Scoring runs in background, result written as a standard run artifact.
|
||||||
|
View full report at GET /api/runs/{run_id} or in the 「运行列表」 page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
|
||||||
|
from webapp.services.score_job_manager import score_job_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=AsyncScoreJobResponse,
|
||||||
|
summary="提交异步评分任务(Dify 推荐方式)",
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": (
|
||||||
|
"任务已排队,立即返回 job_id(202 Accepted)。\n\n"
|
||||||
|
"评分在后台执行,完成后自动生成完整报告(含优化建议)。\n"
|
||||||
|
"通过 `GET /api/score/jobs/{job_id}` 查询状态,"
|
||||||
|
"完成后在「运行列表」页查看完整报告。"
|
||||||
|
),
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {"job_id": "abc123def456", "status": "queued", "run_id": None}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
|
||||||
|
"""提交异步 RAGAS 评分任务,立即返回 job_id。
|
||||||
|
|
||||||
|
**适合 Dify 工作流**:HTTP 节点无需等待评分完成(无超时风险),
|
||||||
|
工作流立即继续,评分结果在 RAGAS 平台「运行列表」中查看。
|
||||||
|
|
||||||
|
评分完成后自动生成:
|
||||||
|
- 各指标得分(`scores.csv`)
|
||||||
|
- 摘要报告(`summary.md`)
|
||||||
|
- LLM 优化建议(`optimization_advice.md`)
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[score_async] submit metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.metrics, bool(request.contexts), bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
status = score_job_manager.submit(request)
|
||||||
|
logger.info("[score_async] queued job_id=%s", status.job_id)
|
||||||
|
return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有异步评分记录",
|
||||||
|
)
|
||||||
|
def list_score_jobs() -> dict:
|
||||||
|
"""返回所有异步评分记录,按创建时间倒序排列。"""
|
||||||
|
jobs = score_job_manager.list_jobs()
|
||||||
|
logger.info("[score_jobs] list count=%d", len(jobs))
|
||||||
|
return {"jobs": [j.model_dump() for j in jobs]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询单个异步评分任务状态",
|
||||||
|
responses={404: {"description": "指定 job_id 的评分任务不存在。"}},
|
||||||
|
)
|
||||||
|
def get_score_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""查询单个异步评分任务的状态和结果。
|
||||||
|
|
||||||
|
`status` 为 `completed` 时,`run_id` 字段包含对应的运行 ID,
|
||||||
|
可通过 `GET /api/runs/{run_id}` 获取完整评分报告。
|
||||||
|
"""
|
||||||
|
status = score_job_manager.get(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
|
||||||
|
return status
|
||||||
206
webapp/api/session_score_jobs.py
Normal file
206
webapp/api/session_score_jobs.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
"""Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
|
||||||
|
|
||||||
|
Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
|
||||||
|
`POST /api/score/session_async` call with a shared `session_id`. All results are
|
||||||
|
accumulated into one report, visible in 「运行列表」→「报告详情」.
|
||||||
|
|
||||||
|
Key behaviour:
|
||||||
|
- Deterministic run_id: derived from session_id — same session always maps to the
|
||||||
|
same report directory (outputs/score-session/session-<id>/).
|
||||||
|
- Append semantics: each call adds a new sample row. Previous rows are preserved.
|
||||||
|
- Advisor regeneration: optimization_advice.md is regenerated after every call
|
||||||
|
using the full set of accumulated rows.
|
||||||
|
- Each call returns its own `job_id` for individual status polling, plus the
|
||||||
|
shared `run_id` and `session_id`.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /api/score/session_async Submit one call (returns job_id + run_id)
|
||||||
|
GET /api/score/sessions List all sessions
|
||||||
|
GET /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
|
||||||
|
GET /api/score/session/jobs/{job_id} Status of one individual call
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
AsyncScoreJobStatus,
|
||||||
|
ScoreRequest,
|
||||||
|
SessionScoreJobResponse,
|
||||||
|
SessionScoreRequest,
|
||||||
|
SessionStatus,
|
||||||
|
)
|
||||||
|
from webapp.services.session_score_manager import session_score_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/score", tags=["score"])
|
||||||
|
logger = logging.getLogger("webapp.api.session_score_jobs")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/session_async",
|
||||||
|
status_code=202,
|
||||||
|
response_model=SessionScoreJobResponse,
|
||||||
|
summary="提交 Session 异步评分(多样本批量聚合)",
|
||||||
|
description=(
|
||||||
|
"**用途**\n"
|
||||||
|
"- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
|
||||||
|
"- 相同 `session_id` 的多次调用不会生成多个独立报告,而是持续追加到同一个 session 报告。\n\n"
|
||||||
|
"**请求字段说明**\n"
|
||||||
|
"- `session_id`:会话唯一标识,同一会话必须保持一致。\n"
|
||||||
|
"- `question` / `answer`:本次待评分的问答对。\n"
|
||||||
|
"- `contexts`:检索片段拼接字符串,按 `context_separator` 拆分。\n"
|
||||||
|
"- `ground_truth`:标准答案,可选;缺失时会自动跳过依赖它的指标。\n"
|
||||||
|
"- `metrics`:本次需要计算的指标列表。\n"
|
||||||
|
"- `judge_model` / `embedding_model`:可选;为空时回退到系统默认配置。\n\n"
|
||||||
|
"**处理行为**\n"
|
||||||
|
"1. 服务端立即返回 `202 Accepted`,并生成本次调用的 `job_id`。\n"
|
||||||
|
"2. 系统根据 `session_id` 计算固定 `run_id`,格式为 `session-<sanitized-session_id>`。\n"
|
||||||
|
"3. 本次评分完成后,会向该 session 的 `scores.csv` 追加一行样本数据。\n"
|
||||||
|
"4. 系统会基于当前 session 的全量样本重写 `summary.md`,并重新生成 `optimization_advice.md`。\n"
|
||||||
|
"5. 报告可在「运行列表」中按 `run_id` 查看;同一 session 的后续调用会持续增量更新该报告。\n\n"
|
||||||
|
"**后续查询接口**\n"
|
||||||
|
"- `GET /api/score/session/jobs/{job_id}`:查询本次调用状态与得分。\n"
|
||||||
|
"- `GET /api/score/sessions/{session_id}`:查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
|
||||||
|
"- `GET /api/runs/{run_id}`:查看完整评估报告内容。\n\n"
|
||||||
|
"**典型请求示例**\n"
|
||||||
|
"```json\n"
|
||||||
|
"{\n"
|
||||||
|
" \"session_id\": \"dify-session-001\",\n"
|
||||||
|
" \"question\": \"单源CT与双源CT在球管配置上有何本质区别?\",\n"
|
||||||
|
" \"answer\": \"单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。\",\n"
|
||||||
|
" \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
|
||||||
|
" \"context_separator\": \" |||| \",\n"
|
||||||
|
" \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
|
||||||
|
" \"judge_model\": \"gpt-5.5\",\n"
|
||||||
|
" \"embedding_model\": \"text-embedding-3-small\"\n"
|
||||||
|
"}\n"
|
||||||
|
"```"
|
||||||
|
),
|
||||||
|
responses={
|
||||||
|
202: {
|
||||||
|
"description": (
|
||||||
|
"调用已排队,立即返回 job_id + run_id(202 Accepted)。\n\n"
|
||||||
|
"相同 `session_id` 的多次调用合并为同一报告,每次调用新增一个样本行。\n"
|
||||||
|
"评分完成后,`summary.md` 和 `optimization_advice.md` 增量更新。\n"
|
||||||
|
"通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态,"
|
||||||
|
"通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态,"
|
||||||
|
"在「运行列表」中查看完整报告(run_id 即 `session-<session_id>` 形式)。"
|
||||||
|
),
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"example": {
|
||||||
|
"job_id": "abc123def456",
|
||||||
|
"session_id": "dify-session-001",
|
||||||
|
"run_id": "session-dify-session-001",
|
||||||
|
"status": "queued",
|
||||||
|
"call_count": 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
|
||||||
|
"""提交 Session 异步 RAGAS 评分,立即返回 job_id。
|
||||||
|
|
||||||
|
相同 `session_id` 的多次调用合并到同一评估报告中,每次调用:
|
||||||
|
1. 新增一个样本行到 `scores.csv`
|
||||||
|
2. 重写 `summary.md`(包含所有累积样本的指标均值)
|
||||||
|
3. 重新生成 `optimization_advice.md`(基于全量样本的 LLM 优化建议)
|
||||||
|
|
||||||
|
**适合 Dify 工作流**:在循环节点中批量调用,所有轮次共用同一 `session_id`,
|
||||||
|
最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
"[session_async] submit session_id=%s metrics=%s has_ctx=%s has_gt=%s",
|
||||||
|
request.session_id,
|
||||||
|
request.metrics,
|
||||||
|
bool(request.contexts),
|
||||||
|
bool(request.ground_truth),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Strip session_id to build a plain ScoreRequest for the manager
|
||||||
|
score_request = ScoreRequest(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts,
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
context_separator=request.context_separator,
|
||||||
|
metrics=request.metrics,
|
||||||
|
judge_model=request.judge_model,
|
||||||
|
embedding_model=request.embedding_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
status, run_id = session_score_manager.submit(request.session_id, score_request)
|
||||||
|
|
||||||
|
# Compute call_count from current session state
|
||||||
|
session_status = session_score_manager.get_session(request.session_id)
|
||||||
|
call_count = session_status.call_count if session_status else 1
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"[session_async] queued job_id=%s session_id=%s run_id=%s call=%d",
|
||||||
|
status.job_id, request.session_id, run_id, call_count,
|
||||||
|
)
|
||||||
|
return SessionScoreJobResponse(
|
||||||
|
job_id=status.job_id,
|
||||||
|
session_id=request.session_id,
|
||||||
|
run_id=run_id,
|
||||||
|
status=status.status,
|
||||||
|
call_count=call_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/sessions",
|
||||||
|
response_model=dict,
|
||||||
|
summary="列出所有 Session 聚合状态",
|
||||||
|
)
|
||||||
|
def list_sessions() -> dict:
|
||||||
|
"""返回所有 session 的聚合状态,按最近完成时间倒序排列。"""
|
||||||
|
sessions = session_score_manager.list_sessions()
|
||||||
|
logger.info("[session_score] list_sessions count=%d", len(sessions))
|
||||||
|
return {"sessions": [s.model_dump() for s in sessions]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/sessions/{session_id}",
|
||||||
|
response_model=SessionStatus,
|
||||||
|
summary="查询 Session 聚合状态(指标均值 + 所有调用记录)",
|
||||||
|
responses={404: {"description": "指定 session_id 不存在。"}},
|
||||||
|
)
|
||||||
|
def get_session(session_id: str) -> SessionStatus:
|
||||||
|
"""查询 session 的聚合评分状态。
|
||||||
|
|
||||||
|
返回内容:
|
||||||
|
- `run_id`:在「运行列表」中查看完整报告
|
||||||
|
- `call_count`:本 session 累计调用次数
|
||||||
|
- `metric_means`:所有已累积样本的各指标均值(实时读取 scores.csv)
|
||||||
|
- `jobs`:本 session 所有调用记录列表
|
||||||
|
"""
|
||||||
|
status = session_score_manager.get_session(session_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/session/jobs/{job_id}",
|
||||||
|
response_model=AsyncScoreJobStatus,
|
||||||
|
summary="查询 Session 单次调用状态",
|
||||||
|
responses={404: {"description": "指定 job_id 不存在。"}},
|
||||||
|
)
|
||||||
|
def get_session_job(job_id: str) -> AsyncScoreJobStatus:
|
||||||
|
"""查询 session 评分中某次调用的状态和评分结果。
|
||||||
|
|
||||||
|
`status` 为 `completed` 时,`run_id` 即所属 session 的报告目录,
|
||||||
|
`scores` 包含本次调用的各指标得分。
|
||||||
|
"""
|
||||||
|
status = session_score_manager.get_job(job_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Session score job not found: {job_id}"
|
||||||
|
)
|
||||||
|
return status
|
||||||
132
webapp/models.py
132
webapp/models.py
@@ -384,6 +384,14 @@ _GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
|||||||
"noise_sensitivity",
|
"noise_sensitivity",
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# 需要 contexts 才能计算的指标集合
|
||||||
|
_CONTEXT_DEPENDENT_METRICS: frozenset[str] = frozenset({
|
||||||
|
"faithfulness",
|
||||||
|
"context_recall",
|
||||||
|
"context_precision",
|
||||||
|
"noise_sensitivity",
|
||||||
|
})
|
||||||
|
|
||||||
# 所有合法指标名称
|
# 所有合法指标名称
|
||||||
_VALID_METRICS: frozenset[str] = frozenset({
|
_VALID_METRICS: frozenset[str] = frozenset({
|
||||||
"faithfulness",
|
"faithfulness",
|
||||||
@@ -428,8 +436,9 @@ class ScoreRequest(BaseModel):
|
|||||||
|
|
||||||
question: str = Field(description="问题文本。")
|
question: str = Field(description="问题文本。")
|
||||||
answer: str = Field(description="待评分的回答。")
|
answer: str = Field(description="待评分的回答。")
|
||||||
contexts: str = Field(
|
contexts: str | None = Field(
|
||||||
description="检索上下文字符串,多段之间用 context_separator 拼接。"
|
default=None,
|
||||||
|
description="检索上下文字符串,多段之间用 context_separator 拼接。缺失时自动跳过依赖检索内容的指标(faithfulness、context_recall、context_precision、noise_sensitivity)。",
|
||||||
)
|
)
|
||||||
ground_truth: str | None = Field(
|
ground_truth: str | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
@@ -467,15 +476,23 @@ class ScoreRequest(BaseModel):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
def contexts_as_list(self) -> list[str]:
|
def contexts_as_list(self) -> list[str]:
|
||||||
"""Split the contexts string into a list of non-empty fragments."""
|
"""Split the contexts string into a list of non-empty fragments.
|
||||||
|
|
||||||
|
Returns an empty list when contexts is None or blank.
|
||||||
|
"""
|
||||||
|
if not self.contexts:
|
||||||
|
return []
|
||||||
separator = self.context_separator or " |||| "
|
separator = self.context_separator or " |||| "
|
||||||
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
|
return [part.strip() for part in self.contexts.split(separator) if part.strip()]
|
||||||
|
|
||||||
def effective_metrics(self) -> list[str]:
|
def effective_metrics(self) -> list[str]:
|
||||||
"""Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
|
"""Return metrics filtered to exclude GT-dependent or context-dependent ones when inputs are absent."""
|
||||||
if self.ground_truth is not None:
|
result = list(self.metrics)
|
||||||
return list(self.metrics)
|
if self.ground_truth is None:
|
||||||
return [metric_name for metric_name in self.metrics if metric_name not in _GT_DEPENDENT_METRICS]
|
result = [m for m in result if m not in _GT_DEPENDENT_METRICS]
|
||||||
|
if not self.contexts:
|
||||||
|
result = [m for m in result if m not in _CONTEXT_DEPENDENT_METRICS]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class ScoreResponse(BaseModel):
|
class ScoreResponse(BaseModel):
|
||||||
@@ -497,3 +514,104 @@ class ScoreResponse(BaseModel):
|
|||||||
default=None,
|
default=None,
|
||||||
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
description="打分异常时的错误信息(HTTP 200 仍返回,scores 为空)。",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 异步评分记录模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class AsyncScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate 202 response after submitting an async score job."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符,用于后续查询结果。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
run_id: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="评分完成后写入的 Run ID,可在「运行列表」中查看完整报告。",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Session async 评分模型
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class SessionScoreRequest(ScoreRequest):
|
||||||
|
"""Request body for session-grouped async scoring.
|
||||||
|
|
||||||
|
All calls sharing the same session_id are accumulated into one report.
|
||||||
|
Each call adds a new sample row to the session's scores.csv.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(
|
||||||
|
json_schema_extra={
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"summary": "Dify 会话批量评分",
|
||||||
|
"value": {
|
||||||
|
"session_id": "dify-session-001",
|
||||||
|
"question": "单源CT与双源CT在球管配置上有何本质区别?",
|
||||||
|
"answer": "单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。",
|
||||||
|
"contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管",
|
||||||
|
"context_separator": " |||| ",
|
||||||
|
"metrics": ["answer_relevancy", "faithfulness"],
|
||||||
|
"judge_model": "gpt-5.5",
|
||||||
|
"embedding_model": "text-embedding-3-small",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
session_id: str = Field(
|
||||||
|
description=(
|
||||||
|
"会话唯一标识符。相同 session_id 的多次调用合并为同一报告,"
|
||||||
|
"每次调用新增一个样本行,指标均值和优化建议在每次调用后增量更新。"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SessionScoreJobResponse(BaseModel):
|
||||||
|
"""Immediate 202 response after submitting a session scoring call."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="本次调用的任务唯一标识符。")
|
||||||
|
session_id: str = Field(description="会话标识符。")
|
||||||
|
run_id: str = Field(description="本 session 对应的报告 Run ID,可在「运行列表」中查看。")
|
||||||
|
status: str = Field(default="queued", description="初始状态:queued。")
|
||||||
|
call_count: int = Field(default=1, description="本 session 当前累计调用次数(包含本次)。")
|
||||||
|
|
||||||
|
|
||||||
|
class SessionStatus(BaseModel):
|
||||||
|
"""Aggregate status and metrics for a scoring session."""
|
||||||
|
|
||||||
|
session_id: str = Field(description="会话标识符。")
|
||||||
|
run_id: str = Field(description="对应报告目录的 Run ID。")
|
||||||
|
call_count: int = Field(description="本 session 累计调用次数。")
|
||||||
|
metric_means: dict[str, float | None] = Field(
|
||||||
|
default_factory=dict, description="所有已累积样本的各指标均值。"
|
||||||
|
)
|
||||||
|
latest_finished_at: str = Field(default="", description="最近一次评分完成时间(ISO 8601 UTC)。")
|
||||||
|
jobs: list[AsyncScoreJobStatus] = Field(
|
||||||
|
default_factory=list, description="本 session 所有调用记录,按创建时间排序。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncScoreJobStatus(BaseModel):
|
||||||
|
"""State of one async score job (queued → running → completed/failed)."""
|
||||||
|
|
||||||
|
job_id: str = Field(description="任务唯一标识符。")
|
||||||
|
status: str = Field(description="queued | running | completed | failed")
|
||||||
|
created_at: str = Field(default="", description="创建时间(ISO 8601 UTC)。")
|
||||||
|
finished_at: str = Field(default="", description="完成时间(ISO 8601 UTC)。")
|
||||||
|
run_id: str | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="完成后对应的 Run ID,可通过 GET /api/runs/{run_id} 查看完整报告。",
|
||||||
|
)
|
||||||
|
request_summary: dict = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="请求参数快照(question 前80字、metrics、judge_model 等)。",
|
||||||
|
)
|
||||||
|
scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
|
||||||
|
weighted_score: float | None = Field(default=None, description="加权综合得分。")
|
||||||
|
latency_ms: int = Field(default=0, description="评分耗时毫秒。")
|
||||||
|
skipped_metrics: list[str] = Field(default_factory=list)
|
||||||
|
error: str | None = Field(default=None)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from fastapi.exceptions import RequestValidationError
|
|||||||
from fastapi.responses import FileResponse, JSONResponse
|
from fastapi.responses import FileResponse, JSONResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
|
from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs, session_score_jobs
|
||||||
|
|
||||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
logger = logging.getLogger("webapp.server")
|
logger = logging.getLogger("webapp.server")
|
||||||
@@ -69,10 +69,20 @@ OPENAPI_TAGS = [
|
|||||||
{
|
{
|
||||||
"name": "score",
|
"name": "score",
|
||||||
"description": (
|
"description": (
|
||||||
"**实时评分 API(Dify 外部 Tool)**\n\n"
|
"**实时评分 API(同步)** — `POST /api/score`\n\n"
|
||||||
"接受单条问答记录 `(question, answer, contexts, ground_truth)`,\n"
|
"**异步评分 API(Dify 推荐)** — `POST /api/score/async`\n\n"
|
||||||
"同步运行 RAGAS 指标打分,返回各指标得分和加权综合得分。\n\n"
|
"异步方式立即返回 job_id(202),评分在后台执行,完成后自动生成完整报告(含优化建议),"
|
||||||
"适用场景:Dify Agent 在回答后即时调用,用于质量监控或自我改进。\n\n"
|
"在「运行列表」页查看。\n\n"
|
||||||
|
"**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
|
||||||
|
"适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告,"
|
||||||
|
"每次调用新增一个样本行,指标均值和优化建议增量更新。\n\n"
|
||||||
|
"**Session 模式调用流程**\n"
|
||||||
|
"1. `POST /api/score/session_async` 提交一条问答评分请求。\n"
|
||||||
|
"2. 用 `GET /api/score/session/jobs/{job_id}` 轮询单次调用状态。\n"
|
||||||
|
"3. 用 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n"
|
||||||
|
"4. 用 `GET /api/runs/{run_id}` 或在「运行列表」中查看完整报告。\n\n"
|
||||||
|
"通过 `GET /api/score/jobs` 列出所有异步评分记录,"
|
||||||
|
"`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
|
||||||
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
"**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 "
|
||||||
"`Authorization: Bearer <token>` 请求头。"
|
"`Authorization: Bearer <token>` 请求头。"
|
||||||
),
|
),
|
||||||
@@ -87,7 +97,7 @@ OPENAPI_TAGS = [
|
|||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
"""Build and configure the FastAPI application instance."""
|
"""Build and configure the FastAPI application instance."""
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="RAGAS 评估系统",
|
title="Siemens RAGAS 评估平台",
|
||||||
description=(
|
description=(
|
||||||
"西门子医疗影像 RAG 评估平台 API 文档。\n\n"
|
"西门子医疗影像 RAG 评估平台 API 文档。\n\n"
|
||||||
"提供以下能力:\n"
|
"提供以下能力:\n"
|
||||||
@@ -108,6 +118,8 @@ def create_app() -> FastAPI:
|
|||||||
app.include_router(llm_profiles.router)
|
app.include_router(llm_profiles.router)
|
||||||
app.include_router(pipeline.router)
|
app.include_router(pipeline.router)
|
||||||
app.include_router(score.router)
|
app.include_router(score.router)
|
||||||
|
app.include_router(score_jobs.router)
|
||||||
|
app.include_router(session_score_jobs.router)
|
||||||
|
|
||||||
@app.middleware("http")
|
@app.middleware("http")
|
||||||
async def access_log_middleware(request: Request, call_next):
|
async def access_log_middleware(request: Request, call_next):
|
||||||
|
|||||||
@@ -54,13 +54,22 @@ class InlineScorer:
|
|||||||
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def invalidate_cache(self) -> None:
|
||||||
|
"""Clear the model cache so the next call rebuilds clients from current profiles."""
|
||||||
|
with self._lock:
|
||||||
|
self._model_cache.clear()
|
||||||
|
|
||||||
def _get_models(
|
def _get_models(
|
||||||
self,
|
self,
|
||||||
judge_model: str,
|
judge_model: str,
|
||||||
embedding_model: str,
|
embedding_model: str,
|
||||||
settings: EvaluationSettings,
|
settings: EvaluationSettings,
|
||||||
) -> tuple[Any, Any]:
|
) -> tuple[Any, Any]:
|
||||||
"""Return cached LLM/embedding clients, building them on first use."""
|
"""Return cached LLM/embedding clients, building them on first use.
|
||||||
|
|
||||||
|
Cache is keyed by (judge_model, embedding_model). Call invalidate_cache()
|
||||||
|
after updating an LLM Profile to force a fresh client on the next request.
|
||||||
|
"""
|
||||||
cache_key = (judge_model, embedding_model)
|
cache_key = (judge_model, embedding_model)
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if cache_key not in self._model_cache:
|
if cache_key not in self._model_cache:
|
||||||
|
|||||||
257
webapp/services/pipeline_task_manager.py
Normal file
257
webapp/services/pipeline_task_manager.py
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
"""Background task manager for end-to-end pipeline jobs (build + eval).
|
||||||
|
|
||||||
|
Each job runs three sequential phases inside a worker thread:
|
||||||
|
1. parsing_documents — AliyunDocmind parses every PDF
|
||||||
|
2. generating_questions — LLM generates a draft question bank
|
||||||
|
3. evaluating — RAGAS online evaluation scores each question
|
||||||
|
|
||||||
|
The DatasetBuildJob and Scenario objects are constructed entirely from the
|
||||||
|
API request parameters, so no YAML config files are needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from contextlib import redirect_stderr, redirect_stdout
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
PipelineJobRequest,
|
||||||
|
PipelineJobStatus,
|
||||||
|
PipelineResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_PIPELINE_OUTPUT_ROOT = _REPO_ROOT / "outputs" / "pipeline"
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class _LineCapture(io.TextIOBase):
|
||||||
|
"""Write-only stream that appends complete lines to a task's log buffer."""
|
||||||
|
|
||||||
|
def __init__(self, sink: "PipelineTask") -> None:
|
||||||
|
self._sink = sink
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
def write(self, text: str) -> int:
|
||||||
|
self._buffer += text
|
||||||
|
while "\n" in self._buffer:
|
||||||
|
line, self._buffer = self._buffer.split("\n", 1)
|
||||||
|
self._sink.append_log(line)
|
||||||
|
return len(text)
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
if self._buffer:
|
||||||
|
self._sink.append_log(self._buffer)
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineTask:
|
||||||
|
"""Mutable state for one pipeline job (build + eval)."""
|
||||||
|
|
||||||
|
def __init__(self, job_id: str, job_name: str) -> None:
|
||||||
|
self.job_id = job_id
|
||||||
|
self.job_name = job_name
|
||||||
|
self.status = "queued"
|
||||||
|
self.phase = "idle"
|
||||||
|
self.logs: list[str] = []
|
||||||
|
self.result: PipelineResult | None = None
|
||||||
|
self.error: str | None = None
|
||||||
|
self.created_at = _now_iso()
|
||||||
|
self.finished_at = ""
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def append_log(self, line: str) -> None:
|
||||||
|
with self._lock:
|
||||||
|
self.logs.append(line)
|
||||||
|
|
||||||
|
def snapshot(self) -> PipelineJobStatus:
|
||||||
|
with self._lock:
|
||||||
|
return PipelineJobStatus(
|
||||||
|
job_id=self.job_id,
|
||||||
|
job_name=self.job_name,
|
||||||
|
status=self.status,
|
||||||
|
phase=self.phase,
|
||||||
|
logs=list(self.logs),
|
||||||
|
result=self.result,
|
||||||
|
error=self.error,
|
||||||
|
created_at=self.created_at,
|
||||||
|
finished_at=self.finished_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineTaskManager:
|
||||||
|
"""Owns the thread pool and registry of pipeline jobs."""
|
||||||
|
|
||||||
|
def __init__(self, max_workers: int = 2) -> None:
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
self._tasks: dict[str, PipelineTask] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def submit(self, request: PipelineJobRequest) -> PipelineTask:
|
||||||
|
"""Register and schedule a new pipeline job; return its task object."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
job_name = request.job_name.strip() or f"pipeline-{job_id[:6]}"
|
||||||
|
task = PipelineTask(job_id=job_id, job_name=job_name)
|
||||||
|
with self._lock:
|
||||||
|
self._tasks[job_id] = task
|
||||||
|
self._executor.submit(self._run, task, request)
|
||||||
|
return task
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> PipelineJobStatus | None:
|
||||||
|
with self._lock:
|
||||||
|
task = self._tasks.get(job_id)
|
||||||
|
return task.snapshot() if task is not None else None
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[PipelineJobStatus]:
|
||||||
|
with self._lock:
|
||||||
|
tasks = list(self._tasks.values())
|
||||||
|
snapshots = [t.snapshot() for t in tasks]
|
||||||
|
snapshots.sort(key=lambda s: s.created_at, reverse=True)
|
||||||
|
return snapshots
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, task: PipelineTask, request: PipelineJobRequest) -> None:
|
||||||
|
"""Execute the full pipeline end to end inside a worker thread."""
|
||||||
|
task.status = "running"
|
||||||
|
task.append_log(f"[{_now_iso()}] 开始 pipeline 任务: {task.job_name}")
|
||||||
|
|
||||||
|
capture = _LineCapture(task)
|
||||||
|
try:
|
||||||
|
with redirect_stdout(capture), redirect_stderr(capture):
|
||||||
|
result = self._execute(task, request)
|
||||||
|
capture.flush()
|
||||||
|
task.result = result
|
||||||
|
task.phase = "done"
|
||||||
|
task.status = "completed"
|
||||||
|
task.append_log(f"[{_now_iso()}] pipeline 任务完成: {task.job_name}")
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
capture.flush()
|
||||||
|
task.error = f"{type(exc).__name__}: {exc}"
|
||||||
|
task.append_log(f"[{_now_iso()}] pipeline 任务失败: {task.error}")
|
||||||
|
task.status = "failed"
|
||||||
|
finally:
|
||||||
|
task.finished_at = _now_iso()
|
||||||
|
|
||||||
|
def _execute(self, task: PipelineTask, req: PipelineJobRequest) -> PipelineResult:
|
||||||
|
"""Run build then eval, updating task.phase as we go."""
|
||||||
|
|
||||||
|
# ── resolve paths ──────────────────────────────────────────────
|
||||||
|
docs_path = Path(req.docs_path)
|
||||||
|
if not docs_path.is_absolute():
|
||||||
|
docs_path = (_REPO_ROOT / docs_path).resolve()
|
||||||
|
if not docs_path.is_dir():
|
||||||
|
raise ValueError(f"docs_path is not an existing directory: {docs_path}")
|
||||||
|
|
||||||
|
job_output_dir = _PIPELINE_OUTPUT_ROOT / task.job_id
|
||||||
|
build_artifact_dir = job_output_dir / "build"
|
||||||
|
dataset_csv = job_output_dir / "generated_dataset.csv"
|
||||||
|
eval_output_dir = job_output_dir / "eval"
|
||||||
|
|
||||||
|
# ── phase 1 + 2: dataset build (parse & generate) ─────────────
|
||||||
|
task.phase = "parsing_documents"
|
||||||
|
task.append_log(f" [build] 扫描文档目录: {docs_path}")
|
||||||
|
build_result = self._run_build(task, req, docs_path, build_artifact_dir, dataset_csv)
|
||||||
|
|
||||||
|
source_chunks_jsonl = build_artifact_dir / "latest" / "source_chunks.jsonl"
|
||||||
|
total_q = len(build_result.draft_samples)
|
||||||
|
parse_failures = len(build_result.parse_failures)
|
||||||
|
task.append_log(f" [build] 题库生成完毕: {total_q} 道题目, {parse_failures} 份文档解析失败")
|
||||||
|
|
||||||
|
if total_q == 0:
|
||||||
|
raise RuntimeError("题库为空(所有文档均解析或生成失败),中止评估。")
|
||||||
|
|
||||||
|
# ── phase 3: evaluation ────────────────────────────────────────
|
||||||
|
task.phase = "evaluating"
|
||||||
|
task.append_log(f" [eval] 开始 RAGAS 评估,共 {total_q} 道题目")
|
||||||
|
eval_result = self._run_eval(task, req, dataset_csv, source_chunks_jsonl, eval_output_dir)
|
||||||
|
|
||||||
|
from rag_eval.reporting.artifacts import build_artifact_paths as _build_eval_paths
|
||||||
|
eval_artifact_paths = _build_eval_paths(eval_result.scenario.output_dir, eval_result.run_id)
|
||||||
|
|
||||||
|
return PipelineResult(
|
||||||
|
build_artifact_dir=build_artifact_dir.as_posix(),
|
||||||
|
dataset_csv=dataset_csv.as_posix(),
|
||||||
|
source_chunks_jsonl=source_chunks_jsonl.as_posix(),
|
||||||
|
total_questions=total_q,
|
||||||
|
parse_failures=parse_failures,
|
||||||
|
eval_run_id=eval_result.run_id,
|
||||||
|
eval_output_dir=eval_result.scenario.output_dir.as_posix(),
|
||||||
|
scores_csv=eval_artifact_paths.scores_csv.as_posix(),
|
||||||
|
summary_md=eval_artifact_paths.summary_md.as_posix(),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _run_build(self, task: PipelineTask, req: PipelineJobRequest,
|
||||||
|
docs_path: Path, artifact_dir: Path, dataset_csv: Path):
|
||||||
|
"""Construct DatasetBuildJob and run the build phase."""
|
||||||
|
from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
|
||||||
|
from rag_eval.dataset_builder.runner import execute_dataset_build_job
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
job = DatasetBuildJob(
|
||||||
|
job_name=task.job_name,
|
||||||
|
input_path=docs_path,
|
||||||
|
input_glob="*.pdf",
|
||||||
|
parser_provider="aliyun_docmind",
|
||||||
|
failure_mode=req.failure_mode, # type: ignore[arg-type]
|
||||||
|
generation_model=req.generation_model,
|
||||||
|
output_type="online_question_bank",
|
||||||
|
review_mode="draft_with_manual_review",
|
||||||
|
max_questions_per_document=req.max_questions_per_document,
|
||||||
|
max_source_chunks_per_question=req.max_source_chunks_per_question,
|
||||||
|
dataset_path=dataset_csv,
|
||||||
|
artifact_dir=artifact_dir,
|
||||||
|
runtime=DatasetBuildRuntime(max_documents=req.max_documents),
|
||||||
|
)
|
||||||
|
return execute_dataset_build_job(job, settings=settings)
|
||||||
|
|
||||||
|
def _run_eval(self, task: PipelineTask, req: PipelineJobRequest,
|
||||||
|
dataset_csv: Path, source_chunks_jsonl: Path, eval_output_dir: Path):
|
||||||
|
"""Construct Scenario and run the evaluation phase."""
|
||||||
|
from rag_eval.execution.runner import run_scenario_from_scenario_obj
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
AppAdapterConfig, DatasetConfig, RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=task.job_name,
|
||||||
|
mode="online",
|
||||||
|
dataset=DatasetConfig(path=dataset_csv),
|
||||||
|
judge_model=req.judge_model,
|
||||||
|
embedding_model=req.embedding_model,
|
||||||
|
metrics=list(req.metrics),
|
||||||
|
output_dir=eval_output_dir,
|
||||||
|
runtime=RuntimeConfig(
|
||||||
|
batch_size=4,
|
||||||
|
app_concurrency=2,
|
||||||
|
metric_concurrency=2,
|
||||||
|
max_samples=req.max_samples,
|
||||||
|
),
|
||||||
|
app_adapter=AppAdapterConfig(
|
||||||
|
type="python",
|
||||||
|
callable="apps.siemens_pdf_qa.adapter:run",
|
||||||
|
static_kwargs={
|
||||||
|
"source_chunks_path": source_chunks_jsonl,
|
||||||
|
"model": req.answer_model,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
optimization_advisor=req.optimization_advisor,
|
||||||
|
)
|
||||||
|
return run_scenario_from_scenario_obj(scenario, settings=settings)
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by the FastAPI routes.
|
||||||
|
pipeline_task_manager = PipelineTaskManager()
|
||||||
@@ -37,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language")
|
|||||||
# How many lowest-scoring samples to surface for manual review.
|
# How many lowest-scoring samples to surface for manual review.
|
||||||
LOWEST_SAMPLE_COUNT = 10
|
LOWEST_SAMPLE_COUNT = 10
|
||||||
|
|
||||||
|
# Metrics whose lower raw value means stronger performance.
|
||||||
|
LOWER_IS_BETTER_METRICS = {"noise_sensitivity"}
|
||||||
|
|
||||||
|
|
||||||
def _round_or_none(value: float | None) -> float | None:
|
def _round_or_none(value: float | None) -> float | None:
|
||||||
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
||||||
@@ -105,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS
|
|||||||
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
||||||
"""Average a single sample's available metric scores for ranking."""
|
"""Average a single sample's available metric scores for ranking."""
|
||||||
values = [
|
values = [
|
||||||
float(row[metric])
|
(1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric])
|
||||||
for metric in metrics
|
for metric in metrics
|
||||||
if metric in row and pd.notna(row[metric])
|
if metric in row and pd.notna(row[metric])
|
||||||
]
|
]
|
||||||
@@ -177,9 +180,11 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
|||||||
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
|
w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
|
||||||
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
|
rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
|
||||||
|
|
||||||
overall_ws = compute_overall_weighted_score_mean(
|
# 综合加权得分计算(已暂时禁用)
|
||||||
score_rows_list, metric_weights, doc_weights
|
# overall_ws = compute_overall_weighted_score_mean(
|
||||||
)
|
# score_rows_list, metric_weights, doc_weights
|
||||||
|
# )
|
||||||
|
overall_ws = None
|
||||||
|
|
||||||
distributions = {
|
distributions = {
|
||||||
metric: _distribution(frame, metric)
|
metric: _distribution(frame, metric)
|
||||||
|
|||||||
271
webapp/services/score_job_manager.py
Normal file
271
webapp/services/score_job_manager.py
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
"""Background task manager for async RAGAS single-sample scoring.
|
||||||
|
|
||||||
|
Each job:
|
||||||
|
1. Runs InlineScorer.score() in a thread pool.
|
||||||
|
2. Constructs a minimal EvaluationResult + Scenario in the standard format.
|
||||||
|
3. Calls write_run_artifacts() — produces metadata.json, scores.csv, summary.md.
|
||||||
|
4. Calls run_advisor() — produces optimization_advice.md.
|
||||||
|
|
||||||
|
The resulting run directory lands under outputs/score-async/<run_id>/ and is
|
||||||
|
automatically picked up by run_reader.list_run_summaries(), so it appears in
|
||||||
|
the existing 「运行列表」 and 「报告详情」 pages without any extra wiring.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-async"
|
||||||
|
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-jobs" # lightweight job index
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class ScoreJobManager:
|
||||||
|
"""Thread-pool manager for async scoring jobs.
|
||||||
|
|
||||||
|
Results are written as standard run artifacts so the report detail page
|
||||||
|
can render them with zero additional code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
output_dir: Path = _DEFAULT_JOBS_DIR,
|
||||||
|
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._output_dir = Path(output_dir)
|
||||||
|
self._index_dir = Path(index_dir)
|
||||||
|
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
self._cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
|
||||||
|
"""Queue one scoring job and return its initial status immediately."""
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": (request.question or "")[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
self._cache[job_id] = status
|
||||||
|
self._persist_index(status)
|
||||||
|
self._executor.submit(self._run, job_id, request)
|
||||||
|
return status
|
||||||
|
|
||||||
|
def get(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return current status or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all known jobs, newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Execute scoring, write run artifacts, run advisor."""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("webapp.services.score_job_manager")
|
||||||
|
self._update(job_id, status="running")
|
||||||
|
|
||||||
|
# Lazy imports to keep web server bootable if ragas is not installed.
|
||||||
|
from rag_eval.advisor import run_advisor
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.reporting.writers import write_run_artifacts
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||||
|
RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
started_at = utc_now_iso()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if effective:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_scores = {}
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
finished_at = utc_now_iso()
|
||||||
|
|
||||||
|
# Build full scores dict (skipped = None)
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# weighted_raw = compute_weighted_score(
|
||||||
|
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||||
|
# )
|
||||||
|
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
weighted = None
|
||||||
|
|
||||||
|
# Build a score row compatible with report_builder
|
||||||
|
score_row: dict[str, Any] = {
|
||||||
|
"sample_id": "async-score-1",
|
||||||
|
"question": request.question,
|
||||||
|
"answer": request.answer or "",
|
||||||
|
"contexts": request.contexts or "",
|
||||||
|
"ground_truth": request.ground_truth or "",
|
||||||
|
"error": "",
|
||||||
|
}
|
||||||
|
score_row.update(all_scores)
|
||||||
|
|
||||||
|
# Construct minimal EvaluationResult so write_run_artifacts works
|
||||||
|
run_id = finished_at.replace(":", "-")
|
||||||
|
output_dir = self._output_dir
|
||||||
|
|
||||||
|
# Build a minimal Scenario for snapshot + advisor
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=f"async-score-{job_id}",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=output_dir / run_id / "dataset.csv"),
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
metrics=list(request.metrics),
|
||||||
|
output_dir=output_dir,
|
||||||
|
optimization_advisor=True, # always generate advice
|
||||||
|
)
|
||||||
|
|
||||||
|
sample = NormalizedSample(
|
||||||
|
sample_id="async-score-1",
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer or "",
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id=run_id,
|
||||||
|
started_at=started_at,
|
||||||
|
finished_at=finished_at,
|
||||||
|
valid_samples=[sample],
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=[score_row],
|
||||||
|
)
|
||||||
|
|
||||||
|
write_run_artifacts(result)
|
||||||
|
logger.info("[score_job] artifacts written job_id=%s run_id=%s", job_id, run_id)
|
||||||
|
|
||||||
|
# Run optimization advisor (builds optimization_advice.md)
|
||||||
|
try:
|
||||||
|
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
logger.info("[score_job] advisor done job_id=%s", job_id)
|
||||||
|
except Exception as adv_exc: # noqa: BLE001
|
||||||
|
logger.warning("[score_job] advisor failed job_id=%s err=%s", job_id, adv_exc)
|
||||||
|
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=finished_at,
|
||||||
|
run_id=run_id,
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
logger.error("[score_job] failed job_id=%s err=%s", job_id, exc)
|
||||||
|
self._update(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Persistence helpers
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _update(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
"""Merge kwargs into the job status and persist the index."""
|
||||||
|
with self._lock:
|
||||||
|
existing = self._cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._cache[job_id] = updated
|
||||||
|
self._persist_index(updated)
|
||||||
|
|
||||||
|
def _persist_index(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Write a lightweight index JSON for this job (survives restarts)."""
|
||||||
|
path = self._index_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Load existing job index files on startup."""
|
||||||
|
for path in sorted(self._index_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
score_job_manager = ScoreJobManager()
|
||||||
452
webapp/services/session_score_manager.py
Normal file
452
webapp/services/session_score_manager.py
Normal file
@@ -0,0 +1,452 @@
|
|||||||
|
"""Background task manager for session-grouped async RAGAS scoring.
|
||||||
|
|
||||||
|
Each session groups multiple scoring calls into one shared run report:
|
||||||
|
|
||||||
|
1. First call: creates outputs/score-session/session-<id>/ and metadata.json.
|
||||||
|
2. Every call: appends a new sample row to scores.csv, rewrites summary.md
|
||||||
|
and optimization_advice.md by re-running write_run_artifacts + run_advisor
|
||||||
|
over ALL accumulated rows.
|
||||||
|
3. The resulting run directory is picked up automatically by run_reader, so the
|
||||||
|
「运行列表」 and 「报告详情」 pages show the live, growing report.
|
||||||
|
|
||||||
|
Concurrency model:
|
||||||
|
- Scoring (LLM network I/O) runs freely in the thread pool — different sessions
|
||||||
|
score concurrently; multiple calls to the same session also start scoring in
|
||||||
|
parallel.
|
||||||
|
- File I/O (CSV append, artifact rewrite, advisor) is serialized per session via
|
||||||
|
a per-session threading.Lock, so no two calls corrupt the same session's CSV.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from webapp.models import AsyncScoreJobStatus, ScoreRequest, SessionStatus
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_DEFAULT_OUTPUT_DIR = _REPO_ROOT / "outputs" / "score-session"
|
||||||
|
_DEFAULT_INDEX_DIR = _REPO_ROOT / "outputs" / "score-session-jobs"
|
||||||
|
|
||||||
|
# Columns that are sample metadata rather than metric scores (mirrors run_reader.NON_METRIC_COLUMNS)
|
||||||
|
_NON_METRIC_COLUMNS = {
|
||||||
|
"sample_id", "question", "contexts", "answer", "ground_truth",
|
||||||
|
"scenario", "language", "retrieval_config", "error",
|
||||||
|
"judge_model", "embedding_model", "run_id", "difficulty",
|
||||||
|
"question_type", "doc_id", "doc_name", "section_path",
|
||||||
|
"page_start", "page_end", "source_chunk_ids", "review_status",
|
||||||
|
"review_notes", "weighted_score", "sample_weight",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_session_id(session_id: str) -> str:
|
||||||
|
"""Convert an arbitrary session_id string to a safe directory-name fragment."""
|
||||||
|
return re.sub(r"[^a-zA-Z0-9]", "-", session_id)[:64].strip("-") or "default"
|
||||||
|
|
||||||
|
|
||||||
|
class SessionScoreJobManager:
|
||||||
|
"""Thread-pool manager for session-grouped async scoring jobs.
|
||||||
|
|
||||||
|
All calls sharing a session_id append to one shared run directory, so the
|
||||||
|
report detail page shows all samples and their aggregate metrics together.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
output_dir: Path = _DEFAULT_OUTPUT_DIR,
|
||||||
|
index_dir: Path = _DEFAULT_INDEX_DIR,
|
||||||
|
max_workers: int = 4,
|
||||||
|
) -> None:
|
||||||
|
self._output_dir = Path(output_dir)
|
||||||
|
self._index_dir = Path(index_dir)
|
||||||
|
self._output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._index_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(self._index_dir / "_sessions").mkdir(parents=True, exist_ok=True)
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
|
||||||
|
# job_id -> AsyncScoreJobStatus; guarded by _lock
|
||||||
|
self._job_cache: dict[str, AsyncScoreJobStatus] = {}
|
||||||
|
# session_id -> [job_ids in order]; guarded by _lock
|
||||||
|
self._session_jobs: dict[str, list[str]] = {}
|
||||||
|
# session_id -> per-session threading.Lock; guarded by _lock
|
||||||
|
self._session_locks: dict[str, threading.Lock] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
self._load_existing()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def session_run_id(self, session_id: str) -> str:
|
||||||
|
"""Return the deterministic run_id for a session (also the dir name)."""
|
||||||
|
return f"session-{_sanitize_session_id(session_id)}"
|
||||||
|
|
||||||
|
def submit(self, session_id: str, request: ScoreRequest) -> tuple[AsyncScoreJobStatus, str]:
|
||||||
|
"""Queue one scoring call for a session.
|
||||||
|
|
||||||
|
Returns (job_status, run_id). run_id is deterministic from session_id.
|
||||||
|
"""
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
job_id = uuid.uuid4().hex[:12]
|
||||||
|
|
||||||
|
status = AsyncScoreJobStatus(
|
||||||
|
job_id=job_id,
|
||||||
|
status="queued",
|
||||||
|
created_at=_now_iso(),
|
||||||
|
request_summary={
|
||||||
|
"question": (request.question or "")[:80],
|
||||||
|
"answer": (request.answer or "")[:80],
|
||||||
|
"metrics": list(request.metrics),
|
||||||
|
"judge_model": request.judge_model or "",
|
||||||
|
"embedding_model": request.embedding_model or "",
|
||||||
|
"has_contexts": bool(request.contexts),
|
||||||
|
"has_ground_truth": bool(request.ground_truth),
|
||||||
|
"session_id": session_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self._job_cache[job_id] = status
|
||||||
|
if session_id not in self._session_jobs:
|
||||||
|
self._session_jobs[session_id] = []
|
||||||
|
self._session_jobs[session_id].append(job_id)
|
||||||
|
|
||||||
|
self._persist_job_index(status)
|
||||||
|
self._persist_session_index(session_id)
|
||||||
|
self._executor.submit(self._run, job_id, session_id, run_id, request)
|
||||||
|
return status, run_id
|
||||||
|
|
||||||
|
def get_job(self, job_id: str) -> AsyncScoreJobStatus | None:
|
||||||
|
"""Return current status of one call, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
return self._job_cache.get(job_id)
|
||||||
|
|
||||||
|
def list_jobs(self) -> list[AsyncScoreJobStatus]:
|
||||||
|
"""Return all session job records, newest first."""
|
||||||
|
with self._lock:
|
||||||
|
jobs = list(self._job_cache.values())
|
||||||
|
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
def get_session(self, session_id: str) -> SessionStatus | None:
|
||||||
|
"""Return aggregate status for a session, or None if unknown."""
|
||||||
|
with self._lock:
|
||||||
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||||
|
if not job_ids:
|
||||||
|
return None
|
||||||
|
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
run_dir = self._output_dir / run_id
|
||||||
|
|
||||||
|
# Compute live metric means from the CSV (may be mid-update — best effort)
|
||||||
|
metric_means = self._read_metric_means(run_dir)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
jobs = [self._job_cache[jid] for jid in job_ids if jid in self._job_cache]
|
||||||
|
|
||||||
|
latest = max((j.finished_at for j in jobs if j.finished_at), default="")
|
||||||
|
return SessionStatus(
|
||||||
|
session_id=session_id,
|
||||||
|
run_id=run_id,
|
||||||
|
call_count=len(job_ids),
|
||||||
|
metric_means=metric_means,
|
||||||
|
latest_finished_at=latest,
|
||||||
|
jobs=sorted(jobs, key=lambda j: j.created_at),
|
||||||
|
)
|
||||||
|
|
||||||
|
def list_sessions(self) -> list[SessionStatus]:
|
||||||
|
"""Return aggregate status for all known sessions."""
|
||||||
|
with self._lock:
|
||||||
|
session_ids = list(self._session_jobs.keys())
|
||||||
|
results = []
|
||||||
|
for sid in session_ids:
|
||||||
|
status = self.get_session(sid)
|
||||||
|
if status is not None:
|
||||||
|
results.append(status)
|
||||||
|
results.sort(key=lambda s: s.latest_finished_at, reverse=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Worker
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _run(self, job_id: str, session_id: str, run_id: str, request: ScoreRequest) -> None:
|
||||||
|
"""Score one sample then append it to the session's shared run artifacts."""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("webapp.services.session_score_manager")
|
||||||
|
self._update_job(job_id, status="running")
|
||||||
|
|
||||||
|
# Lazy imports — keep web server bootable if ragas is not installed.
|
||||||
|
from rag_eval.advisor import run_advisor
|
||||||
|
from rag_eval.metrics.factory import build_models
|
||||||
|
from rag_eval.metrics.weights import compute_weighted_score
|
||||||
|
from rag_eval.reporting.writers import write_run_artifacts
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.models import (
|
||||||
|
DatasetConfig, EvaluationResult, NormalizedSample,
|
||||||
|
RuntimeConfig, Scenario,
|
||||||
|
)
|
||||||
|
from rag_eval.shared.utils import utc_now_iso
|
||||||
|
from webapp.services.inline_scorer import inline_scorer
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
judge_model = request.judge_model or settings.ragas_judge_model
|
||||||
|
embedding_model = request.embedding_model or settings.ragas_embedding_model
|
||||||
|
effective = request.effective_metrics()
|
||||||
|
requested = set(request.metrics)
|
||||||
|
skipped = sorted(requested - set(effective))
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- Scoring (can run concurrently for the same session) ----------
|
||||||
|
if effective:
|
||||||
|
raw_scores = inline_scorer.score(
|
||||||
|
question=request.question,
|
||||||
|
answer=request.answer,
|
||||||
|
contexts=request.contexts_as_list(),
|
||||||
|
ground_truth=request.ground_truth,
|
||||||
|
metrics=effective,
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_scores = {}
|
||||||
|
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
finished_at = utc_now_iso()
|
||||||
|
|
||||||
|
# Build complete scores for this sample (skipped metrics → None)
|
||||||
|
all_scores: dict[str, float | None] = {m: None for m in request.metrics}
|
||||||
|
all_scores.update(raw_scores)
|
||||||
|
|
||||||
|
# 综合加权得分计算(已暂时禁用)
|
||||||
|
# weighted_raw = compute_weighted_score(
|
||||||
|
# {k: v for k, v in raw_scores.items() if v is not None}, {}
|
||||||
|
# )
|
||||||
|
# weighted = round(weighted_raw, 4) if weighted_raw is not None else None
|
||||||
|
weighted = None
|
||||||
|
|
||||||
|
# --- File I/O must be serialized per session ----------------------
|
||||||
|
session_lock = self._get_session_lock(session_id)
|
||||||
|
with session_lock:
|
||||||
|
run_dir = self._output_dir / run_id
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Read all existing rows, then append the new one
|
||||||
|
existing_rows = self._read_score_rows(run_dir)
|
||||||
|
call_number = len(existing_rows) + 1
|
||||||
|
|
||||||
|
new_row: dict[str, Any] = {
|
||||||
|
"sample_id": f"session-score-{call_number}",
|
||||||
|
"question": request.question,
|
||||||
|
"answer": request.answer or "",
|
||||||
|
"contexts": request.contexts or "",
|
||||||
|
"ground_truth": request.ground_truth or "",
|
||||||
|
"error": "",
|
||||||
|
}
|
||||||
|
new_row.update(all_scores)
|
||||||
|
|
||||||
|
all_rows = existing_rows + [new_row]
|
||||||
|
|
||||||
|
# Reconstruct NormalizedSample objects for write_run_artifacts metadata
|
||||||
|
valid_samples = [
|
||||||
|
NormalizedSample(
|
||||||
|
sample_id=str(row.get("sample_id", f"session-score-{i + 1}")),
|
||||||
|
question=str(row.get("question", "")),
|
||||||
|
answer=str(row.get("answer", "")),
|
||||||
|
contexts=[
|
||||||
|
part.strip()
|
||||||
|
for part in str(row.get("contexts", "")).split(" |||| ")
|
||||||
|
if part.strip()
|
||||||
|
],
|
||||||
|
ground_truth=str(row.get("ground_truth", "")),
|
||||||
|
)
|
||||||
|
for i, row in enumerate(all_rows)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Determine all metric columns (union of all rows' metric keys)
|
||||||
|
all_metric_names = sorted({
|
||||||
|
k for row in all_rows
|
||||||
|
for k in row if k not in _NON_METRIC_COLUMNS
|
||||||
|
})
|
||||||
|
|
||||||
|
scenario = Scenario(
|
||||||
|
scenario_name=f"session-{_sanitize_session_id(session_id)}",
|
||||||
|
mode="offline",
|
||||||
|
dataset=DatasetConfig(path=run_dir / "dataset.csv"),
|
||||||
|
judge_model=judge_model,
|
||||||
|
embedding_model=embedding_model,
|
||||||
|
metrics=all_metric_names,
|
||||||
|
output_dir=self._output_dir,
|
||||||
|
optimization_advisor=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
started_at_val = (
|
||||||
|
existing_rows[0].get("_started_at", finished_at)
|
||||||
|
if existing_rows else finished_at
|
||||||
|
)
|
||||||
|
|
||||||
|
result = EvaluationResult(
|
||||||
|
scenario=scenario,
|
||||||
|
run_id=run_id,
|
||||||
|
started_at=started_at_val if isinstance(started_at_val, str) else finished_at,
|
||||||
|
finished_at=finished_at,
|
||||||
|
valid_samples=valid_samples,
|
||||||
|
invalid_samples=[],
|
||||||
|
score_rows=all_rows,
|
||||||
|
)
|
||||||
|
|
||||||
|
write_run_artifacts(result)
|
||||||
|
logger.info(
|
||||||
|
"[session_job] artifacts written job_id=%s session_id=%s call=%d",
|
||||||
|
job_id, session_id, call_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Regenerate optimization advice over all accumulated rows
|
||||||
|
try:
|
||||||
|
llm, _ = build_models(judge_model, embedding_model, settings)
|
||||||
|
run_advisor(result, scenario, llm)
|
||||||
|
logger.info("[session_job] advisor done job_id=%s session=%s", job_id, session_id)
|
||||||
|
except Exception as adv_exc: # noqa: BLE001
|
||||||
|
logger.warning(
|
||||||
|
"[session_job] advisor failed job_id=%s err=%s", job_id, adv_exc
|
||||||
|
)
|
||||||
|
|
||||||
|
self._update_job(
|
||||||
|
job_id,
|
||||||
|
status="completed",
|
||||||
|
finished_at=finished_at,
|
||||||
|
run_id=run_id,
|
||||||
|
scores=all_scores,
|
||||||
|
weighted_score=weighted,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
skipped_metrics=skipped,
|
||||||
|
)
|
||||||
|
self._persist_session_index(session_id)
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
import logging as _logging
|
||||||
|
_logging.getLogger("webapp.services.session_score_manager").error(
|
||||||
|
"[session_job] failed job_id=%s err=%s", job_id, exc
|
||||||
|
)
|
||||||
|
self._update_job(
|
||||||
|
job_id,
|
||||||
|
status="failed",
|
||||||
|
finished_at=_now_iso(),
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
error=f"{type(exc).__name__}: {exc}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Helpers
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _get_session_lock(self, session_id: str) -> threading.Lock:
|
||||||
|
with self._lock:
|
||||||
|
if session_id not in self._session_locks:
|
||||||
|
self._session_locks[session_id] = threading.Lock()
|
||||||
|
return self._session_locks[session_id]
|
||||||
|
|
||||||
|
def _read_score_rows(self, run_dir: Path) -> list[dict[str, Any]]:
|
||||||
|
"""Read existing scores.csv rows, returning empty list if file doesn't exist."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path)
|
||||||
|
return frame.where(pd.notnull(frame), None).to_dict("records")
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _read_metric_means(self, run_dir: Path) -> dict[str, float | None]:
|
||||||
|
"""Compute per-metric means from the session's scores.csv."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path)
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return {}
|
||||||
|
means: dict[str, float | None] = {}
|
||||||
|
for col in frame.columns:
|
||||||
|
if col in _NON_METRIC_COLUMNS:
|
||||||
|
continue
|
||||||
|
if pd.api.types.is_numeric_dtype(frame[col]):
|
||||||
|
val = frame[col].mean(numeric_only=True)
|
||||||
|
means[col] = None if pd.isna(val) else round(float(val), 4)
|
||||||
|
return means
|
||||||
|
|
||||||
|
def _update_job(self, job_id: str, **kwargs: Any) -> None:
|
||||||
|
with self._lock:
|
||||||
|
existing = self._job_cache.get(job_id)
|
||||||
|
if existing is None:
|
||||||
|
return
|
||||||
|
updated = existing.model_copy(update=kwargs)
|
||||||
|
self._job_cache[job_id] = updated
|
||||||
|
self._persist_job_index(updated)
|
||||||
|
|
||||||
|
def _persist_job_index(self, status: AsyncScoreJobStatus) -> None:
|
||||||
|
"""Persist a single job's status to the index directory."""
|
||||||
|
path = self._index_dir / f"{status.job_id}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _persist_session_index(self, session_id: str) -> None:
|
||||||
|
"""Persist the session→job_ids mapping."""
|
||||||
|
with self._lock:
|
||||||
|
job_ids = list(self._session_jobs.get(session_id) or [])
|
||||||
|
run_id = self.session_run_id(session_id)
|
||||||
|
data = {"session_id": session_id, "run_id": run_id, "job_ids": job_ids}
|
||||||
|
path = self._index_dir / "_sessions" / f"{_sanitize_session_id(session_id)}.json"
|
||||||
|
path.write_text(
|
||||||
|
json.dumps(data, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_existing(self) -> None:
|
||||||
|
"""Restore job cache and session mappings from persisted index files on startup."""
|
||||||
|
# Load individual job files
|
||||||
|
for path in sorted(self._index_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
status = AsyncScoreJobStatus.model_validate(data)
|
||||||
|
self._job_cache[status.job_id] = status
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Load session→job_ids mappings
|
||||||
|
sessions_dir = self._index_dir / "_sessions"
|
||||||
|
if not sessions_dir.is_dir():
|
||||||
|
return
|
||||||
|
for path in sorted(sessions_dir.glob("*.json")):
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
sid = data.get("session_id", "")
|
||||||
|
job_ids = data.get("job_ids", [])
|
||||||
|
if sid:
|
||||||
|
self._session_jobs[sid] = job_ids
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by FastAPI routes.
|
||||||
|
session_score_manager = SessionScoreJobManager()
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
/* Siemens RAGAS 评估控制台 — 样式表
|
/* Siemens RAGAS 评估平台 — 样式表
|
||||||
配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */
|
配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */
|
||||||
|
|
||||||
:root {
|
:root {
|
||||||
@@ -199,6 +199,7 @@ code {
|
|||||||
.metric-value.bad { color: var(--bad); }
|
.metric-value.bad { color: var(--bad); }
|
||||||
.metric-value.na { color: var(--slate-light); }
|
.metric-value.na { color: var(--slate-light); }
|
||||||
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||||
|
.metric-desc { font-size: 12px; color: #64748b; margin-top: 6px; line-height: 1.45; }
|
||||||
|
|
||||||
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
.report-half { margin-bottom: 0; }
|
.report-half { margin-bottom: 0; }
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>RAGAS 评估控制台</title>
|
<title>Siemens RAGAS 评估平台</title>
|
||||||
<link rel="stylesheet" href="/static/css/app.css" />
|
<link rel="stylesheet" href="/static/css/app.css" />
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
||||||
</head>
|
</head>
|
||||||
@@ -12,8 +12,8 @@
|
|||||||
<!-- 左侧导航(布局 A) -->
|
<!-- 左侧导航(布局 A) -->
|
||||||
<aside class="sidebar">
|
<aside class="sidebar">
|
||||||
<div class="brand">
|
<div class="brand">
|
||||||
<div class="brand-mark">RAGAS</div>
|
<div class="brand-mark">Siemens RAGAS</div>
|
||||||
<div class="brand-sub">评估控制台</div>
|
<div class="brand-sub">评估平台</div>
|
||||||
</div>
|
</div>
|
||||||
<nav class="nav">
|
<nav class="nav">
|
||||||
<button class="nav-item" data-view="runs">
|
<button class="nav-item" data-view="runs">
|
||||||
@@ -28,6 +28,9 @@
|
|||||||
<button class="nav-item" data-view="profiles">
|
<button class="nav-item" data-view="profiles">
|
||||||
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
<span class="nav-ico">⚙</span><span>LLM 配置</span>
|
||||||
</button>
|
</button>
|
||||||
|
<button class="nav-item" data-view="scorejobs">
|
||||||
|
<span class="nav-ico">📋</span><span>评分记录</span>
|
||||||
|
</button>
|
||||||
<button class="nav-item" data-view="apidocs">
|
<button class="nav-item" data-view="apidocs">
|
||||||
<span class="nav-ico">⎔</span><span>API 文档</span>
|
<span class="nav-ico">⎔</span><span>API 文档</span>
|
||||||
</button>
|
</button>
|
||||||
@@ -234,6 +237,22 @@
|
|||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
|
<!-- 评分记录视图 -->
|
||||||
|
<section class="view" id="view-scorejobs" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<div class="panel-head">
|
||||||
|
<h2>评分记录</h2>
|
||||||
|
<span class="muted" style="font-size:13px">来自 Dify 异步评分任务(POST /api/score/async)</span>
|
||||||
|
</div>
|
||||||
|
<p class="muted">评分完成后自动生成完整报告(含指标得分与 LLM 优化建议),点击「查看报告」跳转报告详情页。</p>
|
||||||
|
</div>
|
||||||
|
<div id="scorejobs-list"></div>
|
||||||
|
<div class="empty" id="scorejobs-empty" hidden>
|
||||||
|
<p>暂无评分记录。</p>
|
||||||
|
<p class="muted">在 Dify 工作流中调用 <code>POST /api/score/async</code> 后,记录将在此显示。</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
<!-- API 文档视图 -->
|
<!-- API 文档视图 -->
|
||||||
<section class="view" id="view-apidocs" hidden>
|
<section class="view" id="view-apidocs" hidden>
|
||||||
<iframe
|
<iframe
|
||||||
@@ -248,9 +267,11 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script src="/static/js/api.js"></script>
|
<script src="/static/js/api.js"></script>
|
||||||
|
<script src="/static/js/metric_presenter.js"></script>
|
||||||
<script src="/static/js/report.js"></script>
|
<script src="/static/js/report.js"></script>
|
||||||
<script src="/static/js/profiles.js"></script>
|
<script src="/static/js/profiles.js"></script>
|
||||||
<script src="/static/js/runner.js"></script>
|
<script src="/static/js/runner.js"></script>
|
||||||
|
<script src="/static/js/score_jobs.js"></script>
|
||||||
<script src="/static/js/app.js"></script>
|
<script src="/static/js/app.js"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -66,6 +66,11 @@ const API = {
|
|||||||
},
|
},
|
||||||
applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); },
|
applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); },
|
||||||
|
|
||||||
|
// 异步评分记录 API
|
||||||
|
scoreJobsAsync(body) { return API.post("/api/score/async", body); },
|
||||||
|
getScoreJob(jobId) { return API.get(`/api/score/jobs/${encodeURIComponent(jobId)}`); },
|
||||||
|
listScoreJobs() { return API.get("/api/score/jobs"); },
|
||||||
|
|
||||||
// 测试已保存 profile 的连通性
|
// 测试已保存 profile 的连通性
|
||||||
testProfile(id) {
|
testProfile(id) {
|
||||||
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}/test`, { method: "POST" })
|
return fetch(`/api/llm-profiles/${encodeURIComponent(id)}/test`, { method: "POST" })
|
||||||
|
|||||||
@@ -5,8 +5,8 @@
|
|||||||
const App = {
|
const App = {
|
||||||
currentRunId: null,
|
currentRunId: null,
|
||||||
activeView: null,
|
activeView: null,
|
||||||
views: ["runs", "new", "report", "profiles", "apidocs"],
|
views: ["runs", "new", "report", "profiles", "scorejobs", "apidocs"],
|
||||||
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", apidocs: "API 文档" },
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", scorejobs: "评分记录", apidocs: "API 文档" },
|
||||||
|
|
||||||
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
// 初始化:绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
|
||||||
init() {
|
init() {
|
||||||
@@ -72,6 +72,7 @@ const App = {
|
|||||||
if (view === "new") Runner.loadScenarios();
|
if (view === "new") Runner.loadScenarios();
|
||||||
if (view === "report") Report.render(App.currentRunId);
|
if (view === "report") Report.render(App.currentRunId);
|
||||||
if (view === "profiles") Profiles.load();
|
if (view === "profiles") Profiles.load();
|
||||||
|
if (view === "scorejobs") ScoreJobs.load();
|
||||||
},
|
},
|
||||||
|
|
||||||
// ----------------------------------------------------------------
|
// ----------------------------------------------------------------
|
||||||
@@ -146,7 +147,7 @@ const App = {
|
|||||||
const chips = (run.metrics || [])
|
const chips = (run.metrics || [])
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
const val = run.metric_means ? run.metric_means[m] : null;
|
const val = run.metric_means ? run.metric_means[m] : null;
|
||||||
const cls = App.scoreClass(val);
|
const cls = App.scoreClass(m, val);
|
||||||
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
||||||
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||||
})
|
})
|
||||||
@@ -173,11 +174,8 @@ const App = {
|
|||||||
if (btn) btn.disabled = false;
|
if (btn) btn.disabled = false;
|
||||||
},
|
},
|
||||||
|
|
||||||
scoreClass(value) {
|
scoreClass(metricName, value) {
|
||||||
if (value === null || value === undefined) return "na";
|
return MetricPresenter.scoreClass(metricName, value);
|
||||||
if (value >= 0.8) return "good";
|
|
||||||
if (value >= 0.65) return "warn";
|
|
||||||
return "bad";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
shortMetric(name) {
|
shortMetric(name) {
|
||||||
|
|||||||
77
webapp/static/js/metric_presenter.js
Normal file
77
webapp/static/js/metric_presenter.js
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
// metric_presenter.js — 统一维护指标语义(高分好 / 低分好)、颜色阈值与简要说明。
|
||||||
|
|
||||||
|
(function attachMetricPresenter(globalObj) {
|
||||||
|
const METRIC_META = {
|
||||||
|
faithfulness: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答是否被检索内容直接支持,越高越可靠。",
|
||||||
|
},
|
||||||
|
answer_relevancy: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与问题是否紧密相关,越高越切题。",
|
||||||
|
},
|
||||||
|
context_recall: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "检索片段覆盖标准答案关键信息的程度,越高越完整。",
|
||||||
|
},
|
||||||
|
context_precision: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "检索片段中有效信息的占比,越高越精准。",
|
||||||
|
},
|
||||||
|
noise_sensitivity: {
|
||||||
|
direction: "lower_better",
|
||||||
|
description: "对噪声上下文的敏感程度,越低说明抗干扰能力越强。",
|
||||||
|
},
|
||||||
|
factual_correctness: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与标准答案在事实层面的吻合程度,越高越准确。",
|
||||||
|
},
|
||||||
|
semantic_similarity: {
|
||||||
|
direction: "higher_better",
|
||||||
|
description: "回答与标准答案在语义上的相似程度,越高越接近。",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
function isLowerBetter(metricName) {
|
||||||
|
return METRIC_META[metricName]?.direction === "lower_better";
|
||||||
|
}
|
||||||
|
|
||||||
|
function scoreClass(metricName, value) {
|
||||||
|
if (value === null || value === undefined || Number.isNaN(Number(value))) return "na";
|
||||||
|
const numeric = Number(value);
|
||||||
|
if (isLowerBetter(metricName)) {
|
||||||
|
if (numeric <= 0.15) return "good";
|
||||||
|
if (numeric <= 0.35) return "warn";
|
||||||
|
return "bad";
|
||||||
|
}
|
||||||
|
if (numeric >= 0.85) return "good";
|
||||||
|
if (numeric >= 0.65) return "warn";
|
||||||
|
return "bad";
|
||||||
|
}
|
||||||
|
|
||||||
|
function describeMetric(metricName) {
|
||||||
|
return METRIC_META[metricName]?.description || "该指标用于衡量当前问答样本的评估表现。";
|
||||||
|
}
|
||||||
|
|
||||||
|
function binColor(metricName, lower) {
|
||||||
|
const numeric = Number(lower);
|
||||||
|
if (isLowerBetter(metricName)) {
|
||||||
|
if (numeric < 0.2) return "#16a34a";
|
||||||
|
if (numeric < 0.4) return "#84cc16";
|
||||||
|
if (numeric < 0.6) return "#eab308";
|
||||||
|
if (numeric < 0.8) return "#f97316";
|
||||||
|
return "#dc2626";
|
||||||
|
}
|
||||||
|
if (numeric >= 0.8) return "#16a34a";
|
||||||
|
if (numeric >= 0.6) return "#84cc16";
|
||||||
|
if (numeric >= 0.4) return "#eab308";
|
||||||
|
if (numeric >= 0.2) return "#f97316";
|
||||||
|
return "#dc2626";
|
||||||
|
}
|
||||||
|
|
||||||
|
globalObj.MetricPresenter = {
|
||||||
|
scoreClass,
|
||||||
|
describeMetric,
|
||||||
|
binColor,
|
||||||
|
};
|
||||||
|
})(window);
|
||||||
@@ -117,28 +117,30 @@ const Report = {
|
|||||||
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
|
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
|
||||||
metrics.forEach((metric) => {
|
metrics.forEach((metric) => {
|
||||||
const value = report.metric_means ? report.metric_means[metric] : null;
|
const value = report.metric_means ? report.metric_means[metric] : null;
|
||||||
const cls = App.scoreClass(value);
|
const cls = App.scoreClass(metric, value);
|
||||||
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
|
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
|
||||||
|
const description = MetricPresenter.describeMetric(metric);
|
||||||
const card = document.createElement("div");
|
const card = document.createElement("div");
|
||||||
card.className = "metric-card";
|
card.className = "metric-card";
|
||||||
card.innerHTML = `
|
card.innerHTML = `
|
||||||
<div class="metric-value ${cls}">${text}</div>
|
<div class="metric-value ${cls}">${text}</div>
|
||||||
<div class="metric-name">${App.escape(metric)}</div>
|
<div class="metric-name">${App.escape(metric)}</div>
|
||||||
|
<div class="metric-desc">${App.escape(description)}</div>
|
||||||
`;
|
`;
|
||||||
wrap.appendChild(card);
|
wrap.appendChild(card);
|
||||||
});
|
});
|
||||||
|
|
||||||
// 综合加权得分卡片
|
// 综合加权得分卡片(已暂时隐藏)
|
||||||
const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
|
// const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
|
||||||
const wsCard = document.createElement("div");
|
// const wsCard = document.createElement("div");
|
||||||
wsCard.className = "metric-card weighted-score-card";
|
// wsCard.className = "metric-card weighted-score-card";
|
||||||
const wsCls = App.scoreClass(wsValue);
|
// const wsCls = App.scoreClass(wsValue);
|
||||||
const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
|
// const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
|
||||||
wsCard.innerHTML = `
|
// wsCard.innerHTML = `
|
||||||
<div class="metric-value ${wsCls}">${wsText}</div>
|
// <div class="metric-value ${wsCls}">${wsText}</div>
|
||||||
<div class="metric-name">综合加权得分</div>
|
// <div class="metric-name">综合加权得分</div>
|
||||||
`;
|
// `;
|
||||||
wrap.appendChild(wsCard);
|
// wrap.appendChild(wsCard);
|
||||||
},
|
},
|
||||||
|
|
||||||
// ② 分数分布直方图(可切换指标)。
|
// ② 分数分布直方图(可切换指标)。
|
||||||
@@ -168,17 +170,13 @@ const Report = {
|
|||||||
const bins = distributions[metric] || [];
|
const bins = distributions[metric] || [];
|
||||||
const labels = bins.map((b) => b.label);
|
const labels = bins.map((b) => b.label);
|
||||||
const counts = bins.map((b) => b.count);
|
const counts = bins.map((b) => b.count);
|
||||||
const colors = bins.map((b) => Report._binColor(b.lower));
|
const colors = bins.map((b) => Report._binColor(metric, b.lower));
|
||||||
Report._drawDistChart(labels, counts, colors);
|
Report._drawDistChart(labels, counts, colors);
|
||||||
},
|
},
|
||||||
|
|
||||||
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
|
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
|
||||||
_binColor(lower) {
|
_binColor(metric, lower) {
|
||||||
if (lower >= 0.8) return "#16a34a";
|
return MetricPresenter.binColor(metric, lower);
|
||||||
if (lower >= 0.6) return "#84cc16";
|
|
||||||
if (lower >= 0.4) return "#eab308";
|
|
||||||
if (lower >= 0.2) return "#f97316";
|
|
||||||
return "#dc2626";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
// 实际绘制 Chart.js 柱状图。
|
// 实际绘制 Chart.js 柱状图。
|
||||||
@@ -247,7 +245,7 @@ const Report = {
|
|||||||
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
|
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
|
||||||
metrics.forEach((m) => {
|
metrics.forEach((m) => {
|
||||||
const v = stat.means ? stat.means[m] : null;
|
const v = stat.means ? stat.means[m] : null;
|
||||||
const cls = App.scoreClass(v);
|
const cls = App.scoreClass(m, v);
|
||||||
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
body += `<td class="${cls}">${text}</td>`;
|
body += `<td class="${cls}">${text}</td>`;
|
||||||
});
|
});
|
||||||
@@ -271,7 +269,7 @@ const Report = {
|
|||||||
const scoreBadges = metrics
|
const scoreBadges = metrics
|
||||||
.map((m) => {
|
.map((m) => {
|
||||||
const v = sample.metrics ? sample.metrics[m] : null;
|
const v = sample.metrics ? sample.metrics[m] : null;
|
||||||
const cls = App.scoreClass(v);
|
const cls = App.scoreClass(m, v);
|
||||||
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
|
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
|
||||||
})
|
})
|
||||||
|
|||||||
126
webapp/static/js/score_jobs.js
Normal file
126
webapp/static/js/score_jobs.js
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
// score_jobs.js — 评分记录页面(异步 RAGAS 评分任务列表)
|
||||||
|
// 每条评分完成后自动写入标准 Run 产物,点击「查看报告」复用现有报告详情页。
|
||||||
|
|
||||||
|
const ScoreJobs = {
|
||||||
|
_pollTimers: {}, // job_id -> setInterval handle
|
||||||
|
|
||||||
|
async load() {
|
||||||
|
const list = document.getElementById("scorejobs-list");
|
||||||
|
const empty = document.getElementById("scorejobs-empty");
|
||||||
|
list.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
|
try {
|
||||||
|
const data = await API.listScoreJobs();
|
||||||
|
const jobs = data.jobs || [];
|
||||||
|
list.innerHTML = "";
|
||||||
|
if (jobs.length === 0) {
|
||||||
|
empty.hidden = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
empty.hidden = true;
|
||||||
|
jobs.forEach(job => list.appendChild(ScoreJobs.renderCard(job)));
|
||||||
|
// Auto-poll any pending jobs
|
||||||
|
jobs.forEach(job => {
|
||||||
|
if (job.status === "queued" || job.status === "running") {
|
||||||
|
ScoreJobs._startPoll(job.job_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
renderCard(job) {
|
||||||
|
const card = document.createElement("div");
|
||||||
|
card.className = "run-card";
|
||||||
|
card.id = `score-job-${job.job_id}`;
|
||||||
|
card.innerHTML = ScoreJobs._cardHtml(job);
|
||||||
|
// Bind report button if already completed
|
||||||
|
ScoreJobs._bindReportBtn(card, job);
|
||||||
|
return card;
|
||||||
|
},
|
||||||
|
|
||||||
|
_cardHtml(job) {
|
||||||
|
const time = App.shortTime(job.created_at);
|
||||||
|
const question = App.escape((job.request_summary?.question || "—").slice(0, 60));
|
||||||
|
const metrics = (job.request_summary?.metrics || []).join(", ");
|
||||||
|
|
||||||
|
const statusBadge = `<span class="badge ${job.status}">${job.status}</span>`;
|
||||||
|
|
||||||
|
let scoreHtml = "";
|
||||||
|
if (job.status === "completed") {
|
||||||
|
scoreHtml = Object.entries(job.scores || {})
|
||||||
|
.map(([k, v]) => {
|
||||||
|
const cls = App.scoreClass(k, v);
|
||||||
|
const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
|
||||||
|
return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
|
||||||
|
})
|
||||||
|
.join(" ");
|
||||||
|
// 综合加权得分(已暂时隐藏)
|
||||||
|
// if (job.weighted_score !== null && job.weighted_score !== undefined) {
|
||||||
|
// const cls = App.scoreClass(job.weighted_score);
|
||||||
|
// scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
|
||||||
|
// }
|
||||||
|
} else if (job.status === "failed") {
|
||||||
|
scoreHtml = `<span style="color:var(--bad);font-size:12px">${App.escape((job.error || "").slice(0, 80))}</span>`;
|
||||||
|
} else {
|
||||||
|
scoreHtml = `<span class="muted">评分中,请稍候…</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reportBtn = job.status === "completed" && job.run_id
|
||||||
|
? `<button class="btn btn-sm btn-primary score-job-report-btn" data-run-id="${App.escape(job.run_id)}">查看报告</button>`
|
||||||
|
: "";
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="run-card-head">
|
||||||
|
<div class="run-card-title">${question}</div>
|
||||||
|
<div style="display:flex;gap:8px;align-items:center">${statusBadge}${reportBtn}</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-meta">
|
||||||
|
<div>指标:${App.escape(metrics)} · ${time} · ${job.latency_ms}ms</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-metrics">${scoreHtml}</div>
|
||||||
|
`;
|
||||||
|
},
|
||||||
|
|
||||||
|
_bindReportBtn(card, job) {
|
||||||
|
const btn = card.querySelector(".score-job-report-btn");
|
||||||
|
if (!btn) return;
|
||||||
|
btn.addEventListener("click", () => {
|
||||||
|
const runId = btn.dataset.runId;
|
||||||
|
if (runId) {
|
||||||
|
App.enableReportNav();
|
||||||
|
App.navigate("report", runId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
_startPoll(jobId) {
|
||||||
|
if (ScoreJobs._pollTimers[jobId]) return;
|
||||||
|
ScoreJobs._pollTimers[jobId] = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
const job = await API.getScoreJob(jobId);
|
||||||
|
const card = document.getElementById(`score-job-${jobId}`);
|
||||||
|
if (card) {
|
||||||
|
card.innerHTML = ScoreJobs._cardHtml(job);
|
||||||
|
ScoreJobs._bindReportBtn(card, job);
|
||||||
|
}
|
||||||
|
if (job.status === "completed" || job.status === "failed") {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
// If completed, pre-enable report nav
|
||||||
|
if (job.status === "completed" && job.run_id) {
|
||||||
|
App.enableReportNav();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (_e) {
|
||||||
|
clearInterval(ScoreJobs._pollTimers[jobId]);
|
||||||
|
delete ScoreJobs._pollTimers[jobId];
|
||||||
|
}
|
||||||
|
}, 5000);
|
||||||
|
},
|
||||||
|
|
||||||
|
stopAllPolls() {
|
||||||
|
Object.values(ScoreJobs._pollTimers).forEach(t => clearInterval(t));
|
||||||
|
ScoreJobs._pollTimers = {};
|
||||||
|
},
|
||||||
|
};
|
||||||
17
webserver.log
Normal file
17
webserver.log
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
INFO: Started server process [82284]
|
||||||
|
INFO: Waiting for application startup.
|
||||||
|
INFO: Application startup complete.
|
||||||
|
INFO: Uvicorn running on http://127.0.0.1:8811 (Press CTRL+C to quit)
|
||||||
|
INFO: 127.0.0.1:56164 - "GET /api/health HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:53350 - "GET / HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:53351 - "GET /api/runs HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:53352 - "GET /api/scenarios HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:64689 - "GET /api/runs/2026-06-15T08-30-00%2B00-00 HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:64700 - "POST /api/evaluations HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:64703 - "GET /api/evaluations/a3243f2443d7 HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:58440 - "GET /api/evaluations/a3243f2443d7 HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:64454 - "GET /static/css/app.css HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:64455 - "GET /static/js/api.js HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:56825 - "GET /static/js/app.js HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:56829 - "GET /static/js/report.js HTTP/1.1" 200 OK
|
||||||
|
INFO: 127.0.0.1:56830 - "GET /static/js/runner.js HTTP/1.1" 200 OK
|
||||||
Reference in New Issue
Block a user