update

fix(llm): resolve score runtime config from saved profiles
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-27 14:31:45 +08:00 · 2026-06-26 20:34:01 +08:00 · 2026-06-26 16:09:33 +08:00 · 2026-06-25 11:35:49 +08:00 · 2026-06-24 17:24:22 +08:00 · 2026-06-24 17:08:01 +08:00
124 changed files with 19857 additions and 80 deletions
--- a/.env.example
+++ b/.env.example
@@ -1,11 +1,26 @@
 # ===== LLM 连接配置（RAGAS 评测 + 生成） =====
 # 所有模型共用同一个 OpenAI 兼容 endpoint
 # 在 Web 控制台的「LLM 配置」页面可以保存多个命名配置，
 # 并在运行评估时按角色（Judge / Answer / Dataset）分别选择覆盖。
 OPENAI_API_KEY=your-api-key
 OPENAI_BASE_URL=http://6.86.80.4:30080/v1
-RAGAS_JUDGE_MODEL=deepseek-v4-flash
+OPENAI_TIMEOUT_SECONDS=180
-RAGAS_EMBEDDING_MODEL=text-embedding-v3
+
 # 默认评测模型（可在场景 YAML 或 Web 控制台 LLM 配置中覆盖）
 # RAGAS_JUDGE_MODEL 需支持 OpenAI 兼容 chat.completions + 结构化 JSON 输出
 # RAGAS_LLM_MAX_TOKENS 控制 Judge 评分链路的 completion budget；faithfulness 等
 # 结构化指标在 GPT-5 系列上通常需要 4096 或更高，避免 IncompleteOutputException
 RAGAS_JUDGE_MODEL=gpt-5
 RAGAS_EMBEDDING_MODEL=text-embedding-3-small
 RAGAS_LLM_MAX_TOKENS=4096
 # 评估并发控制（启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300）
 BATCH_SIZE=8
 RAGAS_METRIC_TIMEOUT_SECONDS=300
-# ===== 阿里云文档解析 =====
+# ===== 阿里云文档解析（dataset build 功能需要） =====
 ALIBABA_ACCESS_KEY_ID=
 ALIBABA_ACCESS_KEY_SECRET=
 ALIBABA_ENDPOINT=docmind-api.cn-hangzhou.aliyuncs.com
@@ -14,6 +29,13 @@ ALIYUN_PARSE_TIMEOUT_SECONDS=900
 ALIYUN_PARSE_LAYOUT_STEP_SIZE=50
 ALIYUN_LLM_ENHANCEMENT=true
 ALIYUN_ENHANCEMENT_MODE=VLM
-DOCUMENT_PARSE_ARTIFACT_PREFIX=artifacts
+DOCUMENT_PARSE_ARTIFACT_PREFIX=outputs/dataset-builds
 PARSER_FAILURE_MODE=fail
 # 生成题库时使用的模型（可在 Web 控制台 LLM 配置中按场景覆盖）
 DATASET_GENERATOR_MODEL=qwen3.6-plus
 # ===== Dify 集成 — 实时评分 API =====
 # 为 /api/score 端点设置 Bearer Token 鉴权（留空则不鉴权，适合内网部署）
 # Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
 SCORE_API_TOKEN=
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,26 @@
 # 默认：文本文件使用 LF（Linux/macOS 风格）
 * text=auto eol=lf
 # Shell 脚本强制 LF，无论在哪个平台 checkout
 *.sh text eol=lf
 # Python 和 YAML 也用 LF
 *.py text eol=lf
 *.yaml text eol=lf
 *.yml text eol=lf
 *.md text eol=lf
 *.json text eol=lf
 *.toml text eol=lf
 *.txt text eol=lf
 *.env text eol=lf
 *.env.example text eol=lf
 # Windows 脚本保留 CRLF
 *.ps1 text eol=crlf
 *.bat text eol=crlf
 # 二进制文件不转换
 *.pdf binary
 *.png binary
 *.jpg binary
 *.csv binary
--- a/.gitignore
+++ b/.gitignore
@@ -17,5 +17,7 @@ wheels/
 # outputs
 outputs/
-# datasets
+# datasets — raw/normalized data files (large, not committed)
 # Note: rag_eval/datasets/ is source code and IS committed (see negation below)
 datasets/
 !rag_eval/datasets/
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="KubernetesApiProvider"><![CDATA[{}]]></component>
  <component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
    <output url="file://$PROJECT_DIR$/out" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/siemens_ragas.iml" filepath="$PROJECT_DIR$/.idea/siemens_ragas.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/siemens_ragas.iml
+++ b/.idea/siemens_ragas.iml
@@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/.superpowers/brainstorm/1625-1781595805/content/analysis-approach.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/analysis-approach.html
@@ -0,0 +1,60 @@
 <h2>优化建议怎么生成？</h2>
 <p class="subtitle">这决定了模块的核心机制与可维护性</p>
 <div class="options">
  <div class="option" data-choice="a" onclick="toggleSelect(this)">
    <div class="letter">A</div>
    <div class="content">
      <h3>纯规则引擎</h3>
      <p>每个指标设阈值（如 faithfulness &lt; 0.6），触发时给出预设建议文本。</p>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>零 LLM 调用，零额外成本</li>
          <li>结果可预测、可审计</li>
          <li>响应极快</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>建议固定，无法结合具体样本</li>
          <li>不能解释"为什么这批数据这个指标低"</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="b" onclick="toggleSelect(this)">
    <div class="letter">B</div>
    <div class="content">
      <h3>LLM 分析（全自动）</h3>
      <p>把评测结果（各指标均值 + 低分样本）一起交给 LLM，生成上下文感知的中文分析报告。</p>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>能结合具体低分样本给出针对性建议</li>
          <li>可用中文解释西门子场景下的问题</li>
          <li>建议质量高、内容丰富</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>每次评测多 1 次 LLM 调用</li>
          <li>依赖 judge_model 的质量</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="c" onclick="toggleSelect(this)">
    <div class="letter">C</div>
    <div class="content">
      <h3>规则定位 + LLM 解读（推荐）</h3>
      <p>规则引擎先识别哪些指标异常、触发哪条优化方向；再把"规则诊断 + 低分样本"一起给 LLM 做二次解读，生成中文建议。</p>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>规则保证诊断稳定，不依赖 LLM 自由发挥</li>
          <li>LLM 在有结构的输入下输出更准确</li>
          <li>两层可独立测试</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>实现略复杂（两个子模块）</li>
        </ul></div>
      </div>
    </div>
  </div>
 </div>
--- a/.superpowers/brainstorm/1625-1781595805/content/approaches.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/approaches.html
@@ -0,0 +1,77 @@
 <h2>优化顾问模块 — 实现方案对比</h2>
 <p class="subtitle">三个方案的核心区别在于 LLM 调用边界和代码入侵程度</p>
 <div class="options">
  <div class="option" data-choice="a" onclick="toggleSelect(this)">
    <div class="letter">A</div>
    <div class="content">
      <h3>独立后处理器（轻量集成）</h3>
      <p>新增 <code>rag_eval/advisor/</code> 包，<code>run_scenario()</code> 末尾调用一行 <code>maybe_run_advisor(result, scenario)</code>。</p>
      <p><strong>文件结构：</strong></p>
      <ul>
        <li><code>rag_eval/advisor/__init__.py</code></li>
        <li><code>rag_eval/advisor/rules.py</code> — 规则引擎，输入 score_rows，输出诊断列表</li>
        <li><code>rag_eval/advisor/llm_analyzer.py</code> — 把规则诊断 + 低分样本交给 judge_model</li>
        <li><code>rag_eval/advisor/writer.py</code> — 写 optimization_advice.md，打日志摘要</li>
      </ul>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>改动最小，runner.py 只加 3 行</li>
          <li>advisor 完全独立，可单独测试</li>
          <li>与现有分层架构完全吻合</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>无法拿到 per-metric 的原始 NaN 率（需从 score_rows 重新算）</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="b" onclick="toggleSelect(this)">
    <div class="letter">B</div>
    <div class="content">
      <h3>嵌入 reporting 层（复用写出基础设施）</h3>
      <p>把 advisor 作为 <code>rag_eval/reporting/</code> 的一部分，<code>write_run_artifacts()</code> 内部判断是否写 advice。</p>
      <p><strong>文件结构：</strong></p>
      <ul>
        <li><code>rag_eval/reporting/advisor.py</code> — 规则 + LLM + 写出三合一</li>
        <li><code>write_run_artifacts()</code> 里追加 <code>if scenario.optimization_advisor: write_advice(...)</code></li>
      </ul>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>artifacts 路径管理统一，advice 自然进 run 目录</li>
          <li>文件更少</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>reporting 层本是"无副作用写文件"，混入 LLM 调用破坏这一约定</li>
          <li>advisor 逻辑和写出逻辑耦合，难以单独测试规则引擎</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="c" onclick="toggleSelect(this)">
    <div class="letter">C</div>
    <div class="content">
      <h3>方案 A 变体：advisor 有独立 settings（推荐）</h3>
      <p>与方案 A 相同的文件结构，但 LLM 调用使用 <strong>scenario 已有的 judge_model</strong>，不新增任何模型配置——advisor 复用 <code>build_models()</code> 已构建好的 llm 实例。</p>
      <ul>
        <li><code>rag_eval/advisor/rules.py</code> — 纯函数，7 条指标诊断规则</li>
        <li><code>rag_eval/advisor/llm_analyzer.py</code> — 接收已有 llm 实例，不重新建 client</li>
        <li><code>rag_eval/advisor/writer.py</code> — 写 md + 日志</li>
        <li><code>rag_eval/advisor/__init__.py</code> — 暴露 <code>run_advisor()</code></li>
      </ul>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>不重复创建 LLM client（节省资源）</li>
          <li>advisor 阈值可通过 YAML 的 optimization_advisor 块扩展配置</li>
          <li>独立包边界清晰，易于单测</li>
          <li>runner.py 改动最小</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>需把 llm 实例从 runner 传入 advisor（多传一个参数）</li>
        </ul></div>
      </div>
    </div>
  </div>
 </div>
--- a/.superpowers/brainstorm/1625-1781595805/content/design-architecture.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/design-architecture.html
@@ -0,0 +1,53 @@
 <h2>优化顾问模块 — 整体架构与数据流</h2>
 <p class="subtitle">新增 rag_eval/advisor/ 包，插入 run_scenario() 末尾，复用已有 llm 实例</p>
 <div class="mockup">
  <div class="mockup-header">执行链路（变更前 → 变更后）</div>
  <div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:2">
    <span style="color:#94a3b8">run_scenario()</span><br>
    &nbsp;&nbsp;→ load_scenario()&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8"># 读 YAML，解析 Scenario + optimization_advisor 字段</span><br>
    &nbsp;&nbsp;→ build_models()&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8"># 已有：创建 llm, embeddings</span><br>
    &nbsp;&nbsp;→ build_metric_pipeline()&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8"># 已有</span><br>
    &nbsp;&nbsp;→ Evaluator.evaluate()&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8"># 已有：打分 → EvaluationResult</span><br>
    &nbsp;&nbsp;→ write_run_artifacts()&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8"># 已有：scores.csv / summary.md / ...</span><br>
    &nbsp;&nbsp;<span style="color:#4ade80;font-weight:bold">→ run_advisor(result, scenario, llm)&nbsp;&nbsp;&nbsp;# 新增 3 行</span><br>
    &nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#4ade80">&nbsp;&nbsp;→ rules.diagnose(score_rows)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# 规则引擎：识别异常指标 + 方向</span><br>
    &nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#4ade80">&nbsp;&nbsp;→ llm_analyzer.analyze(diag, samples)&nbsp;# LLM：结合低分样本生成中文建议</span><br>
    &nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#4ade80">&nbsp;&nbsp;→ writer.write(advice, paths)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# 写 optimization_advice.md + 日志</span>
  </div>
 </div>
 <div class="section">
  <h3>新增文件一览</h3>
  <div class="mockup">
    <div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
      rag_eval/advisor/<br>
      &nbsp;&nbsp;__init__.py&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8">← 暴露 run_advisor()，是外部唯一入口</span><br>
      &nbsp;&nbsp;rules.py&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8">← 纯函数，无 LLM，可单独单测</span><br>
      &nbsp;&nbsp;llm_analyzer.py <span style="color:#94a3b8">← 接收 llm 实例 + 诊断结构 → 中文 Markdown</span><br>
      &nbsp;&nbsp;writer.py&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color:#94a3b8">← 写 optimization_advice.md，打日志摘要</span><br>
      <br>
      rag_eval/shared/models.py&nbsp;&nbsp;&nbsp;<span style="color:#fbbf24">← 修改：Scenario 加 optimization_advisor 字段</span><br>
      rag_eval/config/schema.py&nbsp;&nbsp;&nbsp;<span style="color:#fbbf24">← 修改：ScenarioModel 加字段</span><br>
      rag_eval/execution/runner.py&nbsp;<span style="color:#fbbf24">← 修改：末尾加 3 行调用</span><br>
      rag_eval/reporting/artifacts.py <span style="color:#fbbf24">← 修改：RunArtifactPaths 加 advice_md 路径</span>
    </div>
  </div>
 </div>
 <div class="section">
  <h3>输出产物</h3>
  <div class="mockup">
    <div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.9">
      outputs/online/siemens-pdf-question-bank/&lt;run_id&gt;/<br>
      &nbsp;&nbsp;scenario.snapshot.yaml<br>
      &nbsp;&nbsp;scores.csv<br>
      &nbsp;&nbsp;invalid.csv<br>
      &nbsp;&nbsp;summary.md<br>
      &nbsp;&nbsp;metadata.json<br>
      &nbsp;&nbsp;<span style="color:#4ade80;font-weight:bold">optimization_advice.md&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;← 新增</span>
    </div>
  </div>
 </div>
 <p style="margin-top:1rem;color:#94a3b8;font-size:13px">整体看起来 OK 吗？这是新模块与现有链路的接入方式。</p>
--- a/.superpowers/brainstorm/1625-1781595805/content/trigger-mode.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/trigger-mode.html
@@ -0,0 +1,68 @@
 <h2>优化顾问在什么情况下运行？</h2>
 <p class="subtitle">这决定了模块与现有评测流程的集成方式</p>
 <div class="options">
  <div class="option" data-choice="a" onclick="toggleSelect(this)">
    <div class="letter">A</div>
    <div class="content">
      <h3>每次评测自动运行</h3>
      <p>run_scenario() 结束后自动调用，无需任何额外配置。</p>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>零感知，开箱即用</li>
          <li>每次跑完都有建议报告</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>每次都多一次 LLM 调用，不管是否需要</li>
          <li>无法关闭</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="b" onclick="toggleSelect(this)">
    <div class="letter">B</div>
    <div class="content">
      <h3>YAML 场景中显式开启（推荐）</h3>
      <p>在 scenario YAML 里加一行 <code>optimization_advisor: true</code>，默认关闭。</p>
      <div class="mockup">
        <div class="mockup-header">siemens-pdf-question-bank-online.yaml</div>
        <div class="mockup-body" style="font-family:monospace;font-size:13px;line-height:1.8">
          metrics:<br>
          &nbsp;&nbsp;- faithfulness<br>
          &nbsp;&nbsp;- noise_sensitivity<br>
          &nbsp;&nbsp;...<br>
          <span style="color:#4ade80;font-weight:bold">optimization_advisor: true  # 新增</span>
        </div>
      </div>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>显式可见，按需开启</li>
          <li>与现有 YAML 驱动风格一致</li>
          <li>可为不同场景独立配置</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>需要手动在 YAML 里加一行</li>
        </ul></div>
      </div>
    </div>
  </div>
  <div class="option" data-choice="c" onclick="toggleSelect(this)">
    <div class="letter">C</div>
    <div class="content">
      <h3>阈值触发（任一指标低于警戒线时自动激活）</h3>
      <p>规则引擎先算，若发现有指标低于阈值则自动启动 LLM 分析；一切正常则跳过。</p>
      <div class="pros-cons">
        <div class="pros"><h4>优点</h4><ul>
          <li>"有问题才报警"，符合直觉</li>
          <li>高分场景无额外成本</li>
        </ul></div>
        <div class="cons"><h4>缺点</h4><ul>
          <li>阈值需要维护，不同场景可能不同</li>
          <li>正常分数时无建议，但用户可能仍想看优化空间</li>
        </ul></div>
      </div>
    </div>
  </div>
 </div>
--- a/.superpowers/brainstorm/1625-1781595805/content/waiting-2.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/waiting-2.html
@@ -0,0 +1,3 @@
 <div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
  <p class="subtitle">Writing spec & moving to implementation...</p>
 </div>
--- a/.superpowers/brainstorm/1625-1781595805/content/waiting.html
+++ b/.superpowers/brainstorm/1625-1781595805/content/waiting.html
@@ -0,0 +1,3 @@
 <div style="display:flex;align-items:center;justify-content:center;min-height:60vh">
  <p class="subtitle">Continuing in terminal — 正在设计方案...</p>
 </div>
--- a/.superpowers/brainstorm/1625-1781595805/state/server-stopped
+++ b/.superpowers/brainstorm/1625-1781595805/state/server-stopped
@@ -0,0 +1 @@
 {"reason":"idle timeout","timestamp":1781598635371}
--- a/.superpowers/brainstorm/1625-1781595805/state/server.pid
+++ b/.superpowers/brainstorm/1625-1781595805/state/server.pid
@@ -0,0 +1 @@
 1625
--- a/apps/siemens_pdf_qa/init.py
+++ b/apps/siemens_pdf_qa/init.py
@@ -0,0 +1,6 @@
 """Siemens PDF question bank adapter for online evaluation.
 Wraps the generic pdf_question_bank adapter with a Siemens-specific system
 prompt that instructs the model to answer in the same language as the question
 (Chinese for Chinese CT documentation) and to cite only the provided evidence.
 """
--- a/apps/siemens_pdf_qa/adapter.py
+++ b/apps/siemens_pdf_qa/adapter.py
@@ -0,0 +1,170 @@
 """Online evaluation adapter for the Siemens medical-imaging PDF question bank.
 Functionally identical to apps/pdf_question_bank/adapter.py but uses a
 Siemens-specific system prompt that:
  - Instructs the model to answer in the same language as the question
    (important for Chinese CT documentation).
  - Emphasises citation of source chunks and refusal when evidence is absent.
  - Adds domain context (medical imaging / CT terminology).
 The adapter contract is the same as all other adapters:
  run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
 """
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Any
 from openai import OpenAI
 from rag_eval.settings import EvaluationSettings
 from rag_eval.shared.utils import parse_contexts
 # ── chunk cache (module-level, lives for the process lifetime) ────────────────
 _CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
 def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
    """Resolve the chunk artifact path; fall back to the latest timestamped run."""
    resolved = Path(source_chunks_path).resolve()
    if resolved.exists():
        return resolved
    if resolved.parent.name != "latest":
        raise FileNotFoundError(resolved)
    artifact_root = resolved.parent.parent
    if not artifact_root.exists():
        raise FileNotFoundError(resolved)
    candidates = sorted(
        [d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
        key=lambda p: p.name,
        reverse=True,
    )
    for run_dir in candidates:
        candidate = run_dir / resolved.name
        if candidate.exists():
            return candidate
    raise FileNotFoundError(resolved)
 def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
    """Load and cache source chunks by chunk_id."""
    resolved = _resolve_source_chunks_path(source_chunks_path)
    cached = _CHUNK_CACHE.get(resolved)
    if cached is not None:
        return cached
    lookup: dict[str, dict[str, Any]] = {}
    with resolved.open(encoding="utf-8") as fh:
        for lineno, line in enumerate(fh, 1):
            text = line.strip()
            if not text:
                continue
            payload = json.loads(text)
            chunk_id = str(payload.get("chunk_id", "")).strip()
            if not chunk_id:
                raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
            lookup[chunk_id] = payload
    _CHUNK_CACHE[resolved] = lookup
    return lookup
 def _resolve_chunk_ids(raw: Any) -> list[str]:
    """Parse the source_chunk_ids column into a list of non-empty id strings."""
    ids = parse_contexts(raw)
    normalized = [i for i in ids if i]
    if not normalized:
        raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
    return normalized
 def _build_messages(
    question: str,
    contexts: list[str],
    metadata: dict[str, Any],
 ) -> list[dict[str, str]]:
    """Build a Siemens-domain grounded prompt for the answer model."""
    evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
    meta_lines = [
        f"doc_name: {metadata.get('doc_name', '')}",
        f"section_path: {metadata.get('section_path', '')}",
        f"page_range: {metadata.get('page_start', '')}–{metadata.get('page_end', '')}",
    ]
    # Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
    system_prompt = (
        "你是西门子医疗影像知识库的问答助手（Siemens Healthineers CT Knowledge Base QA）。"
        "请严格根据下方【证据片段】回答问题，不得使用片段之外的任何知识。"
        "若证据不足以回答，请明确说明「根据现有资料无法回答」。"
        "请用与问题相同的语言（中文或英文）作答，简洁准确，必要时引用片段编号。"
    )
    user_prompt = "\n".join([
        "【问题】",
        question,
        "",
        "【文档元信息】",
        *meta_lines,
        "",
        "【证据片段】",
        *evidence_lines,
        "",
        "请基于以上证据片段作答。",
    ])
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
 def run(
    question: str,
    *,
    source_chunks_path: str,
    model: str | None = None,
    client: OpenAI | None = None,
    **kwargs: Any,
 ) -> dict[str, Any]:
    """Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
    This is the adapter contract entry point used by the online evaluation runner.
    """
    chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
    chunk_lookup = _load_source_chunks(source_chunks_path)
    missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
    if missing:
        raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
    resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
    contexts = [
        str(chunk.get("text", "")).strip()
        for chunk in resolved_chunks
        if str(chunk.get("text", "")).strip()
    ]
    if not contexts:
        raise ValueError("resolved source chunks contain no usable text.")
    settings = EvaluationSettings()
    target_model = (model or settings.ragas_judge_model).strip()
    if not target_model:
        raise ValueError("A model name is required for siemens_pdf_qa adapter.")
    llm_client = client or OpenAI(**settings.openai_client_kwargs)
    completion = llm_client.chat.completions.create(
        model=target_model,
        messages=_build_messages(question, contexts, kwargs),
        temperature=0,
    )
    answer = str(completion.choices[0].message.content or "").strip()
    return {
        "answer": answer,
        "contexts": contexts,
        "raw_response": {
            "resolved_chunk_ids": chunk_ids,
            "doc_id": kwargs.get("doc_id", ""),
            "doc_name": kwargs.get("doc_name", ""),
            "model": target_model,
            "response_text": answer,
        },
    }
--- a/configs/llm_profiles.json
+++ b/configs/llm_profiles.json
@@ -0,0 +1,64 @@
 {
  "profiles": [
    {
      "profile_id": "c8e185a64fa0",
      "name": "glm-5",
      "model": "glm-5",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:16:22.438297+00:00",
      "updated_at": "2026-06-16T09:19:03.089865+00:00"
    },
    {
      "profile_id": "54ddfe5aeb46",
      "name": "deepseek-v4-pro",
      "model": "deepseek-v4-pro",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:17:08.473904+00:00",
      "updated_at": "2026-06-16T09:19:07.504082+00:00"
    },
    {
      "profile_id": "25d035eef194",
      "name": "qwen3.5-flash",
      "model": "qwen3.5-flash",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:18:24.265619+00:00",
      "updated_at": "2026-06-16T09:18:24.265619+00:00"
    },
    {
      "profile_id": "ff1d0f417a5d",
      "name": "deepseek-v4-flash",
      "model": "deepseek-v4-flash",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:18:57.091549+00:00",
      "updated_at": "2026-06-16T09:18:57.091549+00:00"
    },
    {
      "profile_id": "5b04c49df9df",
      "name": "text-embedding-v4",
      "model": "text-embedding-v4",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:19:49.104004+00:00",
      "updated_at": "2026-06-16T09:19:49.104004+00:00"
    },
    {
      "profile_id": "b4f7c82859d5",
      "name": "text-embedding-v3",
      "model": "text-embedding-v3",
      "base_url": "http://6.86.80.4:30080/v1",
      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
      "timeout_seconds": 600,
      "created_at": "2026-06-16T09:20:18.266540+00:00",
      "updated_at": "2026-06-16T09:20:18.266540+00:00"
    }
  ]
 }
--- a/deploy.sh
+++ b/deploy.sh
@@ -0,0 +1,173 @@
 #!/usr/bin/env bash
 # deploy.sh — Siemens RAGAS 一键部署脚本（Linux）
 # 用法：bash deploy.sh
 # 功能：检查环境 → 安装依赖 → 初始化配置 → 启动后台服务
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 # ── 颜色输出 ──────────────────────────────────────────────────────
 if [ -t 1 ]; then
    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
 else
    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
 fi
 ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
 warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
 err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
 info() { echo -e "${CYAN}[INFO]${NC}  $*"; }
 echo ""
 echo -e "${CYAN}============================================================${NC}"
 echo -e "${CYAN}  Siemens RAGAS Console  —  Linux 一键部署${NC}"
 echo -e "${CYAN}============================================================${NC}"
 echo ""
 # ── 阶段 1：Python 版本检查 ───────────────────────────────────────
 info "阶段 1/7：检查 Python 版本..."
 PYTHON_BIN=""
 for candidate in python3.12 python3.13 python3.14 python3; do
    if command -v "$candidate" &>/dev/null; then
        version=$("$candidate" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || true)
        major=$(echo "$version" | cut -d. -f1)
        minor=$(echo "$version" | cut -d. -f2)
        if [ "${major:-0}" -ge 3 ] && [ "${minor:-0}" -ge 12 ]; then
            PYTHON_BIN="$candidate"
            ok "Python $version ($candidate)"
            break
        fi
    fi
 done
 if [ -z "$PYTHON_BIN" ]; then
    err "未找到 Python 3.12+。请安装后重试。"
    err "  Ubuntu/Debian: sudo apt install python3.12 python3.12-venv"
    err "  CentOS/RHEL:   sudo dnf install python3.12"
    exit 1
 fi
 # ── 阶段 2：虚拟环境 ──────────────────────────────────────────────
 info "阶段 2/7：准备虚拟环境..."
 if [ -d ".venv" ] && [ -f ".venv/bin/python" ]; then
    ok ".venv 已存在，跳过创建"
 else
    info "创建 .venv..."
    "$PYTHON_BIN" -m venv .venv
    ok ".venv 创建完成"
 fi
 PIP=".venv/bin/pip"
 PYTHON=".venv/bin/python"
 # ── 阶段 3：安装依赖 ──────────────────────────────────────────────
 info "阶段 3/7：安装项目依赖（可能需要几分钟）..."
 "$PIP" install --upgrade pip -q
 ok "pip 已升级"
 "$PIP" install -e . -q
 ok "项目依赖安装完成（pyproject.toml）"
 "$PIP" install fastapi uvicorn httpx -q
 ok "Web 服务依赖安装完成（fastapi / uvicorn / httpx）"
 # ── 阶段 4：配置文件 ──────────────────────────────────────────────
 info "阶段 4/7：初始化配置文件..."
 if [ ! -f ".env" ]; then
    cp .env.example .env
    warn ".env 已从 .env.example 复制，请编辑填写实际的 API Key 等配置后再启动："
    warn "  nano .env   或   vim .env"
    warn "  关键字段：OPENAI_API_KEY, OPENAI_BASE_URL, ALIBABA_ACCESS_KEY_ID, ALIBABA_ACCESS_KEY_SECRET"
 else
    ok ".env 已存在，跳过"
 fi
 # ── 阶段 5：目录初始化 ────────────────────────────────────────────
 info "阶段 5/7：初始化目录结构..."
 mkdir -p configs logs outputs datasets
 ok "目录就绪：configs/ logs/ outputs/ datasets/"
 # 确保其他脚本有执行权限
 for script in start.sh stop.sh run_eval.sh; do
    [ -f "$script" ] && chmod +x "$script"
 done
 ok "辅助脚本已设置执行权限"
 # ── 阶段 6：Demo 数据 ─────────────────────────────────────────────
 info "阶段 6/7：初始化演示数据..."
 DEMO_DIR="outputs/kba-knowledge-base-offline-baseline"
 if [ -d "$DEMO_DIR" ]; then
    ok "演示数据已存在，跳过"
 else
    info "生成演示数据（scripts/seed_sample_run.py）..."
    if "$PYTHON" scripts/seed_sample_run.py; then
        ok "演示数据生成完成"
    else
        warn "演示数据生成失败，控制台报告页将为空（服务仍可正常启动）"
    fi
 fi
 # ── 阶段 7：启动服务 ──────────────────────────────────────────────
 info "阶段 7/7：启动 Web 服务..."
 # 检查 .env 是否包含默认占位符
 if grep -q "your-api-key" .env 2>/dev/null; then
    warn ".env 中仍包含默认占位符，部分功能（评估执行）将不可用"
    warn "请编辑 .env 后重新运行 start.sh"
 fi
 # 端口检测
 PORT=8800
 if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
    warn "端口 $PORT 已被占用，尝试 8801..."
    PORT=8801
    if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
        err "端口 8800 和 8801 均被占用。请手动运行："
        err "  .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
        exit 1
    fi
 fi
 # 清理残留 PID
 if [ -f ".server.pid" ]; then
    OLD_PID=$(cat .server.pid)
    if kill -0 "$OLD_PID" 2>/dev/null; then
        warn "检测到已有服务进程 (PID=$OLD_PID)，停止旧进程..."
        kill "$OLD_PID" 2>/dev/null || true
        sleep 1
    fi
    rm -f .server.pid
 fi
 # 后台启动
 nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
 SERVER_PID=$!
 echo "$SERVER_PID" > .server.pid
 # 等待 3 秒验证进程存活
 sleep 3
 if kill -0 "$SERVER_PID" 2>/dev/null; then
    ok "服务已启动 (PID=$SERVER_PID)"
    echo ""
    echo -e "${CYAN}============================================================${NC}"
    echo -e "${GREEN}  部署成功！${NC}"
    echo -e "${GREEN}  访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
    echo -e "${GREEN}  本机访问: http://127.0.0.1:${PORT}${NC}"
    echo -e "${CYAN}  服务日志: tail -f logs/server.log${NC}"
    echo -e "${CYAN}  停止服务: bash stop.sh${NC}"
    echo -e "${CYAN}============================================================${NC}"
    echo ""
 else
    err "服务启动失败，请查看日志："
    err "  tail -20 logs/server.log"
    rm -f .server.pid
    exit 1
 fi
--- a/docs/rag-eval-architecture.md
+++ b/docs/rag-eval-architecture.md
@@ -318,6 +318,10 @@ metrics:
  - answer_relevancy
  - context_recall
  - context_precision
  # 可选：鲁棒性 / 端到端指标（需数据集含 ground_truth），完整列表见 §9.4
  # - noise_sensitivity
  # - factual_correctness
  # - semantic_similarity
 output_dir: runs/legal-assistant-offline-baseline
 runtime:
  batch_size: 4
@@ -338,7 +342,7 @@ runtime:
 - `embedding_model`
  - 负责向量相关指标的模型
 - `metrics`
-  - 本次启用的指标列表
+  - 本次启用的指标列表（完整可选项与依赖见 §9.4）
 - `output_dir`
  - 本次运行结果输出目录
 - `runtime.batch_size`
@@ -399,6 +403,32 @@ app_adapter:
 - embedding model
 - 指标实例
 当前支持的指标（`rag_eval/metrics/registry.py` 中的 `SUPPORTED_METRICS`）：
 | 指标名 | 层面 | 依赖 |
 |---|---|---|
 | `faithfulness` | 生成 | judge model |
 | `answer_relevancy` | 生成 | judge model + embedding |
 | `context_recall` | 检索 | judge model + ground_truth |
 | `context_precision` | 检索 | judge model + ground_truth |
 | `noise_sensitivity` | 鲁棒性 | judge model + ground_truth |
 | `factual_correctness` | 端到端 | judge model + ground_truth |
 | `semantic_similarity` | 端到端 | embedding + ground_truth（无 LLM 调用） |
 后四项以 `ground_truth`（标准答案）为参照，数据集必须提供该字段。新增指标统一在 `registry.py` / `factory.py` / `pipeline.py` 三处对齐装配。
 **Optimization Advisor（§11 优化策略落地）：**
 评测结束后，若场景配置 `optimization_advisor: true`，则自动调用 `rag_eval/advisor/` 模块：
 - 规则引擎（`rules.py`）对 7 个指标各自设阈值，识别触发项并选取 top-3 低分样本
 - LLM 分析器（`llm_analyzer.py`）结合低分样本生成中文 Markdown 优化建议（复用 judge_model，失败自动降级为纯规则报告）
 - 写出层（`writer.py`）输出 `optimization_advice.md` 并打日志摘要
 ```yaml
 # 场景配置示例
 optimization_advisor: true
 ```
 ### 9.5 并发控制
 执行层负责并发上限，不把并发策略散落到各指标实现中。
--- a/docs/rag-eval-engine-flow.md
+++ b/docs/rag-eval-engine-flow.md
@@ -316,11 +316,21 @@ adapter 层的目标是：**把不同类型的目标应用，统一成同一套
 当前支持的指标包括：
 核心检索 / 生成指标（始终可用）：
 - `faithfulness`
 - `answer_relevancy`
 - `context_recall`
 - `context_precision`
 鲁棒性 / 端到端指标（架构设计 §10.2，需数据集含 `ground_truth`）：
 - `noise_sensitivity` —— 鲁棒性：对检索噪声的敏感度
 - `factual_correctness` —— 端到端：回答相对标准答案的事实正确性
 - `semantic_similarity` —— 端到端：回答与标准答案的语义相似度（基于 embedding，无 LLM 调用）
 所有指标都通过同一套装配点接入：`registry.py`（校验白名单）、`factory.py`（实例化）、`pipeline.py`（`ascore` 入参分发），新增指标只需在这三处对齐即可。
 所以 metric pipeline 的职责可以总结为：
 **把标准样本转换成结构化评分结果。**
@@ -414,3 +424,39 @@ main.py
 - 可以把每次实验的资产稳定留住
 这也是它和一次性离线脚本的根本区别。
 ---
 ## 15. Optimization Advisor 链路
 相关代码：
 - `rag_eval/advisor/__init__.py` — 外部入口 `run_advisor()`
 - `rag_eval/advisor/rules.py` — 规则引擎（纯函数，无 LLM），7 条指标诊断规则
 - `rag_eval/advisor/llm_analyzer.py` — LLM 分析器（复用 judge_model llm 实例，失败自动降级）
 - `rag_eval/advisor/writer.py` — 写出 `optimization_advice.md` + 日志摘要
 Advisor 在 `write_run_artifacts()` 之后触发，仅当场景配置 `optimization_advisor: true` 时生效，默认关闭。
 执行链路：
 ```text
 run_advisor(result, scenario, llm)
  -> rules.diagnose(score_rows, metrics)     # 识别异常指标，选取 top-3 低分样本
  -> llm_analyzer.analyze(diagnoses, llm)   # LLM 生成中文建议（失败自动降级为纯规则报告）
  -> writer.write_advice(...)               # 写 optimization_advice.md + 日志摘要
 ```
 输出产物追加在现有 run 目录：
 ```text
 outputs/online/siemens-pdf-question-bank/<run_id>/
  scenario.snapshot.yaml
  scores.csv
  invalid.csv
  summary.md
  metadata.json
  optimization_advice.md    <- 新增（optimization_advisor: true 时生成）
 ```
 规则引擎对 7 个指标各自设 warning / critical 双档阈值，`noise_sensitivity` 为"越低越好"（方向相反）。所有诊断均附带 top-3 低分样本，喂给 LLM 生成针对具体内容的中文建议。
--- a/docs/superpowers/plans/2026-06-16-llm-profile-manager.md
+++ b/docs/superpowers/plans/2026-06-16-llm-profile-manager.md
--- a/docs/superpowers/plans/2026-06-16-optimization-advisor.md
+++ b/docs/superpowers/plans/2026-06-16-optimization-advisor.md
--- a/docs/superpowers/plans/2026-06-18-metric-doc-weights.md
+++ b/docs/superpowers/plans/2026-06-18-metric-doc-weights.md
--- a/docs/superpowers/plans/2026-06-22-dify-score-api.md
+++ b/docs/superpowers/plans/2026-06-22-dify-score-api.md
@@ -0,0 +1,974 @@
 # Dify 实时评分 API Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** 新增 `POST /api/score` 端点，供 Dify 外部 Tool 调用，接受单条问答记录并同步返回 RAGAS 各指标得分。
 **Architecture:** 新增 `inline_scorer.py` 服务层封装 RAGAS 打分逻辑，以 `(judge_model, embedding_model)` 为 key 缓存 LLM 客户端；新增 `webapp/api/score.py` 路由；`ScoreRequest`/`ScoreResponse` 放入 `webapp/models.py`；`SCORE_API_TOKEN` 加入 `EvaluationSettings`。
 **Tech Stack:** Python 3.12, FastAPI, Pydantic v2, RAGAS 0.4.3, pytest
 ## Global Constraints
 - Python 3.12+，PEP 8，4 空格缩进，类型注解必须
 - contexts 用 `context_separator`（默认 `" |||| "`）拆分为 list[str]
 - ground_truth 为可选；缺失时跳过 context_recall / factual_correctness / semantic_similarity / noise_sensitivity
 - SCORE_API_TOKEN 为空时不鉴权（内网部署场景）
 - 所有测试用 pytest，不依赖真实 LLM
 ---
 ## 文件清单
 | 操作 | 文件 | 职责 |
 |------|------|------|
 | 新建 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + 单题打分 |
 | 新建 | `webapp/api/score.py` | `/api/score` 路由 |
 | 新建 | `tests/webapp/test_score_api.py` | 端点测试（全 mock） |
 | 修改 | `webapp/models.py` | 新增 ScoreRequest / ScoreResponse |
 | 修改 | `rag_eval/settings.py` | 新增 score_api_token 字段 |
 | 修改 | `webapp/server.py` | 注册 score router，更新 OPENAPI_TAGS 和 description |
 ---
 ## Task 1: ScoreRequest / ScoreResponse 模型 + settings 字段
 **Files:**
 - Modify: `webapp/models.py`
 - Modify: `rag_eval/settings.py`
 - Test: `tests/webapp/test_score_api.py` (partial — model validation tests)
 **Interfaces:**
 - Produces:
  - `ScoreRequest` Pydantic model（见下方字段）
  - `ScoreResponse` Pydantic model
  - `EvaluationSettings.score_api_token: str | None`
 - [ ] **Step 1: Write failing model-validation tests**
 Create `tests/webapp/test_score_api.py`:
 ```python
 """Tests for POST /api/score endpoint."""
 from __future__ import annotations
 import math
 import pytest
 from pydantic import ValidationError
 from webapp.models import ScoreRequest, ScoreResponse
 class TestScoreRequest:
    def test_minimal_valid_request(self):
        """Only required fields — question, answer, contexts."""
        req = ScoreRequest(
            question="What is CT?",
            answer="CT is imaging.",
            contexts="CT uses X-rays.",
        )
        assert req.question == "What is CT?"
        assert req.contexts == "CT uses X-rays."
        assert req.ground_truth is None
        assert req.context_separator == " |||| "
        assert req.metrics == ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
    def test_contexts_split_by_separator(self):
        """contexts_as_list() splits on context_separator."""
        req = ScoreRequest(
            question="q", answer="a",
            contexts="ctx1 |||| ctx2 |||| ctx3",
            context_separator=" |||| ",
        )
        assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
    def test_contexts_split_custom_separator(self):
        req = ScoreRequest(
            question="q", answer="a",
            contexts="a---b---c",
            context_separator="---",
        )
        assert req.contexts_as_list() == ["a", "b", "c"]
    def test_contexts_split_single_item(self):
        req = ScoreRequest(question="q", answer="a", contexts="only one")
        assert req.contexts_as_list() == ["only one"]
    def test_missing_question_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(answer="a", contexts="c")  # type: ignore[call-arg]
    def test_missing_answer_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", contexts="c")  # type: ignore[call-arg]
    def test_missing_contexts_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", answer="a")  # type: ignore[call-arg]
    def test_custom_metrics_accepted(self):
        req = ScoreRequest(
            question="q", answer="a", contexts="c",
            metrics=["faithfulness"],
        )
        assert req.metrics == ["faithfulness"]
    def test_invalid_metric_name_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", answer="a", contexts="c", metrics=["not_a_metric"])
    def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
        """Without ground_truth, GT-dependent metrics are excluded."""
        req = ScoreRequest(
            question="q", answer="a", contexts="c",
            metrics=["faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity"],
        )
        effective = req.effective_metrics()
        assert "faithfulness" in effective
        assert "context_recall" not in effective
        assert "factual_correctness" not in effective
        assert "semantic_similarity" not in effective
        assert "noise_sensitivity" not in effective
    def test_effective_metrics_keeps_all_when_ground_truth_present(self):
        req = ScoreRequest(
            question="q", answer="a", contexts="c", ground_truth="gt",
            metrics=["faithfulness", "context_recall", "factual_correctness"],
        )
        effective = req.effective_metrics()
        assert effective == ["faithfulness", "context_recall", "factual_correctness"]
 class TestScoreResponse:
    def test_score_response_structure(self):
        resp = ScoreResponse(
            scores={"faithfulness": 0.85, "answer_relevancy": None},
            weighted_score=0.85,
            latency_ms=1200,
        )
        assert resp.scores["faithfulness"] == 0.85
        assert resp.scores["answer_relevancy"] is None
        assert resp.latency_ms == 1200
 ```
 - [ ] **Step 2: Run to verify FAIL**
 ```
 cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
 python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
 ```
 Expected: `ImportError: cannot import name 'ScoreRequest' from 'webapp.models'`
 - [ ] **Step 3: Add ScoreRequest and ScoreResponse to `webapp/models.py`**
 Append to the end of `webapp/models.py` (after `PipelineJobResponse`):
 ```python
 # ---------------------------------------------------------------------------
 # Dify 实时评分 API 模型
 # ---------------------------------------------------------------------------
 # 需要 ground_truth 才能计算的指标集合
 _GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
    "context_recall",
    "factual_correctness",
    "semantic_similarity",
    "noise_sensitivity",
 })
 # 所有合法指标名称
 _VALID_METRICS: frozenset[str] = frozenset({
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
    "noise_sensitivity",
    "factual_correctness",
    "semantic_similarity",
 })
 _DEFAULT_SCORE_METRICS: list[str] = [
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
 ]
 class ScoreRequest(BaseModel):
    """Request body for the real-time single-sample scoring endpoint."""
    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {
                    "summary": "基础评分请求",
                    "value": {
                        "question": "双源CT的时间分辨率是多少?",
                        "answer": "双源CT的单扇区时间分辨率为75ms。",
                        "contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
                        "ground_truth": "双源CT单扇区时间分辨率为75ms，需旋转135度。",
                        "context_separator": " |||| ",
                        "metrics": ["faithfulness", "answer_relevancy", "context_recall", "context_precision"],
                        "judge_model": "deepseek-v4-flash",
                        "embedding_model": "text-embedding-v3",
                    },
                }
            ]
        }
    )
    question: str = Field(description="问题文本。")
    answer: str = Field(description="待评分的回答。")
    contexts: str = Field(
        description="检索上下文字符串，多段之间用 context_separator 拼接。"
    )
    ground_truth: str | None = Field(
        default=None,
        description="标准参考答案（可选）。缺失时自动跳过需要它的指标。",
    )
    context_separator: str = Field(
        default=" |||| ",
        description="contexts 字段中段落分隔符，默认为四个竖线两侧各一空格。",
    )
    metrics: list[str] = Field(
        default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
        description="需要计算的 RAGAS 指标列表。",
    )
    judge_model: str | None = Field(
        default=None,
        description="Judge LLM 模型名称；为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
    )
    embedding_model: str | None = Field(
        default=None,
        description="Embedding 模型名称；为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
    )
    @field_validator("metrics")
    @classmethod
    def validate_metric_names(cls, value: list[str]) -> list[str]:
        """Reject any metric name not in the supported registry."""
        invalid = [m for m in value if m not in _VALID_METRICS]
        if invalid:
            raise ValueError(
                f"不支持的指标名称：{invalid}。"
                f"合法值：{sorted(_VALID_METRICS)}"
            )
        if not value:
            raise ValueError("metrics 不能为空列表。")
        return value
    def contexts_as_list(self) -> list[str]:
        """Split the contexts string into a list of non-empty fragments."""
        sep = self.context_separator or " |||| "
        return [s.strip() for s in self.contexts.split(sep) if s.strip()]
    def effective_metrics(self) -> list[str]:
        """Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
        if self.ground_truth is not None:
            return list(self.metrics)
        return [m for m in self.metrics if m not in _GT_DEPENDENT_METRICS]
 class ScoreResponse(BaseModel):
    """Response payload for the real-time scoring endpoint."""
    scores: dict[str, float | None] = Field(
        description="各指标得分（NaN 或计算失败时为 null）。"
    )
    weighted_score: float | None = Field(
        default=None,
        description="等权加权综合得分（仅对非 null 指标求均值）。",
    )
    latency_ms: int = Field(description="服务端打分耗时（毫秒）。")
    skipped_metrics: list[str] = Field(
        default_factory=list,
        description="因缺少 ground_truth 而跳过的指标名称列表。",
    )
    error: str | None = Field(
        default=None,
        description="打分异常时的错误信息（HTTP 200 仍返回，scores 为空）。",
    )
 ```
 Also add `field_validator` to the import line at the top of `webapp/models.py`:
 ```python
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 ```
 - [ ] **Step 4: Add `score_api_token` to `rag_eval/settings.py`**
 Add after the `dataset_generator_model` field:
 ```python
 score_api_token: str | None = Field(
    default=None,
    alias="SCORE_API_TOKEN",
    description="Bearer token for /api/score endpoint. Empty = no auth.",
 )
 ```
 - [ ] **Step 5: Run to verify PASS**
 ```
 python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
 ```
 Expected: all 12 tests PASS.
 - [ ] **Step 6: Commit**
 ```
 git add webapp/models.py rag_eval/settings.py tests/webapp/test_score_api.py
 git commit -m "feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting"
 ```
 ---
 ## Task 2: InlineScorer 服务（LLM 缓存 + 打分）
 **Files:**
 - Create: `webapp/services/inline_scorer.py`
 **Interfaces:**
 - Consumes:
  - `build_models(judge_model, embedding_model, settings) -> tuple[Any, Any]` from `rag_eval.metrics.factory`
  - `MetricPipeline(metrics, metric_timeout_seconds)` from `rag_eval.metrics.pipeline`
  - `NormalizedSample` from `rag_eval.shared.models`
  - `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights`
  - `EvaluationSettings` from `rag_eval.settings`
 - Produces:
  - `inline_scorer: InlineScorer` (module-level singleton)
  - `InlineScorer.score(question, answer, contexts, ground_truth, metrics, judge_model, embedding_model, settings) -> dict[str, float | None]`
 - [ ] **Step 1: Write failing test**
 Add to `tests/webapp/test_score_api.py`:
 ```python
 class TestInlineScorer:
    def test_score_returns_dict_with_requested_metrics(self):
        """InlineScorer.score returns a dict keyed by the requested metrics."""
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings
        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
        mock_score.error = ""
        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a",
                        contexts=["ctx1"],
                        ground_truth=None,
                        metrics=["faithfulness", "answer_relevancy"],
                        judge_model="test-model",
                        embedding_model="test-embed",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert "faithfulness" in result
        assert "answer_relevancy" in result
        assert result["faithfulness"] == pytest.approx(0.9)
    def test_score_converts_nan_to_none(self):
        """NaN scores are converted to None in the returned dict."""
        import math
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings
        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": float("nan")}
        mock_score.error = ""
        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a", contexts=["c"],
                        ground_truth=None,
                        metrics=["faithfulness"],
                        judge_model="m", embedding_model="e",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert result["faithfulness"] is None
 ```
 - [ ] **Step 2: Run to verify FAIL**
 ```
 python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
 ```
 Expected: `ModuleNotFoundError: No module named 'webapp.services.inline_scorer'`
 - [ ] **Step 3: Create `webapp/services/inline_scorer.py`**
 ```python
 """LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
 A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
 (judge_model, embedding_model), so repeated Dify Tool calls with the same
 models reuse existing AsyncOpenAI connections instead of creating new ones.
 """
 from __future__ import annotations
 import asyncio
 import math
 import threading
 from typing import Any
 from rag_eval.compat import ensure_ragas_import_compat
 from rag_eval.metrics.factory import build_models
 from rag_eval.metrics.pipeline import MetricPipeline
 from rag_eval.metrics.weights import compute_weighted_score
 from rag_eval.settings import EvaluationSettings
 from rag_eval.shared.models import NormalizedSample
 ensure_ragas_import_compat()
 from ragas.metrics.collections import (  # noqa: E402
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
    FactualCorrectness,
    Faithfulness,
    NoiseSensitivity,
    SemanticSimilarity,
 )
 def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
    """Instantiate only the RAGAS metric objects requested."""
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
        "noise_sensitivity": NoiseSensitivity(llm=llm),
        "factual_correctness": FactualCorrectness(llm=llm),
        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return {name: registry[name] for name in metrics if name in registry}
 class InlineScorer:
    """Thread-safe single-sample RAGAS scorer with LLM client caching."""
    def __init__(self) -> None:
        # Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
        self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
        self._lock = threading.Lock()
    def _get_models(
        self,
        judge_model: str,
        embedding_model: str,
        settings: EvaluationSettings,
    ) -> tuple[Any, Any]:
        """Return cached LLM/embedding clients, building them on first use."""
        cache_key = (judge_model, embedding_model)
        with self._lock:
            if cache_key not in self._model_cache:
                llm, embeddings = build_models(judge_model, embedding_model, settings)
                self._model_cache[cache_key] = (llm, embeddings)
            return self._model_cache[cache_key]
    def score(
        self,
        question: str,
        answer: str,
        contexts: list[str],
        ground_truth: str | None,
        metrics: list[str],
        judge_model: str,
        embedding_model: str,
        settings: EvaluationSettings,
    ) -> dict[str, float | None]:
        """Score one sample synchronously and return {metric_name: score | None}.
        NaN values from RAGAS are converted to None for clean JSON serialization.
        """
        llm, embeddings = self._get_models(judge_model, embedding_model, settings)
        metric_instances = _build_metric_instances(metrics, llm, embeddings)
        pipeline = MetricPipeline(
            metrics=metric_instances,
            metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
        )
        sample = NormalizedSample(
            sample_id="inline-score",
            question=question,
            answer=answer,
            contexts=contexts,
            ground_truth=ground_truth or "",
        )
        metric_score = asyncio.run(pipeline.score_sample(sample))
        # Convert NaN → None for clean JSON output
        return {
            name: (None if math.isnan(v) or math.isinf(v) else round(v, 4))
            for name, v in metric_score.metrics.items()
        }
 # Module-level singleton shared by FastAPI routes.
 inline_scorer = InlineScorer()
 ```
 - [ ] **Step 4: Run to verify PASS**
 ```
 python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
 ```
 Expected: both tests PASS.
 - [ ] **Step 5: Commit**
 ```
 git add webapp/services/inline_scorer.py tests/webapp/test_score_api.py
 git commit -m "feat: add InlineScorer service with LLM client caching"
 ```
 ---
 ## Task 3: `/api/score` 路由 + 鉴权 + 集成测试
 **Files:**
 - Create: `webapp/api/score.py`
 - Modify: `webapp/server.py`
 **Interfaces:**
 - Consumes:
  - `ScoreRequest`, `ScoreResponse` from `webapp.models`
  - `inline_scorer: InlineScorer` from `webapp.services.inline_scorer`
  - `EvaluationSettings` from `rag_eval.settings`
  - `compute_weighted_score(scores, {}) -> float | None` from `rag_eval.metrics.weights`
 - Produces: `POST /api/score` endpoint
 - [ ] **Step 1: Write failing endpoint tests**
 Add to `tests/webapp/test_score_api.py`:
 ```python
 # ── Fixtures ─────────────────────────────────────────────────────────────────
 import pytest
 from fastapi.testclient import TestClient
 from unittest.mock import MagicMock, patch
@pytest.fixture()
 def client(monkeypatch):
    """TestClient with mocked InlineScorer."""
    import webapp.api.score as score_mod
    mock_scorer = MagicMock()
    mock_scorer.score.return_value = {
        "faithfulness": 0.85,
        "answer_relevancy": 0.90,
    }
    monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
    from webapp.server import create_app
    return TestClient(create_app())
 class TestScoreEndpoint:
    def test_post_score_returns_200(self, client):
        resp = client.post("/api/score", json={
            "question": "What is CT?",
            "answer": "CT is imaging.",
            "contexts": "CT uses X-rays.",
        })
        assert resp.status_code == 200
        data = resp.json()
        assert "scores" in data
        assert "latency_ms" in data
        assert data["scores"]["faithfulness"] == pytest.approx(0.85)
    def test_weighted_score_computed(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
        })
        assert resp.status_code == 200
        data = resp.json()
        # weighted_score is the mean of all non-null scores
        assert data["weighted_score"] is not None
    def test_missing_required_fields_returns_422(self, client):
        resp = client.post("/api/score", json={"question": "q"})
        assert resp.status_code == 422
    def test_invalid_metric_name_returns_422(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
            "metrics": ["not_a_metric"],
        })
        assert resp.status_code == 422
    def test_skipped_metrics_returned_when_no_ground_truth(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
            "metrics": ["faithfulness", "context_recall"],
        })
        assert resp.status_code == 200
        data = resp.json()
        assert "context_recall" in data["skipped_metrics"]
    def test_contexts_split_on_separator(self, client, monkeypatch):
        """contexts string is split before passing to scorer."""
        import webapp.api.score as score_mod
        calls = []
        def capture(*args, **kwargs):
            calls.append(kwargs.get("contexts", []))
            return {"faithfulness": 0.9}
        monkeypatch.setattr(score_mod.inline_scorer, "score", capture)
        client.post("/api/score", json={
            "question": "q", "answer": "a",
            "contexts": "ctx1 |||| ctx2",
            "context_separator": " |||| ",
        })
        assert calls[0] == ["ctx1", "ctx2"]
    def test_bearer_token_auth_required_when_configured(self, monkeypatch):
        """When SCORE_API_TOKEN is set, requests without token get 401."""
        import webapp.api.score as score_mod
        from rag_eval.settings import EvaluationSettings
        mock_settings = EvaluationSettings(_env_file=None)
        object.__setattr__(mock_settings, "score_api_token", "secret-token")
        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
        mock_scorer = MagicMock()
        mock_scorer.score.return_value = {"faithfulness": 0.9}
        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
        from webapp.server import create_app
        test_client = TestClient(create_app())
        # No auth header → 401
        resp = test_client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
        })
        assert resp.status_code == 401
        # Correct token → 200
        resp = test_client.post("/api/score",
            json={"question": "q", "answer": "a", "contexts": "c"},
            headers={"Authorization": "Bearer secret-token"},
        )
        assert resp.status_code == 200
    def test_wrong_bearer_token_returns_401(self, monkeypatch):
        import webapp.api.score as score_mod
        from rag_eval.settings import EvaluationSettings
        mock_settings = EvaluationSettings(_env_file=None)
        object.__setattr__(mock_settings, "score_api_token", "correct-token")
        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
        mock_scorer = MagicMock()
        mock_scorer.score.return_value = {}
        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
        from webapp.server import create_app
        test_client = TestClient(create_app())
        resp = test_client.post("/api/score",
            json={"question": "q", "answer": "a", "contexts": "c"},
            headers={"Authorization": "Bearer wrong-token"},
        )
        assert resp.status_code == 401
 ```
 - [ ] **Step 2: Run to verify FAIL**
 ```
 python -m pytest tests/webapp/test_score_api.py::TestScoreEndpoint -v
 ```
 Expected: `ModuleNotFoundError: No module named 'webapp.api.score'`
 - [ ] **Step 3: Create `webapp/api/score.py`**
 ```python
 """Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
 from __future__ import annotations
 import time
 from fastapi import APIRouter, Header, HTTPException
 from typing import Annotated
 from rag_eval.metrics.weights import compute_weighted_score
 from rag_eval.settings import EvaluationSettings
 from webapp.models import ScoreRequest, ScoreResponse
 from webapp.services.inline_scorer import inline_scorer
 router = APIRouter(prefix="/api/score", tags=["score"])
 def _get_settings() -> EvaluationSettings:
    """Return a fresh EvaluationSettings instance (overridable in tests)."""
    return EvaluationSettings()
 def _check_auth(authorization: str | None, token: str) -> None:
    """Raise 401 if Bearer token does not match the configured token."""
    if authorization is None:
        raise HTTPException(status_code=401, detail="Missing Authorization header.")
    parts = authorization.split(" ", 1)
    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
    "",
    response_model=ScoreResponse,
    summary="单题实时评分（Dify 外部 Tool）",
    responses={
        200: {"description": "各指标得分和加权综合得分。"},
        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
        422: {"description": "请求参数校验失败。"},
    },
 )
 def score_sample(
    request: ScoreRequest,
    authorization: Annotated[str | None, Header()] = None,
 ) -> ScoreResponse:
    """接受单条问答记录，同步运行 RAGAS 指标打分，实时返回各指标得分。
    供 Dify 外部 Tool 调用。将 `contexts` 字段按 `context_separator` 拆分后传入
    RAGAS 管道；`ground_truth` 缺失时自动跳过依赖它的指标。
    """
    settings = _get_settings()
    # 鉴权（仅在配置了 token 时生效）
    if settings.score_api_token:
        _check_auth(authorization, settings.score_api_token)
    judge_model = request.judge_model or settings.ragas_judge_model
    embedding_model = request.embedding_model or settings.ragas_embedding_model
    effective = request.effective_metrics()
    requested = set(request.metrics)
    skipped = sorted(requested - set(effective))
    if not effective:
        # All requested metrics require ground_truth which is absent.
        return ScoreResponse(
            scores={m: None for m in request.metrics},
            weighted_score=None,
            latency_ms=0,
            skipped_metrics=skipped,
        )
    t0 = time.monotonic()
    try:
        raw_scores = inline_scorer.score(
            question=request.question,
            answer=request.answer,
            contexts=request.contexts_as_list(),
            ground_truth=request.ground_truth,
            metrics=effective,
            judge_model=judge_model,
            embedding_model=embedding_model,
            settings=settings,
        )
    except Exception as exc:  # noqa: BLE001
        latency_ms = int((time.monotonic() - t0) * 1000)
        return ScoreResponse(
            scores={},
            weighted_score=None,
            latency_ms=latency_ms,
            skipped_metrics=skipped,
            error=f"{type(exc).__name__}: {exc}",
        )
    latency_ms = int((time.monotonic() - t0) * 1000)
    # Merge: skipped metrics appear as null in final scores dict.
    all_scores: dict[str, float | None] = {m: None for m in request.metrics}
    all_scores.update(raw_scores)
    # Weighted score = equal-weight mean of non-null effective scores.
    weighted = compute_weighted_score(
        {k: v for k, v in raw_scores.items() if v is not None},
        {},
    )
    return ScoreResponse(
        scores=all_scores,
        weighted_score=round(weighted, 4) if weighted is not None else None,
        latency_ms=latency_ms,
        skipped_metrics=skipped,
    )
 ```
 - [ ] **Step 4: Register router in `webapp/server.py`**
 Add `score` to the import line:
 ```python
 from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
 ```
 Add the router registration after `pipeline.router`:
 ```python
 app.include_router(score.router)
 ```
 Add `"score"` tag to `OPENAPI_TAGS` list (insert before `"meta"`):
 ```python
    {
        "name": "score",
        "description": (
            "**实时评分 API（Dify 外部 Tool）**\n\n"
            "接受单条问答记录 `(question, answer, contexts, ground_truth)`，\n"
            "同步运行 RAGAS 指标打分，返回各指标得分和加权综合得分。\n\n"
            "适用场景：Dify Agent 在回答后即时调用，用于质量监控或自我改进。\n\n"
            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
            "`Authorization: Bearer <token>` 请求头。"
        ),
    },
 ```
 Also update the `description` field in `FastAPI(...)` to add a bullet:
 ```python
 "- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
 ```
 - [ ] **Step 5: Run to verify PASS**
 ```
 python -m pytest tests/webapp/test_score_api.py -v
 ```
 Expected: all tests PASS.
 - [ ] **Step 6: Verify server boots and route appears**
 ```
 python -c "
 from webapp.server import create_app
 app = create_app()
 routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes]
 score_routes = [(p,m) for p,m in routes if 'score' in p]
 print('Score routes:', score_routes)
 "
 ```
 Expected output:
 ```
 Score routes: [('/api/score', ['POST'])]
 ```
 - [ ] **Step 7: Commit**
 ```
 git add webapp/api/score.py webapp/server.py tests/webapp/test_score_api.py
 git commit -m "feat: add POST /api/score endpoint for Dify real-time scoring"
 ```
 ---
 ## Task 4: 全量回归 + `.env.example` 更新
 **Files:**
 - Modify: `.env.example`
 - [ ] **Step 1: Add SCORE_API_TOKEN to `.env.example`**
 Add this block after `DATASET_GENERATOR_MODEL=qwen3.6-plus`:
 ```
 # ===== Dify 集成 — 实时评分 API =====
 # 为 /api/score 端点设置 Bearer Token 鉴权（留空则不鉴权，适合内网部署）
 # Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
 SCORE_API_TOKEN=
 ```
 - [ ] **Step 2: Run full test suite**
 ```
 python -m pytest tests/ -v --tb=short
 ```
 Pre-existing failures to ignore:
 - `test_normalize_sample_pdf_offline_smoke_row` — 缺少 CSV fixture
 - `test_evaluator_and_reporting_write_run_assets` — 预存在的断言不匹配
 - `test_question_generator_rejects_invalid_json` — retry 循环吞掉了 ValueError
 - `test_question_generator_rejects_non_list_samples` — 同上
 **零新增失败**即为通过。
 - [ ] **Step 3: Final commit**
 ```
 git add .env.example
 git commit -m "feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
 - POST /api/score: real-time RAGAS scoring for Dify external Tool
 - ScoreRequest/ScoreResponse Pydantic models with full field docs
 - InlineScorer with (judge_model, embedding_model) client cache
 - Bearer token auth via SCORE_API_TOKEN env var (optional)
 - contexts split by configurable separator (default ' |||| ')
 - GT-dependent metrics auto-skipped when ground_truth absent
 - Full test coverage (22 new tests)
 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
 ```
 ---
 ## Dify 侧配置参考
 任务完成后，在 Dify 「工具」→「自定义工具」中填写如下 OpenAPI Schema：
 ```yaml
 openapi: 3.1.0
 info:
  title: RAGAS 实时评分
  version: 1.0.0
 servers:
  - url: http://<your-server>:8800
 paths:
  /api/score:
    post:
      operationId: scoreQA
      summary: 对一条问答记录进行 RAGAS 评分
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [question, answer, contexts]
              properties:
                question:       { type: string }
                answer:         { type: string }
                contexts:       { type: string, description: "多段上下文用 ' |||| ' 拼接" }
                ground_truth:   { type: string }
                metrics:
                  type: array
                  items: { type: string }
                  default: [faithfulness, answer_relevancy, context_recall, context_precision]
      responses:
        '200':
          description: 评分结果
          content:
            application/json:
              schema:
                type: object
                properties:
                  scores:         { type: object }
                  weighted_score: { type: number }
                  latency_ms:     { type: integer }
                  skipped_metrics: { type: array, items: { type: string } }
 ```
--- a/docs/superpowers/plans/2026-06-24-async-score-jobs.md
+++ b/docs/superpowers/plans/2026-06-24-async-score-jobs.md
@@ -0,0 +1,808 @@
 # 异步评分记录（Async Score Jobs）Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** 新增 `POST /api/score/async` 异步端点，结果持久化至 `outputs/score-jobs/`，并在前端新增「评分记录」页面展示。
 **Architecture:** 新建 `ScoreJobManager`（复用 `pipeline_task_manager` 线程池模式）在后台执行 `InlineScorer.score()`，写入 JSON 文件；新增三个 REST 端点；前端新增导航页加载并轮询记录。
 **Tech Stack:** Python 3.12, FastAPI, Pydantic v2, threading, Vanilla JS, pytest
 ## Global Constraints
 - Python 3.12+，PEP 8，4 空格缩进，类型注解必须
 - 存储路径：`outputs/score-jobs/<job_id>.json`
 - 复用现有 `ScoreRequest`（含 `effective_metrics()` 和 `contexts_as_list()` 方法）
 - 复用现有 `InlineScorer.score()` 和 `compute_weighted_score()`
 - 所有测试用 pytest，不依赖真实 LLM
 ---
 ## 文件清单
 | 操作 | 文件 | 职责 |
 |------|------|------|
 | 新建 | `webapp/services/score_job_manager.py` | ScoreJobManager：线程池 + JSON 持久化 |
 | 新建 | `webapp/api/score_jobs.py` | 3 个端点路由 |
 | 新建 | `webapp/static/js/score_jobs.js` | 前端列表 + 轮询逻辑 |
 | 新建 | `tests/webapp/test_score_jobs_api.py` | API 集成测试 |
 | 修改 | `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
 | 修改 | `webapp/server.py` | 注册 score_jobs router，更新 OPENAPI_TAGS 和 description |
 | 修改 | `webapp/static/index.html` | 新增导航项 + `#view-scorejobs` section |
 | 修改 | `webapp/static/js/api.js` | 新增 `scoreJobsAsync()`、`getScoreJob()`、`listScoreJobs()` |
 | 修改 | `webapp/static/js/app.js` | 注册 `scorejobs` 视图、加载调用 |
 ---
 ## Task 1: Pydantic 模型 + ScoreJobManager
 **Files:**
 - Modify: `webapp/models.py`
 - Create: `webapp/services/score_job_manager.py`
 - Create: `tests/webapp/test_score_jobs_api.py` (partial)
 **Interfaces:**
 - Produces:
  - `AsyncScoreJobStatus` Pydantic model
  - `AsyncScoreJobResponse` Pydantic model
  - `score_job_manager: ScoreJobManager` singleton
  - `ScoreJobManager.submit(request: ScoreRequest) -> AsyncScoreJobStatus`
  - `ScoreJobManager.get(job_id: str) -> AsyncScoreJobStatus | None`
  - `ScoreJobManager.list_jobs() -> list[AsyncScoreJobStatus]`
 - [ ] **Step 1: Add models to `webapp/models.py`**
 Append after `AsyncScoreJobResponse` (at the end of the file, after `ScoreResponse`):
 ```python
 # ---------------------------------------------------------------------------
 # 异步评分记录模型
 # ---------------------------------------------------------------------------
 class AsyncScoreJobResponse(BaseModel):
    """Immediate response after submitting an async score job."""
    job_id: str = Field(description="任务唯一标识符，用于后续查询结果。")
    status: str = Field(default="queued", description="初始状态：queued。")
 class AsyncScoreJobStatus(BaseModel):
    """Full state of one async score job, persisted to disk."""
    job_id: str = Field(description="任务唯一标识符。")
    status: str = Field(description="queued | running | completed | failed")
    created_at: str = Field(default="", description="创建时间（ISO 8601 UTC）。")
    finished_at: str = Field(default="", description="完成时间（ISO 8601 UTC）。")
    request_summary: dict = Field(
        default_factory=dict,
        description="请求参数快照（question 前80字、metrics、judge_model 等）。",
    )
    scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
    weighted_score: float | None = Field(default=None, description="加权综合得分。")
    latency_ms: int = Field(default=0, description="评分耗时毫秒。")
    skipped_metrics: list[str] = Field(default_factory=list)
    error: str | None = Field(default=None)
 ```
 - [ ] **Step 2: Write failing tests**
 Create `tests/webapp/test_score_jobs_api.py`:
 ```python
 """Tests for async score jobs API."""
 from __future__ import annotations
 import json
 import time
 import pytest
 from unittest.mock import MagicMock, patch
 from fastapi.testclient import TestClient
@pytest.fixture()
 def client(tmp_path, monkeypatch):
    import webapp.services.score_job_manager as mgr_mod
    from webapp.services.score_job_manager import ScoreJobManager
    fresh_mgr = ScoreJobManager(jobs_dir=tmp_path / "score-jobs")
    monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
    import webapp.api.score_jobs as api_mod
    monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
    from webapp.server import create_app
    return TestClient(create_app())
 class TestScoreJobManager:
    def test_submit_returns_job_status_with_queued(self, tmp_path):
        from webapp.services.score_job_manager import ScoreJobManager
        from webapp.models import ScoreRequest
        mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
        req = ScoreRequest(question="q", answer="a", metrics=["answer_relevancy"])
        with patch.object(mgr, "_execute") as mock_exec:
            mock_exec.return_value = None
            status = mgr.submit(req)
        assert status.status in ("queued", "running", "completed")
        assert len(status.job_id) > 0
    def test_get_returns_none_for_unknown_id(self, tmp_path):
        from webapp.services.score_job_manager import ScoreJobManager
        mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
        assert mgr.get("nonexistent") is None
    def test_list_returns_empty_initially(self, tmp_path):
        from webapp.services.score_job_manager import ScoreJobManager
        mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs")
        assert mgr.list_jobs() == []
    def test_completed_job_persisted_to_disk(self, tmp_path):
        from webapp.services.score_job_manager import ScoreJobManager
        from webapp.models import ScoreRequest
        mgr = ScoreJobManager(jobs_dir=tmp_path / "jobs", max_workers=1)
        req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
        mock_scorer = MagicMock()
        mock_scorer.score.return_value = {"answer_relevancy": 0.85}
        with patch("webapp.services.score_job_manager.inline_scorer", mock_scorer):
            with patch("webapp.services.score_job_manager.EvaluationSettings"):
                status = mgr.submit(req)
        for _ in range(20):
            s = mgr.get(status.job_id)
            if s and s.status in ("completed", "failed"):
                break
            time.sleep(0.2)
        s = mgr.get(status.job_id)
        assert s is not None
        json_path = tmp_path / "jobs" / f"{status.job_id}.json"
        assert json_path.exists()
        data = json.loads(json_path.read_text(encoding="utf-8"))
        assert data["job_id"] == status.job_id
 ```
 - [ ] **Step 3: Run to verify FAIL**
 ```
 cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
 python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
 ```
 Expected: `ModuleNotFoundError: No module named 'webapp.services.score_job_manager'`
 - [ ] **Step 4: Create `webapp/services/score_job_manager.py`**
 ```python
 """Background task manager for async RAGAS single-sample scoring.
 Each job runs InlineScorer.score() in a thread pool and persists the
 result as a JSON file under outputs/score-jobs/<job_id>.json so results
 survive server restarts and can be listed by the frontend.
 """
 from __future__ import annotations
 import json
 import math
 import threading
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from rag_eval.metrics.weights import compute_weighted_score
 from rag_eval.settings import EvaluationSettings
 from webapp.models import AsyncScoreJobStatus, ScoreRequest
 from webapp.services.inline_scorer import inline_scorer
 _REPO_ROOT = Path(__file__).resolve().parents[2]
 _DEFAULT_JOBS_DIR = _REPO_ROOT / "outputs" / "score-jobs"
 def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
 class ScoreJobManager:
    """Thread-pool manager for async RAGAS scoring jobs with JSON persistence."""
    def __init__(
        self,
        jobs_dir: Path = _DEFAULT_JOBS_DIR,
        max_workers: int = 4,
    ) -> None:
        self._jobs_dir = Path(jobs_dir)
        self._jobs_dir.mkdir(parents=True, exist_ok=True)
        self._executor = ThreadPoolExecutor(max_workers=max_workers)
        # In-memory index: job_id -> AsyncScoreJobStatus (authoritative while running)
        self._cache: dict[str, AsyncScoreJobStatus] = {}
        self._lock = threading.Lock()
        self._load_existing()
    # ------------------------------------------------------------------ #
    # Public API
    # ------------------------------------------------------------------ #
    def submit(self, request: ScoreRequest) -> AsyncScoreJobStatus:
        """Queue one scoring job and return its initial status immediately."""
        job_id = uuid.uuid4().hex[:12]
        status = AsyncScoreJobStatus(
            job_id=job_id,
            status="queued",
            created_at=_now_iso(),
            request_summary={
                "question": request.question[:80],
                "answer": (request.answer or "")[:80],
                "metrics": list(request.metrics),
                "judge_model": request.judge_model or "",
                "embedding_model": request.embedding_model or "",
                "has_contexts": bool(request.contexts),
                "has_ground_truth": bool(request.ground_truth),
            },
        )
        with self._lock:
            self._cache[job_id] = status
        self._persist(status)
        self._executor.submit(self._run, job_id, request)
        return status
    def get(self, job_id: str) -> AsyncScoreJobStatus | None:
        """Return the current status for one job, or None if unknown."""
        with self._lock:
            return self._cache.get(job_id)
    def list_jobs(self) -> list[AsyncScoreJobStatus]:
        """Return all known jobs sorted newest first."""
        with self._lock:
            jobs = list(self._cache.values())
        jobs.sort(key=lambda j: j.created_at, reverse=True)
        return jobs
    # ------------------------------------------------------------------ #
    # Internal
    # ------------------------------------------------------------------ #
    def _run(self, job_id: str, request: ScoreRequest) -> None:
        """Execute scoring in the thread pool and persist the result."""
        self._update(job_id, status="running")
        settings = EvaluationSettings()
        judge_model = request.judge_model or settings.ragas_judge_model
        embedding_model = request.embedding_model or settings.ragas_embedding_model
        effective = request.effective_metrics()
        requested = set(request.metrics)
        skipped = sorted(requested - set(effective))
        import time as _time
        t0 = _time.monotonic()
        try:
            if not effective:
                scores: dict[str, float | None] = {m: None for m in request.metrics}
                weighted = None
            else:
                raw = inline_scorer.score(
                    question=request.question,
                    answer=request.answer,
                    contexts=request.contexts_as_list(),
                    ground_truth=request.ground_truth,
                    metrics=effective,
                    judge_model=judge_model,
                    embedding_model=embedding_model,
                    settings=settings,
                )
                scores = {m: None for m in request.metrics}
                scores.update(raw)
                weighted_raw = compute_weighted_score(
                    {k: v for k, v in raw.items() if v is not None}, {}
                )
                weighted = round(weighted_raw, 4) if weighted_raw is not None else None
            latency_ms = int((_time.monotonic() - t0) * 1000)
            self._update(
                job_id,
                status="completed",
                finished_at=_now_iso(),
                scores=scores,
                weighted_score=weighted,
                latency_ms=latency_ms,
                skipped_metrics=skipped,
            )
        except Exception as exc:  # noqa: BLE001
            latency_ms = int((_time.monotonic() - t0) * 1000)
            self._update(
                job_id,
                status="failed",
                finished_at=_now_iso(),
                latency_ms=latency_ms,
                error=f"{type(exc).__name__}: {exc}",
            )
    def _update(self, job_id: str, **kwargs: Any) -> None:
        """Merge kwargs into the job status and persist."""
        with self._lock:
            existing = self._cache.get(job_id)
            if existing is None:
                return
            updated = existing.model_copy(update=kwargs)
            self._cache[job_id] = updated
        self._persist(updated)
    def _persist(self, status: AsyncScoreJobStatus) -> None:
        """Write one job's status to its JSON file."""
        path = self._jobs_dir / f"{status.job_id}.json"
        path.write_text(
            json.dumps(status.model_dump(), ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
    def _load_existing(self) -> None:
        """Load completed jobs from disk into memory on startup."""
        for path in sorted(self._jobs_dir.glob("*.json")):
            try:
                data = json.loads(path.read_text(encoding="utf-8"))
                status = AsyncScoreJobStatus.model_validate(data)
                self._cache[status.job_id] = status
            except Exception:  # noqa: BLE001
                pass  # Corrupt file — skip
 # Module-level singleton shared by FastAPI routes.
 score_job_manager = ScoreJobManager()
 ```
 - [ ] **Step 5: Run to verify tests PASS**
 ```
 python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobManager -v
 ```
 Expected: 4 tests PASS
 - [ ] **Step 6: Commit**
 ```
 git add webapp/models.py webapp/services/score_job_manager.py tests/webapp/test_score_jobs_api.py
 git commit -m "feat: add AsyncScoreJobStatus model and ScoreJobManager with JSON persistence"
 ```
 ---
 ## Task 2: API 端点
 **Files:**
 - Create: `webapp/api/score_jobs.py`
 - Modify: `webapp/server.py`
 - Modify: `tests/webapp/test_score_jobs_api.py`
 **Interfaces:**
 - Consumes: `score_job_manager: ScoreJobManager`, `AsyncScoreJobResponse`, `AsyncScoreJobStatus`, `ScoreRequest`
 - Produces: `POST /api/score/async`, `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
 - [ ] **Step 1: Add API tests to `tests/webapp/test_score_jobs_api.py`**
 Append this class:
 ```python
 class TestScoreJobsEndpoint:
    def test_submit_async_returns_202(self, client):
        with patch("webapp.services.score_job_manager.ScoreJobManager._execute"):
            resp = client.post("/api/score/async", json={
                "question": "q?", "answer": "a.",
                "metrics": ["answer_relevancy"],
            })
        assert resp.status_code == 202
        data = resp.json()
        assert "job_id" in data
        assert data["status"] == "queued"
    def test_get_unknown_job_returns_404(self, client):
        resp = client.get("/api/score/jobs/nonexistent")
        assert resp.status_code == 404
    def test_list_jobs_returns_empty_initially(self, client):
        resp = client.get("/api/score/jobs")
        assert resp.status_code == 200
        assert resp.json()["jobs"] == []
    def test_submitted_job_appears_in_list(self, client):
        with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
            resp = client.post("/api/score/async", json={
                "question": "q?", "answer": "a.",
                "metrics": ["answer_relevancy"],
            })
        job_id = resp.json()["job_id"]
        list_resp = client.get("/api/score/jobs")
        ids = [j["job_id"] for j in list_resp.json()["jobs"]]
        assert job_id in ids
    def test_get_job_by_id(self, client):
        with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
            resp = client.post("/api/score/async", json={
                "question": "q?", "answer": "a.",
                "metrics": ["answer_relevancy"],
            })
        job_id = resp.json()["job_id"]
        get_resp = client.get(f"/api/score/jobs/{job_id}")
        assert get_resp.status_code == 200
        assert get_resp.json()["job_id"] == job_id
 ```
 - [ ] **Step 2: Run to verify FAIL**
 ```
 python -m pytest tests/webapp/test_score_jobs_api.py::TestScoreJobsEndpoint -v
 ```
 Expected: FAIL — `ModuleNotFoundError: No module named 'webapp.api.score_jobs'`
 - [ ] **Step 3: Create `webapp/api/score_jobs.py`**
 ```python
 """Routes for async RAGAS scoring jobs (Dify fire-and-forget integration)."""
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
 from webapp.services.score_job_manager import score_job_manager
 router = APIRouter(prefix="/api/score", tags=["score"])
 logger = logging.getLogger("webapp.api.score_jobs")
@router.post(
    "/async",
    status_code=202,
    response_model=AsyncScoreJobResponse,
    summary="提交异步评分任务（Dify 推荐方式）",
    responses={
        202: {
            "description": "任务已排队，立即返回 job_id。通过 GET /api/score/jobs/{job_id} 查询结果。",
            "content": {
                "application/json": {
                    "example": {"job_id": "abc123def456", "status": "queued"}
                }
            },
        },
    },
 )
 def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
    """提交异步 RAGAS 评分任务，立即返回 job_id（202 Accepted）。
    评分在后台线程中执行，结果持久化至 `outputs/score-jobs/<job_id>.json`。
    在 RAGAS 平台「评分记录」页面可查看所有历史评分记录。
    **Dify 工作流推荐使用此接口**：不等待评分完成，工作流立即继续，
    避免 HTTP 节点超时。评分结果通过平台界面查看。
    """
    logger.info(
        "[score_async] submit  metrics=%s  has_ctx=%s  has_gt=%s",
        request.metrics, bool(request.contexts), bool(request.ground_truth),
    )
    status = score_job_manager.submit(request)
    logger.info("[score_async] queued  job_id=%s", status.job_id)
    return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
@router.get(
    "/jobs",
    response_model=dict,
    summary="列出所有评分记录",
 )
 def list_score_jobs() -> dict:
    """返回所有异步评分记录，按创建时间倒序排列。"""
    jobs = score_job_manager.list_jobs()
    logger.info("[score_jobs] list  count=%d", len(jobs))
    return {"jobs": [j.model_dump() for j in jobs]}
@router.get(
    "/jobs/{job_id}",
    response_model=AsyncScoreJobStatus,
    summary="查询评分记录详情",
    responses={404: {"description": "指定 job_id 的评分记录不存在。"}},
 )
 def get_score_job(job_id: str) -> AsyncScoreJobStatus:
    """返回一个异步评分任务的当前状态和结果。"""
    status = score_job_manager.get(job_id)
    if status is None:
        raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
    return status
 ```
 - [ ] **Step 4: Register router in `webapp/server.py`**
 Add import:
 ```python
 from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score, score_jobs
 ```
 Add after `app.include_router(score.router)`:
 ```python
    app.include_router(score_jobs.router)
 ```
 Add entry to `OPENAPI_TAGS` before `"meta"`:
 ```python
    {
        "name": "score",
        "description": (
            "**实时评分 API（同步）** — `POST /api/score`\n\n"
            "**异步评分 API（Dify 推荐）** — `POST /api/score/async`\n\n"
            "异步方式立即返回 job_id（202），评分在后台执行，结果在「评分记录」页查看。\n\n"
            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
            "`Authorization: Bearer <token>` 请求头。"
        ),
    },
 ```
 > Note: this replaces the existing `"score"` entry in `OPENAPI_TAGS`.
 - [ ] **Step 5: Verify no route conflict**
 ```
 python -c "
 from webapp.server import create_app
 app = create_app()
 score_routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes if 'score' in r.path]
 print(score_routes)
 "
 ```
 Expected: shows `/api/score`, `/api/score/async`, `/api/score/jobs`, `/api/score/jobs/{job_id}`
 - [ ] **Step 6: Run API tests**
 ```
 python -m pytest tests/webapp/test_score_jobs_api.py -v --tb=short
 ```
 Expected: all 9 tests PASS
 - [ ] **Step 7: Commit**
 ```
 git add webapp/api/score_jobs.py webapp/server.py tests/webapp/test_score_jobs_api.py
 git commit -m "feat: add POST /api/score/async and GET /api/score/jobs endpoints"
 ```
 ---
 ## Task 3: 前端「评分记录」页
 **Files:**
 - Modify: `webapp/static/index.html`
 - Modify: `webapp/static/js/api.js`
 - Modify: `webapp/static/js/app.js`
 - Create: `webapp/static/js/score_jobs.js`
 **Interfaces:**
 - Consumes: `GET /api/score/jobs`, `GET /api/score/jobs/{job_id}`
 - Produces: `#view-scorejobs` section, `ScoreJobs` JS object
 - [ ] **Step 1: Add API methods to `webapp/static/js/api.js`**
 Add before the closing `};`:
 ```javascript
  // 异步评分记录 API
  scoreJobsAsync(body) { return API.post("/api/score/async", body); },
  getScoreJob(jobId) { return API.get(`/api/score/jobs/${encodeURIComponent(jobId)}`); },
  listScoreJobs() { return API.get("/api/score/jobs"); },
 ```
 - [ ] **Step 2: Add nav item and section to `webapp/static/index.html`**
 In the `<nav class="nav">` block, add after the `profiles` nav-item and before the `apidocs` nav-item:
 ```html
        <button class="nav-item" data-view="scorejobs">
          <span class="nav-ico">📋</span><span>评分记录</span>
        </button>
 ```
 Add a new section before the `<!-- API 文档视图 -->` comment:
 ```html
      <!-- 评分记录视图 -->
      <section class="view" id="view-scorejobs" hidden>
        <div class="panel">
          <div class="panel-head">
            <h2>评分记录</h2>
            <span class="muted" style="font-size:13px">来自 Dify 异步评分任务（POST /api/score/async）</span>
          </div>
        </div>
        <div id="scorejobs-container"></div>
        <div class="empty" id="scorejobs-empty" hidden>
          <p>暂无评分记录。</p>
          <p class="muted">在 Dify 工作流中调用 <code>POST /api/score/async</code> 后，记录将在此显示。</p>
        </div>
      </section>
 ```
 - [ ] **Step 3: Create `webapp/static/js/score_jobs.js`**
 ```javascript
 // score_jobs.js — 评分记录页面逻辑（异步 RAGAS 评分结果列表）
 const ScoreJobs = {
  _pollTimers: {},   // job_id -> setInterval handle
  async load() {
    const container = document.getElementById("scorejobs-container");
    const empty = document.getElementById("scorejobs-empty");
    container.innerHTML = '<p class="muted">加载中…</p>';
    try {
      const data = await API.listScoreJobs();
      const jobs = data.jobs || [];
      container.innerHTML = "";
      if (jobs.length === 0) {
        empty.hidden = false;
        return;
      }
      empty.hidden = true;
      jobs.forEach(job => container.appendChild(ScoreJobs.renderRow(job)));
      // Auto-poll any queued/running jobs
      jobs.forEach(job => {
        if (job.status === "queued" || job.status === "running") {
          ScoreJobs._startPoll(job.job_id);
        }
      });
    } catch (err) {
      container.innerHTML = `<p class="muted">加载失败：${App.escape(err.message)}</p>`;
    }
  },
  renderRow(job) {
    const row = document.createElement("div");
    row.className = "panel score-job-row";
    row.id = `score-job-${job.job_id}`;
    row.innerHTML = ScoreJobs._rowHtml(job);
    return row;
  },
  _rowHtml(job) {
    const time = App.shortTime(job.created_at);
    const question = App.escape((job.request_summary?.question || "—").slice(0, 50));
    const metrics = (job.request_summary?.metrics || []).join(", ");
    const statusBadge = `<span class="badge ${job.status}">${job.status}</span>`;
    let scoreHtml = "";
    if (job.status === "completed") {
      scoreHtml = Object.entries(job.scores || {})
        .map(([k, v]) => {
          const cls = App.scoreClass(v);
          const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
          return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
        })
        .join(" ");
      if (job.weighted_score !== null && job.weighted_score !== undefined) {
        const cls = App.scoreClass(job.weighted_score);
        scoreHtml += ` <span class="metric-chip">综合 <b class="${cls}">${Number(job.weighted_score).toFixed(3)}</b></span>`;
      }
    } else if (job.status === "failed") {
      scoreHtml = `<span class="muted" style="color:var(--bad)">${App.escape(job.error || "未知错误")}</span>`;
    } else {
      scoreHtml = `<span class="muted">评分中…</span>`;
    }
    return `
      <div class="run-card-head">
        <div class="run-card-title">${question}</div>
        <div>${statusBadge}</div>
      </div>
      <div class="run-card-meta">
        <div>指标：${App.escape(metrics)} · ${time} · ${job.latency_ms}ms</div>
      </div>
      <div class="run-card-metrics">${scoreHtml}</div>
    `;
  },
  _startPoll(jobId) {
    if (ScoreJobs._pollTimers[jobId]) return;
    ScoreJobs._pollTimers[jobId] = setInterval(async () => {
      try {
        const job = await API.getScoreJob(jobId);
        const el = document.getElementById(`score-job-${jobId}`);
        if (el) el.innerHTML = ScoreJobs._rowHtml(job);
        if (job.status === "completed" || job.status === "failed") {
          clearInterval(ScoreJobs._pollTimers[jobId]);
          delete ScoreJobs._pollTimers[jobId];
        }
      } catch (_e) {
        clearInterval(ScoreJobs._pollTimers[jobId]);
        delete ScoreJobs._pollTimers[jobId];
      }
    }, 5000);
  },
  stopAllPolls() {
    Object.values(ScoreJobs._pollTimers).forEach(t => clearInterval(t));
    ScoreJobs._pollTimers = {};
  },
 };
 ```
 - [ ] **Step 4: Update `webapp/static/js/app.js`**
 Add `"scorejobs"` to the `views` array and `titles` object:
 ```javascript
  views: ["runs", "new", "report", "profiles", "scorejobs", "apidocs"],
  titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", scorejobs: "评分记录", apidocs: "API 文档" },
 ```
 Add in `_doSwitch` after `if (view === "profiles") Profiles.load();`:
 ```javascript
    if (view === "scorejobs") ScoreJobs.load();
 ```
 Add `ScoreJobs.stopAllPolls();` when switching away, in `_doSwitch` before view switching logic:
 ```javascript
    // Stop score job pollers when leaving the scorejobs view
    if (App.activeView === "scorejobs" && view !== "scorejobs") ScoreJobs.stopAllPolls();
 ```
 - [ ] **Step 5: Add script tag to `webapp/static/index.html`**
 Add before `<script src="/static/js/app.js"></script>`:
 ```html
  <script src="/static/js/score_jobs.js"></script>
 ```
 - [ ] **Step 6: Verify server boots**
 ```
 python -c "from webapp.server import create_app; create_app(); print('OK')"
 ```
 Expected: `OK`
 Also verify HTML has all new elements:
 ```
 python -c "
 c = open('webapp/static/index.html', encoding='utf-8').read()
 assert 'view-scorejobs' in c
 assert 'scorejobs-container' in c
 assert '评分记录' in c
 print('HTML OK')
 "
 ```
 - [ ] **Step 7: Commit**
 ```
 git add webapp/static/index.html webapp/static/js/api.js webapp/static/js/app.js webapp/static/js/score_jobs.js
 git commit -m "feat: add 评分记录 page with async score job list and auto-polling"
 ```
 ---
 ## Task 4: 全量回归测试 + Dify 说明注释
 **Files:**
 - Modify: `webapp/static/js/score_jobs.js` (minor: add Dify curl comment at top)
 - [ ] **Step 1: Run full test suite**
 ```
 python -m pytest tests/ -v --tb=short -q 2>&1 | tail -15
 ```
 Pre-existing failures to ignore:
 - `test_normalize_sample_pdf_offline_smoke_row`
 - `test_evaluator_and_reporting_write_run_assets`
 - `test_question_generator_rejects_invalid_json`
 - `test_question_generator_rejects_non_list_samples`
 Any other failure is a regression — fix before proceeding.
 - [ ] **Step 2: Run targeted tests**
 ```
 python -m pytest tests/webapp/test_score_jobs_api.py tests/webapp/test_score_api.py tests/test_pipeline.py -v --tb=short
 ```
 Expected: all PASS
 - [ ] **Step 3: Final commit**
 ```
 git add .
 git commit -m "feat: async score jobs complete — POST /api/score/async + 评分记录 page
 - ScoreJobManager: thread pool + JSON persistence (outputs/score-jobs/)
 - POST /api/score/async: 202 immediate response with job_id
 - GET /api/score/jobs + GET /api/score/jobs/{id}: query endpoints
 - Frontend: 评分记录 nav page with 5s auto-polling for pending jobs
 - Dify integration: change /api/score → /api/score/async, remove response parsing
 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
 ```
--- a/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
+++ b/docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
@@ -0,0 +1,59 @@
 # Siemens PDF 场景设计 Spec
 - 日期：2026-06-15
 - 状态：已确认，进入实现。
 ## 1. 目标
 基于 `datasets/siemens-pdfs/`（17 个西门子医疗 CT 中文 PDF），跑通完整三步流水线：
 ```
 dataset_build（PDF→题库）→ offline smoke 评估 → online 评估
 ```
 完全镜像现有 `sample-pdf-*` 模式（方案 A），不改动任何现有文件。
 ## 2. 参数决策
 | 项目 | 值 |
 |---|---|
 | 输入 PDF | `datasets/siemens-pdfs/*.pdf`（17 个） |
 | failure_mode | `skip`（单个文档解析失败不中断整批） |
 | max_questions_per_document | 10（共 ~170 题） |
 | max_source_chunks_per_question | 3 |
 | generation model | `.env` 的 `DATASET_GENERATOR_MODEL`（qwen3.6-plus） |
 | judge model | `.env` 的 `RAGAS_JUDGE_MODEL`（deepseek-v4-flash） |
 | embedding model | `.env` 的 `RAGAS_EMBEDDING_MODEL`（text-embedding-v3） |
 | online answer model | `.env` 的 `RAGAS_JUDGE_MODEL` |
 | metrics | faithfulness / answer_relevancy / context_recall / context_precision |
 ## 3. 新增文件（4 个）
 ```
 scenarios/siemens_build/siemens-pdf-build.yaml
 scenarios/offline/siemens-pdf-offline-smoke.yaml
 scenarios/online/siemens-pdf-question-bank-online.yaml
 apps/siemens_pdf_qa/__init__.py
 apps/siemens_pdf_qa/adapter.py
 ```
 加上辅助脚本：
 ```
 scripts/build_siemens_offline_smoke.py   ← 从 build 产物生成 offline smoke CSV
 ```
 ## 4. 运行顺序
 ```
 # 步骤 1：dataset build（PDF → 题库草稿 + source_chunks.jsonl）
 python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
 # 步骤 2：生成 offline smoke 数据集（一次性脚本，build 跑完后执行）
 python scripts/build_siemens_offline_smoke.py
 # 步骤 3：offline 评估（用 source chunks 作为 contexts，ground_truth 作为 answer）
 python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
 # 步骤 4：online 评估（实时调用 LLM 生成 answer，再评分）
 python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
 ```
--- a/docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
+++ b/docs/superpowers/specs/2026-06-16-optimization-advisor-design.md
@@ -0,0 +1,225 @@
 # 优化顾问模块设计 Spec
 - 日期：2026-06-16
 - 状态：已确认，进入实现。
 ## 1. 目标
 在现有 RAG 评测流程结束后，新增一个**优化顾问模块**（Optimization Advisor），根据本次评测的多项指标分数与低分样本，自动诊断指标偏低的原因并给出针对性的优化建议，输出为中文 Markdown 报告 + 日志摘要。
 对应架构设计 §11（优化策略）：将"指标到动作的映射"（§11.2）从文档形式落地为代码自动执行。
 ---
 ## 2. 决策摘要
 | 决策点 | 选择 |
 |---|---|
 | 输出形式 | `optimization_advice.md`（文件）+ 控制台/日志摘要（双输出） |
 | 生成机制 | 规则引擎定位异常指标 → LLM 结合低分样本二次解读（两层） |
 | 触发方式 | YAML 场景文件显式声明 `optimization_advisor: true`，默认关闭 |
 | LLM 实例 | 复用 `build_models()` 已创建的 `llm` 实例，不重建 client |
 | 包位置 | `rag_eval/advisor/`（独立包，对外暴露 `run_advisor()` 单一入口） |
 ---
 ## 3. 架构
 ### 3.1 执行链路
 ```
 run_scenario()
  → load_scenario()           # 读 YAML，解析 optimization_advisor 字段
  → build_models()            # 已有：创建 llm, embeddings
  → build_metric_pipeline()   # 已有
  → Evaluator.evaluate()      # 已有：打分 → EvaluationResult
  → write_run_artifacts()     # 已有：scores.csv / summary.md / ...
  → run_advisor(              # 新增（3 行）
        result, scenario, llm, artifact_paths
    )
      → rules.diagnose(score_rows)           # 规则引擎：返回 Diagnosis 列表
      → llm_analyzer.analyze(diags, samples) # LLM：生成中文 Markdown 建议
      → writer.write(advice, paths)          # 写文件 + 打日志
 ```
 ### 3.2 新增文件
 ```
 rag_eval/advisor/
  __init__.py          ← 暴露 run_advisor()，外部唯一入口
  rules.py             ← 纯函数规则引擎，无 LLM，可单独单测
  llm_analyzer.py      ← 接收 llm 实例 + 诊断结构 → 中文 Markdown
  writer.py            ← 写 optimization_advice.md，打日志摘要
 ```
 ### 3.3 修改文件（最小改动）
 | 文件 | 改动 |
 |---|---|
 | `rag_eval/shared/models.py` | `Scenario` 加 `optimization_advisor: bool = False` 字段 |
 | `rag_eval/config/schema.py` | `ScenarioModel` 加同名字段 + 透传到 `Scenario` |
 | `rag_eval/config/loader.py` | 透传 `optimization_advisor` 到 `Scenario` 构造 |
 | `rag_eval/reporting/artifacts.py` | `RunArtifactPaths` 加 `advice_md: Path` 字段 + `build_artifact_paths()` 加赋值 |
 | `rag_eval/execution/runner.py` | `run_scenario()` 末尾：`build_models` 返回 llm 传入，条件调用 `run_advisor()` |
 ### 3.4 输出产物
 ```
 outputs/online/siemens-pdf-question-bank/<run_id>/
  scenario.snapshot.yaml
  scores.csv
  invalid.csv
  summary.md
  metadata.json
  optimization_advice.md    ← 新增（optimization_advisor: true 时生成）
 ```
 ---
 ## 4. 规则引擎（rules.py）
 ### 4.1 数据结构
 ```python
@dataclass
 class Diagnosis:
    metric: str           # 指标名
    mean_score: float     # 本次均值
    threshold: float      # 警戒阈值
    severity: str         # "warning" | "critical"
    root_causes: list[str]  # 可能原因（来自架构设计 §11.2）
    suggested_actions: list[str]  # 对应可调阶段
    low_samples: list[dict]  # 分数最低的 N 条样本（含 question/answer/ground_truth）
 ```
 ### 4.2 七条指标诊断规则
 阈值参考 RAG 评测最佳实践，分 warning / critical 两档：
 | 指标 | warning | critical | 根因方向 | 对应优化阶段（§11.2） |
 |---|---|---|---|---|
 | `faithfulness` | < 0.7 | < 0.5 | 生成未严格基于检索片段 / 幻觉 | 生成 prompt grounding、开启校验 |
 | `answer_relevancy` | < 0.7 | < 0.5 | 回答偏离问题 / 格式冗余 | 查询改写、生成 prompt 格式 |
 | `context_recall` | < 0.7 | < 0.5 | 检索遗漏关键信息 | 多查询、问题分解、Step-back、加大过召回 |
 | `context_precision` | < 0.6 | < 0.4 | 检索引入过多噪声 / 排序差 | 后检索重排、压缩、相关性过滤 |
 | `noise_sensitivity` | > 0.3 | > 0.5 | 回答被噪声片段干扰（越低越好） | 后检索相关性过滤、重排 |
 | `factual_correctness` | < 0.6 | < 0.4 | 回答事实与标准答案偏差大 | 检索与生成综合优化 |
 | `semantic_similarity` | < 0.7 | < 0.5 | 回答语义与标准答案差距大 | 生成 prompt、检索质量 |
 > 注：`noise_sensitivity` 越低越好（0=完全不受噪声影响），其阈值方向与其余相反。
 ### 4.3 低分样本选取
 每个触发诊断的指标，取该指标分数最低的 **top-3** 样本（排除 NaN）附入 `Diagnosis.low_samples`，字段包含 `sample_id / question / answer / ground_truth / <metric_score>`。
 ---
 ## 5. LLM 分析器（llm_analyzer.py）
 ### 5.1 输入
 - `diagnoses: list[Diagnosis]` — 规则引擎输出（仅触发阈值的指标）
 - `llm` — 已有 RAGAS LLM 实例（scenario 的 judge_model）
 - `scenario_name: str` — 用于报告标题
 ### 5.2 Prompt 设计
 使用**一次 LLM 调用**，把所有触发诊断的指标和低分样本一起发送：
 ```
 你是一个 RAG 系统优化专家，正在分析西门子医疗 CT 文档问答系统的评测结果。
 请用中文撰写一份优化建议报告，格式为 Markdown。
 ## 评测诊断摘要
 {for each diagnosis: 指标名、均值、阈值、可能原因、建议动作}
 ## 低分样本示例
 {for each diagnosis: top-3 低分样本的 question / answer / ground_truth}
 ## 要求
 1. 按指标分节（## 指标名），先解释"为什么低"，再给出"具体怎么改"
 2. "具体怎么改"要结合低分样本的具体内容，而不只是泛泛建议
 3. 最后写一节 ## 优先优化次序，按性价比排序（参考：不增加调用次数的优先）
 4. 语言简洁，面向工程师，不要废话
 ```
 ### 5.3 输出
 LLM 返回的 Markdown 字符串，直接写入 `optimization_advice.md`（在报告头部追加运行元信息）。
 ### 5.4 失败降级
 LLM 调用失败（超时/异常）时：降级为**纯规则报告**（只输出规则引擎的诊断结构，不含 LLM 解读），文件照常写出，错误信息写入报告末尾，不阻断整个评测流程。
 ---
 ## 6. 写出层（writer.py）
 ### 6.1 文件写出
 `optimization_advice.md` 结构：
 ```markdown
 # 优化建议报告 — <scenario_name>
 - run_id: `<run_id>`
 - 生成时间: `<timestamp>`
 - judge_model: `<model>`
 ---
 <LLM 生成的 Markdown 正文>
 ```
 ### 6.2 日志摘要
 `run_advisor()` 完成后向 `logger.info` 打印一条精简摘要（单行，适合 `run_eval.bat` 结束后一眼扫到）：
 ```
 [advisor] 触发诊断 3 项: faithfulness(0.42, critical) context_recall(0.58, warning) noise_sensitivity(0.41, critical)
 [advisor] 优化建议已写出: outputs/online/.../optimization_advice.md
 ```
 ---
 ## 7. YAML 配置
 场景文件新增一个顶层字段：
 ```yaml
 optimization_advisor: true   # 默认 false；true 时评测结束后自动生成优化建议
 ```
 后续若需精细配置（阈值覆盖、top-N 低分样本数），可扩展为：
 ```yaml
 optimization_advisor:
  enabled: true
  top_low_samples: 3          # 每个指标取几条低分样本（默认 3）
  # thresholds:               # 可选：覆盖默认阈值
  #   faithfulness: 0.65
 ```
 本轮实现仅支持 `optimization_advisor: true/false`，扩展接口预留但不实现。
 ---
 ## 8. 测试策略
 | 测试 | 文件 | 说明 |
 |---|---|---|
 | 规则引擎单测 | `tests/test_advisor_rules.py` | 纯函数，无 LLM，覆盖每条规则的 warning/critical 触发、NaN 跳过、low_samples 选取 |
 | writer 单测 | `tests/test_advisor_writer.py` | mock Diagnosis 列表，验证 md 文件写出格式和日志输出 |
 | 集成（可选） | 现有 `tests/test_online_eval.py` | 验证 `optimization_advisor: true` 场景下 advice_md 存在 |
 LLM 分析器不写单测（依赖网络），由集成场景覆盖。
 ---
 ## 9. 不覆盖（本轮边界）
 - 不支持跨版本对比分析（只分析本次 run）
 - 不支持批量场景聚合建议
 - 不建设 Web UI 展示
 - LLM 分析器 prompt 本轮不做多语言适配（直接中文）
 - advisor 阈值本轮硬编码在 `rules.py`，不从 YAML 读取
--- a/docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
+++ b/docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
@@ -0,0 +1,240 @@
 # 指标权重 & 文档片段权重功能设计
 **日期**: 2026-06-18  
 **状态**: 已批准，待实现  
 **范围**: 在「新建评估」运行评估时，支持为 RAGAS 指标和文档配置权重，计算加权综合得分并在报告中展示。
 ---
 ## 1. 目标
 1. **指标权重（Metric Weights）**：允许为每个 RAGAS 指标配置浮点权重（如 faithfulness: 0.35），计算每道题的加权综合得分 `weighted_score`。
 2. **文档权重（Doc Weights）**：允许为特定 PDF 文档名称配置权重（如 `"322_双源CT.pdf": 2.0`），该文档的题目在汇总指标均值时按权重放大贡献。
 3. **前端覆盖**：在「新建评估」页面选中场景后，展示可编辑的权重面板，运行前可临时覆盖 YAML 中的权重。
 4. **完全向后兼容**：两个字段均为可选，省略时退化为等权行为，现有场景 YAML 无需修改。
 ---
 ## 2. 数据模型
 ### 2.1 场景 YAML（新增可选字段）
 ```yaml
 # 可选。缺省时所有指标权重 = 1.0
 metric_weights:
  faithfulness: 0.35
  context_recall: 0.25
  context_precision: 0.20
  answer_relevancy: 0.20
 # 可选。缺省时所有文档权重 = 1.0
 doc_weights:
  "322_双源CT成像技术.pdf": 2.0
  "323_单源CT对比.pdf": 1.5
 ```
 ### 2.2 Pydantic Schema（`rag_eval/config/schema.py`）
 `ScenarioModel` 新增：
 ```python
 metric_weights: dict[str, float] = Field(default_factory=dict)
 doc_weights:    dict[str, float] = Field(default_factory=dict)
 ```
 `ConfigDict(extra="ignore")` 不变，新字段不影响既有 YAML 的加载。
 ### 2.3 内部 Scenario dataclass（`rag_eval/shared/models.py`）
 `Scenario` 新增：
 ```python
 metric_weights: dict[str, float] = field(default_factory=dict)
 doc_weights:    dict[str, float] = field(default_factory=dict)
 ```
 随 `scenario.snapshot()` 序列化，供 `run_reader` / 报告层读取。
 ---
 ## 3. 后端：权重计算逻辑
 ### 3.1 新模块 `rag_eval/metrics/weights.py`
 纯函数模块，无外部依赖，独立可测：
 ```python
 def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
    """返回 key 对应的权重，缺失时返回 default。"""
 def compute_weighted_score(
    scores: dict[str, float | None],
    metric_weights: dict[str, float],
 ) -> float | None:
    """
    给定各指标得分和权重，返回加权综合得分。
    - 忽略 NaN / None 值
    - metric_weights 为空时退化为等权均值
    - 全部 NaN 时返回 None
    公式: Σ(w_i * s_i) / Σ(w_i)，只对非 NaN 项求和
    """
 def weighted_metric_means(
    score_rows: list[dict],
    metrics: list[str],
    doc_weights: dict[str, float],
 ) -> dict[str, float | None]:
    """
    对每个指标计算文档加权均值。
    - sample_weight = doc_weights.get(row["doc_name"], 1.0)
    - 公式: Σ(sample_weight_j * score_m_j) / Σ(sample_weight_j)
    - doc_weights 为空时退化为普通算术均值
    """
 ```
 ### 3.2 评估器（`rag_eval/execution/evaluator.py`）
 `_merge_score()` 新增两列：
 ```python
 record["weighted_score"] = compute_weighted_score(
    score.metrics, self.scenario.metric_weights
 )
 record["sample_weight"] = self.scenario.doc_weights.get(
    sample.metadata.get("doc_name", ""), 1.0
 )
 ```
 `scores.csv` 新增 `weighted_score`、`sample_weight` 两列。
 ### 3.3 报告摘要（`rag_eval/reporting/summary.py`）
 `build_summary_markdown()` 改用 `weighted_metric_means()` 计算各指标均值；
 新增 `weighted_score` 整体均值行：
 ```
 ## Metric Means（加权）
 - faithfulness:     0.8123  (w=0.35)
 - context_recall:   0.7654  (w=0.25)
 - context_precision: 0.7200  (w=0.20)
 - answer_relevancy: 0.7400  (w=0.20)
 - **weighted_score: 0.7789**
 ```
 ---
 ## 4. yaml_patcher 扩展（`webapp/services/yaml_patcher.py`）
 `apply_profiles_to_scenario()` 扩展签名，新增可选参数：
 ```python
 def apply_profiles_to_scenario(
    scenario_path: str,
    judge_profile: LLMProfile | None,
    answer_profile: LLMProfile | None,
    dataset_profile: LLMProfile | None,
    metric_weights: dict[str, float] | None = None,   # 新增
    doc_weights: dict[str, float] | None = None,       # 新增
    _resolve_absolute: bool = False,
 ) -> list[str]:
 ```
 - `metric_weights` 非 None 时写入 `data["metric_weights"]`，追加 `"metric_weights"` 到 patched 列表
 - `doc_weights` 非 None 时写入 `data["doc_weights"]`，追加 `"doc_weights"` 到 patched 列表
 ---
 ## 5. Webapp 模型与 API 扩展
 ### 5.1 `webapp/models.py`
 `ProfileApplyRequest` 新增：
 ```python
 metric_weights: dict[str, float] | None = None
 doc_weights:    dict[str, float] | None = None
 ```
 `ProfileApplyResponse` 不变（`patched_fields` 已包含新字段名）。
 ### 5.2 `webapp/api/llm_profiles.py` — `apply_profiles()`
 透传 `metric_weights` / `doc_weights` 给 `apply_profiles_to_scenario()`。
 ---
 ## 6. 前端：权重配置面板
 ### 6.1 HTML（`index.html`）
 在 `#llm-assignment-panel` 下方新增 `#weight-config-panel`（选中场景后显示）：
 ```
 ┌─────────────────────────────────────────────┐
 │ 权重配置  （可选，留空使用场景原始配置）         │
 ├─────────────────────────────────────────────┤
 │ 指标权重                                     │
 │  faithfulness        [____1.0____]           │
 │  context_recall      [____1.0____]           │
 │  ...（根据选中场景的 metrics 动态生成）         │
 │                                              │
 │ 文档权重（doc_weights）                       │
 │  [doc名称_______________] [权重__] [＋] [✕]  │
 │  [doc名称_______________] [权重__] [＋] [✕]  │
 │  ＋ 添加文档权重规则                          │
 └─────────────────────────────────────────────┘
 ```
 ### 6.2 `runner.js`
 - `renderScenarioItem()` 选中后调用 `Runner._renderWeightPanel(sc)` 动态生成指标行
 - `_applyProfilesIfNeeded()` 同时读取权重输入，追加到 `apply` 请求 body
 - `Runner._collectWeights()` 收集 metric_weights / doc_weights，全部为 1.0 时不发送（跳过）
 ### 6.3 CSS（`app.css`）
 新增 `.weight-config-panel`、`.weight-row`、`.weight-input` 样式，与现有 `.llm-role-row` 风格一致。
 ---
 ## 7. 报告展示（`webapp/services/report_builder.py`）
 - `RunSummary.metric_means` 改用 `weighted_metric_means()` 计算（需从 `scenario.snapshot.yaml` 读取 `doc_weights` / `metric_weights`）
 - `RunSummary` 新增 `weighted_score_mean: float | None` 字段
 - 前端 `report.js` 的指标卡片区新增「综合加权得分」卡片，使用 `good/warn/bad` 配色
 ---
 ## 8. 测试计划
 | 测试文件 | 覆盖内容 |
 |----------|---------|
 | `tests/test_weights.py` | `compute_weighted_score` / `weighted_metric_means` 纯函数，含 NaN 边界、空权重、全 NaN |
 | `tests/test_dataset_build.py` | 无改动（隔离良好） |
 | `tests/test_offline_eval.py` | `_merge_score` 新增 weighted_score / sample_weight 列断言 |
 | `tests/webapp/test_llm_profiles_api.py` | `apply_profiles` 带 metric_weights / doc_weights 的 patching 测试 |
 ---
 ## 9. 改动文件清单
 | 文件 | 改动类型 |
 |------|---------|
 | `rag_eval/config/schema.py` | 新增字段 |
 | `rag_eval/shared/models.py` | 新增字段 |
 | `rag_eval/config/loader.py` | 透传新字段到 Scenario |
 | `rag_eval/metrics/weights.py` | **新建** |
 | `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 |
 | `rag_eval/reporting/summary.py` | 改用加权均值 |
 | `webapp/services/yaml_patcher.py` | 新增 metric_weights / doc_weights 参数 |
 | `webapp/models.py` | ProfileApplyRequest 新增字段；RunSummary 新增 weighted_score_mean |
 | `webapp/api/llm_profiles.py` | 透传新参数 |
 | `webapp/services/report_builder.py` | 加权均值计算 |
 | `webapp/static/index.html` | 新增权重配置面板 |
 | `webapp/static/js/runner.js` | 权重面板逻辑 |
 | `webapp/static/css/app.css` | 新增权重面板样式 |
 | `tests/test_weights.py` | **新建** |
 ---
 ## 10. 向后兼容保证
 - `metric_weights: {}` + `doc_weights: {}` → 所有权重 = 1.0，行为与当前完全一致
 - 现有场景 YAML 不含这两个字段 → Pydantic `default_factory=dict` 填充空字典
 - `scores.csv` 新增两列不影响现有报告读取（`run_reader` 只读已知列）
--- a/docs/superpowers/specs/2026-06-22-dify-score-api-design.md
+++ b/docs/superpowers/specs/2026-06-22-dify-score-api-design.md
@@ -0,0 +1,138 @@
 # Dify 集成 — 单题实时评分 API 设计
 **日期**: 2026-06-22  
 **状态**: 已批准，待实现  
 **范围**: 在现有 FastAPI 服务中新增 `POST /api/score` 端点，供 Dify 外部 Tool 调用，实现单条问答记录的实时 RAGAS 指标评分。
 ---
 ## 1. 目标
 让 Dify Agent 能在回答完问题后，将 `(question, answer, contexts, ground_truth)` 发给 siemens_ragas 服务，实时获取各 RAGAS 指标得分，用于质量监控或 Agent 自我改进。
 ---
 ## 2. API 规范
 ### `POST /api/score`
 **请求体：**
 ```json
 {
  "question":          "双源CT的时间分辨率是多少?",
  "answer":            "双源CT的单扇区时间分辨率为75ms。",
  "contexts":          "片段1：双源CT采用两套管-探测器系统... |||| 片段2：单扇区采集旋转135度...",
  "ground_truth":      "双源CT单扇区时间分辨率为75ms，需旋转135度。",
  "context_separator": " |||| ",
  "metrics":           ["faithfulness", "answer_relevancy"],
  "judge_model":       "deepseek-v4-flash",
  "embedding_model":   "text-embedding-v3"
 }
 ```
 **字段说明：**
 | 字段 | 类型 | 必填 | 说明 |
 |------|------|------|------|
 | `question` | str | ✅ | 问题文本 |
 | `answer` | str | ✅ | 待评分的回答 |
 | `contexts` | str | ✅ | 检索到的上下文，多段用 `context_separator` 拼接 |
 | `ground_truth` | str | ❌ | 标准答案；缺失时跳过依赖它的指标（context_recall、factual_correctness、semantic_similarity） |
 | `context_separator` | str | ❌ | 默认 `" \|\|\|\| "`（四个竖线，两侧各一空格） |
 | `metrics` | list[str] | ❌ | 默认 `["faithfulness", "answer_relevancy", "context_recall", "context_precision"]` |
 | `judge_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_JUDGE_MODEL` |
 | `embedding_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_EMBEDDING_MODEL` |
 **响应体（200 OK）：**
 ```json
 {
  "scores": {
    "faithfulness":     0.8750,
    "answer_relevancy": 0.9200
  },
  "weighted_score": 0.8975,
  "latency_ms": 3420
 }
 ```
 **错误响应：**
 | 状态码 | 场景 |
 |--------|------|
 | 400 | 必填字段缺失、metrics 名称不合法 |
 | 401 | 配置了 `SCORE_API_TOKEN` 但请求未携带有效 Bearer Token |
 | 422 | 请求体 JSON 格式错误（Pydantic 校验） |
 | 500 | RAGAS 内部评分异常，附带 error 字段 |
 **鉴权（可选）：**  
 若 `.env` 中 `SCORE_API_TOKEN` 非空，则要求请求头携带 `Authorization: Bearer <token>`。为空则不鉴权（内网部署场景）。
 ---
 ## 3. 架构与文件改动
 ### 新文件
 | 文件 | 职责 |
 |------|------|
 | `webapp/api/score.py` | 路由定义，请求验证，调用 InlineScorer |
 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + RAGAS 评分逻辑封装 |
 ### 修改文件
 | 文件 | 改动 |
 |------|------|
 | `webapp/models.py` | 新增 `ScoreRequest`、`ScoreResponse` |
 | `webapp/server.py` | 注册 `score.router`，更新 `openapi_tags` |
 | `rag_eval/settings.py` | 新增 `score_api_token: str | None` 字段 |
 ---
 ## 4. `inline_scorer.py` 设计
 ```python
 class InlineScorer:
    """同步执行 RAGAS 单题评分，内部缓存 LLM 客户端。"""
    def score(
        self,
        question: str,
        answer: str,
        contexts: list[str],
        ground_truth: str | None,
        metrics: list[str],
        judge_model: str,
        embedding_model: str,
        settings: EvaluationSettings,
    ) -> dict[str, float | None]:
        """返回 {metric_name: score} 字典，NaN 记为 None。"""
 ```
 **客户端缓存策略：**  
 以 `(judge_model, embedding_model)` 为 key，缓存 `(llm, embeddings)` 对象，避免每次请求都重建 AsyncOpenAI 连接。缓存为模块级单例（`_scorer_cache: dict`），线程安全（加 `threading.Lock`）。
 **评分执行：**  
 复用 `build_metric_pipeline` 构建 `MetricPipeline`，然后 `asyncio.run(pipeline.score_sample(sample))` 执行。与现有 `evaluator.py` 模式一致。
 **ground_truth 为空时的指标跳过逻辑：**  
 `context_recall`、`factual_correctness`、`semantic_similarity`、`noise_sensitivity` 需要 ground_truth；若请求中未提供，自动从 metrics 列表中移除这些指标，并在响应中对应字段返回 `null`。
 ---
 ## 5. Dify 侧配置方法
 1. 在 Dify 「工具」→「自定义工具」中创建新工具
 2. 填写 OpenAPI Schema（与 `/api/score` 端点对齐）
 3. 鉴权方式：API Key（Bearer）或无鉴权
 4. 在 Agent / Workflow 节点中引用该工具，将 `question`、`answer`、`contexts` 变量映射到工具输入
 ---
 ## 6. 不在范围内
 - 批量评分接口（异步 job）
 - Dify Workflow 节点插件（需要 Dify 插件开发框架）
 - 评分结果持久化到 scores.csv
 - 与现有 report_builder 集成展示
--- a/docs/superpowers/specs/2026-06-22-linux-deploy-design.md
+++ b/docs/superpowers/specs/2026-06-22-linux-deploy-design.md
@@ -0,0 +1,173 @@
 # Linux 一键部署脚本设计
 **日期**: 2026-06-22  
 **状态**: 已批准，待实现  
 **范围**: 为 siemens_ragas 项目提供 Linux 环境的部署与运维脚本（无 Docker，无 systemd）。
 ---
 ## 1. 目标
 提供四个 Bash 脚本，覆盖 Linux 服务器上的完整生命周期：
 | 脚本 | 职责 |
 |------|------|
 | `deploy.sh` | 一键完成环境检查、依赖安装、配置初始化、启动服务 |
 | `start.sh` | 仅启动 Web 服务（已部署后复用，不重装依赖） |
 | `stop.sh` | 停止后台 Web 服务 |
 | `run_eval.sh` | 运行单次评估（对应 Windows 的 `run_eval.ps1`） |
 ---
 ## 2. 约束与假设
 - Linux 目标环境有 PyPI 网络访问（pip 可直接安装）
 - 代码已通过 `git clone` 或文件拷贝到服务器
 - 使用 `pip + venv`（不使用 uv）
 - Web 服务监听 `0.0.0.0:8800`（内网可达）
 - 后台运行使用 `nohup`，PID 写入 `.server.pid`，日志追加到 `logs/server.log`
 - 所有脚本均放在仓库根目录，路径相对于 `$SCRIPT_DIR`
 ---
 ## 3. `deploy.sh` 详细设计
 ### 3.1 阶段 1：Python 版本检查
 ```
 require Python >= 3.12
 ```
 - `python3 --version` 解析 major.minor
 - 不满足则打印错误并 `exit 1`
 - 满足则打印 `[OK] Python X.Y.Z`
 ### 3.2 阶段 2：虚拟环境
 - 目标路径：`$SCRIPT_DIR/.venv`
 - 已存在则跳过创建（打印 `[OK] .venv already exists`）
 - 不存在则 `python3 -m venv .venv`
 ### 3.3 阶段 3：依赖安装
 ```bash
 .venv/bin/pip install --upgrade pip -q
 .venv/bin/pip install -e . -q          # 安装 pyproject.toml 中的依赖
 .venv/bin/pip install fastapi uvicorn httpx -q  # Web 服务额外依赖
 ```
 - 失败则打印错误并 `exit 1`
 - `fastapi`、`uvicorn`、`httpx` 在 `pyproject.toml` 中未列，需单独安装
 ### 3.4 阶段 4：配置文件
 - 若 `.env` 不存在：`cp .env.example .env`，打印警告提示用户编辑后再启动
 - 若 `.env` 已存在：跳过，打印 `[OK] .env found`
 ### 3.5 阶段 5：目录初始化
 创建以下目录（`mkdir -p`，幂等）：
 - `configs/` — LLM Profile 持久化存储
 - `logs/` — 评估日志 + 服务器日志
 - `outputs/` — 评估运行产物
 - `datasets/` — 原始数据集
 ### 3.6 阶段 6：Demo 数据
 - 检查 `outputs/kba-knowledge-base-offline-baseline/` 是否存在
 - 不存在则运行 `.venv/bin/python scripts/seed_sample_run.py`
 - 失败时打印 `[WARN]`（非致命，报告页为空但服务可启动）
 ### 3.7 阶段 7：端口检测
 - 默认端口 `8800`
 - 用 `ss -tlnp` 或 `netstat -tlnp` 检查是否占用
 - 占用则尝试 `8801`，仍占用则报错退出
 ### 3.8 阶段 8：启动服务
 ```bash
 nohup .venv/bin/python webmain.py \
    --host 0.0.0.0 \
    --port $PORT \
    >> logs/server.log 2>&1 &
 echo $! > .server.pid
 ```
 - 等待 2 秒后用 `kill -0 $PID` 检测进程是否存活
 - 存活则打印 URL 和 stop 方法
 - 未存活则打印 `[ERROR] Server failed to start. Check logs/server.log.` 并 `exit 1`
 ---
 ## 4. `start.sh` 详细设计
 单独负责启动，不做任何环境初始化。
 ```bash
 #!/usr/bin/env bash
 # 检查 .venv 存在
 # 端口检测（同 deploy.sh 逻辑）
 # 检查 .env 存在（不存在则 warn 但不阻止）
 # nohup 启动 + PID 文件 + 存活验证
 # 打印 URL
 ```
 ---
 ## 5. `stop.sh` 详细设计
 ```bash
 #!/usr/bin/env bash
 # 读取 .server.pid
 # 若文件不存在：打印 "No server PID file found." 退出
 # kill $PID
 # 等待 2 秒，若进程仍存活用 kill -9
 # 删除 .server.pid
 # 打印 "Server stopped."
 ```
 ---
 ## 6. `run_eval.sh` 详细设计
 对应 Windows 的 `run_eval.ps1`。
 ```
 用法:
  ./run_eval.sh                          # online eval (默认)
  ./run_eval.sh offline                  # offline smoke
  ./run_eval.sh scenarios/xxx.yaml       # 自定义场景
  ./run_eval.sh online DEBUG             # 自定义日志级别
 ```
 - 参数 1（Scenario）：`online` / `offline` / 文件路径，默认 `online`
 - 参数 2（LogLevel）：`DEBUG` / `INFO` / `WARNING` / `ERROR`，默认 `INFO`
 - 场景别名映射：
  - `online` → `scenarios/online/siemens-pdf-question-bank-online.yaml`
  - `offline` → `scenarios/offline/siemens-pdf-offline-smoke.yaml`
 - 时间戳日志文件：`logs/eval_$(date +%Y-%m-%d_%H%M%S).log`
 - 环境变量：`PYTHONIOENCODING=utf-8 PYTHONPATH=.`
 - 调用：`.venv/bin/python main.py --scenario $SCENARIO --log-file $LOG_FILE --log-level $LOG_LEVEL`
 - 非零退出码时打印错误并 `exit 1`
 ---
 ## 7. 通用约定
 - 所有脚本首行：`#!/usr/bin/env bash`
 - `set -euo pipefail` — 错误立即退出，未定义变量报错，管道错误传播
 - `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` — 从任意目录执行均正确
 - `cd "$SCRIPT_DIR"` — 切换到仓库根目录
 - 颜色输出：绿色 `[OK]`、黄色 `[WARN]`、红色 `[ERROR]`（检测 tty，非交互式终端降级为无色）
 - 执行权限：脚本自身需要 `chmod +x`（在 deploy.sh 内对其他脚本自动 chmod）
 ---
 ## 8. 不在范围内
 - Docker / docker-compose 支持
 - systemd service 配置
 - Nginx 反向代理配置
 - SSL/TLS 配置
 - 离线/内网镜像源配置
--- a/docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
+++ b/docs/superpowers/specs/2026-06-24-async-score-jobs-design.md
@@ -0,0 +1,116 @@
 # 异步评分记录功能设计
 **日期**: 2026-06-24  
 **状态**: 已批准，待实现  
 **范围**: 新增 `POST /api/score/async` 异步评分端点，评分结果持久化到磁盘，前端新增「评分记录」页面展示。
 ---
 ## 1. 目标
 - Dify 工作流调用 `/api/score/async` 立即返回 `job_id`（202），不等待评分完成
 - 后台异步执行 RAGAS 评分，结果写入 `outputs/score-jobs/<job_id>.json`
 - RAGAS 平台新增「评分记录」导航页，列表展示所有评分记录及状态
 ---
 ## 2. 架构
 ```
 Dify → POST /api/score/async → 202 {job_id, status:"queued"}
                                      ↓
                              ScoreJobManager (线程池)
                                      ↓
                              InlineScorer.score()
                                      ↓
                        outputs/score-jobs/<job_id>.json
                                      ↓
              GET /api/score/jobs ← 前端「评分记录」页轮询
 ```
 ---
 ## 3. 存储格式
 `outputs/score-jobs/<job_id>.json`:
 ```json
 {
  "job_id": "abc123def456",
  "status": "completed",
  "created_at": "2026-06-24T09:00:00+00:00",
  "finished_at": "2026-06-24T09:00:15+00:00",
  "request": {
    "question": "双源CT的时间分辨率是多少?",
    "answer": "双源CT的单扇区时间分辨率为75ms。",
    "contexts": null,
    "ground_truth": null,
    "metrics": ["answer_relevancy"],
    "judge_model": "gpt-5",
    "embedding_model": "text-embedding-3-small"
  },
  "scores": {"answer_relevancy": 0.9075},
  "weighted_score": 0.9075,
  "latency_ms": 12500,
  "skipped_metrics": [],
  "error": null
 }
 ```
 ---
 ## 4. API 端点
 ### `POST /api/score/async`
 请求体与 `POST /api/score` 完全相同（`ScoreRequest`）。
 ```json
 // 立即返回 202
 {"job_id": "abc123def456", "status": "queued"}
 ```
 ### `GET /api/score/jobs`
 返回所有评分记录，按创建时间倒序：
 ```json
 {"jobs": [{...ScoreJobStatus...}]}
 ```
 ### `GET /api/score/jobs/{job_id}`
 返回单条评分记录详情。
 ---
 ## 5. 新增文件
 | 文件 | 职责 |
 |------|------|
 | `webapp/services/score_job_manager.py` | ScoreJobManager：线程池 + JSON 持久化 |
 | `webapp/api/score_jobs.py` | 3 个端点路由 |
 | `webapp/static/js/score_jobs.js` | 前端列表逻辑 + 轮询 |
 ## 6. 修改文件
 | 文件 | 改动 |
 |------|------|
 | `webapp/models.py` | 新增 `AsyncScoreJobStatus`、`AsyncScoreJobResponse` |
 | `webapp/server.py` | 注册 score_jobs router，更新 OPENAPI_TAGS |
 | `webapp/static/index.html` | 新增导航项 + section |
 ---
 ## 7. 前端「评分记录」页
 列表列：时间 / 问题摘要（前40字）/ 指标 / 得分 / 状态
 - 进入页面自动刷新
 - `queued/running` 记录每 5 秒轮询 `GET /api/score/jobs/{id}` 更新状态
 - 得分按 scoreClass（good/warn/bad）着色
 ---
 ## 8. Dify 改造
 只改 HTTP 节点 URL：`/api/score` → `/api/score/async`，删除解析响应的代码节点。
--- a/logs/online_eval.log
+++ b/logs/online_eval.log
@@ -0,0 +1 @@
 Completed run: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\online\siemens-pdf-question-bank
--- a/logs/server_2026-06-23.log
+++ b/logs/server_2026-06-23.log
@@ -0,0 +1,24 @@
 2026-06-23 13:55:00  INFO      webapp.server  Starting RAGAS Console  host=127.0.0.1  port=8800  log_level=info  log_file=C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\logs\server_2026-06-23.log
 2026-06-23 13:55:14  INFO      uvicorn.error  Started server process [83868]
 2026-06-23 13:55:14  INFO      uvicorn.error  Waiting for application startup.
 2026-06-23 13:55:14  INFO      uvicorn.error  Application startup complete.
 2026-06-23 13:55:14  INFO      uvicorn.error  Uvicorn running on http://127.0.0.1:8800 (Press CTRL+C to quit)
 2026-06-23 13:59:47  INFO      uvicorn.access  127.0.0.1:53487 - "GET / HTTP/1.1" 200
 2026-06-23 13:59:47  INFO      uvicorn.access  127.0.0.1:53487 - "GET /static/css/app.css HTTP/1.1" 200
 2026-06-23 13:59:47  INFO      uvicorn.access  127.0.0.1:50321 - "GET /static/js/api.js HTTP/1.1" 200
 2026-06-23 13:59:47  INFO      uvicorn.access  127.0.0.1:51325 - "GET /static/js/profiles.js HTTP/1.1" 200
 2026-06-23 13:59:47  INFO      uvicorn.access  127.0.0.1:59869 - "GET /static/js/report.js HTTP/1.1" 200
 2026-06-23 13:59:48  INFO      uvicorn.access  127.0.0.1:50980 - "GET /static/js/runner.js HTTP/1.1" 200
 2026-06-23 13:59:48  INFO      uvicorn.access  127.0.0.1:63223 - "GET /static/js/app.js HTTP/1.1" 200
 2026-06-23 13:59:48  INFO      webapp.access  GET /docs → 200  (0ms)
 2026-06-23 13:59:48  INFO      uvicorn.access  127.0.0.1:63223 - "GET /docs HTTP/1.1" 200
 2026-06-23 13:59:48  INFO      webapp.access  GET /api/health → 200  (0ms)
 2026-06-23 13:59:48  INFO      uvicorn.access  127.0.0.1:50321 - "GET /api/health HTTP/1.1" 200
 2026-06-23 13:59:49  INFO      webapp.api.runs  [get_runs] found 19 runs
 2026-06-23 13:59:49  INFO      webapp.access  GET /api/runs → 200  (1094ms)
 2026-06-23 13:59:49  INFO      uvicorn.access  127.0.0.1:63223 - "GET /api/runs HTTP/1.1" 200
 2026-06-23 13:59:49  INFO      webapp.access  GET /openapi.json → 200  (94ms)
 2026-06-23 13:59:49  INFO      uvicorn.access  127.0.0.1:63223 - "GET /openapi.json HTTP/1.1" 200
 2026-06-23 13:59:50  INFO      webapp.api.llm_profiles  [list_profiles] count=6
 2026-06-23 13:59:50  INFO      webapp.access  GET /api/llm-profiles → 200  (0ms)
 2026-06-23 13:59:50  INFO      uvicorn.access  127.0.0.1:63223 - "GET /api/llm-profiles HTTP/1.1" 200
--- a/logs/siemens_build.log
+++ b/logs/siemens_build.log
@@ -0,0 +1,35 @@
  [info] generating questions for: 315_1_Flash????????.pdf
  [info] 315_1_Flash????????.pdf: 6 questions generated (total so far: 6)
  [info] generating questions for: 316_2_Flash??????_??.pdf
  [info] 316_2_Flash??????_??.pdf: 10 questions generated (total so far: 16)
  [info] generating questions for: 317_3_Flash??????_??.pdf
  [info] 317_3_Flash??????_??.pdf: 9 questions generated (total so far: 25)
  [info] generating questions for: 318_4_Flash??????_???.pdf
  [info] 318_4_Flash??????_???.pdf: 9 questions generated (total so far: 34)
  [info] generating questions for: 319_5_Flash??????_?????.pdf
  [info] 319_5_Flash??????_?????.pdf: 10 questions generated (total so far: 44)
  [info] generating questions for: 320_6_Flash??????_??.pdf
  [info] 320_6_Flash??????_??.pdf: 8 questions generated (total so far: 52)
  [info] generating questions for: 321_??CT???????????--??.pdf
  [info] 321_??CT???????????--??.pdf: 5 questions generated (total so far: 57)
  [info] generating questions for: 322_??CT???????????--??????????.pdf
  [info] 322_??CT???????????--??????????.pdf: 8 questions generated (total so far: 65)
  [info] generating questions for: 323_??CT???????????--?????????.pdf
  [info] 323_??CT???????????--?????????.pdf: 5 questions generated (total so far: 70)
  [info] generating questions for: 324_??CT???????????--????????.pdf
  [info] 324_??CT???????????--????????.pdf: 8 questions generated (total so far: 78)
  [info] generating questions for: 325_??CT???????????--???????.pdf
  [info] 325_??CT???????????--???????.pdf: 8 questions generated (total so far: 86)
  [info] generating questions for: 326_??CT???????????--4D????.pdf
  [info] 326_??CT???????????--4D????.pdf: 7 questions generated (total so far: 93)
  [info] generating questions for: 327_??CT???????????--??????.pdf
  [info] 327_??CT???????????--??????.pdf: 8 questions generated (total so far: 101)
  [info] generating questions for: 749_????01_???????????.pdf
  [info] 749_????01_???????????.pdf: 8 questions generated (total so far: 109)
  [info] generating questions for: 804_????02-????????CT?????X-Map??.pdf
  [info] 804_????02-????????CT?????X-Map??.pdf: 8 questions generated (total so far: 117)
  [info] generating questions for: 805_????03_????????????????.pdf
  [info] 805_????03_????????????????.pdf: 6 questions generated (total so far: 123)
  [info] generating questions for: 807_???CT???????_SJ-L10.2??1-5.pdf
  [info] 807_???CT???????_SJ-L10.2??1-5.pdf: 9 questions generated (total so far: 132)
 Completed dataset build: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\dataset-builds\siemens-pdf-question-bank\2026-06-15T09-28-35.302231+00-00
--- a/main.py
+++ b/main.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 import argparse
 import logging
 from pathlib import Path
 from rag_eval.dataset_builder.runner import run_dataset_build
 from rag_eval.execution.runner import run_scenario
@@ -18,18 +20,33 @@ def parse_args() -> argparse.Namespace:
        "--dataset-build-config",
        help="Path to a YAML dataset build config file.",
    )
    parser.add_argument(
        "--log-file",
        default=None,
        help="Write evaluation logs to this file (in addition to stderr). "
             "Example: logs/eval.log",
    )
    parser.add_argument(
        "--log-level",
        default="INFO",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Logging verbosity level (default: INFO). Use DEBUG for per-metric detail.",
    )
    return parser.parse_args()
 def main() -> None:
    """Dispatch the CLI call to the requested workflow."""
    args = parse_args()
    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
    log_file = Path(args.log_file) if args.log_file else None
    if args.dataset_build_config:
        result = run_dataset_build(args.dataset_build_config)
        print(f"Completed dataset build: {result.artifact_paths.root_dir}")
        return
-    result = run_scenario(args.scenario)
+    result = run_scenario(args.scenario, log_file=log_file, log_level=log_level)
    print(f"Completed run: {result.scenario.output_dir}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,3 +17,8 @@ dependencies = [
    "pydantic-settings>=2.14.1",
    "ragas==0.4.3",
 ]
 [tool.setuptools.packages.find]
 # 只打包源码目录，排除运行时产生的数据目录
 include = ["rag_eval*", "apps*", "webapp*"]
 exclude = ["logs*", "outputs*", "datasets*", "configs*", "scenarios*", "scripts*", "tests*"]
--- a/rag_eval/advisor/init.py
+++ b/rag_eval/advisor/init.py
@@ -0,0 +1,67 @@
 """Optimization advisor: rule-based diagnosis + LLM-powered recommendations."""
 from __future__ import annotations
 import asyncio
 import logging
 from typing import Any
 from rag_eval.reporting.artifacts import build_artifact_paths
 from rag_eval.shared.models import EvaluationResult, Scenario
 from .llm_analyzer import analyze
 from .rules import Diagnosis, diagnose
 from .writer import write_advice
 logger = logging.getLogger("rag_eval.advisor")
 __all__ = ["run_advisor", "Diagnosis", "diagnose"]
 def run_advisor(
    result: EvaluationResult,
    scenario: Scenario,
    llm: Any,
 ) -> None:
    """Run the full optimization advisor pipeline after an evaluation completes.
    Skips silently if scenario.optimization_advisor is False.
    Never raises — failures are logged as warnings, not exceptions.
    Args:
        result: Completed EvaluationResult from Evaluator.evaluate().
        scenario: The resolved Scenario (provides metrics, judge_model, output_dir).
        llm: Pre-built RAGAS LLM instance (from build_models()) for LLM analysis.
    """
    if not scenario.optimization_advisor:
        return
    logger.info("[advisor] starting optimization analysis  scenario=%s", scenario.scenario_name)
    try:
        artifact_paths = build_artifact_paths(scenario.output_dir, result.run_id)
        if artifact_paths.advice_md is None:
            logger.warning("[advisor] advice_md path not set in RunArtifactPaths — skipping")
            return
        diagnoses = diagnose(result.score_rows, scenario.metrics)
        logger.info("[advisor] rule diagnosis complete: %d metric(s) triggered", len(diagnoses))
        if diagnoses:
            llm_markdown = asyncio.run(analyze(diagnoses, llm, scenario.scenario_name))
        else:
            llm_markdown = ""
        write_advice(
            diagnoses=diagnoses,
            llm_markdown=llm_markdown,
            advice_path=artifact_paths.advice_md,
            scenario_name=scenario.scenario_name,
            run_id=result.run_id,
            judge_model=scenario.judge_model,
        )
    except Exception as exc:
        logger.warning(
            "[advisor] advisor failed (%s: %s) — evaluation result is unaffected",
            type(exc).__name__, exc,
        )
--- a/rag_eval/advisor/llm_analyzer.py
+++ b/rag_eval/advisor/llm_analyzer.py
@@ -0,0 +1,109 @@
 """LLM-powered analysis of rule diagnostics and low-score samples."""
 from __future__ import annotations
 import logging
 from typing import Any
 from .rules import Diagnosis
 logger = logging.getLogger("rag_eval.advisor")
 _PROMPT_TEMPLATE = """\
 你是一个 RAG 系统优化专家，正在分析西门子医疗 CT 文档问答系统的评测结果。
 请用中文撰写一份优化建议报告，格式为 Markdown。
 ## 评测诊断摘要
 {diagnosis_summary}
 ## 低分样本示例
 {low_sample_text}
 ## 报告要求
 1. 按指标分节（## 指标名  [严重程度]），先解释"为什么低"（结合低分样本具体分析），再给出"具体怎么改"
 2. 严重程度说明：critical=严重（<阈值50%），warning=警告（<阈值70%），low=待优化（低于0.85，有提升空间）
 3. "具体怎么改"要结合低分样本的实际内容，而不只是泛泛建议
 4. 最后写一节 **## 优先优化次序**，按性价比排序（不增加 LLM 调用次数的优化优先），critical 和 warning 项优先于 low 项
 5. 语言简洁，面向工程师，不要废话，不要重复列表内容
 只输出 Markdown 报告正文，不要任何前置说明。
 """
 _SEVERITY_LABEL_ZH: dict[str, str] = {
    "critical": "严重",
    "warning": "警告",
    "low": "待优化",
 }
 def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
    lines = []
    for d in diagnoses:
        direction = "（越低越好）" if d.metric == "noise_sensitivity" else ""
        label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
        lines.append(
            f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}，"
            f"阈值={d.threshold}，严重程度={label}"
        )
        lines.append(f"  - 可能原因：{'; '.join(d.root_causes)}")
        lines.append(f"  - 建议动作：{'; '.join(d.suggested_actions)}")
    return "\n".join(lines)
 def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
    lines = []
    for d in diagnoses:
        if not d.low_samples:
            continue
        lines.append(f"### {d.metric} 低分样本（最多 3 条）")
        for i, s in enumerate(d.low_samples, 1):
            score = s.get(d.metric, "N/A")
            lines.append(f"\n**样本 {i}**（分数={score}）")
            lines.append(f"- 问题：{s.get('question', '')}")
            lines.append(f"- 回答：{s.get('answer', '')[:300]}")
            lines.append(f"- 标准答案：{s.get('ground_truth', '')[:200]}")
    return "\n".join(lines)
 async def analyze(
    diagnoses: list[Diagnosis],
    llm: Any,
    scenario_name: str,
 ) -> str:
    """Call the judge LLM to generate a Chinese optimization report.
    Args:
        diagnoses: Non-empty list of Diagnosis from rules.diagnose().
        llm: RAGAS LLM wrapper (has .agenerate() method).
        scenario_name: Used only for logging.
    Returns:
        LLM-generated Markdown string, or "" on failure (triggers writer fallback).
    """
    if not diagnoses:
        return ""
    diagnosis_summary = _build_diagnosis_summary(diagnoses)
    low_sample_text = _build_low_sample_text(diagnoses)
    prompt = _PROMPT_TEMPLATE.format(
        diagnosis_summary=diagnosis_summary,
        low_sample_text=low_sample_text,
    )
    try:
        logger.info("[advisor] calling LLM for optimization analysis  scenario=%s", scenario_name)
        from langchain_core.messages import HumanMessage
        # Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel)
        response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)])
        text = response.content.strip()
        logger.info("[advisor] LLM analysis complete  chars=%d", len(text))
        return text
    except Exception as exc:
        logger.warning(
            "[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
            type(exc).__name__, exc,
        )
        return ""
--- a/rag_eval/advisor/rules.py
+++ b/rag_eval/advisor/rules.py
@@ -0,0 +1,243 @@
 """Rule-based diagnostic engine for RAG evaluation metric scores."""
 from __future__ import annotations
 import math
 from dataclasses import dataclass, field
 from typing import Any
@dataclass
 class MetricRule:
    """Threshold configuration and diagnostic text for one metric."""
    warning_threshold: float
    critical_threshold: float
    higher_is_better: bool  # False for noise_sensitivity
    root_causes: list[str]
    suggested_actions: list[str]
    # Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
    # Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
    advisory_threshold: float = 0.85
 METRIC_RULES: dict[str, MetricRule] = {
    "faithfulness": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "生成回答包含检索片段中不支持的陈述（幻觉）",
            "生成阶段未严格遵循 grounding 约束",
            "校验阶段未开启或未生效",
        ],
        suggested_actions=[
            "强化生成 prompt 的 grounding 约束（'只依据参考资料作答'）",
            "开启校验阶段（validation: by_scenario）",
            "检查低分样本中模型是否引用了片段外的知识",
        ],
    ),
    "answer_relevancy": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "回答偏离问题主旨或包含大量冗余内容",
            "查询改写后问题语义漂移",
            "生成 prompt 格式约束不足",
        ],
        suggested_actions=[
            "优化查询改写 prompt，确保改写后语义不偏移",
            "在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
            "检查低分样本的回答是否存在格式冗余或话题偏移",
        ],
    ),
    "context_recall": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "检索未能召回标准答案所涉及的关键信息",
            "单一查询未能覆盖问题的多个角度",
            "过召回数量不足，关键片段被截断",
        ],
        suggested_actions=[
            "启用多查询扩展（use_multi_query）覆盖不同措辞",
            "对多跳问题启用问题分解（sub_questions）",
            "加大过召回宽度（recall_top_k）",
            "对颗粒度细的问题尝试 Step-back 双路检索",
        ],
    ),
    "context_precision": MetricRule(
        warning_threshold=0.6,
        critical_threshold=0.4,
        higher_is_better=True,
        root_causes=[
            "检索引入过多与问题无关的片段",
            "重排未能将相关片段排在前列",
            "缺少相关性过滤，噪声片段进入上下文",
        ],
        suggested_actions=[
            "启用或优化 listwise 重排，将相关片段排在前列",
            "启用上下文压缩（compression）过滤无关句子",
            "启用相关性过滤（relevance_filter）丢弃明确无关片段",
            "缩小 rerank_keep_k（如从 8 降到 5）",
        ],
    ),
    "noise_sensitivity": MetricRule(
        warning_threshold=0.3,   # higher is worse; trigger when mean > threshold
        critical_threshold=0.5,
        higher_is_better=False,
        root_causes=[
            "回答中包含检索到的噪声片段所引入的错误陈述",
            "相关性过滤未能拦截干扰性片段",
            "生成阶段对噪声片段未加区分地引用",
        ],
        suggested_actions=[
            "启用相关性过滤（relevance_filter）拦截噪声",
            "优化重排，将不相关片段排到截断点之后",
            "在生成 prompt 中强调'来源冲突时并列陈述，不擅自下定论'",
        ],
    ),
    "factual_correctness": MetricRule(
        warning_threshold=0.6,
        critical_threshold=0.4,
        higher_is_better=True,
        root_causes=[
            "回答的事实陈述与标准答案存在偏差",
            "检索未能命中标准答案所依据的关键片段",
            "生成阶段对多个来源综合时产生事实错误",
        ],
        suggested_actions=[
            "重点检查低分样本，确认是检索遗漏还是生成错误",
            "提升 context_recall 以确保关键信息被检索到",
            "对事实型问题将 temperature 降至 0",
        ],
    ),
    "semantic_similarity": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "回答语义与标准答案差距较大",
            "回答过于简短或过于冗长，语义偏移",
            "检索到的片段质量不足，导致生成内容偏离",
        ],
        suggested_actions=[
            "检查低分样本的回答与标准答案的表述差异",
            "优化生成 prompt 使回答更贴近标准表述风格",
            "提升检索质量（context_recall / context_precision）",
        ],
    ),
 }
@dataclass
 class Diagnosis:
    """Diagnostic result for one metric that triggered a threshold."""
    metric: str
    mean_score: float
    threshold: float          # the triggered threshold
    severity: str             # "warning" | "critical"
    root_causes: list[str] = field(default_factory=list)
    suggested_actions: list[str] = field(default_factory=list)
    low_samples: list[dict[str, Any]] = field(default_factory=list)
 def _mean_ignoring_nan(values: list[float]) -> float | None:
    valid = [v for v in values if not math.isnan(v)]
    if not valid:
        return None
    return sum(valid) / len(valid)
 def _select_low_samples(
    rows: list[dict[str, Any]],
    metric: str,
    top_n: int,
    higher_is_better: bool,
 ) -> list[dict[str, Any]]:
    """Return the top_n worst-scoring rows for a metric, excluding NaN."""
    valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
    sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
    worst = sorted_rows[:top_n]
    keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
    return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
 def diagnose(
    score_rows: list[dict[str, Any]],
    metrics: list[str],
    top_low_samples: int = 3,
 ) -> list[Diagnosis]:
    """Analyse score_rows and return a Diagnosis for each metric below threshold.
    Args:
        score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
        metrics: Metric names to evaluate (from Scenario.metrics).
        top_low_samples: How many worst-scoring samples to attach per diagnosis.
    Returns:
        List of Diagnosis objects, one per triggered metric. Empty if all OK.
    """
    diagnoses: list[Diagnosis] = []
    for metric in metrics:
        rule = METRIC_RULES.get(metric)
        if rule is None:
            continue  # unknown metric, skip
        values = []
        for row in score_rows:
            raw = row.get(metric)
            if raw is None:
                continue
            try:
                v = float(raw)
            except (TypeError, ValueError):
                continue
            values.append(v)
        if not values:
            continue
        mean = _mean_ignoring_nan(values)
        if mean is None:
            continue
        # Determine severity (direction-aware)
        if rule.higher_is_better:
            if mean < rule.critical_threshold:
                severity = "critical"
                threshold = rule.critical_threshold
            elif mean < rule.warning_threshold:
                severity = "warning"
                threshold = rule.warning_threshold
            elif mean < rule.advisory_threshold:
                # Score is acceptable but below 0.85 — request LLM optimization advice.
                severity = "low"
                threshold = rule.advisory_threshold
            else:
                continue  # >= advisory_threshold → no diagnosis needed
        else:
            # lower is better (noise_sensitivity): keep existing two-tier logic
            if mean > rule.critical_threshold:
                severity = "critical"
                threshold = rule.critical_threshold
            elif mean > rule.warning_threshold:
                severity = "warning"
                threshold = rule.warning_threshold
            else:
                continue
        low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
        diagnoses.append(Diagnosis(
            metric=metric,
            mean_score=round(mean, 4),
            threshold=threshold,
            severity=severity,
            root_causes=list(rule.root_causes),
            suggested_actions=list(rule.suggested_actions),
            low_samples=low_samples,
        ))
    return diagnoses
--- a/rag_eval/advisor/writer.py
+++ b/rag_eval/advisor/writer.py
@@ -0,0 +1,93 @@
 """Write optimization advice to markdown file and emit log summary."""
 from __future__ import annotations
 import logging
 from pathlib import Path
 from .rules import Diagnosis
 logger = logging.getLogger("rag_eval.advisor")
 # Chinese display labels for each severity tier.
 _SEVERITY_LABEL: dict[str, str] = {
    "critical": "严重",
    "warning": "警告",
    "low": "待优化",
 }
 def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
    """Return a single-line log summary of triggered diagnoses."""
    if not diagnoses:
        return "[advisor] 所有指标正常，无需优化建议。"
    parts = [
        f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
        for d in diagnoses
    ]
    triggered = " ".join(parts)
    return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}  →  {advice_path}"
 def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
    """Build a rules-only report when LLM analysis is unavailable."""
    if not diagnoses:
        return ""
    lines = ["## 规则诊断（LLM 分析不可用）\n"]
    for d in diagnoses:
        label = _SEVERITY_LABEL.get(d.severity, d.severity)
        lines.append(f"### {d.metric}  [{label}]  均值={d.mean_score:.4f}")
        lines.append("\n**可能原因：**")
        for cause in d.root_causes:
            lines.append(f"- {cause}")
        lines.append("\n**建议动作：**")
        for action in d.suggested_actions:
            lines.append(f"- {action}")
        lines.append("")
    return "\n".join(lines)
 def write_advice(
    diagnoses: list[Diagnosis],
    llm_markdown: str,
    advice_path: Path,
    scenario_name: str,
    run_id: str,
    judge_model: str,
 ) -> None:
    """Write optimization_advice.md and emit a log summary line.
    Args:
        diagnoses: List of Diagnosis from rules.diagnose().
        llm_markdown: LLM-generated Markdown body. Empty string triggers fallback.
        advice_path: Full path to write the .md file.
        scenario_name: Human-readable scenario identifier for the report header.
        run_id: Run identifier string.
        judge_model: Model used for LLM analysis (shown in header).
    """
    advice_path.parent.mkdir(parents=True, exist_ok=True)
    from rag_eval.shared.utils import utc_now_iso
    header_lines = [
        f"# 优化建议报告 — {scenario_name}",
        "",
        f"- run_id: `{run_id}`",
        f"- 生成时间: `{utc_now_iso()}`",
        f"- judge_model: `{judge_model}`",
        "",
        "---",
        "",
    ]
    if not diagnoses:
        body = "## ✅ 未发现明显指标异常\n\n所有指标均在正常范围内，当前 RAG 链路表现良好。\n"
    elif llm_markdown:
        body = llm_markdown
    else:
        body = _build_fallback_report(diagnoses)
    content = "\n".join(header_lines) + body
    advice_path.write_text(content, encoding="utf-8")
    summary = _format_log_summary(diagnoses, advice_path)
    logger.info(summary)
    logger.info("[advisor] 优化建议已写出: %s", advice_path)
--- a/rag_eval/config/loader.py
+++ b/rag_eval/config/loader.py
@@ -61,6 +61,9 @@ def load_scenario(path: str | Path) -> Scenario:
            max_samples=model.runtime.max_samples,
        ),
        source_path=scenario_path,
        optimization_advisor=model.optimization_advisor,
        metric_weights=dict(model.metric_weights),
        doc_weights=dict(model.doc_weights),
    )
    # Run cross-field checks after all relative paths have been resolved.
    validate_scenario(scenario)
--- a/rag_eval/config/schema.py
+++ b/rag_eval/config/schema.py
@@ -54,6 +54,9 @@ class ScenarioModel(BaseModel):
    metrics: list[str]
    output_dir: str
    runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
    optimization_advisor: bool = False
    metric_weights: dict[str, float] = Field(default_factory=dict)
    doc_weights: dict[str, float] = Field(default_factory=dict)
    @field_validator("metrics")
    @classmethod
--- a/rag_eval/dataset_builder/generator/question_generator.py
+++ b/rag_eval/dataset_builder/generator/question_generator.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 import json
 import time
 from abc import ABC, abstractmethod
 from typing import Any
@@ -150,24 +151,39 @@ class OpenAIQuestionGenerator(QuestionGenerator):
        max_questions: int,
        max_chunks_per_question: int,
        job_name: str,
        max_retries: int = 3,
        retry_delay: float = 5.0,
    ) -> list[DraftQuestionSample]:
-        """Generate draft questions for one parsed document."""
+        """Generate draft questions for one parsed document, with retry on timeout/server errors."""
        prompt = self._build_prompt(
            document,
            max_questions=max_questions,
            max_chunks_per_question=max_chunks_per_question,
        )
-        response = self.client.chat.completions.create(
+        last_exc: Exception | None = None
-            model=self.model,
+        for attempt in range(1, max_retries + 1):
-            messages=[
+            try:
-                {"role": "system", "content": "You generate structured draft question banks from source documents."},
+                response = self.client.chat.completions.create(
-                {"role": "user", "content": prompt},
+                    model=self.model,
-            ],
+                    messages=[
-            response_format={"type": "json_object"},
+                        {"role": "system", "content": "You generate structured draft question banks from source documents."},
-        )
+                        {"role": "user", "content": prompt},
-        content = response.choices[0].message.content or "{}"
+                    ],
-        payload = self._parse_response_payload(content)
+                    response_format={"type": "json_object"},
-        return [
+                )
-            self._build_sample(document=document, payload=item, index=index, job_name=job_name)
+                content = response.choices[0].message.content or "{}"
-            for index, item in enumerate(payload[:max_questions], start=1)
+                payload = self._parse_response_payload(content)
-        ]
+                return [
                    self._build_sample(document=document, payload=item, index=index, job_name=job_name)
                    for index, item in enumerate(payload[:max_questions], start=1)
                ]
            except Exception as exc:
                last_exc = exc
                if attempt < max_retries:
                    wait = retry_delay * attempt
                    doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
                    print(f"  [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
                    time.sleep(wait)
        raise RuntimeError(
            f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
        ) from last_exc
--- a/rag_eval/dataset_builder/runner.py
+++ b/rag_eval/dataset_builder/runner.py
@@ -111,12 +111,32 @@ def run_dataset_build(
            continue
        documents.append(document)
-        generated = generator.generate(
+        doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
-            document,
+        print(f"  [info] generating questions for: {doc_name_safe}")
-            max_questions=job.max_questions_per_document,
+        try:
-            max_chunks_per_question=job.max_source_chunks_per_question,
+            generated = generator.generate(
-            job_name=job.job_name,
+                document,
-        )
+                max_questions=job.max_questions_per_document,
                max_chunks_per_question=job.max_source_chunks_per_question,
                job_name=job.job_name,
            )
        except Exception as exc:
            gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
            failures.append(gen_failure)
            print(f"  [warn] skipping {doc_name_safe} after generation failure: {exc}")
            if job.failure_mode == "fail":
                result = DatasetBuildResult(
                    job=job,
                    run_id=run_id,
                    artifact_paths=artifact_paths,
                    documents=documents,
                    draft_samples=draft_samples,
                    parse_failures=failures,
                )
                write_dataset_build_artifacts(result)
                raise
            continue
        valid_generated = []
        for sample in generated:
            errors = validate_draft_sample(
@@ -126,9 +146,9 @@ def run_dataset_build(
            )
            if not errors:
                valid_generated.append(sample)
-        draft_samples.extend(
+        new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
-            dedupe_samples(valid_generated)[: job.max_questions_per_document]
+        draft_samples.extend(new_samples)
-        )
+        print(f"  [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
    result = DatasetBuildResult(
        job=job,
--- a/rag_eval/datasets/init.py
+++ b/rag_eval/datasets/init.py
@@ -0,0 +1 @@
 """Dataset loading and normalization for the RAG evaluation platform."""
--- a/rag_eval/datasets/loader.py
+++ b/rag_eval/datasets/loader.py
@@ -0,0 +1,56 @@
 """Load raw evaluation dataset records from disk.
 Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
 into NormalizedSample is handled by normalizers.py.
 """
 from __future__ import annotations
 import csv
 import json
 from pathlib import Path
 from typing import Any
 def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
    """Load raw records from a CSV or JSONL file.
    Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
    are left as-is; normalizers handle parsing.
    """
    file_path = Path(path)
    if not file_path.is_file():
        raise FileNotFoundError(f"Dataset file not found: {file_path}")
    suffix = file_path.suffix.lower()
    if suffix in (".jsonl", ".ndjson"):
        return _load_jsonl(file_path)
    if suffix in (".csv",):
        return _load_csv(file_path)
    # Fall back to CSV for unknown extensions.
    return _load_csv(file_path)
 def _load_csv(path: Path) -> list[dict[str, Any]]:
    """Read a CSV file into a list of row dicts."""
    with path.open(encoding="utf-8", newline="") as fh:
        reader = csv.DictReader(fh)
        return [dict(row) for row in reader]
 def _load_jsonl(path: Path) -> list[dict[str, Any]]:
    """Read a JSONL file into a list of record dicts."""
    records: list[dict[str, Any]] = []
    with path.open(encoding="utf-8") as fh:
        for lineno, line in enumerate(fh, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as exc:
                raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
            if not isinstance(obj, dict):
                raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
            records.append(obj)
    return records
--- a/rag_eval/datasets/normalizers.py
+++ b/rag_eval/datasets/normalizers.py
@@ -0,0 +1,105 @@
 """Normalize raw dataset records into NormalizedSample and InvalidSample objects.
 Handles both offline mode (records already contain answer + contexts) and online
 mode (records only contain question + ground_truth; adapter fills the rest).
 """
 from __future__ import annotations
 import uuid
 from typing import Any
 from rag_eval.shared.models import InvalidSample, NormalizedSample
 from rag_eval.shared.utils import parse_contexts
 # Fields we always strip from the raw record before storing it in metadata.
 _CORE_FIELDS = {
    "sample_id",
    "question",
    "contexts",
    "answer",
    "ground_truth",
    "scenario",
    "language",
    "retrieval_config",
 }
 def _get_str(record: dict[str, Any], key: str, default: str = "") -> str:
    """Return a string field from the record, coercing None/NaN to the default."""
    value = record.get(key)
    if value is None:
        return default
    text = str(value).strip()
    return default if text.lower() == "nan" else text
 def normalize_records(
    records: list[dict[str, Any]],
    mode: str = "offline",
    max_samples: int | None = None,
 ) -> tuple[list[NormalizedSample], list[InvalidSample]]:
    """Convert raw dicts into NormalizedSample / InvalidSample collections.
    In offline mode every record must already contain answer and contexts.
    In online mode those fields may be absent; they will be filled by the adapter.
    """
    if max_samples is not None:
        records = records[:max_samples]
    valid: list[NormalizedSample] = []
    invalid: list[InvalidSample] = []
    for raw in records:
        sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12]
        question = _get_str(raw, "question")
        if not question:
            invalid.append(InvalidSample(
                sample_id=sample_id,
                error="missing required field: question",
                raw=raw,
            ))
            continue
        ground_truth = _get_str(raw, "ground_truth")
        contexts = parse_contexts(raw.get("contexts"))
        answer = _get_str(raw, "answer")
        if mode == "offline":
            errors: list[str] = []
            if not ground_truth:
                errors.append("missing ground_truth")
            if not answer:
                errors.append("missing answer")
            if not contexts:
                errors.append("missing or empty contexts")
            if errors:
                invalid.append(InvalidSample(
                    sample_id=sample_id,
                    error="; ".join(errors),
                    raw=raw,
                ))
                continue
        # Collect any extra columns as opaque metadata for adapters and reporting.
        metadata = {
            key: value
            for key, value in raw.items()
            if key not in _CORE_FIELDS
        }
        valid.append(NormalizedSample(
            sample_id=sample_id,
            question=question,
            contexts=contexts,
            answer=answer,
            ground_truth=ground_truth,
            scenario=_get_str(raw, "scenario"),
            language=_get_str(raw, "language"),
            retrieval_config=_get_str(raw, "retrieval_config"),
            metadata=metadata,
            raw=raw,
        ))
    return valid, invalid
--- a/rag_eval/execution/evaluator.py
+++ b/rag_eval/execution/evaluator.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 import asyncio
 import logging
 import time
 from typing import Any
 from rag_eval.adapters.base import AppAdapter
@@ -10,9 +12,12 @@ from rag_eval.datasets.loader import load_dataset_records
 from rag_eval.datasets.normalizers import normalize_records
 from rag_eval.execution.concurrency import gather_with_limit
 from rag_eval.metrics.pipeline import MetricPipeline
 from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
 from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
 from rag_eval.shared.utils import utc_now_iso
 logger = logging.getLogger("rag_eval.execution.evaluator")
 class Evaluator:
    """Coordinate dataset loading, optional app execution, and metric scoring."""
@@ -31,27 +36,61 @@ class Evaluator:
    def evaluate(self) -> EvaluationResult:
        """Execute the full evaluation flow and return the collected results."""
        started_at = utc_now_iso()
        scenario_name = self.scenario.scenario_name
        mode = self.scenario.mode
        logger.info("=" * 60)
        logger.info("[eval] START  scenario=%s  mode=%s", scenario_name, mode)
        logger.info("[eval] dataset=%s", self.scenario.dataset.path)
        logger.info("[eval] metrics=%s", list(self.scenario.metrics))
        logger.info("[eval] judge=%s  embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
        raw_records = load_dataset_records(self.scenario.dataset.path)
        logger.info("[eval] raw_records=%d", len(raw_records))
        samples, invalid_samples = normalize_records(
            raw_records,
            mode=self.scenario.mode,
            max_samples=self.scenario.runtime.max_samples,
        )
        logger.info("[eval] normalized: valid=%d  invalid=%d", len(samples), len(invalid_samples))
        if self.scenario.mode == "online":
-            # Online mode enriches each sample by calling the target application first.
+            logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
            t0 = time.monotonic()
            samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
            elapsed = time.monotonic() - t0
            invalid_samples.extend(online_invalids)
            logger.info(
                "[eval] adapter done: enriched=%d  adapter_invalids=%d  elapsed=%.1fs",
                len(samples), len(online_invalids), elapsed,
            )
        logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
        t0 = time.monotonic()
        metric_scores = asyncio.run(
            self.metric_pipeline.score_samples(
                samples,
                max_concurrency=self.scenario.runtime.metric_limit(),
            )
        )
        elapsed = time.monotonic() - t0
        logger.info("[eval] metric scoring done  elapsed=%.1fs", elapsed)
        finished_at = utc_now_iso()
        score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
        # Summary of NaN rates per metric
        import math
        for metric_name in self.scenario.metrics:
            nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
            logger.info("[eval] %-22s  NaN=%d/%d (%.0f%%)",
                        metric_name, nan_count, len(score_rows),
                        100 * nan_count / len(score_rows) if score_rows else 0)
        run_id = finished_at.replace(":", "-")
        logger.info("[eval] DONE  run_id=%s  total_valid=%d  total_invalid=%d",
                    run_id, len(samples), len(invalid_samples))
        logger.info("=" * 60)
        return EvaluationResult(
            scenario=self.scenario,
            run_id=run_id,
@@ -72,13 +111,27 @@ class Evaluator:
        valid: list[NormalizedSample] = []
        invalid: list[InvalidSample] = []
        total = len(samples)
-        async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
+        async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
            """Convert adapter exceptions into invalid samples instead of aborting the run."""
            sid = sample.sample_id[:12]
            logger.debug("[adapter] [%d/%d] calling adapter  sample=%s  question=%r",
                         idx + 1, total, sid, (sample.question or "")[:60])
            t0 = time.monotonic()
            try:
-                return await self.app_adapter.enrich_sample(sample)
+                result = await self.app_adapter.enrich_sample(sample)
                elapsed = time.monotonic() - t0
                ans_len = len(result.answer or "")
                ctx_count = len(result.contexts or [])
                logger.info("[adapter] [%d/%d] OK  sample=%-12s  ans_len=%d  ctx_count=%d  elapsed=%.1fs",
                            idx + 1, total, sid, ans_len, ctx_count, elapsed)
                return result
            except Exception as exc:
                elapsed = time.monotonic() - t0
                error_type = type(exc).__name__
                logger.warning("[adapter] [%d/%d] FAIL  sample=%-12s  %s: %s  (elapsed=%.1fs)",
                               idx + 1, total, sid, error_type, exc, elapsed)
                return InvalidSample(
                    sample_id=sample.sample_id,
                    error=f"adapter failed [{error_type}]: {exc}",
@@ -86,8 +139,8 @@ class Evaluator:
                )
        factories = [
-            (lambda sample=sample: enrich_with_capture(sample))
+            (lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
-            for sample in samples
+            for i, sample in enumerate(samples)
        ]
        results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
@@ -102,6 +155,8 @@ class Evaluator:
            if not sample.contexts:
                errors.append("adapter returned empty contexts")
            if errors:
                logger.warning("[adapter] incomplete payload  sample=%s  errors=%s",
                               sample.sample_id[:12], errors)
                invalid.append(
                    InvalidSample(
                        sample_id=sample.sample_id,
@@ -111,10 +166,13 @@ class Evaluator:
                )
                continue
            valid.append(sample)
        logger.info("[adapter] enrichment summary: valid=%d  invalid=%d  of total=%d",
                    len(valid), len(invalid), total)
        return valid, invalid
    def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
-        """Combine sample data, metric results, and run metadata into one output row."""
+        """Combine sample data, metric results, run metadata, and weight columns."""
        record = sample.to_record()
        record["contexts"] = sample.contexts
        record.update(score.metrics)
@@ -122,4 +180,12 @@ class Evaluator:
        record["judge_model"] = self.scenario.judge_model
        record["embedding_model"] = self.scenario.embedding_model
        record["run_id"] = self.scenario.scenario_name
        # 综合加权得分列（已暂时禁用）
        # record["weighted_score"] = compute_weighted_score(
        #     score.metrics, self.scenario.metric_weights
        # )
        # doc_name = str(sample.metadata.get("doc_name", "") or "")
        # record["sample_weight"] = resolve_weight(
        #     self.scenario.doc_weights, doc_name, default=1.0
        # )
        return record
--- a/rag_eval/execution/runner.py
+++ b/rag_eval/execution/runner.py
@@ -2,16 +2,42 @@
 from __future__ import annotations
 import logging
 import sys
 from pathlib import Path
 from rag_eval.adapters.http import HttpAppAdapter
 from rag_eval.adapters.python import PythonFunctionAdapter
 from rag_eval.advisor import run_advisor
 from rag_eval.config.loader import load_scenario
-from rag_eval.metrics.factory import build_metric_pipeline
+from rag_eval.metrics.factory import build_models, build_metric_pipeline
 from rag_eval.reporting.writers import write_run_artifacts
 from rag_eval.settings import EvaluationSettings
 from rag_eval.shared.models import Scenario
 from .evaluator import Evaluator
 logger = logging.getLogger("rag_eval.execution.runner")
 def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
    """Configure root logger: always write to stderr, optionally also to a file."""
    fmt = "%(asctime)s  %(levelname)-8s  %(name)s  %(message)s"
    datefmt = "%H:%M:%S"
    handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
    if log_file is not None:
        log_file.parent.mkdir(parents=True, exist_ok=True)
        fh = logging.FileHandler(log_file, encoding="utf-8")
        fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
        handlers.append(fh)
    logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
    # Also show ragas internal logs at WARNING so we can see LLM errors
    logging.getLogger("ragas").setLevel(logging.WARNING)
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("openai").setLevel(logging.WARNING)
 def build_adapter(scenario: Scenario):
    """Instantiate the adapter required by the resolved scenario, if any."""
@@ -27,16 +53,32 @@ def build_adapter(scenario: Scenario):
 def run_scenario(
    scenario_path: str,
    settings: EvaluationSettings | None = None,
    log_file: Path | None = None,
    log_level: int = logging.INFO,
 ):
    """Run one scenario end to end and persist its reporting artifacts."""
    _setup_logging(log_file=log_file, level=log_level)
    logger.info("[runner] run_scenario  path=%s", scenario_path)
    settings = settings or EvaluationSettings()
    if not settings.openai_api_key:
        raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
    scenario = load_scenario(scenario_path)
    logger.info("[runner] scenario loaded: name=%s  mode=%s  max_samples=%s",
                scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)
    # Build models once; reuse llm in both MetricPipeline and advisor.
    llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)
    adapter = build_adapter(scenario)
-    pipeline = build_metric_pipeline(scenario, settings)
+    pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
    evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
    result = evaluator.evaluate()
    write_run_artifacts(result)
    logger.info("[runner] artifacts written for run_id=%s", result.run_id)
    # Optimization advisor — runs only if scenario.optimization_advisor is True.
    run_advisor(result, scenario, llm)
    return result
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -18,20 +18,64 @@ from ragas.metrics.collections import (
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
    FactualCorrectness,
    Faithfulness,
    NoiseSensitivity,
    SemanticSimilarity,
 )
 from .pipeline import MetricPipeline
 def _resolve_openai_client_kwargs(
    judge_model: str,
    settings: EvaluationSettings,
 ) -> dict[str, Any]:
    """Return AsyncOpenAI kwargs, preferring a matching LLM Profile over .env settings.
    Lookup order:
      1. LLM Profile whose model name equals judge_model (exact match)
      2. Fall back to EvaluationSettings (.env)
    """
    try:
        # Lazy import to avoid circular dependency (webapp -> rag_eval is one-way).
        from webapp.services.profile_manager import profile_manager
        profiles = profile_manager.list_all()
        for profile in profiles:
            if profile.model == judge_model:
                kwargs: dict[str, Any] = {
                    "api_key": profile.api_key or "sk-placeholder",
                    "timeout": float(profile.timeout_seconds or 30),
                }
                if profile.base_url and profile.base_url.strip():
                    kwargs["base_url"] = profile.base_url.strip()
                return kwargs
    except Exception:  # noqa: BLE001
        # If profile lookup fails for any reason, fall through to .env settings.
        pass
    return settings.openai_client_kwargs
 def build_models(
    judge_model: str,
    embedding_model: str,
    settings: EvaluationSettings,
 ) -> tuple[Any, Any]:
-    """Create the LLM and embedding clients required by the selected RAGAS metrics."""
+    """Create the LLM and embedding clients required by the selected RAGAS metrics.
-    client = AsyncOpenAI(**settings.openai_client_kwargs)
+
-    llm = llm_factory(judge_model, client=client)
+    Dynamically resolves connection settings from the stored LLM Profiles first
    (matched by model name), falling back to .env settings when no profile matches.
    """
    client_kwargs = _resolve_openai_client_kwargs(judge_model, settings)
    client = AsyncOpenAI(**client_kwargs)
    # RAGAS structured-output judge calls can be truncated by the upstream default
    # 1024 completion budget, especially for faithfulness and GPT-5 family models.
    llm = llm_factory(
        judge_model,
        client=client,
        max_tokens=max(1, int(settings.ragas_llm_max_tokens)),
    )
    embeddings = embedding_factory(provider="openai", model=embedding_model, client=client)
    return llm, embeddings
@@ -39,19 +83,34 @@ def build_models(
 def build_metric_pipeline(
    scenario: Scenario,
    settings: EvaluationSettings,
    llm: Any | None = None,
    embeddings: Any | None = None,
 ) -> MetricPipeline:
-    """Build a metric pipeline containing only the metrics requested by the scenario."""
+    """Build a metric pipeline containing only the metrics requested by the scenario.
-    llm, embeddings = build_models(
+
-        scenario.judge_model,
+    If llm and embeddings are provided (pre-built by the caller), they are reused.
-        scenario.embedding_model,
+    Otherwise, new instances are created from scenario + settings.
-        settings,
+    """
-    )
+    if llm is None or embeddings is None:
        llm, embeddings = build_models(
            scenario.judge_model,
            scenario.embedding_model,
            settings,
        )
    # Build the full registry once, then slice it by configured metric names.
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
        # Robustness / end-to-end metrics (架构设计 §10.2).
        # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
        "noise_sensitivity": NoiseSensitivity(llm=llm),
        # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
        "factual_correctness": FactualCorrectness(llm=llm),
        # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return MetricPipeline(
        metrics={name: registry[name] for name in scenario.metrics},
--- a/rag_eval/metrics/pipeline.py
+++ b/rag_eval/metrics/pipeline.py
@@ -3,12 +3,16 @@
 from __future__ import annotations
 import asyncio
 import logging
 import math
 import time
 from dataclasses import dataclass
 from typing import Any
 from rag_eval.shared.models import MetricScore, NormalizedSample
 logger = logging.getLogger("rag_eval.metrics.pipeline")
@dataclass(slots=True)
 class MetricPipeline:
@@ -22,12 +26,43 @@ class MetricPipeline:
        results = {name: math.nan for name in self.metrics}
        errors: list[str] = []
        sid = sample.sample_id[:12]
        ans_len = len(sample.answer or "")
        ctx_count = len(sample.contexts or [])
        logger.debug(
            "[score] sample=%s  ans_len=%d  ctx_count=%d  question=%r",
            sid, ans_len, ctx_count,
            (sample.question or "")[:80],
        )
        for name, metric in self.metrics.items():
            t0 = time.monotonic()
            try:
                result = await self._run_metric(name, metric, sample)
-                results[name] = float(result.value)
+                score_val = float(result.value)
                results[name] = score_val
                elapsed = time.monotonic() - t0
                logger.info(
                    "[metric OK ] sample=%-12s  %-20s  score=%.4f  elapsed=%.1fs",
                    sid, name, score_val, elapsed,
                )
            except asyncio.TimeoutError:
                elapsed = time.monotonic() - t0
                msg = f"timeout after {self.metric_timeout_seconds}s"
                errors.append(f"{name}: {msg}")
                logger.warning(
                    "[metric TMO] sample=%-12s  %-20s  TIMEOUT after %.1fs",
                    sid, name, elapsed,
                )
            except Exception as exc:
                elapsed = time.monotonic() - t0
                exc_type = type(exc).__name__
                errors.append(f"{name}: {exc}")
                logger.warning(
                    "[metric ERR] sample=%-12s  %-20s  %s: %s  (elapsed=%.1fs)",
                    sid, name, exc_type, exc, elapsed,
                )
        return MetricScore(metrics=results, error=" | ".join(errors))
    async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
@@ -59,6 +94,23 @@ class MetricPipeline:
                reference=sample.ground_truth,
                retrieved_contexts=sample.contexts,
            )
        elif name == "noise_sensitivity":
            coroutine = metric.ascore(
                user_input=sample.question,
                response=sample.answer,
                reference=sample.ground_truth,
                retrieved_contexts=sample.contexts,
            )
        elif name == "factual_correctness":
            coroutine = metric.ascore(
                response=sample.answer,
                reference=sample.ground_truth,
            )
        elif name == "semantic_similarity":
            coroutine = metric.ascore(
                reference=sample.ground_truth,
                response=sample.answer,
            )
        else:
            raise ValueError(f"Unsupported metric: {name}")
@@ -72,11 +124,22 @@ class MetricPipeline:
        max_concurrency: int,
    ) -> list[MetricScore]:
        """Score all samples while respecting the configured concurrency limit."""
        total = len(samples)
        logger.info("[pipeline] scoring %d samples  concurrency=%d  timeout=%ss",
                    total, max_concurrency, self.metric_timeout_seconds)
        semaphore = asyncio.Semaphore(max(1, max_concurrency))
        completed = 0
-        async def guarded(sample: NormalizedSample) -> MetricScore:
+        async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
            """Throttle a single sample-scoring coroutine with the shared semaphore."""
            nonlocal completed
            async with semaphore:
-                return await self.score_sample(sample)
+                result = await self.score_sample(sample)
                completed += 1
                nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
                status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
                logger.info("[pipeline] progress %d/%d  sample=%-12s  %s",
                            completed, total, sample.sample_id[:12], status)
                return result
-        return await asyncio.gather(*(guarded(sample) for sample in samples))
+        return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))
--- a/rag_eval/metrics/registry.py
+++ b/rag_eval/metrics/registry.py
@@ -1,8 +1,13 @@
 """Supported metric names recognized by scenario validation and pipeline setup."""
 SUPPORTED_METRICS = {
    # Core retrieval / generation metrics (always available).
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
    # Robustness and end-to-end metrics (see 架构设计 §10.2).
    "noise_sensitivity",      # 鲁棒性：对检索噪声的敏感度
    "factual_correctness",    # 端到端：回答相对标准答案的事实正确性
    "semantic_similarity",    # 端到端：回答与标准答案的语义相似度（embedding，无 LLM 调用）
 }
--- a/rag_eval/metrics/weights.py
+++ b/rag_eval/metrics/weights.py
@@ -0,0 +1,152 @@
 """Utility functions for weighted metric aggregation.
 All functions are pure (no side effects, no I/O) and operate on plain dicts/lists.
 Weights do not need to be pre-normalised — normalisation is done internally.
 """
 from __future__ import annotations
 import math
 def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
    """Return the weight for *key*, or *default* when absent."""
    return float(weights.get(key, default))
 def compute_weighted_score(
    scores: dict[str, float | None],
    metric_weights: dict[str, float],
 ) -> float | None:
    """Return the weighted mean of valid (non-NaN, non-None) metric scores.
    Args:
        scores: mapping of metric_name -> raw score (may be NaN or None).
        metric_weights: optional per-metric weights; absent keys default to 1.0.
    Returns:
        Weighted mean as a float, or None when no valid score exists.
    """
    total_weight = 0.0
    total_score = 0.0
    for metric, score in scores.items():
        if score is None:
            continue
        try:
            value = float(score)
        except (TypeError, ValueError):
            continue
        if math.isnan(value) or math.isinf(value):
            continue
        weight = resolve_weight(metric_weights, metric, default=1.0)
        total_weight += weight
        total_score += weight * value
    if total_weight == 0.0:
        return None
    return total_score / total_weight
 def weighted_metric_means(
    score_rows: list[dict],
    metrics: list[str],
    doc_weights: dict[str, float],
 ) -> dict[str, float | None]:
    """Compute per-metric weighted means across all score rows.
    Each row's contribution is scaled by the doc_weight for its ``doc_name``.
    Rows with NaN/None for a given metric are excluded from that metric's mean.
    Args:
        score_rows: list of score record dicts (from scores.csv).
        metrics: ordered list of metric names to aggregate.
        doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0.
    Returns:
        Dict mapping metric_name -> weighted mean (or None if no valid data).
    """
    totals: dict[str, float] = {metric: 0.0 for metric in metrics}
    weights_sum: dict[str, float] = {metric: 0.0 for metric in metrics}
    for row in score_rows:
        doc_name = str(row.get("doc_name", "") or "")
        sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
        for metric in metrics:
            raw_value = row.get(metric)
            if raw_value is None:
                continue
            try:
                value = float(raw_value)
            except (TypeError, ValueError):
                continue
            if math.isnan(value) or math.isinf(value):
                continue
            totals[metric] += sample_weight * value
            weights_sum[metric] += sample_weight
    return {
        metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None)
        for metric in metrics
    }
 def compute_overall_weighted_score_mean(
    score_rows: list[dict],
    metric_weights: dict[str, float],
    doc_weights: dict[str, float],
 ) -> float | None:
    """Compute the overall weighted-score mean across all samples.
    For each sample:
      1. Compute per-sample weighted_score via compute_weighted_score.
      2. Scale by the doc weight for that sample's doc_name.
    Then return the weighted mean of all per-sample weighted_scores.
    """
    total_weight = 0.0
    total_score = 0.0
    for row in score_rows:
        metric_scores: dict[str, float | None] = {}
        for key, value in row.items():
            if key in _META_COLUMNS:
                continue
            metric_scores[key] = value  # type: ignore[assignment]
        weighted_score = compute_weighted_score(metric_scores, metric_weights)
        if weighted_score is None:
            continue
        doc_name = str(row.get("doc_name", "") or "")
        sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
        total_weight += sample_weight
        total_score += sample_weight * weighted_score
    return total_score / total_weight if total_weight > 0 else None
 # Columns in scores.csv that are sample metadata, not metric scores.
 _META_COLUMNS = frozenset(
    {
        "sample_id",
        "question",
        "contexts",
        "answer",
        "ground_truth",
        "scenario",
        "language",
        "retrieval_config",
        "error",
        "judge_model",
        "embedding_model",
        "run_id",
        "difficulty",
        "question_type",
        "doc_id",
        "doc_name",
        "section_path",
        "page_start",
        "page_end",
        "source_chunk_ids",
        "review_status",
        "review_notes",
        "weighted_score",
        "sample_weight",
    }
 )
--- a/rag_eval/reporting/artifacts.py
+++ b/rag_eval/reporting/artifacts.py
@@ -17,4 +17,5 @@ def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
        invalid_csv=run_dir / "invalid.csv",
        summary_md=run_dir / "summary.md",
        metadata_json=run_dir / "metadata.json",
        advice_md=run_dir / "optimization_advice.md",
    )
--- a/rag_eval/reporting/summary.py
+++ b/rag_eval/reporting/summary.py
@@ -6,6 +6,10 @@ import math
 import pandas as pd
 from rag_eval.metrics.weights import (
    compute_overall_weighted_score_mean,
    weighted_metric_means,
 )
 from rag_eval.shared.models import EvaluationResult
@@ -55,24 +59,42 @@ def build_summary_markdown(result: EvaluationResult) -> str:
        lines.append("No valid samples were scored.")
        return "\n".join(lines) + "\n"
-    for metric in result.scenario.metrics:
+    score_rows_list = scores.to_dict(orient="records")
-        mean_value = scores[metric].mean(numeric_only=True)
+    w_means = weighted_metric_means(
-        if isinstance(mean_value, float) and not math.isnan(mean_value):
+        score_rows_list, result.scenario.metrics, result.scenario.doc_weights
            lines.append(f"- {metric}: `{mean_value:.4f}`")
        else:
            lines.append(f"- {metric}: `n/a`")
    # Keep the summary self-sufficient by including every scored sample and its errors.
    detail_columns = ["sample_id", *result.scenario.metrics, "error"]
    detail = scores[detail_columns]
    lines.extend(
        [
            "",
            "## Per-sample Scores",
            "",
            "```text",
            _table_from_frame(detail),
            "```",
        ]
    )
    has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
    for metric in result.scenario.metrics:
        mean_value = w_means.get(metric)
        w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
        weight_note = f"  (w={w:.2f})" if result.scenario.metric_weights else ""
        if mean_value is not None and not math.isnan(mean_value):
            lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
        else:
            lines.append(f"- {metric}: `n/a`{weight_note}")
    # 综合加权得分（已暂时禁用）
    # if has_weights:
    #     overall_ws = compute_overall_weighted_score_mean(
    #         score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
    #     )
    #     weight_suffix = " (加权)"
    #     if overall_ws is not None and not math.isnan(overall_ws):
    #         lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
    #     else:
    #         lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
    detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
    existing_columns = [c for c in detail_columns if c in scores.columns]
    detail = scores[existing_columns]
    lines.extend([
        "",
        "## Per-sample Scores",
        "",
        "```text",
        _table_from_frame(detail),
        "```",
    ])
    return "\n".join(lines) + "\n"
--- a/rag_eval/settings.py
+++ b/rag_eval/settings.py
@@ -21,11 +21,16 @@ class EvaluationSettings(BaseSettings):
    openai_api_key: str | None = Field(default=None, alias="OPENAI_API_KEY")
    openai_base_url: str = Field(default="http://6.86.80.4:30080/v1", alias="OPENAI_BASE_URL")
-    ragas_judge_model: str = Field(default="deepseek-v4-flash", alias="RAGAS_JUDGE_MODEL")
+    ragas_judge_model: str = Field(default="gpt-5", alias="RAGAS_JUDGE_MODEL")
    ragas_embedding_model: str = Field(
-        default="text-embedding-v3",
+        default="text-embedding-3-small",
        alias="RAGAS_EMBEDDING_MODEL",
    )
    ragas_llm_max_tokens: int = Field(
        default=4096,
        alias="RAGAS_LLM_MAX_TOKENS",
        gt=0,
    )
    openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS")
    ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS")
    batch_size: int = Field(default=8, alias="BATCH_SIZE")
@@ -52,6 +57,11 @@ class EvaluationSettings(BaseSettings):
    )
    parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
    dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
    score_api_token: str | None = Field(
        default=None,
        alias="SCORE_API_TOKEN",
        description="Bearer token for /api/score endpoint. Empty = no auth.",
    )
    @property
    def openai_client_kwargs(self) -> dict[str, str | float]:
--- a/rag_eval/shared/models.py
+++ b/rag_eval/shared/models.py
@@ -76,6 +76,9 @@ class Scenario:
    runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
    app_adapter: AppAdapterConfig | None = None
    source_path: Path | None = None
    optimization_advisor: bool = False
    metric_weights: dict[str, float] = field(default_factory=dict)
    doc_weights: dict[str, float] = field(default_factory=dict)
    def snapshot(self) -> dict[str, Any]:
        """Serialize the scenario into a reporting-friendly dictionary snapshot."""
@@ -159,3 +162,4 @@ class RunArtifactPaths:
    invalid_csv: Path
    summary_md: Path
    metadata_json: Path
    advice_md: Path | None = None
--- a/rag_eval/shared/profile_store.py
+++ b/rag_eval/shared/profile_store.py
@@ -0,0 +1,53 @@
 """Lightweight read-only accessor for configs/llm_profiles.json.
 Kept in ``rag_eval`` (not ``webapp``) so the runner can look up per-model
 credentials without depending on the webapp layer.
 """
 from __future__ import annotations
 import json
 import logging
 from pathlib import Path
 from typing import Any
 logger = logging.getLogger(__name__)
 _PROFILES_PATH = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json"
 def find_by_model(model_name: str) -> dict[str, Any] | None:
    """Return the first profile whose ``model`` field matches *model_name*, or None.
    Returns None (without raising) when the profiles file does not exist or
    cannot be parsed — callers fall back to environment-variable defaults.
    """
    if not _PROFILES_PATH.exists():
        return None
    try:
        data = json.loads(_PROFILES_PATH.read_text(encoding="utf-8"))
        for profile in data.get("profiles", []):
            if profile.get("model") == model_name:
                return profile
    except Exception as exc:  # noqa: BLE001
        logger.warning("[profile_store] failed to read %s: %s", _PROFILES_PATH, exc)
    return None
 def profile_to_client_kwargs(
    profile: dict[str, Any],
    fallback_api_key: str | None,
    fallback_timeout: float,
 ) -> dict[str, Any]:
    """Convert a profile dict into keyword arguments for ``openai.AsyncOpenAI``.
    Fields present in the profile override the supplied fallback values.
    """
    kwargs: dict[str, Any] = {
        "api_key": profile.get("api_key") or fallback_api_key or "",
        "timeout": float(profile.get("timeout_seconds") or fallback_timeout),
    }
    base_url = (profile.get("base_url") or "").strip()
    if base_url:
        kwargs["base_url"] = base_url
    return kwargs
--- a/run_eval.bat
+++ b/run_eval.bat
@@ -0,0 +1,107 @@
@echo off
 setlocal enabledelayedexpansion
 :: ============================================================
 ::  run_eval.bat  -  Run a RAGAS evaluation scenario with logs
 ::
 ::  Usage:
 ::    run_eval.bat                          (uses default online scenario)
 ::    run_eval.bat offline                  (runs offline smoke scenario)
 ::    run_eval.bat path\to\scenario.yaml    (any custom scenario)
 ::    run_eval.bat offline DEBUG            (second arg = log level)
 :: ============================================================
 cd /d "%~dp0"
 echo.
 echo ============================================================
 echo   Siemens RAGAS  -  Evaluation Runner
 echo ============================================================
 echo.
 :: ----------------------------------------------------------------
 :: 1. Resolve scenario path  (arg1)
 :: ----------------------------------------------------------------
 set "SCENARIO=%~1"
 if "%SCENARIO%"=="" set "SCENARIO=online"
 if /i "%SCENARIO%"=="online" (
    set "SCENARIO=scenarios\online\siemens-pdf-question-bank-online.yaml"
 )
 if /i "%SCENARIO%"=="offline" (
    set "SCENARIO=scenarios\offline\siemens-pdf-offline-smoke.yaml"
 )
 if not exist "%SCENARIO%" (
    echo [ERROR] Scenario file not found: %SCENARIO%
    echo.
    echo Usage examples:
    echo   run_eval.bat                    - online eval (default)
    echo   run_eval.bat offline            - offline smoke
    echo   run_eval.bat path\to\file.yaml  - custom scenario
    goto :error
 )
 echo [OK] Scenario : %SCENARIO%
 :: ----------------------------------------------------------------
 :: 2. Resolve log level  (arg2, default INFO)
 :: ----------------------------------------------------------------
 set "LOG_LEVEL=%~2"
 if "%LOG_LEVEL%"=="" set "LOG_LEVEL=INFO"
 echo [OK] Log level: %LOG_LEVEL%
 :: ----------------------------------------------------------------
 :: 3. Create logs dir and build timestamped log filename
 :: ----------------------------------------------------------------
 if not exist "logs" mkdir logs
 for /f "tokens=1-3 delims=/-" %%a in ("%DATE%") do (
    set "YMD=%%c-%%a-%%b"
 )
 for /f "tokens=1-3 delims=:." %%a in ("%TIME: =0%") do (
    set "HMS=%%a%%b%%c"
 )
 set "LOG_FILE=logs\eval_%YMD%_%HMS%.log"
 echo [OK] Log file : %LOG_FILE%
 echo.
 echo ============================================================
 echo   Starting evaluation...
 echo   (Logs also written to %LOG_FILE%)
 echo   Press Ctrl+C to abort
 echo ============================================================
 echo.
 :: ----------------------------------------------------------------
 :: 4. Run evaluation with UTF-8 and logging
 :: ----------------------------------------------------------------
 set PYTHONIOENCODING=utf-8
 set PYTHONPATH=.
 python main.py ^
    --scenario "%SCENARIO%" ^
    --log-file "%LOG_FILE%" ^
    --log-level %LOG_LEVEL%
 if errorlevel 1 (
    echo.
    echo [ERROR] Evaluation failed. Check log: %LOG_FILE%
    goto :error
 )
 echo.
 echo ============================================================
 echo   Evaluation complete!
 echo   Log saved to: %LOG_FILE%
 echo   Open the web console to view results: start.bat
 echo ============================================================
 echo.
 pause
 exit /b 0
 :error
 echo.
 echo ============================================================
 echo   Evaluation failed. See error above or check log file.
 echo ============================================================
 pause
 exit /b 1
--- a/run_eval.ps1
+++ b/run_eval.ps1
@@ -0,0 +1,96 @@
 # run_eval.ps1 - Siemens RAGAS Evaluation Runner
 # Usage:
 #   .\run_eval.ps1                         # online eval (default)
 #   .\run_eval.ps1 offline                 # offline smoke
 #   .\run_eval.ps1 path\to\scenario.yaml   # custom scenario
 #   .\run_eval.ps1 online DEBUG            # second arg = log level (DEBUG/INFO/WARNING)
 # Or: powershell -ExecutionPolicy Bypass -File run_eval.ps1 [scenario] [log-level]
 param(
    [string]$Scenario = "online",
    [string]$LogLevel = "INFO"
 )
 $ErrorActionPreference = "Stop"
 Set-Location $PSScriptRoot
 Write-Host ""
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host "  Siemens RAGAS  -  Evaluation Runner" -ForegroundColor Cyan
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host ""
 # ----------------------------------------------------------------
 # 1. Resolve scenario path
 # ----------------------------------------------------------------
 $scenarioMap = @{
    "online"  = "scenarios\online\siemens-pdf-question-bank-online.yaml"
    "offline" = "scenarios\offline\siemens-pdf-offline-smoke.yaml"
 }
 if ($scenarioMap.ContainsKey($Scenario.ToLower())) {
    $Scenario = $scenarioMap[$Scenario.ToLower()]
 }
 if (-not (Test-Path $Scenario)) {
    Write-Host "[ERROR] Scenario file not found: $Scenario" -ForegroundColor Red
    Write-Host ""
    Write-Host "Usage examples:"
    Write-Host "  .\run_eval.ps1                    - online eval (default)"
    Write-Host "  .\run_eval.ps1 offline            - offline smoke"
    Write-Host "  .\run_eval.ps1 path\to\file.yaml  - custom scenario"
    Read-Host "Press Enter to exit"
    exit 1
 }
 Write-Host "[OK] Scenario : $Scenario" -ForegroundColor Green
 # ----------------------------------------------------------------
 # 2. Validate log level
 # ----------------------------------------------------------------
 $validLevels = @("DEBUG", "INFO", "WARNING", "ERROR")
 if ($validLevels -notcontains $LogLevel.ToUpper()) {
    Write-Host "[WARN] Unknown log level '$LogLevel', defaulting to INFO" -ForegroundColor Yellow
    $LogLevel = "INFO"
 }
 Write-Host "[OK] Log level: $LogLevel" -ForegroundColor Green
 # ----------------------------------------------------------------
 # 3. Create logs dir with timestamped filename
 # ----------------------------------------------------------------
 if (-not (Test-Path "logs")) { New-Item -ItemType Directory "logs" | Out-Null }
 $timestamp = Get-Date -Format "yyyy-MM-dd_HHmmss"
 $logFile = "logs\eval_$timestamp.log"
 Write-Host "[OK] Log file : $logFile" -ForegroundColor Green
 Write-Host ""
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host "  Starting evaluation..." -ForegroundColor Cyan
 Write-Host "  Logs also written to: $logFile" -ForegroundColor Cyan
 Write-Host "  Press Ctrl+C to abort" -ForegroundColor Yellow
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host ""
 # ----------------------------------------------------------------
 # 4. Run evaluation
 # ----------------------------------------------------------------
 $env:PYTHONIOENCODING = "utf-8"
 $env:PYTHONPATH = "."
 & python main.py `
    --scenario $Scenario `
    --log-file $logFile `
    --log-level $LogLevel.ToUpper()
 if ($LASTEXITCODE -ne 0) {
    Write-Host ""
    Write-Host "[ERROR] Evaluation failed. Check log: $logFile" -ForegroundColor Red
    Read-Host "Press Enter to exit"
    exit 1
 }
 Write-Host ""
 Write-Host "============================================================" -ForegroundColor Green
 Write-Host "  Evaluation complete!" -ForegroundColor Green
 Write-Host "  Log saved to: $logFile" -ForegroundColor Green
 Write-Host "  Open the web console to view results: start.bat" -ForegroundColor Cyan
 Write-Host "============================================================" -ForegroundColor Green
 Write-Host ""
 Read-Host "Press Enter to exit"
--- a/run_eval.sh
+++ b/run_eval.sh
@@ -0,0 +1,147 @@
 #!/usr/bin/env bash
 # run_eval.sh — Siemens RAGAS 评估运行脚本（Linux）
 # 对应 Windows 的 run_eval.ps1
 #
 # 用法:
 #   bash run_eval.sh                          # online 评估（默认）
 #   bash run_eval.sh offline                  # offline 冒烟测试
 #   bash run_eval.sh scenarios/xxx.yaml       # 自定义场景
 #   bash run_eval.sh online DEBUG             # 指定日志级别
 #   bash run_eval.sh build scenarios/siemens_build/siemens-pdf-build.yaml
 #                                             # 题库生成
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 # ── 颜色输出 ──────────────────────────────────────────────────────
 if [ -t 1 ]; then
    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
 else
    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
 fi
 ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
 warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
 err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
 info() { echo -e "${CYAN}[INFO]${NC}  $*"; }
 # ── 参数解析 ──────────────────────────────────────────────────────
 SCENARIO="${1:-online}"
 LOG_LEVEL="${2:-INFO}"
 # 场景别名映射
 declare -A SCENARIO_MAP=(
    ["online"]="scenarios/online/siemens-pdf-question-bank-online.yaml"
    ["offline"]="scenarios/offline/siemens-pdf-offline-smoke.yaml"
 )
 # 检测是否是 dataset build 模式
 BUILD_MODE=false
 BUILD_CONFIG=""
 if [ "$SCENARIO" = "build" ]; then
    BUILD_MODE=true
    BUILD_CONFIG="${2:-scenarios/siemens_build/siemens-pdf-build.yaml}"
    LOG_LEVEL="${3:-INFO}"
 elif [ -v "SCENARIO_MAP[$SCENARIO]" ]; then
    SCENARIO="${SCENARIO_MAP[$SCENARIO]}"
 fi
 # ── 验证 ──────────────────────────────────────────────────────────
 echo ""
 echo -e "${CYAN}============================================================${NC}"
 echo -e "${CYAN}  Siemens RAGAS  —  评估运行${NC}"
 echo -e "${CYAN}============================================================${NC}"
 echo ""
 # 检查虚拟环境
 if [ ! -f ".venv/bin/python" ]; then
    err "未找到 .venv，请先执行部署：bash deploy.sh"
    exit 1
 fi
 PYTHON=".venv/bin/python"
 # Build 模式校验
 if [ "$BUILD_MODE" = true ]; then
    if [ ! -f "$BUILD_CONFIG" ]; then
        err "题库生成配置文件不存在：$BUILD_CONFIG"
        echo ""
        echo "可用配置："
        find scenarios/ -name "*.yaml" 2>/dev/null | head -20 | sed 's/^/  /'
        exit 1
    fi
    ok "模式      : 题库生成 (dataset build)"
    ok "配置文件  : $BUILD_CONFIG"
 else
    # 场景文件校验
    if [ ! -f "$SCENARIO" ]; then
        err "场景文件不存在：$SCENARIO"
        echo ""
        echo "用法示例："
        echo "  bash run_eval.sh                          # online 评估"
        echo "  bash run_eval.sh offline                  # offline 冒烟"
        echo "  bash run_eval.sh scenarios/xxx.yaml       # 自定义场景"
        echo "  bash run_eval.sh build [config.yaml]      # 题库生成"
        exit 1
    fi
    ok "场景文件  : $SCENARIO"
 fi
 # 日志级别校验
 LOG_LEVEL_UPPER="${LOG_LEVEL^^}"
 case "$LOG_LEVEL_UPPER" in
    DEBUG|INFO|WARNING|ERROR) ;;
    *)
        warn "未知日志级别 '$LOG_LEVEL'，使用默认值 INFO"
        LOG_LEVEL_UPPER="INFO"
        ;;
 esac
 ok "日志级别  : $LOG_LEVEL_UPPER"
 # 创建日志目录
 mkdir -p logs
 TIMESTAMP=$(date +%Y-%m-%d_%H%M%S)
 LOG_FILE="logs/eval_${TIMESTAMP}.log"
 ok "日志文件  : $LOG_FILE"
 echo ""
 echo -e "${CYAN}============================================================${NC}"
 echo -e "${CYAN}  开始运行，按 Ctrl+C 中止${NC}"
 echo -e "${CYAN}============================================================${NC}"
 echo ""
 # ── 运行 ──────────────────────────────────────────────────────────
 export PYTHONIOENCODING="utf-8"
 export PYTHONPATH="."
 if [ "$BUILD_MODE" = true ]; then
    "$PYTHON" main.py \
        --dataset-build-config "$BUILD_CONFIG"
 else
    "$PYTHON" main.py \
        --scenario "$SCENARIO" \
        --log-file "$LOG_FILE" \
        --log-level "$LOG_LEVEL_UPPER"
 fi
 EXIT_CODE=$?
 echo ""
 if [ $EXIT_CODE -eq 0 ]; then
    echo -e "${GREEN}============================================================${NC}"
    echo -e "${GREEN}  运行完成！${NC}"
    if [ "$BUILD_MODE" = false ]; then
        echo -e "${GREEN}  日志已保存到：$LOG_FILE${NC}"
    fi
    echo -e "${CYAN}  在 Web 控制台查看报告：bash start.sh${NC}"
    echo -e "${GREEN}============================================================${NC}"
 else
    err "运行失败（exit code=$EXIT_CODE）"
    if [ "$BUILD_MODE" = false ]; then
        err "查看日志：cat $LOG_FILE"
    fi
    exit $EXIT_CODE
 fi
 echo ""
--- a/scenarios/offline/siemens-pdf-offline-smoke.yaml
+++ b/scenarios/offline/siemens-pdf-offline-smoke.yaml
@@ -0,0 +1,19 @@
 scenario_name: siemens-pdf-offline-smoke
 mode: offline
 app_adapter: null
 dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
 judge_model: deepseek-v4-flash
 embedding_model: text-embedding-v3
 metrics:
  - faithfulness
  - answer_relevancy
  - context_recall
  - context_precision
  # 可选：鲁棒性 / 端到端指标（数据集已含 ground_truth，取消注释即可启用）
  # - noise_sensitivity      # 鲁棒性：对检索噪声的敏感度
  # - factual_correctness    # 端到端：事实正确性（相对标准答案）
  # - semantic_similarity    # 端到端：语义相似度（embedding，无 LLM 调用）
 output_dir: ../../outputs/siemens-pdf-offline-smoke
 runtime:
  batch_size: 4
  max_samples: 30
--- a/scenarios/online/sample-pdf-question-bank-online.yaml
+++ b/scenarios/online/sample-pdf-question-bank-online.yaml
@@ -1,13 +1,13 @@
 scenario_name: sample-pdf-question-bank-online
 mode: online
 dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
-judge_model: deepseek-v4-pro
+judge_model: qwen3.5-flash
 embedding_model: text-embedding-v3
 metrics:
-  - faithfulness
+- faithfulness
-  - answer_relevancy
+- answer_relevancy
-  - context_recall
+- context_recall
-  - context_precision
+- context_precision
 output_dir: ../../outputs/online/sample-pdf-question-bank
 runtime:
  batch_size: 2
@@ -19,4 +19,4 @@ app_adapter:
  callable: apps.pdf_question_bank.adapter:run
  static_kwargs:
    source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
-    model: deepseek-v4-flash
+    model: glm-5
--- a/scenarios/online/siemens-pdf-question-bank-online.yaml
+++ b/scenarios/online/siemens-pdf-question-bank-online.yaml
@@ -0,0 +1,26 @@
 scenario_name: siemens-pdf-question-bank-online
 mode: online
 dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
 judge_model: deepseek-v4-flash
 embedding_model: text-embedding-v3
 optimization_advisor: true
 metrics:
 - faithfulness
 - answer_relevancy
 - context_recall
 - context_precision
 - noise_sensitivity
 - factual_correctness
 - semantic_similarity
 output_dir: ../../outputs/online/siemens-pdf-question-bank
 runtime:
  batch_size: 3
  app_concurrency: 3
  metric_concurrency: 3
  max_samples: 10
 app_adapter:
  type: python
  callable: apps.siemens_pdf_qa.adapter:run
  static_kwargs:
    source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
    model: glm-5
--- a/scenarios/siemens_build/siemens-pdf-build.yaml
+++ b/scenarios/siemens_build/siemens-pdf-build.yaml
@@ -0,0 +1,17 @@
 job_name: siemens-pdf-question-bank
 input:
  path: ../../datasets/siemens-pdfs
  glob: "*.pdf"
 parser:
  provider: aliyun_docmind
  failure_mode: skip
 generation:
  output_type: online_question_bank
  review_mode: draft_with_manual_review
  max_questions_per_document: 10
  max_source_chunks_per_question: 3
 output:
  dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
  artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
 runtime:
  max_documents: 17
--- a/scripts/build_siemens_offline_smoke.py
+++ b/scripts/build_siemens_offline_smoke.py
@@ -0,0 +1,72 @@
 """Build the Siemens offline smoke dataset from a completed dataset_build run.
 Must be run AFTER `python main.py --dataset-build-config
 scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
 It uses the stable `latest/` alias so you don't need to know the run_id.
 Usage:
    python scripts/build_siemens_offline_smoke.py
 Output:
    datasets/normalized/siemens_pdf_offline_smoke.csv
    (referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
 """
 from __future__ import annotations
 from pathlib import Path
 # ---------------------------------------------------------------------------
 # Paths — all relative to the siemens_ragas/ repository root
 # ---------------------------------------------------------------------------
 REPO_ROOT = Path(__file__).resolve().parents[1]
 DRAFT_DATASET_PATH = (
    REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
    / "latest" / "dataset_draft.csv"
 )
 SOURCE_CHUNKS_PATH = (
    REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
    / "latest" / "source_chunks.jsonl"
 )
 OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
 def main() -> None:
    """Convert the Siemens build artefacts into an offline-evaluable dataset."""
    if not DRAFT_DATASET_PATH.exists():
        raise FileNotFoundError(
            f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
            "Run the dataset build first:\n"
            "  python main.py --dataset-build-config "
            "scenarios/siemens_build/siemens-pdf-build.yaml"
        )
    if not SOURCE_CHUNKS_PATH.exists():
        raise FileNotFoundError(
            f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
            "Run the dataset build first."
        )
    # Import here so the script is importable even before rag_eval is fully set up.
    from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
    output = build_offline_smoke_dataset(
        draft_dataset_path=DRAFT_DATASET_PATH,
        source_chunks_path=SOURCE_CHUNKS_PATH,
        output_path=OUTPUT_PATH,
    )
    import pandas as pd
    frame = pd.read_csv(output)
    print(f"Offline smoke dataset written to: {output}")
    print(f"Total rows: {len(frame)}")
    if len(frame) > 0:
        lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
        diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
        print(f"Language distribution: {lang_counts}")
        print(f"Difficulty distribution: {diff_counts}")
 if __name__ == "__main__":
    main()
--- a/scripts/seed_sample_run.py
+++ b/scripts/seed_sample_run.py
@@ -0,0 +1,236 @@
 """Generate a realistic sample evaluation run so the console has demo data.
 This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
 scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
 layer produces, but without needing ragas or any network calls. It lets the
 report board render immediately for demos and local development.
 Usage:
    python scripts/seed_sample_run.py
 """
 from __future__ import annotations
 import csv
 import json
 from pathlib import Path
 REPO_ROOT = Path(__file__).resolve().parents[1]
 SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
 RUN_ID = "2026-06-15T08-30-00+00-00"
 JUDGE_MODEL = "deepseek-distill-qwen-32b"
 EMBEDDING_MODEL = "text-embedding-v3"
 METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
 # Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
 # Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
 # tail in the distribution, and clear weak groups by difficulty).
 SAMPLES = [
    {
        "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
        "question": "员工入职满3年可享受多少天年休假？",
        "contexts": ["员工入司满1年不满10年的，年休假5天。", "年休假在每年1月1日起可申请。"],
        "answer": "根据规定，入职满3年的员工可享受5天年休假。",
        "ground_truth": "员工入司满1年不满10年的，年休假5天。",
        "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
    },
    {
        "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
        "question": "公司报销差旅费的截止提交时间是什么时候？",
        "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
        "answer": "差旅费需在出差结束后30天内提交报销。",
        "ground_truth": "差旅费报销须在出差结束后30天内提交。",
        "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
    },
    {
        "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
        "question": "申请远程办公需要经过哪些审批流程？",
        "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
        "answer": "需先由直属主管审批，再提交人力资源部备案，每月不超过8天。",
        "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案，每月上限8天。",
        "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
    },
    {
        "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
        "question": "How many days of paternity leave are employees entitled to?",
        "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
        "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
        "ground_truth": "Employees are entitled to 15 days of paternity leave.",
        "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
    },
    {
        "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
        "question": "正式员工与试用期员工在医疗保险待遇上有何区别？",
        "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
        "answer": "正式员工额外享受补充医疗保险，试用期员工仅有基础医疗保险。",
        "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险，试用期员工仅享基础医疗保险。",
        "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
    },
    {
        "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
        "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
        "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
        "answer": "第三方共享需签保密协议，敏感数据须经数据保护官批准，记录留存3年。",
        "ground_truth": "向第三方共享数据须签署保密协议，敏感数据共享须经数据保护官批准，且共享记录至少留存3年。",
        "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
    },
    {
        "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
        "question": "跨部门项目预算超支时的审批升级路径是怎样的？",
        "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
        "answer": "超支10%以内项目经理批，10%-20%需总监批，超20%需财务委员会审批。",
        "ground_truth": "超支10%以内由项目经理审批，10%-20%由部门总监审批，超过20%须提交财务委员会审批。",
        "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
    },
    {
        "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
        "question": "员工持股计划的最低锁定期是多少年？",
        "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
        "answer": "员工持股计划的最低锁定期为3年。",
        "ground_truth": "员工持股计划的最低锁定期为4年。",
        "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
    },
    {
        "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
        "question": "What is the difference in notice period between voluntary and involuntary termination?",
        "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
        "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
        "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
        "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
    },
    {
        "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
        "question": "公司规定的标准工作时间是每周多少小时？",
        "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
        "answer": "公司标准工作时间为每周40小时。",
        "ground_truth": "公司标准工作时间为每周40小时。",
        "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
    },
 ]
 # Two samples that failed normalization, to exercise the invalid count display.
 INVALID_SAMPLES = [
    {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办？"},
    {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡？"},
 ]
 def _output_dir() -> Path:
    """Return the run directory where sample artifacts are written."""
    return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
 def _write_scores_csv(path: Path) -> None:
    """Write scores.csv with sample fields, metric scores, and metadata columns."""
    fieldnames = [
        "sample_id", "question", "contexts", "answer", "ground_truth",
        "scenario", "language", "difficulty", "question_type",
        *METRICS, "error", "judge_model", "embedding_model", "run_id",
    ]
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        for sample in SAMPLES:
            row = {
                "sample_id": sample["sample_id"],
                "question": sample["question"],
                # Serialize contexts as a JSON list, matching engine CSV output.
                "contexts": json.dumps(sample["contexts"], ensure_ascii=False),
                "answer": sample["answer"],
                "ground_truth": sample["ground_truth"],
                "scenario": SCENARIO_NAME,
                "language": sample["language"],
                "difficulty": sample["difficulty"],
                "question_type": sample["question_type"],
                "error": "",
                "judge_model": JUDGE_MODEL,
                "embedding_model": EMBEDDING_MODEL,
                "run_id": SCENARIO_NAME,
            }
            for metric in METRICS:
                row[metric] = sample[metric]
            writer.writerow(row)
 def _write_invalid_csv(path: Path) -> None:
    """Write invalid.csv with the small set of unscored samples."""
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
        writer.writeheader()
        writer.writerows(INVALID_SAMPLES)
 def _metric_mean(metric: str) -> float:
    """Compute the mean of one metric across the valid samples."""
    return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
 def _write_metadata(path: Path) -> None:
    """Write metadata.json mirroring the reporting layer's schema."""
    metadata = {
        "run_id": RUN_ID,
        "scenario_name": SCENARIO_NAME,
        "mode": "offline",
        "judge_model": JUDGE_MODEL,
        "embedding_model": EMBEDDING_MODEL,
        "started_at": "2026-06-15T08:29:12+00:00",
        "finished_at": "2026-06-15T08:31:45+00:00",
        "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
        "valid_samples": len(SAMPLES),
        "invalid_samples": len(INVALID_SAMPLES),
    }
    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
 def _write_summary(path: Path) -> None:
    """Write a human-readable summary.md echoing the metric means."""
    lines = [
        f"# {SCENARIO_NAME}",
        "",
        f"- run_id: `{RUN_ID}`",
        "- mode: `offline`",
        f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
        f"- valid_samples: `{len(SAMPLES)}`",
        f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
        f"- judge_model: `{JUDGE_MODEL}`",
        "",
        "## Metric Means",
        "",
    ]
    for metric in METRICS:
        lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
 def _write_scenario_snapshot(path: Path) -> None:
    """Write scenario.snapshot.yaml so the reader resolves the metric list."""
    import yaml
    snapshot = {
        "scenario_name": SCENARIO_NAME,
        "mode": "offline",
        "judge_model": JUDGE_MODEL,
        "embedding_model": EMBEDDING_MODEL,
        "metrics": METRICS,
    }
    path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
 def main() -> None:
    """Write all sample run artifacts into a fresh run directory."""
    run_dir = _output_dir()
    run_dir.mkdir(parents=True, exist_ok=True)
    _write_scores_csv(run_dir / "scores.csv")
    _write_invalid_csv(run_dir / "invalid.csv")
    _write_metadata(run_dir / "metadata.json")
    _write_summary(run_dir / "summary.md")
    _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
    print(f"Sample run written to: {run_dir}")
    print("Start the console with: python webmain.py")
 if __name__ == "__main__":
    main()
--- a/scripts/smoke_advisor.py
+++ b/scripts/smoke_advisor.py
@@ -0,0 +1,59 @@
 """Offline smoke-check for the advisor module wiring (no network required)."""
 import math
 import sys
 import tempfile
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from rag_eval.advisor.rules import diagnose
 from rag_eval.advisor.writer import write_advice, _format_log_summary
 # Simulate score_rows with low faithfulness and high noise_sensitivity
 rows = [
    {
        "sample_id": f"s{i}",
        "question": f"问题{i}：西门子CT扫描的Flash技术原理是什么？",
        "answer": f"答案{i}：Flash技术采用双源CT扫描",
        "ground_truth": f"标准答案{i}：Flash扫描利用双源CT和大螺距实现超低辐射剂量扫描",
        "faithfulness": 0.3 + i * 0.05,
        "noise_sensitivity": 0.4 + i * 0.02,
        "context_recall": 0.75,
        "semantic_similarity": 0.65,
    }
    for i in range(5)
 ]
 diags = diagnose(rows, metrics=["faithfulness", "noise_sensitivity", "context_recall", "semantic_similarity"])
 print(f"Diagnosed {len(diags)} metric(s):")
 for d in diags:
    print(f"  {d.metric}: mean={d.mean_score}, severity={d.severity}, low_samples={len(d.low_samples)}")
 assert len(diags) >= 2, f"Expected at least 2 diagnoses, got {len(diags)}"
 metrics_hit = {d.metric for d in diags}
 assert "faithfulness" in metrics_hit, "faithfulness should be triggered"
 assert "noise_sensitivity" in metrics_hit, "noise_sensitivity should be triggered"
 with tempfile.TemporaryDirectory() as tmp:
    path = Path(tmp) / "optimization_advice.md"
    write_advice(
        diagnoses=diags,
        llm_markdown="",  # fallback mode (no LLM)
        advice_path=path,
        scenario_name="smoke-test-siemens",
        run_id="2026-06-16T00-00-00",
        judge_model="deepseek-v4-flash",
    )
    content = path.read_text(encoding="utf-8")
    assert "smoke-test-siemens" in content, "scenario name missing from report"
    assert "faithfulness" in content, "faithfulness missing from report"
    assert "noise_sensitivity" in content, "noise_sensitivity missing from report"
    print(f"\nAdvice file ({len(content)} chars) — assertions OK")
 # Verify log summary format
 summary = _format_log_summary(diags, Path("optimization_advice.md"))
 print(f"\nLog summary length: {len(summary)} chars, faithfulness present: {'faithfulness' in summary}")
 assert "触发诊断" in summary
 assert "faithfulness" in summary
 print("\nSmoke check PASSED")
--- a/siemens-ragas-project-overview.html
+++ b/siemens-ragas-project-overview.html
--- a/start.bat
+++ b/start.bat
@@ -0,0 +1,123 @@
@echo off
 setlocal
 echo.
 echo ============================================================
 echo   Siemens RAGAS Console  -  Starting...
 echo ============================================================
 echo.
 :: Change to the directory where this script lives (siemens_ragas/)
 cd /d "%~dp0"
 echo Working directory: %CD%
 echo.
 :: ----------------------------------------------------------------
 :: 1. Check Python
 :: ----------------------------------------------------------------
 python --version >nul 2>&1
 if errorlevel 1 (
    echo [ERROR] Python not found. Please install Python 3.12+ and add it to PATH.
    goto :error
 )
 for /f "tokens=*" %%v in ('python --version 2^>^&1') do echo [OK] %%v
 :: ----------------------------------------------------------------
 :: 2. Check FastAPI / uvicorn
 :: ----------------------------------------------------------------
 python -c "import fastapi, uvicorn" >nul 2>&1
 if errorlevel 1 (
    echo [INFO] Installing fastapi and uvicorn...
    pip install fastapi uvicorn --quiet
    if errorlevel 1 (
        echo [ERROR] Failed to install fastapi/uvicorn.
        echo         Run manually: pip install fastapi uvicorn
        goto :error
    )
    echo [OK] fastapi and uvicorn installed.
 ) else (
    echo [OK] fastapi / uvicorn ready.
 )
 :: ----------------------------------------------------------------
 :: 3. Check ragas version
 :: ----------------------------------------------------------------
 python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" >nul 2>&1
 if errorlevel 1 (
    echo [INFO] Installing ragas==0.4.3 ...
    pip install "ragas==0.4.3" --quiet
    if errorlevel 1 (
        echo [WARN] ragas install failed. Dashboard still works; evaluation trigger will show an error.
    ) else (
        echo [OK] ragas 0.4.3 installed.
    )
 ) else (
    echo [OK] ragas 0.4.3 ready.
 )
 :: ----------------------------------------------------------------
 :: 4. Ensure configs/ directory exists for LLM profile storage
 :: ----------------------------------------------------------------
 if not exist "configs" (
    mkdir configs
    echo [OK] Created configs/ directory for LLM profile storage.
 ) else (
    echo [OK] configs/ directory ready.
 )
 :: ----------------------------------------------------------------
 :: 5. Seed demo data if no runs exist yet
 :: ----------------------------------------------------------------
 if not exist "outputs\kba-knowledge-base-offline-baseline" (
    echo [INFO] No run data found. Generating demo data...
    python scripts\seed_sample_run.py
    if errorlevel 1 (
        echo [WARN] Demo data generation failed. Dashboard may be empty.
    ) else (
        echo [OK] Demo data generated.
    )
 ) else (
    echo [OK] Run data found, skipping demo generation.
 )
 :: ----------------------------------------------------------------
 :: 6. Pick an available port
 :: ----------------------------------------------------------------
 set PORT=8800
 netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
 if not errorlevel 1 (
    echo [WARN] Port 8800 in use, trying 8801...
    set PORT=8801
    netstat -ano 2>nul | findstr ":8801" | findstr "LISTENING" >nul 2>&1
    if not errorlevel 1 (
        echo [ERROR] Ports 8800 and 8801 are both in use.
        echo         Run manually: python webmain.py --port 8802
        goto :error
    )
 )
 echo.
 echo ============================================================
 echo   Console URL : http://127.0.0.1:%PORT%
 echo   Press Ctrl+C to stop the server
 echo ============================================================
 echo.
 :: Open browser after 2-second delay (non-blocking)
 start /b cmd /c "timeout /t 2 >nul && start http://127.0.0.1:%PORT%"
 :: Launch uvicorn (blocking — window stays open while server runs)
 python webmain.py --host 127.0.0.1 --port %PORT%
 echo.
 echo Server stopped.
 pause
 exit /b 0
 :error
 echo.
 echo ============================================================
 echo   Startup failed. See error above.
 echo ============================================================
 pause
 exit /b 1
--- a/start.ps1
+++ b/start.ps1
@@ -0,0 +1,121 @@
 # start.ps1 — Siemens RAGAS Console launcher for Windows PowerShell
 # Usage: Right-click -> "Run with PowerShell", or: powershell -ExecutionPolicy Bypass -File start.ps1
 $ErrorActionPreference = "Stop"
 Set-Location $PSScriptRoot
 Write-Host ""
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host "  Siemens RAGAS Console  -  Starting..." -ForegroundColor Cyan
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host ""
 Write-Host "Working directory: $PSScriptRoot"
 Write-Host ""
 # ----------------------------------------------------------------
 # 1. Check Python
 # ----------------------------------------------------------------
 try {
    $pyver = & python --version 2>&1
    Write-Host "[OK] $pyver" -ForegroundColor Green
 } catch {
    Write-Host "[ERROR] Python not found. Please install Python 3.12+ and add to PATH." -ForegroundColor Red
    Read-Host "Press Enter to exit"
    exit 1
 }
 # ----------------------------------------------------------------
 # 2. Check FastAPI / uvicorn
 # ----------------------------------------------------------------
 $check = & python -c "import fastapi, uvicorn" 2>&1
 if ($LASTEXITCODE -ne 0) {
    Write-Host "[INFO] Installing fastapi and uvicorn..." -ForegroundColor Yellow
    & pip install fastapi uvicorn --quiet
    if ($LASTEXITCODE -ne 0) {
        Write-Host "[ERROR] Failed to install fastapi/uvicorn. Run: pip install fastapi uvicorn" -ForegroundColor Red
        Read-Host "Press Enter to exit"
        exit 1
    }
    Write-Host "[OK] fastapi / uvicorn installed." -ForegroundColor Green
 } else {
    Write-Host "[OK] fastapi / uvicorn ready." -ForegroundColor Green
 }
 # ----------------------------------------------------------------
 # 3. Check ragas version
 # ----------------------------------------------------------------
 $check = & python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" 2>&1
 if ($LASTEXITCODE -ne 0) {
    Write-Host "[INFO] Installing ragas==0.4.3 (evaluation engine)..." -ForegroundColor Yellow
    & pip install "ragas==0.4.3" --quiet
    if ($LASTEXITCODE -ne 0) {
        Write-Host "[WARN] ragas install failed. Dashboard works; evaluation trigger will show error." -ForegroundColor Yellow
    } else {
        Write-Host "[OK] ragas 0.4.3 installed." -ForegroundColor Green
    }
 } else {
    Write-Host "[OK] ragas 0.4.3 ready." -ForegroundColor Green
 }
 # ----------------------------------------------------------------
 # 4. Ensure configs/ directory exists for LLM profile storage
 # ----------------------------------------------------------------
 if (-not (Test-Path "configs")) {
    New-Item -ItemType Directory "configs" | Out-Null
    Write-Host "[OK] Created configs/ directory for LLM profile storage." -ForegroundColor Green
 } else {
    Write-Host "[OK] configs/ directory ready." -ForegroundColor Green
 }
 # ----------------------------------------------------------------
 # 5. Seed demo data if missing
 # ----------------------------------------------------------------
 if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
    Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
    & python scripts\seed_sample_run.py
    if ($LASTEXITCODE -ne 0) {
        Write-Host "[WARN] Demo data generation failed. Dashboard may be empty." -ForegroundColor Yellow
    } else {
        Write-Host "[OK] Demo data generated." -ForegroundColor Green
    }
 } else {
    Write-Host "[OK] Run data found, skipping demo generation." -ForegroundColor Green
 }
 # ----------------------------------------------------------------
 # 6. Pick an available port
 # ----------------------------------------------------------------
 $PORT = 8800
 $inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
 if ($inUse) {
    Write-Host "[WARN] Port $PORT in use, trying 8801..." -ForegroundColor Yellow
    $PORT = 8801
    $inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
    if ($inUse) {
        Write-Host "[ERROR] Ports 8800 and 8801 are both in use." -ForegroundColor Red
        Write-Host "        Run manually: python webmain.py --port 8802"
        Read-Host "Press Enter to exit"
        exit 1
    }
 }
 Write-Host ""
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host "  Console URL : http://127.0.0.1:$PORT" -ForegroundColor Green
 Write-Host "  Press Ctrl+C to stop the server" -ForegroundColor Cyan
 Write-Host "============================================================" -ForegroundColor Cyan
 Write-Host ""
 # Open browser after 2-second delay
 Start-Job -ScriptBlock {
    param($port)
    Start-Sleep 2
    Start-Process "http://127.0.0.1:$port"
 } -ArgumentList $PORT | Out-Null
 # Launch uvicorn (blocking)
 & python webmain.py --host 127.0.0.1 --port $PORT
 Write-Host ""
 Write-Host "Server stopped."
 Read-Host "Press Enter to exit"
--- a/start.sh
+++ b/start.sh
@@ -0,0 +1,94 @@
 #!/usr/bin/env bash
 # start.sh — 启动 Siemens RAGAS Web 服务（后台运行）
 # 前提：已执行过 deploy.sh（.venv 和依赖均已就绪）
 # 用法：bash start.sh
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 # ── 颜色输出 ──────────────────────────────────────────────────────
 if [ -t 1 ]; then
    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
 else
    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
 fi
 ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
 warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
 err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
 echo ""
 echo -e "${CYAN}============================================================${NC}"
 echo -e "${CYAN}  Siemens RAGAS Console  —  启动服务${NC}"
 echo -e "${CYAN}============================================================${NC}"
 echo ""
 # 检查虚拟环境
 if [ ! -f ".venv/bin/python" ]; then
    err "未找到 .venv，请先执行部署：bash deploy.sh"
    exit 1
 fi
 PYTHON=".venv/bin/python"
 # 检查 .env
 if [ ! -f ".env" ]; then
    warn ".env 不存在，请先复制并编辑配置："
    warn "  cp .env.example .env && nano .env"
 fi
 if grep -q "your-api-key" .env 2>/dev/null; then
    warn ".env 中仍包含默认占位符，部分功能（评估执行）将不可用"
 fi
 # 检查是否已有运行中的进程
 if [ -f ".server.pid" ]; then
    EXISTING_PID=$(cat .server.pid)
    if kill -0 "$EXISTING_PID" 2>/dev/null; then
        warn "服务已在运行 (PID=$EXISTING_PID)，无需重复启动"
        warn "如需重启请先执行：bash stop.sh"
        exit 0
    else
        # PID 文件残留，清理
        rm -f .server.pid
    fi
 fi
 # 创建必要目录
 mkdir -p logs
 # 端口检测
 PORT=8800
 if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
    warn "端口 $PORT 已被占用，尝试 8801..."
    PORT=8801
    if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
        err "端口 8800 和 8801 均被占用，请手动指定端口："
        err "  .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
        exit 1
    fi
 fi
 # 后台启动
 nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
 SERVER_PID=$!
 echo "$SERVER_PID" > .server.pid
 # 等待 3 秒验证进程存活
 sleep 3
 if kill -0 "$SERVER_PID" 2>/dev/null; then
    ok "服务已启动 (PID=$SERVER_PID)"
    echo ""
    echo -e "${CYAN}  访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
    echo -e "${CYAN}  本机访问: http://127.0.0.1:${PORT}${NC}"
    echo -e "${CYAN}  查看日志: tail -f logs/server.log${NC}"
    echo -e "${CYAN}  停止服务: bash stop.sh${NC}"
    echo ""
 else
    err "服务启动失败，请查看日志："
    err "  tail -20 logs/server.log"
    rm -f .server.pid
    exit 1
 fi
--- a/stop.sh
+++ b/stop.sh
@@ -0,0 +1,68 @@
 #!/usr/bin/env bash
 # stop.sh — 停止 Siemens RAGAS 后台 Web 服务
 # 用法：bash stop.sh
 set -uo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 # ── 颜色输出 ──────────────────────────────────────────────────────
 if [ -t 1 ]; then
    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
 else
    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
 fi
 ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
 warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
 err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
 echo ""
 echo -e "${CYAN}  Siemens RAGAS Console  —  停止服务${NC}"
 echo ""
 PID_FILE="$SCRIPT_DIR/.server.pid"
 if [ ! -f "$PID_FILE" ]; then
    warn "未找到 .server.pid，服务可能未启动或已停止"
    exit 0
 fi
 PID=$(cat "$PID_FILE")
 if ! kill -0 "$PID" 2>/dev/null; then
    warn "进程 $PID 已不存在，清理 PID 文件"
    rm -f "$PID_FILE"
    exit 0
 fi
 # 优雅停止（SIGTERM）
 echo -e "  正在停止进程 (PID=$PID)..."
 kill "$PID" 2>/dev/null || true
 # 等待最多 5 秒
 for i in 1 2 3 4 5; do
    sleep 1
    if ! kill -0 "$PID" 2>/dev/null; then
        break
    fi
    echo -e "  等待进程退出... ($i/5)"
 done
 # 若进程仍存在，强制终止
 if kill -0 "$PID" 2>/dev/null; then
    warn "进程未响应，强制终止 (SIGKILL)..."
    kill -9 "$PID" 2>/dev/null || true
    sleep 1
 fi
 rm -f "$PID_FILE"
 if kill -0 "$PID" 2>/dev/null; then
    err "无法停止进程 $PID，请手动执行：kill -9 $PID"
    exit 1
 else
    ok "服务已停止"
    echo ""
 fi
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_advisor_rules.py
+++ b/tests/test_advisor_rules.py
@@ -0,0 +1,100 @@
 import math
 import unittest
 from rag_eval.advisor.rules import Diagnosis, diagnose, METRIC_RULES
 class TestDiagnosis(unittest.TestCase):
    def _make_rows(self, metric: str, scores: list[float]) -> list[dict]:
        return [{metric: s, "question": f"q{i}", "answer": f"a{i}",
                 "ground_truth": f"gt{i}", "sample_id": f"s{i}"}
                for i, s in enumerate(scores)]
    def test_no_diagnosis_when_all_scores_above_threshold(self):
        # Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
        rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(result, [])
    def test_no_diagnosis_when_mean_above_advisory_threshold(self):
        rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
        result = diagnose(rows, metrics=["answer_relevancy"])
        self.assertEqual(result, [])
    def test_low_severity_when_mean_below_advisory_threshold(self):
        # Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
        rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].severity, "low")
        self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
    def test_low_severity_answer_relevancy_at_0_84(self):
        rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
        result = diagnose(rows, metrics=["answer_relevancy"])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].severity, "low")
    def test_low_severity_has_root_causes_and_actions(self):
        rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
        result = diagnose(rows, metrics=["context_precision"])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].severity, "low")
        self.assertTrue(len(result[0].root_causes) > 0)
        self.assertTrue(len(result[0].suggested_actions) > 0)
    def test_warning_when_mean_below_warning_threshold(self):
        rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].metric, "faithfulness")
        self.assertEqual(result[0].severity, "warning")
        self.assertAlmostEqual(result[0].mean_score, 0.65, places=2)
    def test_critical_when_mean_below_critical_threshold(self):
        rows = self._make_rows("faithfulness", [0.3, 0.4, 0.45])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(result[0].severity, "critical")
    def test_low_samples_selected_are_bottom_three(self):
        rows = self._make_rows("faithfulness", [0.1, 0.2, 0.3, 0.8, 0.9])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(len(result[0].low_samples), 3)
        scores = [s["faithfulness"] for s in result[0].low_samples]
        self.assertEqual(sorted(scores), [0.1, 0.2, 0.3])
    def test_nan_scores_excluded_from_mean_and_low_samples(self):
        rows = self._make_rows("faithfulness", [0.3, float("nan"), 0.4])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(len(result), 1)
        for s in result[0].low_samples:
            self.assertFalse(math.isnan(s["faithfulness"]))
    def test_noise_sensitivity_direction_inverted(self):
        # noise_sensitivity: higher is worse; threshold > 0.3 is warning
        rows = self._make_rows("noise_sensitivity", [0.4, 0.45, 0.5])
        result = diagnose(rows, metrics=["noise_sensitivity"])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].metric, "noise_sensitivity")
    def test_noise_sensitivity_no_diagnosis_when_low(self):
        rows = self._make_rows("noise_sensitivity", [0.1, 0.15, 0.2])
        result = diagnose(rows, metrics=["noise_sensitivity"])
        self.assertEqual(result, [])
    def test_skips_metric_not_in_rows(self):
        rows = [{"faithfulness": 0.3, "question": "q", "answer": "a",
                 "ground_truth": "gt", "sample_id": "s1"}]
        result = diagnose(rows, metrics=["faithfulness", "context_recall"])
        metrics_found = [d.metric for d in result]
        self.assertIn("faithfulness", metrics_found)
        self.assertNotIn("context_recall", metrics_found)
    def test_all_seven_metrics_have_rules(self):
        expected = {"faithfulness", "answer_relevancy", "context_recall",
                    "context_precision", "noise_sensitivity",
                    "factual_correctness", "semantic_similarity"}
        self.assertEqual(set(METRIC_RULES.keys()), expected)
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_advisor_writer.py
+++ b/tests/test_advisor_writer.py
@@ -0,0 +1,113 @@
 import shutil
 import unittest
 from pathlib import Path
 from rag_eval.advisor.rules import Diagnosis
 from rag_eval.advisor.writer import write_advice, _format_log_summary
 class TestWriteAdvice(unittest.TestCase):
    def setUp(self):
        self.tmp = Path("tests/.tmp/test_advisor_writer")
        shutil.rmtree(self.tmp, ignore_errors=True)
        self.tmp.mkdir(parents=True, exist_ok=True)
        self.advice_path = self.tmp / "optimization_advice.md"
    def tearDown(self):
        shutil.rmtree(self.tmp, ignore_errors=True)
    def _make_diagnosis(self, metric="faithfulness", severity="warning"):
        return Diagnosis(
            metric=metric,
            mean_score=0.55,
            threshold=0.7,
            severity=severity,
            root_causes=["原因1", "原因2"],
            suggested_actions=["建议1", "建议2"],
            low_samples=[
                {"sample_id": "s1", "question": "问题1", "answer": "答案1",
                 "ground_truth": "标准1", metric: 0.4},
            ],
        )
    def test_write_creates_file(self):
        diag = self._make_diagnosis()
        write_advice(
            diagnoses=[diag],
            llm_markdown="## faithfulness\n\nLLM 建议内容",
            advice_path=self.advice_path,
            scenario_name="test-scenario",
            run_id="2026-01-01T00-00-00",
            judge_model="deepseek-v4-flash",
        )
        self.assertTrue(self.advice_path.exists())
    def test_write_contains_scenario_name_and_run_id(self):
        diag = self._make_diagnosis()
        write_advice(
            diagnoses=[diag],
            llm_markdown="## faithfulness\n\nLLM 建议",
            advice_path=self.advice_path,
            scenario_name="siemens-test",
            run_id="2026-01-01T00-00-00",
            judge_model="deepseek-v4-flash",
        )
        content = self.advice_path.read_text(encoding="utf-8")
        self.assertIn("siemens-test", content)
        self.assertIn("2026-01-01T00-00-00", content)
    def test_write_contains_llm_markdown(self):
        diag = self._make_diagnosis()
        write_advice(
            diagnoses=[diag],
            llm_markdown="## faithfulness\n\n具体建议文本",
            advice_path=self.advice_path,
            scenario_name="test",
            run_id="rid",
            judge_model="model",
        )
        content = self.advice_path.read_text(encoding="utf-8")
        self.assertIn("具体建议文本", content)
    def test_write_fallback_when_no_llm_markdown(self):
        """When llm_markdown is empty, writer emits rule-only report."""
        diag = self._make_diagnosis()
        write_advice(
            diagnoses=[diag],
            llm_markdown="",
            advice_path=self.advice_path,
            scenario_name="test",
            run_id="rid",
            judge_model="model",
        )
        content = self.advice_path.read_text(encoding="utf-8")
        self.assertIn("faithfulness", content)
        self.assertIn("原因1", content)
    def test_log_summary_format(self):
        diags = [
            self._make_diagnosis("faithfulness", "critical"),
            self._make_diagnosis("context_recall", "warning"),
        ]
        summary = _format_log_summary(diags, self.advice_path)
        self.assertIn("faithfulness", summary)
        self.assertIn("严重", summary)   # "critical" maps to Chinese label
        self.assertIn("context_recall", summary)
        self.assertIn("警告", summary)   # "warning" maps to Chinese label
    def test_write_empty_diagnoses_still_creates_file(self):
        write_advice(
            diagnoses=[],
            llm_markdown="",
            advice_path=self.advice_path,
            scenario_name="test",
            run_id="rid",
            judge_model="model",
        )
        self.assertTrue(self.advice_path.exists())
        content = self.advice_path.read_text(encoding="utf-8")
        self.assertIn("未发现明显指标异常", content)
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_metric_presenter.py
+++ b/tests/test_metric_presenter.py
@@ -0,0 +1,68 @@
 from __future__ import annotations
 import subprocess
 from pathlib import Path
 REPO_ROOT = Path(__file__).resolve().parents[1]
 def _run_node(script: str) -> str:
    """Execute a short Node.js script and return stdout."""
    completed = subprocess.run(
        ["node", "-e", script],
        cwd=REPO_ROOT,
        capture_output=True,
        text=True,
        encoding="utf-8",
        check=True,
    )
    return completed.stdout.strip()
 def test_metric_presenter_applies_thresholds_and_noise_direction() -> None:
    """MetricPresenter should centralize thresholds and inverse noise semantics."""
    metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix()
    script = f"""
 const fs = require("fs");
 const vm = require("vm");
 const code = fs.readFileSync("{metric_js}", "utf8");
 const sandbox = {{ window: {{}}, console }};
 vm.runInNewContext(code, sandbox);
 const p = sandbox.window.MetricPresenter;
 const result = {{
  faith085: p.scoreClass("faithfulness", 0.85),
  faith070: p.scoreClass("faithfulness", 0.70),
  faith064: p.scoreClass("faithfulness", 0.64),
  noise010: p.scoreClass("noise_sensitivity", 0.10),
  noise030: p.scoreClass("noise_sensitivity", 0.30),
  noise050: p.scoreClass("noise_sensitivity", 0.50),
  desc: p.describeMetric("faithfulness"),
  noiseDesc: p.describeMetric("noise_sensitivity"),
  noiseBin: p.binColor("noise_sensitivity", 0.0),
  faithBin: p.binColor("faithfulness", 0.8)
 }};
 console.log(JSON.stringify(result));
 """
    output = _run_node(script)
    assert '"faith085":"good"' in output
    assert '"faith070":"warn"' in output
    assert '"faith064":"bad"' in output
    assert '"noise010":"good"' in output
    assert '"noise030":"warn"' in output
    assert '"noise050":"bad"' in output
    assert '"desc":"' in output
    assert '"noiseDesc":"' in output
    assert '"noiseBin":"#16a34a"' in output
    assert '"faithBin":"#16a34a"' in output
 def test_report_and_index_load_metric_presenter_helper() -> None:
    """The report page should use the shared helper for card descriptions and colors."""
    index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8")
    report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8")
    app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8")
    assert "js/metric_presenter.js" in index_html
    assert "MetricPresenter.describeMetric" in report_js
    assert "MetricPresenter.scoreClass" in app_js
--- a/tests/test_offline_eval.py
+++ b/tests/test_offline_eval.py
@@ -80,6 +80,64 @@ class ScenarioAndDatasetTests(unittest.TestCase):
        self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
        self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")
    def test_load_scenario_metric_and_doc_weights(self) -> None:
        """load_scenario passes metric_weights and doc_weights into Scenario."""
        import os
        import tempfile
        import yaml
        from rag_eval.config.loader import load_scenario
        payload = {
            "scenario_name": "w-test",
            "mode": "offline",
            "dataset": "nonexistent.csv",
            "judge_model": "m",
            "embedding_model": "e",
            "metrics": ["faithfulness"],
            "output_dir": "out",
            "metric_weights": {"faithfulness": 0.7},
            "doc_weights": {"doc.pdf": 2.0},
        }
        with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
            yaml.dump(payload, f, allow_unicode=True)
            tmp_path = f.name
        try:
            scenario = load_scenario(tmp_path)
            assert scenario.metric_weights == {"faithfulness": 0.7}
            assert scenario.doc_weights == {"doc.pdf": 2.0}
        finally:
            os.unlink(tmp_path)
    def test_load_scenario_defaults_to_empty_weights(self) -> None:
        """load_scenario defaults metric_weights and doc_weights to empty dicts."""
        import os
        import tempfile
        import yaml
        from rag_eval.config.loader import load_scenario
        payload = {
            "scenario_name": "no-w",
            "mode": "offline",
            "dataset": "nonexistent.csv",
            "judge_model": "m",
            "embedding_model": "e",
            "metrics": ["faithfulness"],
            "output_dir": "out",
        }
        with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
            yaml.dump(payload, f, allow_unicode=True)
            tmp_path = f.name
        try:
            scenario = load_scenario(tmp_path)
            assert scenario.metric_weights == {}
            assert scenario.doc_weights == {}
        finally:
            os.unlink(tmp_path)
    def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
        scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
        snapshot = scenario.snapshot()
@@ -125,6 +183,119 @@ class ScenarioAndDatasetTests(unittest.TestCase):
 class EvaluatorAndReportingTests(unittest.TestCase):
    def test_merge_score_includes_weighted_score_and_sample_weight(self):
        """_merge_score no longer adds weighted_score/sample_weight (feature disabled)."""
        from unittest.mock import MagicMock
        from rag_eval.execution.evaluator import Evaluator
        from rag_eval.shared.models import (
            MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
        )
        scenario = Scenario(
            scenario_name="w-test", mode="offline",
            dataset=DatasetConfig(path=Path("d.csv")),
            judge_model="m", embedding_model="e",
            metrics=["faithfulness", "context_recall"],
            output_dir=Path("out"),
            metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
            doc_weights={"doc.pdf": 2.0},
        )
        evaluator = Evaluator(
            scenario=scenario,
            metric_pipeline=MagicMock(),
            app_adapter=None,
        )
        sample = NormalizedSample(
            sample_id="s1", question="q", contexts=["ctx"],
            answer="a", ground_truth="gt",
            metadata={"doc_name": "doc.pdf"},
        )
        score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
        row = evaluator._merge_score(sample, score)
        # 综合加权得分已暂时禁用，weighted_score 和 sample_weight 不再写入
        assert "weighted_score" not in row
        assert "sample_weight" not in row
        assert row["faithfulness"] == 1.0
        assert row["context_recall"] == 0.0
    def test_summary_markdown_shows_weighted_score(self):
        """build_summary_markdown includes weighted_score when metric_weights set."""
        import math
        from rag_eval.reporting.summary import build_summary_markdown
        from rag_eval.shared.models import (
            EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
        )
        from pathlib import Path
        scenario = Scenario(
            scenario_name="ws-test", mode="offline",
            dataset=DatasetConfig(path=Path("d.csv")),
            judge_model="m", embedding_model="e",
            metrics=["faithfulness"],
            output_dir=Path("out"),
            metric_weights={"faithfulness": 1.0},
            doc_weights={},
        )
        sample = NormalizedSample(
            sample_id="s1", question="q", contexts=["c"],
            answer="a", ground_truth="gt",
        )
        result = EvaluationResult(
            scenario=scenario, run_id="r1",
            started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
            valid_samples=[sample], invalid_samples=[],
            score_rows=[{
                "sample_id": "s1", "faithfulness": 0.8,
                "weighted_score": 0.8, "sample_weight": 1.0,
                "doc_name": "", "error": "",
            }],
        )
        md = build_summary_markdown(result)
        assert "weighted_score" in md
        assert "0.8000" in md
    def test_summary_markdown_hides_weighted_score_without_weights(self):
        """build_summary_markdown preserves unweighted summaries when no weights set."""
        from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
        scenario = Scenario(
            scenario_name="plain-test",
            mode="offline",
            dataset=DatasetConfig(path=Path("d.csv")),
            judge_model="m",
            embedding_model="e",
            metrics=["faithfulness"],
            output_dir=Path("out"),
            metric_weights={},
            doc_weights={},
        )
        sample = NormalizedSample(
            sample_id="s1",
            question="q",
            contexts=["c"],
            answer="a",
            ground_truth="gt",
        )
        result = EvaluationResult(
            scenario=scenario,
            run_id="r1",
            started_at="2026-01-01T00:00:00",
            finished_at="2026-01-01T00:01:00",
            valid_samples=[sample],
            invalid_samples=[],
            score_rows=[{
                "sample_id": "s1",
                "faithfulness": 0.8,
                "weighted_score": 0.8,
                "sample_weight": 1.0,
                "doc_name": "",
                "error": "",
            }],
        )
        md = build_summary_markdown(result)
        assert "- **weighted_score" not in md
    def test_metric_pipeline_scores_sample(self) -> None:
        pipeline = MetricPipeline(
            metrics={
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -0,0 +1,280 @@
 """Tests for the end-to-end pipeline API and pipeline task manager."""
 from __future__ import annotations
 import json
 import time
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import pytest
 from fastapi.testclient import TestClient
 # ── fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture()
 def client(tmp_path, monkeypatch):
    """TestClient with a fresh PipelineTaskManager backed by tmp_path outputs."""
    import webapp.services.pipeline_task_manager as mgr_mod
    from webapp.services.pipeline_task_manager import PipelineTaskManager
    fresh_mgr = PipelineTaskManager(max_workers=2)
    monkeypatch.setattr(mgr_mod, "pipeline_task_manager", fresh_mgr)
    monkeypatch.setattr(mgr_mod, "_PIPELINE_OUTPUT_ROOT", tmp_path / "pipeline")
    import webapp.api.pipeline as api_mod
    monkeypatch.setattr(api_mod, "pipeline_task_manager", fresh_mgr)
    from webapp.server import create_app
    return TestClient(create_app())
 def _minimal_pdf_dir(tmp_path: Path) -> Path:
    """Create a temp directory that looks like a PDF folder (empty, valid dir)."""
    d = tmp_path / "pdfs"
    d.mkdir()
    return d
 def _mock_build_result(tmp_path: Path, job, run_id="r1"):
    """Return a fake DatasetBuildResult with a minimal dataset CSV."""
    from rag_eval.dataset_builder.models import (
        DatasetBuildArtifactPaths,
        DatasetBuildResult,
        DraftQuestionSample,
    )
    artifact_root = tmp_path / "build" / run_id
    artifact_root.mkdir(parents=True, exist_ok=True)
    latest = tmp_path / "build" / "latest"
    latest.mkdir(parents=True, exist_ok=True)
    chunks_path = artifact_root / "source_chunks.jsonl"
    chunks_path.write_text(
        json.dumps({"chunk_id": "c1", "doc_id": "d1", "doc_name": "test.pdf",
                    "text": "CT scan context.", "page_start": 1, "page_end": 1,
                    "section_path": "/", "section_title": "", "source_layout_ids": []}) + "\n",
        encoding="utf-8",
    )
    (latest / "source_chunks.jsonl").write_text(chunks_path.read_text(encoding="utf-8"), encoding="utf-8")
    dataset_csv = tmp_path / "generated_dataset.csv"
    dataset_csv.write_text(
        "sample_id,question,ground_truth,scenario,language,doc_id,doc_name,"
        "section_path,page_start,page_end,source_chunk_ids,question_type,difficulty,"
        "review_status,review_notes\n"
        's1,"What is CT?","CT is imaging.","test","zh","d1","test.pdf","/",'
        '1,1,"[""c1""]","fact","easy","draft",""\n',
        encoding="utf-8",
    )
    sample = DraftQuestionSample(
        sample_id="s1", question="What is CT?", ground_truth="CT is imaging.",
        scenario="test", language="zh", doc_id="d1", doc_name="test.pdf",
        section_path="/", page_start=1, page_end=1, source_chunk_ids=["c1"],
        question_type="fact", difficulty="easy",
    )
    artifact_paths = DatasetBuildArtifactPaths(
        root_dir=artifact_root,
        documents_jsonl=artifact_root / "documents.jsonl",
        semantic_blocks_jsonl=artifact_root / "semantic_blocks.jsonl",
        source_chunks_jsonl=chunks_path,
        dataset_draft_csv=artifact_root / "dataset_draft.csv",
        parse_failures_csv=artifact_root / "parse_failures.csv",
        metadata_json=artifact_root / "metadata.json",
    )
    return DatasetBuildResult(
        job=job,
        run_id=run_id,
        artifact_paths=artifact_paths,
        documents=[],
        draft_samples=[sample],
        parse_failures=[],
    )
 def _mock_eval_result(tmp_path: Path, scenario):
    """Return a fake EvaluationResult."""
    from rag_eval.shared.models import EvaluationResult
    return EvaluationResult(
        scenario=scenario,
        run_id="eval-r1",
        started_at="2026-01-01T00:00:00",
        finished_at="2026-01-01T00:01:00",
        valid_samples=[],
        invalid_samples=[],
        score_rows=[],
    )
 # ── API route tests ────────────────────────────────────────────────────────────
 def test_submit_returns_202_and_job_id(client, tmp_path):
    """POST /api/pipeline/jobs returns 202 with job_id immediately."""
    pdf_dir = _minimal_pdf_dir(tmp_path)
    with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
        from webapp.models import PipelineResult
        mock_exec.return_value = PipelineResult(
            build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
            source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
            parse_failures=0, eval_run_id="r1", eval_output_dir="/tmp/e",
            scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
        )
        resp = client.post("/api/pipeline/jobs", json={
            "docs_path": str(pdf_dir),
            "job_name": "test-job",
        })
    assert resp.status_code == 202
    data = resp.json()
    assert "job_id" in data
    assert data["job_name"] == "test-job"
    # status may already be completed by the time the response is read (mock runs instantly)
    assert data["status"] in ("queued", "completed")
 def test_get_nonexistent_job_returns_404(client):
    """GET /api/pipeline/jobs/{id} returns 404 for unknown job."""
    resp = client.get("/api/pipeline/jobs/doesnotexist")
    assert resp.status_code == 404
 def test_list_jobs_returns_empty_initially(client):
    """GET /api/pipeline/jobs returns empty list when no jobs submitted."""
    resp = client.get("/api/pipeline/jobs")
    assert resp.status_code == 200
    assert resp.json()["jobs"] == []
 def test_job_status_polling(client, tmp_path):
    """Submitted job becomes visible via GET /api/pipeline/jobs/{id}."""
    pdf_dir = _minimal_pdf_dir(tmp_path)
    with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
        from webapp.models import PipelineResult
        mock_exec.return_value = PipelineResult(
            build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
            source_chunks_jsonl="/tmp/c.jsonl", total_questions=3,
            parse_failures=0, eval_run_id="r2", eval_output_dir="/tmp/e",
            scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
        )
        post_resp = client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir)})
    job_id = post_resp.json()["job_id"]
    # Poll until done or timeout (max 5s for mock)
    for _ in range(20):
        status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
        assert status_resp.status_code == 200
        status = status_resp.json()
        if status["status"] in ("completed", "failed"):
            break
        time.sleep(0.25)
    assert status["status"] == "completed"
    assert status["result"]["total_questions"] == 3
 def test_job_fails_on_invalid_docs_path(client):
    """Job fails quickly if docs_path does not exist."""
    resp = client.post("/api/pipeline/jobs", json={
        "docs_path": "/nonexistent/path/that/does/not/exist",
    })
    assert resp.status_code == 202
    job_id = resp.json()["job_id"]
    for _ in range(20):
        status_resp = client.get(f"/api/pipeline/jobs/{job_id}")
        status = status_resp.json()
        if status["status"] in ("completed", "failed"):
            break
        time.sleep(0.25)
    assert status["status"] == "failed"
    assert "docs_path" in status["error"] or "not" in status["error"].lower()
 def test_list_jobs_shows_submitted(client, tmp_path):
    """GET /api/pipeline/jobs includes jobs after submission."""
    pdf_dir = _minimal_pdf_dir(tmp_path)
    with patch("webapp.services.pipeline_task_manager.PipelineTaskManager._execute") as mock_exec:
        from webapp.models import PipelineResult
        mock_exec.return_value = PipelineResult(
            build_artifact_dir="/tmp/b", dataset_csv="/tmp/d.csv",
            source_chunks_jsonl="/tmp/c.jsonl", total_questions=1,
            parse_failures=0, eval_run_id="r3", eval_output_dir="/tmp/e",
            scores_csv="/tmp/scores.csv", summary_md="/tmp/summary.md",
        )
        client.post("/api/pipeline/jobs", json={"docs_path": str(pdf_dir), "job_name": "listed-job"})
    time.sleep(0.5)
    list_resp = client.get("/api/pipeline/jobs")
    assert list_resp.status_code == 200
    jobs = list_resp.json()["jobs"]
    assert len(jobs) >= 1
    names = [j["job_name"] for j in jobs]
    assert "listed-job" in names
 # ── execute_dataset_build_job refactor test ────────────────────────────────────
 def test_execute_dataset_build_job_directly(tmp_path):
    """execute_dataset_build_job runs the build without a YAML file."""
    from unittest.mock import patch as _patch
    from rag_eval.dataset_builder.models import DatasetBuildJob, DatasetBuildRuntime
    from rag_eval.dataset_builder.runner import execute_dataset_build_job
    from rag_eval.settings import EvaluationSettings
    pdf_dir = tmp_path / "pdfs"
    pdf_dir.mkdir()
    (pdf_dir / "doc.pdf").write_bytes(b"%PDF-fake")
    job = DatasetBuildJob(
        job_name="direct-test",
        input_path=pdf_dir,
        input_glob="*.pdf",
        parser_provider="aliyun_docmind",
        failure_mode="skip",
        generation_model="test-model",
        output_type="online_question_bank",
        review_mode="draft_with_manual_review",
        max_questions_per_document=5,
        max_source_chunks_per_question=3,
        dataset_path=tmp_path / "out.csv",
        artifact_dir=tmp_path / "artifacts",
        runtime=DatasetBuildRuntime(max_documents=1),
    )
    mock_doc = MagicMock()
    mock_doc.doc_id = "d1"
    mock_doc.doc_name = "doc.pdf"
    mock_doc.source_chunks = []
    mock_doc.semantic_blocks = []
    mock_doc.raw_text = ""
    mock_doc.structure_nodes = []
    mock_doc.metadata = {}
    mock_doc.to_record.return_value = {
        "doc_id": "d1", "doc_name": "doc.pdf", "raw_text": "",
        "structure_nodes": [], "metadata": {},
        "semantic_block_count": 0, "source_chunk_count": 0,
    }
    mock_parser = MagicMock()
    mock_parser.parse.return_value = mock_doc
    mock_generator = MagicMock()
    mock_generator.generate.return_value = []
    result = execute_dataset_build_job(
        job,
        settings=EvaluationSettings(_env_file=None),
        parser=mock_parser,
        generator=mock_generator,
    )
    assert result.job.job_name == "direct-test"
    assert result.artifact_paths.root_dir.exists()
--- a/tests/test_webapp_report_builder.py
+++ b/tests/test_webapp_report_builder.py
@@ -0,0 +1,117 @@
 """Regression tests for weighted webapp report aggregation."""
 from __future__ import annotations
 from pathlib import Path
 import pytest
 from webapp.services.report_builder import build_report
 from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
 def _write_run_artifacts(run_dir: Path) -> None:
    """Create a minimal run directory with weighted scores and a snapshot."""
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "scores.csv").write_text(
        "\n".join(
            [
                "sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
                "s1,a.pdf,1.0,0.5,0.8333,3.0",
                "s2,b.pdf,0.0,0.5,0.1667,1.0",
            ]
        ),
        encoding="utf-8",
    )
    (run_dir / "summary.md").write_text("summary", encoding="utf-8")
    (run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
    (run_dir / "scenario.snapshot.yaml").write_text(
        "\n".join(
            [
                "metrics:",
                "  - faithfulness",
                "  - context_recall",
                "metric_weights:",
                "  faithfulness: 2.0",
                "  context_recall: 1.0",
                "doc_weights:",
                "  a.pdf: 3.0",
                "  b.pdf: 1.0",
            ]
        ),
        encoding="utf-8",
    )
 def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
    """Snapshot weight reader returns both weight maps as plain float dicts."""
    run_dir = tmp_path / "run"
    _write_run_artifacts(run_dir)
    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
    assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
    assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
 def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
    """Report aggregation uses weighted means and surfaces snapshot weights."""
    run_dir = tmp_path / "run"
    _write_run_artifacts(run_dir)
    report = build_report(run_dir, ["faithfulness", "context_recall"])
    assert report.metric_means == {
        "faithfulness": pytest.approx(0.75, rel=1e-4),
        "context_recall": pytest.approx(0.5, rel=1e-4),
    }
    # 综合加权得分已暂时禁用
    assert report.weighted_score_mean is None
    assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
    assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
    assert report.summary_markdown == "summary"
    assert report.advice_markdown == "advice"
 def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
    """Metric inference excludes weighted helper columns from scores.csv."""
    run_dir = tmp_path / "run"
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "scores.csv").write_text(
        "\n".join(
            [
                "sample_id,doc_name,faithfulness,weighted_score,sample_weight",
                "s1,a.pdf,0.8,0.8,2.0",
            ]
        ),
        encoding="utf-8",
    )
    assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
 def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None:
    """Lowest-sample review should treat higher noise sensitivity as worse."""
    run_dir = tmp_path / "run"
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "scores.csv").write_text(
        "\n".join(
            [
                "sample_id,question,noise_sensitivity",
                "s-good,q1,0.10",
                "s-warn,q2,0.30",
                "s-bad,q3,0.90",
            ]
        ),
        encoding="utf-8",
    )
    (run_dir / "summary.md").write_text("summary", encoding="utf-8")
    (run_dir / "optimization_advice.md").write_text("", encoding="utf-8")
    report = build_report(run_dir, ["noise_sensitivity"])
    assert [sample.sample_id for sample in report.lowest_samples[:3]] == [
        "s-bad",
        "s-warn",
        "s-good",
    ]
--- a/tests/test_weights.py
+++ b/tests/test_weights.py
@@ -0,0 +1,124 @@
 """Unit tests for rag_eval/metrics/weights.py"""
 import math
 import pytest
 from rag_eval.metrics.weights import (
    compute_overall_weighted_score_mean,
    compute_weighted_score,
    resolve_weight,
    weighted_metric_means,
 )
 class TestResolveWeight:
    def test_returns_value_when_key_present(self):
        assert resolve_weight({"faith": 0.5}, "faith") == 0.5
    def test_returns_default_when_key_missing(self):
        assert resolve_weight({}, "faith") == 1.0
    def test_returns_custom_default_when_key_missing(self):
        assert resolve_weight({}, "faith", default=2.0) == 2.0
    def test_empty_dict_returns_default(self):
        assert resolve_weight({}, "anything") == 1.0
 class TestComputeWeightedScore:
    def test_equal_weights_is_simple_mean(self):
        scores = {"faithfulness": 0.8, "context_recall": 0.6}
        result = compute_weighted_score(scores, {})
        assert result == pytest.approx(0.7, rel=1e-4)
    def test_explicit_weights(self):
        scores = {"faithfulness": 1.0, "context_recall": 0.0}
        weights = {"faithfulness": 3.0, "context_recall": 1.0}
        result = compute_weighted_score(scores, weights)
        assert result == pytest.approx(0.75, rel=1e-4)
    def test_nan_values_excluded(self):
        scores = {"faithfulness": float("nan"), "context_recall": 0.8}
        result = compute_weighted_score(scores, {})
        assert result == pytest.approx(0.8, rel=1e-4)
    def test_none_values_excluded(self):
        scores = {"faithfulness": None, "context_recall": 0.6}
        result = compute_weighted_score(scores, {})
        assert result == pytest.approx(0.6, rel=1e-4)
    def test_all_nan_returns_none(self):
        scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
        assert compute_weighted_score(scores, {}) is None
    def test_empty_scores_returns_none(self):
        assert compute_weighted_score({}, {}) is None
    def test_missing_metric_in_weights_uses_default_1(self):
        scores = {"faithfulness": 0.8, "context_recall": 0.4}
        weights = {"faithfulness": 2.0}
        result = compute_weighted_score(scores, weights)
        assert result == pytest.approx(2.0 / 3, rel=1e-4)
 class TestWeightedMetricMeans:
    def _rows(self):
        return [
            {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
            {"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
        ]
    def test_equal_weights_gives_arithmetic_mean(self):
        rows = self._rows()
        result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
        assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
    def test_doc_weight_amplifies_contribution(self):
        rows = self._rows()
        doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
        result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
        assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
    def test_nan_rows_skipped_per_metric(self):
        rows = [
            {"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
            {"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
        ]
        result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
        assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
    def test_missing_metric_column_returns_none(self):
        rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
        result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
        assert result["unknown_metric"] is None
    def test_empty_rows_returns_none_for_all(self):
        result = weighted_metric_means([], ["faithfulness"], {})
        assert result["faithfulness"] is None
 class TestComputeOverallWeightedScoreMean:
    def test_basic_weighted_mean_of_weighted_scores(self):
        rows = [
            {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
            {"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
        ]
        metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
        result = compute_overall_weighted_score_mean(rows, metric_weights, {})
        assert result == pytest.approx(0.5, rel=1e-4)
    def test_doc_weight_amplifies_sample(self):
        rows = [
            {"doc_name": "important.pdf", "faithfulness": 1.0},
            {"doc_name": "other.pdf", "faithfulness": 0.0},
        ]
        doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
        result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
        assert result == pytest.approx(0.9, rel=1e-4)
    def test_all_nan_returns_none(self):
        rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
        assert compute_overall_weighted_score_mean(rows, {}, {}) is None
--- a/tests/webapp/init.py
+++ b/tests/webapp/init.py
--- a/tests/webapp/test_llm_profiles_api.py
+++ b/tests/webapp/test_llm_profiles_api.py
@@ -0,0 +1,245 @@
 """Integration tests for /api/llm-profiles endpoints."""
 import pytest
 from fastapi.testclient import TestClient
 from unittest.mock import patch
@pytest.fixture()
 def client(tmp_path, monkeypatch):
    """TestClient with a fresh ProfileManager backed by a temp file."""
    store = tmp_path / "profiles.json"
    import webapp.services.profile_manager as pm_mod
    from webapp.services.profile_manager import ProfileManager
    fresh_mgr = ProfileManager(store_path=store)
    monkeypatch.setattr(pm_mod, "profile_manager", fresh_mgr)
    import webapp.api.llm_profiles as api_mod
    monkeypatch.setattr(api_mod, "profile_manager", fresh_mgr)
    from webapp.server import create_app
    return TestClient(create_app())
 def test_list_empty(client):
    resp = client.get("/api/llm-profiles")
    assert resp.status_code == 200
    assert resp.json()["profiles"] == []
 def test_create_and_list(client):
    body = {"name": "Test", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
    resp = client.post("/api/llm-profiles", json=body)
    assert resp.status_code == 201
    data = resp.json()
    assert data["name"] == "Test"
    assert data["profile_id"] != ""
    resp2 = client.get("/api/llm-profiles")
    assert len(resp2.json()["profiles"]) == 1
 def test_update_profile(client):
    body = {"name": "Old", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
    pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
    upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60}
    with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
        resp = client.put(f"/api/llm-profiles/{pid}", json=upd)
    assert resp.status_code == 200
    assert resp.json()["name"] == "New"
    assert resp.json()["timeout_seconds"] == 60
    invalidate.assert_called_once()
 def test_delete_profile(client):
    body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"}
    pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
    with patch("webapp.services.inline_scorer.inline_scorer.invalidate_cache") as invalidate:
        resp = client.delete(f"/api/llm-profiles/{pid}")
    assert resp.status_code == 200
    assert resp.json()["deleted"] is True
    assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0
    invalidate.assert_called_once()
 def test_update_nonexistent(client):
    resp = client.put("/api/llm-profiles/nope",
                      json={"name": "X", "model": "m", "base_url": "http://x/v1", "api_key": "k"})
    assert resp.status_code == 404
 def test_delete_nonexistent(client):
    resp = client.delete("/api/llm-profiles/nope")
    assert resp.status_code == 404
 # ---------------------------------------------------------------------------
 # YAML patcher tests
 # ---------------------------------------------------------------------------
 import yaml as yaml_lib
 from webapp.services.yaml_patcher import apply_profiles_to_scenario
 from webapp.models import LLMProfile
 def test_apply_judge_profile(tmp_path):
    """Applying a judge profile patches judge_model in the YAML."""
    scenario_file = tmp_path / "test-scenario.yaml"
    scenario_file.write_text(
        "scenario_name: test\nmode: offline\njudge_model: old-model\nembedding_model: emb\n"
        "dataset: data.csv\nmetrics:\n- faithfulness\noutput_dir: outputs/test\n",
        encoding="utf-8",
    )
    judge_p = LLMProfile(
        profile_id="x", name="J", model="new-model",
        base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
    )
    patched = apply_profiles_to_scenario(
        scenario_path=str(scenario_file),
        judge_profile=judge_p,
        answer_profile=None,
        dataset_profile=None,
        _resolve_absolute=True,
    )
    assert "judge_model" in patched
    data = yaml_lib.safe_load(scenario_file.read_text())
    assert data["judge_model"] == "new-model"
 def test_apply_answer_profile(tmp_path):
    """Applying an answer profile patches app_adapter.static_kwargs.model."""
    scenario_file = tmp_path / "online.yaml"
    scenario_file.write_text(
        "scenario_name: online\nmode: online\njudge_model: j\nembedding_model: emb\n"
        "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n"
        "app_adapter:\n  type: python\n  callable: apps.foo:run\n"
        "  static_kwargs:\n    model: old\n    source_chunks_path: chunks.jsonl\n",
        encoding="utf-8",
    )
    answer_p = LLMProfile(
        profile_id="y", name="A", model="new-answer-model",
        base_url="http://x/v1", api_key="k", created_at="t", updated_at="t",
    )
    patched = apply_profiles_to_scenario(
        scenario_path=str(scenario_file),
        judge_profile=None,
        answer_profile=answer_p,
        dataset_profile=None,
        _resolve_absolute=True,
    )
    assert "app_adapter.static_kwargs.model" in patched
    data = yaml_lib.safe_load(scenario_file.read_text())
    assert data["app_adapter"]["static_kwargs"]["model"] == "new-answer-model"
 def test_apply_no_profiles_returns_empty(tmp_path):
    """When no profiles are given, no fields are patched."""
    scenario_file = tmp_path / "noop.yaml"
    scenario_file.write_text("scenario_name: noop\njudge_model: m\n", encoding="utf-8")
    patched = apply_profiles_to_scenario(
        scenario_path=str(scenario_file),
        judge_profile=None,
        answer_profile=None,
        dataset_profile=None,
        _resolve_absolute=True,
    )
    assert patched == []
 def test_apply_metric_weights_patches_yaml(tmp_path):
    """Applying metric_weights writes them into the YAML."""
    import yaml as yaml_lib
    import pytest
    scenario_file = tmp_path / "w-scenario.yaml"
    scenario_file.write_text(
        "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
        "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
        encoding="utf-8",
    )
    from webapp.services.yaml_patcher import apply_profiles_to_scenario
    patched = apply_profiles_to_scenario(
        scenario_path=str(scenario_file),
        judge_profile=None, answer_profile=None, dataset_profile=None,
        metric_weights={"faithfulness": 0.7, "context_recall": 0.3},
        _resolve_absolute=True,
    )
    assert "metric_weights" in patched
    data = yaml_lib.safe_load(scenario_file.read_text())
    assert abs(data["metric_weights"]["faithfulness"] - 0.7) < 1e-9
 def test_apply_doc_weights_patches_yaml(tmp_path):
    """Applying doc_weights writes them into the YAML."""
    import yaml as yaml_lib
    scenario_file = tmp_path / "dw-scenario.yaml"
    scenario_file.write_text(
        "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
        "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
        encoding="utf-8",
    )
    from webapp.services.yaml_patcher import apply_profiles_to_scenario
    patched = apply_profiles_to_scenario(
        scenario_path=str(scenario_file),
        judge_profile=None, answer_profile=None, dataset_profile=None,
        doc_weights={"doc.pdf": 2.0},
        _resolve_absolute=True,
    )
    assert "doc_weights" in patched
    data = yaml_lib.safe_load(scenario_file.read_text())
    assert abs(data["doc_weights"]["doc.pdf"] - 2.0) < 1e-9
 # ---------------------------------------------------------------------------
 # Connectivity test endpoint tests
 # ---------------------------------------------------------------------------
 from unittest.mock import MagicMock
 def test_probe_connectivity_success(client):
    """POST /api/llm-profiles/probe returns ok=True on successful completion."""
    mock_response = MagicMock()
    mock_response.choices = [MagicMock()]
    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
        MockOpenAI.return_value.chat.completions.create.return_value = mock_response
        resp = client.post("/api/llm-profiles/probe", json={
            "model": "test-model",
            "base_url": "http://x/v1",
            "api_key": "sk-test",
        })
    assert resp.status_code == 200
    data = resp.json()
    assert data["ok"] is True
    assert data["latency_ms"] is not None
 def test_probe_connectivity_failure(client):
    """POST /api/llm-profiles/probe returns ok=False when the LLM call raises."""
    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
        MockOpenAI.return_value.chat.completions.create.side_effect = Exception("connection refused")
        resp = client.post("/api/llm-profiles/probe", json={
            "model": "test-model",
            "base_url": "http://x/v1",
            "api_key": "sk-test",
        })
    assert resp.status_code == 200
    data = resp.json()
    assert data["ok"] is False
    assert "connection refused" in data["message"]
 def test_test_saved_profile_success(client):
    """POST /api/llm-profiles/{id}/test returns ok=True for a saved profile."""
    body = {"name": "T", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
    pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
    mock_response = MagicMock()
    mock_response.choices = [MagicMock()]
    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
        MockOpenAI.return_value.chat.completions.create.return_value = mock_response
        resp = client.post(f"/api/llm-profiles/{pid}/test")
    assert resp.status_code == 200
    assert resp.json()["ok"] is True
 def test_test_nonexistent_profile_returns_404(client):
    """POST /api/llm-profiles/{id}/test returns 404 for unknown profile id."""
    resp = client.post("/api/llm-profiles/nonexistent/test")
    assert resp.status_code == 404
--- a/tests/webapp/test_profile_manager.py
+++ b/tests/webapp/test_profile_manager.py
@@ -0,0 +1,205 @@
 import pytest
 from unittest.mock import sentinel
 from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
 def test_llm_profile_defaults():
    p = LLMProfile(
        profile_id="abc",
        name="Test",
        model="gpt-4",
        base_url="http://localhost/v1",
        api_key="sk-test",
    )
    assert p.timeout_seconds == 30
    assert p.created_at != ""
    assert p.updated_at != ""
 def test_profile_apply_request_fields():
    req = ProfileApplyRequest(
        scenario_path="scenarios/offline/sample.yaml",
        judge_profile_id="id1",
        answer_profile_id="id2",
        dataset_profile_id=None,
    )
    assert req.judge_profile_id == "id1"
    assert req.dataset_profile_id is None
 def test_profile_apply_response():
    resp = ProfileApplyResponse(scenario_path="scenarios/offline/sample.yaml", patched_fields=["judge_model"])
    assert "judge_model" in resp.patched_fields
 # ---------------------------------------------------------------------------
 # ProfileManager service tests
 # ---------------------------------------------------------------------------
 import json
 from webapp.services.profile_manager import ProfileManager
 def _make_manager(tmp_path):
    store = tmp_path / "profiles.json"
    return ProfileManager(store_path=store)
 def test_create_profile(tmp_path):
    mgr = _make_manager(tmp_path)
    p = mgr.create(name="Local", model="deepseek-v4-flash",
                   base_url="http://localhost/v1", api_key="sk-x")
    assert p.profile_id != ""
    assert p.name == "Local"
 def test_list_profiles(tmp_path):
    mgr = _make_manager(tmp_path)
    mgr.create(name="A", model="m1", base_url="http://a/v1", api_key="k1")
    mgr.create(name="B", model="m2", base_url="http://b/v1", api_key="k2")
    profiles = mgr.list_all()
    assert len(profiles) == 2
 def test_get_profile(tmp_path):
    mgr = _make_manager(tmp_path)
    created = mgr.create(name="X", model="m", base_url="http://x/v1", api_key="k")
    fetched = mgr.get(created.profile_id)
    assert fetched is not None
    assert fetched.name == "X"
 def test_update_profile(tmp_path):
    mgr = _make_manager(tmp_path)
    p = mgr.create(name="Old", model="m", base_url="http://x/v1", api_key="k")
    updated = mgr.update(p.profile_id, name="New", model="m2",
                         base_url="http://x/v1", api_key="k", timeout_seconds=60)
    assert updated is not None
    assert updated.name == "New"
    assert updated.model == "m2"
    assert updated.timeout_seconds == 60
 def test_delete_profile(tmp_path):
    mgr = _make_manager(tmp_path)
    p = mgr.create(name="Del", model="m", base_url="http://x/v1", api_key="k")
    assert mgr.delete(p.profile_id) is True
    assert mgr.get(p.profile_id) is None
 def test_persistence(tmp_path):
    store = tmp_path / "profiles.json"
    mgr1 = ProfileManager(store_path=store)
    p = mgr1.create(name="Persist", model="m", base_url="http://x/v1", api_key="k")
    mgr2 = ProfileManager(store_path=store)
    assert mgr2.get(p.profile_id) is not None
 def test_get_nonexistent(tmp_path):
    mgr = _make_manager(tmp_path)
    assert mgr.get("does-not-exist") is None
 def test_delete_nonexistent(tmp_path):
    mgr = _make_manager(tmp_path)
    assert mgr.delete("does-not-exist") is False
 def test_resolve_openai_client_kwargs_prefers_matching_profile(tmp_path, monkeypatch):
    """Metric runtime should prefer the saved LLM Profile over .env defaults."""
    from rag_eval.metrics.factory import _resolve_openai_client_kwargs
    from rag_eval.settings import EvaluationSettings
    import webapp.services.profile_manager as pm_mod
    mgr = _make_manager(tmp_path)
    mgr.create(
        name="Judge",
        model="gpt-5.5",
        base_url="http://39.107.88.131:13000",
        api_key="sk-profile",
        timeout_seconds=300,
    )
    monkeypatch.setattr(pm_mod, "profile_manager", mgr)
    settings = EvaluationSettings(
        OPENAI_API_KEY="sk-env",
        OPENAI_BASE_URL="http://env-base/v1",
        OPENAI_TIMEOUT_SECONDS=30,
    )
    kwargs = _resolve_openai_client_kwargs("gpt-5.5", settings)
    assert kwargs["api_key"] == "sk-profile"
    assert kwargs["base_url"] == "http://39.107.88.131:13000"
    assert kwargs["timeout"] == 300.0
 def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
    """When no saved profile matches, .env settings remain the fallback."""
    from rag_eval.metrics.factory import _resolve_openai_client_kwargs
    from rag_eval.settings import EvaluationSettings
    import webapp.services.profile_manager as pm_mod
    mgr = _make_manager(tmp_path)
    monkeypatch.setattr(pm_mod, "profile_manager", mgr)
    settings = EvaluationSettings(
        OPENAI_API_KEY="sk-env",
        OPENAI_BASE_URL="http://env-base/v1",
        OPENAI_TIMEOUT_SECONDS=45,
    )
    kwargs = _resolve_openai_client_kwargs("gpt-5", settings)
    assert kwargs["api_key"] == "sk-env"
    assert kwargs["base_url"] == "http://env-base/v1"
    assert kwargs["timeout"] == 45.0
 def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch):
    """Structured RAGAS judge calls should use a larger completion budget by default."""
    import rag_eval.metrics.factory as factory
    from rag_eval.settings import EvaluationSettings
    captured: dict[str, object] = {}
    def fake_llm_factory(model, client=None, **kwargs):
        captured["model"] = model
        captured["client"] = client
        captured["kwargs"] = kwargs
        return sentinel.llm
    monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
    monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
    monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
    llm, embeddings = factory.build_models(
        "gpt-5",
        "text-embedding-3-small",
        EvaluationSettings(),
    )
    assert llm is sentinel.llm
    assert embeddings is sentinel.embeddings
    assert captured["model"] == "gpt-5"
    assert captured["client"] is sentinel.client
    assert captured["kwargs"] == {"max_tokens": 4096}
 def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch):
    """Operators should be able to raise the judge completion budget via settings."""
    import rag_eval.metrics.factory as factory
    from rag_eval.settings import EvaluationSettings
    captured: dict[str, object] = {}
    def fake_llm_factory(model, client=None, **kwargs):
        captured["kwargs"] = kwargs
        return sentinel.llm
    monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
    monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
    monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
    factory.build_models(
        "gpt-5",
        "text-embedding-3-small",
        EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192),
    )
    assert captured["kwargs"] == {"max_tokens": 8192}
--- a/tests/webapp/test_score_api.py
+++ b/tests/webapp/test_score_api.py
@@ -0,0 +1,341 @@
 """Tests for POST /api/score endpoint."""
 from __future__ import annotations
 import pytest
 from pydantic import ValidationError
 from webapp.models import ScoreRequest, ScoreResponse
 class TestScoreRequest:
    def test_minimal_valid_request(self):
        """Only required fields — question, answer, contexts."""
        req = ScoreRequest(
            question="What is CT?",
            answer="CT is imaging.",
            contexts="CT uses X-rays.",
        )
        assert req.question == "What is CT?"
        assert req.contexts == "CT uses X-rays."
        assert req.ground_truth is None
        assert req.context_separator == " |||| "
        assert req.metrics == [
            "faithfulness",
            "answer_relevancy",
            "context_recall",
            "context_precision",
        ]
    def test_contexts_split_by_separator(self):
        """contexts_as_list() splits on context_separator."""
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="ctx1 |||| ctx2 |||| ctx3",
            context_separator=" |||| ",
        )
        assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
    def test_contexts_split_custom_separator(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="a---b---c",
            context_separator="---",
        )
        assert req.contexts_as_list() == ["a", "b", "c"]
    def test_contexts_split_single_item(self):
        req = ScoreRequest(question="q", answer="a", contexts="only one")
        assert req.contexts_as_list() == ["only one"]
    def test_missing_question_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(answer="a", contexts="c")  # type: ignore[call-arg]
    def test_missing_answer_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(question="q", contexts="c")  # type: ignore[call-arg]
    def test_missing_contexts_defaults_to_none(self):
        """contexts is now optional — missing contexts is allowed."""
        req = ScoreRequest(question="q", answer="a")
        assert req.contexts is None
        assert req.contexts_as_list() == []
    def test_custom_metrics_accepted(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            metrics=["faithfulness"],
        )
        assert req.metrics == ["faithfulness"]
    def test_invalid_metric_name_raises(self):
        with pytest.raises(ValidationError):
            ScoreRequest(
                question="q",
                answer="a",
                contexts="c",
                metrics=["not_a_metric"],
            )
    def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
        """Without ground_truth, GT-dependent metrics are excluded."""
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            metrics=[
                "faithfulness",
                "context_recall",
                "factual_correctness",
                "semantic_similarity",
                "noise_sensitivity",
            ],
        )
        effective = req.effective_metrics()
        assert "faithfulness" in effective
        assert "context_recall" not in effective
        assert "factual_correctness" not in effective
        assert "semantic_similarity" not in effective
        assert "noise_sensitivity" not in effective
    def test_effective_metrics_keeps_all_when_ground_truth_present(self):
        req = ScoreRequest(
            question="q",
            answer="a",
            contexts="c",
            ground_truth="gt",
            metrics=["faithfulness", "context_recall", "factual_correctness"],
        )
        effective = req.effective_metrics()
        assert effective == [
            "faithfulness",
            "context_recall",
            "factual_correctness",
        ]
    def test_effective_metrics_drops_context_dependent_when_contexts_absent(self):
        """Without contexts, context-dependent metrics are excluded."""
        req = ScoreRequest(
            question="q", answer="a",
            metrics=["faithfulness", "answer_relevancy", "context_precision"],
        )
        effective = req.effective_metrics()
        assert "answer_relevancy" in effective
        assert "faithfulness" not in effective
        assert "context_precision" not in effective
 class TestScoreResponse:
    def test_score_response_structure(self):
        resp = ScoreResponse(
            scores={"faithfulness": 0.85, "answer_relevancy": None},
            weighted_score=0.85,
            latency_ms=1200,
        )
        assert resp.scores["faithfulness"] == 0.85
        assert resp.scores["answer_relevancy"] is None
        assert resp.latency_ms == 1200
 class TestInlineScorer:
    def test_score_returns_dict_with_requested_metrics(self):
        """InlineScorer.score returns a dict keyed by the requested metrics."""
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings
        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
        mock_score.error = ""
        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a",
                        contexts=["ctx1"],
                        ground_truth=None,
                        metrics=["faithfulness", "answer_relevancy"],
                        judge_model="test-model",
                        embedding_model="test-embed",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert "faithfulness" in result
        assert "answer_relevancy" in result
        assert result["faithfulness"] == pytest.approx(0.9)
    def test_score_converts_nan_to_none(self):
        """NaN scores are converted to None in the returned dict."""
        import math
        from unittest.mock import AsyncMock, MagicMock, patch
        from webapp.services.inline_scorer import InlineScorer
        from rag_eval.settings import EvaluationSettings
        mock_score = MagicMock()
        mock_score.metrics = {"faithfulness": float("nan")}
        mock_score.error = ""
        mock_pipeline = MagicMock()
        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
                    scorer = InlineScorer()
                    result = scorer.score(
                        question="q", answer="a", contexts=["c"],
                        ground_truth=None,
                        metrics=["faithfulness"],
                        judge_model="m", embedding_model="e",
                        settings=EvaluationSettings(_env_file=None),
                    )
        assert result["faithfulness"] is None
 # ── Endpoint integration tests ────────────────────────────────────────────────
@pytest.fixture()
 def client(monkeypatch):
    """TestClient with mocked InlineScorer."""
    import webapp.api.score as score_mod
    from unittest.mock import MagicMock
    mock_scorer = MagicMock()
    mock_scorer.score.return_value = {
        "faithfulness": 0.85,
        "answer_relevancy": 0.90,
    }
    monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
    from webapp.server import create_app
    return TestClient(create_app())
 from fastapi.testclient import TestClient
 class TestScoreEndpoint:
    def test_post_score_returns_200(self, client):
        resp = client.post("/api/score", json={
            "question": "What is CT?",
            "answer": "CT is imaging.",
            "contexts": "CT uses X-rays.",
        })
        assert resp.status_code == 200
        data = resp.json()
        assert "scores" in data
        assert "latency_ms" in data
        assert data["scores"]["faithfulness"] == pytest.approx(0.85)
    def test_weighted_score_computed(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
        })
        assert resp.status_code == 200
        data = resp.json()
        # 综合加权得分已暂时禁用，始终返回 null
        assert data["weighted_score"] is None
    def test_missing_required_fields_returns_422(self, client):
        resp = client.post("/api/score", json={"question": "q"})
        assert resp.status_code == 422
    def test_invalid_metric_name_returns_422(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
            "metrics": ["not_a_metric"],
        })
        assert resp.status_code == 422
    def test_skipped_metrics_returned_when_no_ground_truth(self, client):
        resp = client.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
            "metrics": ["faithfulness", "context_recall"],
        })
        assert resp.status_code == 200
        data = resp.json()
        assert "context_recall" in data["skipped_metrics"]
    def test_contexts_split_on_separator(self, monkeypatch):
        """contexts string is split before passing to scorer."""
        import webapp.api.score as score_mod
        from unittest.mock import MagicMock
        calls = []
        def capture(**kwargs):
            calls.append(kwargs.get("contexts", []))
            return {"faithfulness": 0.9}
        mock_scorer = MagicMock()
        mock_scorer.score.side_effect = lambda **kw: capture(**kw)
        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
        from webapp.server import create_app
        from fastapi.testclient import TestClient
        tc = TestClient(create_app())
        tc.post("/api/score", json={
            "question": "q", "answer": "a",
            "contexts": "ctx1 |||| ctx2",
            "context_separator": " |||| ",
        })
        assert len(calls) == 1
        assert calls[0] == ["ctx1", "ctx2"]
    def test_bearer_token_auth_required_when_configured(self, monkeypatch):
        """When SCORE_API_TOKEN is set, requests without token get 401."""
        import webapp.api.score as score_mod
        from rag_eval.settings import EvaluationSettings
        from unittest.mock import MagicMock
        mock_settings = EvaluationSettings(_env_file=None)
        object.__setattr__(mock_settings, "score_api_token", "secret-token")
        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
        mock_scorer = MagicMock()
        mock_scorer.score.return_value = {"faithfulness": 0.9}
        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
        from webapp.server import create_app
        from fastapi.testclient import TestClient
        tc = TestClient(create_app())
        # No auth header -> 401
        resp = tc.post("/api/score", json={
            "question": "q", "answer": "a", "contexts": "c",
        })
        assert resp.status_code == 401
        # Correct token -> 200
        resp = tc.post("/api/score",
            json={"question": "q", "answer": "a", "contexts": "c"},
            headers={"Authorization": "Bearer secret-token"},
        )
        assert resp.status_code == 200
    def test_wrong_bearer_token_returns_401(self, monkeypatch):
        import webapp.api.score as score_mod
        from rag_eval.settings import EvaluationSettings
        from unittest.mock import MagicMock
        mock_settings = EvaluationSettings(_env_file=None)
        object.__setattr__(mock_settings, "score_api_token", "correct-token")
        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
        mock_scorer = MagicMock()
        mock_scorer.score.return_value = {}
        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
        from webapp.server import create_app
        from fastapi.testclient import TestClient
        tc = TestClient(create_app())
        resp = tc.post("/api/score",
            json={"question": "q", "answer": "a", "contexts": "c"},
            headers={"Authorization": "Bearer wrong-token"},
        )
        assert resp.status_code == 401
--- a/tests/webapp/test_score_jobs_api.py
+++ b/tests/webapp/test_score_jobs_api.py
@@ -0,0 +1,146 @@
 """Tests for async score jobs API."""
 from __future__ import annotations
 import json
 import time
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import pytest
 from fastapi.testclient import TestClient
@pytest.fixture()
 def client(tmp_path, monkeypatch):
    """TestClient with fresh ScoreJobManager backed by tmp dirs."""
    import webapp.services.score_job_manager as mgr_mod
    from webapp.services.score_job_manager import ScoreJobManager
    fresh_mgr = ScoreJobManager(
        output_dir=tmp_path / "score-async",
        index_dir=tmp_path / "score-jobs",
        max_workers=2,
    )
    monkeypatch.setattr(mgr_mod, "score_job_manager", fresh_mgr)
    import webapp.api.score_jobs as api_mod
    monkeypatch.setattr(api_mod, "score_job_manager", fresh_mgr)
    from webapp.server import create_app
    return TestClient(create_app())
 class TestAsyncScoreEndpoints:
    def test_submit_returns_202_with_job_id(self, client):
        """POST /api/score/async returns 202 immediately."""
        with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
            resp = client.post("/api/score/async", json={
                "question": "q?",
                "answer": "a.",
                "metrics": ["answer_relevancy"],
            })
        assert resp.status_code == 202
        data = resp.json()
        assert "job_id" in data
        assert data["status"] == "queued"
    def test_list_jobs_empty_initially(self, client):
        resp = client.get("/api/score/jobs")
        assert resp.status_code == 200
        assert resp.json()["jobs"] == []
    def test_get_unknown_job_returns_404(self, client):
        resp = client.get("/api/score/jobs/nonexistent123")
        assert resp.status_code == 404
    def test_submitted_job_appears_in_list(self, client):
        with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
            resp = client.post("/api/score/async", json={
                "question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
            })
        job_id = resp.json()["job_id"]
        time.sleep(0.1)
        list_resp = client.get("/api/score/jobs")
        ids = [j["job_id"] for j in list_resp.json()["jobs"]]
        assert job_id in ids
    def test_get_job_by_id_returns_status(self, client):
        with patch("webapp.services.score_job_manager.ScoreJobManager._run"):
            resp = client.post("/api/score/async", json={
                "question": "q?", "answer": "a.", "metrics": ["answer_relevancy"],
            })
        job_id = resp.json()["job_id"]
        time.sleep(0.1)
        get_resp = client.get(f"/api/score/jobs/{job_id}")
        assert get_resp.status_code == 200
        assert get_resp.json()["job_id"] == job_id
    def test_missing_required_fields_returns_422(self, client):
        resp = client.post("/api/score/async", json={"question": "q?"})
        assert resp.status_code == 422
 class TestScoreJobManager:
    def test_completed_job_persisted_to_index(self, tmp_path):
        """Completed job writes index JSON."""
        from webapp.services.score_job_manager import ScoreJobManager
        from webapp.models import ScoreRequest
        mgr = ScoreJobManager(
            output_dir=tmp_path / "runs",
            index_dir=tmp_path / "index",
            max_workers=1,
        )
        req = ScoreRequest(question="q?", answer="a.", metrics=["answer_relevancy"])
        # Patch _run directly — it uses lazy imports internally
        def fake_run(job_id, request):
            mgr._update(job_id, status="completed", finished_at="2026-01-01T00:00:01+00:00",
                        run_id="fake-run-id", scores={"answer_relevancy": 0.85},
                        weighted_score=0.85, latency_ms=500)
        with patch.object(mgr, "_run", side_effect=fake_run):
            status = mgr.submit(req)
        for _ in range(20):
            s = mgr.get(status.job_id)
            if s and s.status == "completed":
                break
            time.sleep(0.1)
        s = mgr.get(status.job_id)
        assert s is not None
        idx_path = tmp_path / "index" / f"{status.job_id}.json"
        assert idx_path.exists()
        data = json.loads(idx_path.read_text(encoding="utf-8"))
        assert data["job_id"] == status.job_id
        assert data["status"] == "completed"
    def test_loads_existing_index_on_startup(self, tmp_path):
        """Manager loads persisted jobs from index dir on init."""
        from webapp.services.score_job_manager import ScoreJobManager
        from webapp.models import AsyncScoreJobStatus
        idx_dir = tmp_path / "index"
        idx_dir.mkdir()
        fake = AsyncScoreJobStatus(
            job_id="testjob001",
            status="completed",
            created_at="2026-01-01T00:00:00+00:00",
            run_id="some-run-id",
            scores={"answer_relevancy": 0.9},
            weighted_score=0.9,
            latency_ms=1000,
        )
        (idx_dir / "testjob001.json").write_text(
            json.dumps(fake.model_dump(), ensure_ascii=False), encoding="utf-8"
        )
        mgr = ScoreJobManager(
            output_dir=tmp_path / "runs",
            index_dir=idx_dir,
            max_workers=1,
        )
        loaded = mgr.get("testjob001")
        assert loaded is not None
        assert loaded.status == "completed"
        assert loaded.run_id == "some-run-id"
--- a/tests/webapp/test_session_score_jobs_api.py
+++ b/tests/webapp/test_session_score_jobs_api.py
@@ -0,0 +1,299 @@
 """Tests for session-grouped async scoring API and SessionScoreJobManager."""
 from __future__ import annotations
 import json
 import threading
 import time
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import pandas as pd
 import pytest
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@pytest.fixture()
 def tmp_manager(tmp_path):
    """Isolated SessionScoreJobManager backed by tmp dirs (no real LLM calls)."""
    from webapp.services.session_score_manager import SessionScoreJobManager
    return SessionScoreJobManager(
        output_dir=tmp_path / "score-session",
        index_dir=tmp_path / "score-session-jobs",
        max_workers=2,
    )
@pytest.fixture()
 def client(tmp_path, monkeypatch):
    """TestClient with fresh SessionScoreJobManager backed by tmp dirs."""
    import webapp.services.session_score_manager as mgr_mod
    from webapp.services.session_score_manager import SessionScoreJobManager
    fresh_mgr = SessionScoreJobManager(
        output_dir=tmp_path / "score-session",
        index_dir=tmp_path / "score-session-jobs",
        max_workers=2,
    )
    monkeypatch.setattr(mgr_mod, "session_score_manager", fresh_mgr)
    import webapp.api.session_score_jobs as api_mod
    monkeypatch.setattr(api_mod, "session_score_manager", fresh_mgr)
    from webapp.server import create_app
    return pytest.importorskip("fastapi.testclient").TestClient(create_app())
 # ---------------------------------------------------------------------------
 # Unit tests for SessionScoreJobManager
 # ---------------------------------------------------------------------------
 class TestSessionRunId:
    def test_same_session_always_same_run_id(self, tmp_manager):
        assert tmp_manager.session_run_id("abc") == tmp_manager.session_run_id("abc")
    def test_different_sessions_different_run_ids(self, tmp_manager):
        assert tmp_manager.session_run_id("session-A") != tmp_manager.session_run_id("session-B")
    def test_run_id_prefixed_with_session(self, tmp_manager):
        assert tmp_manager.session_run_id("test123").startswith("session-")
    def test_special_chars_sanitized(self, tmp_manager):
        run_id = tmp_manager.session_run_id("user@dify:flow/001")
        assert "/" not in run_id
        assert "@" not in run_id
        assert ":" not in run_id
 class TestSubmit:
    def test_submit_returns_job_status_and_run_id(self, tmp_manager):
        with patch.object(tmp_manager._executor, "submit"):
            status, run_id = tmp_manager.submit("session-1", _mock_request())
        assert status.job_id
        assert status.status == "queued"
        assert run_id == tmp_manager.session_run_id("session-1")
    def test_submit_adds_job_to_session(self, tmp_manager):
        with patch.object(tmp_manager._executor, "submit"):
            status, _ = tmp_manager.submit("session-1", _mock_request())
        session = tmp_manager.get_session("session-1")
        assert session is not None
        assert any(j.job_id == status.job_id for j in session.jobs)
    def test_multiple_submits_same_session_accumulate(self, tmp_manager):
        with patch.object(tmp_manager._executor, "submit"):
            tmp_manager.submit("session-X", _mock_request())
            tmp_manager.submit("session-X", _mock_request())
            tmp_manager.submit("session-X", _mock_request())
        session = tmp_manager.get_session("session-X")
        assert session.call_count == 3
    def test_get_unknown_job_returns_none(self, tmp_manager):
        assert tmp_manager.get_job("does-not-exist") is None
    def test_get_unknown_session_returns_none(self, tmp_manager):
        assert tmp_manager.get_session("no-such-session") is None
 class TestSessionIndexPersistence:
    def test_session_index_survives_restart(self, tmp_path):
        """Jobs and session mappings loaded from disk on new manager instance."""
        from webapp.services.session_score_manager import SessionScoreJobManager
        mgr1 = SessionScoreJobManager(
            output_dir=tmp_path / "score-session",
            index_dir=tmp_path / "score-session-jobs",
        )
        with patch.object(mgr1._executor, "submit"):
            mgr1.submit("persist-session", _mock_request())
            mgr1.submit("persist-session", _mock_request())
        # New manager instance loads from disk
        mgr2 = SessionScoreJobManager(
            output_dir=tmp_path / "score-session",
            index_dir=tmp_path / "score-session-jobs",
        )
        session = mgr2.get_session("persist-session")
        assert session is not None
        assert session.call_count == 2
    def test_job_index_file_created_on_submit(self, tmp_path):
        from webapp.services.session_score_manager import SessionScoreJobManager
        mgr = SessionScoreJobManager(
            output_dir=tmp_path / "score-session",
            index_dir=tmp_path / "score-session-jobs",
        )
        with patch.object(mgr._executor, "submit"):
            status, _ = mgr.submit("file-test", _mock_request())
        index_file = tmp_path / "score-session-jobs" / f"{status.job_id}.json"
        assert index_file.is_file()
        data = json.loads(index_file.read_text())
        assert data["job_id"] == status.job_id
 class TestAppendBehaviour:
    """Test the CSV append / read-all logic in _append_and_regenerate via _read_score_rows."""
    def test_read_score_rows_returns_empty_for_missing_csv(self, tmp_manager, tmp_path):
        rows = tmp_manager._read_score_rows(tmp_path / "nonexistent")
        assert rows == []
    def test_read_score_rows_reads_existing_csv(self, tmp_manager, tmp_path):
        run_dir = tmp_path / "run1"
        run_dir.mkdir()
        df = pd.DataFrame([{"sample_id": "s1", "answer_relevancy": 0.9}])
        df.to_csv(run_dir / "scores.csv", index=False)
        rows = tmp_manager._read_score_rows(run_dir)
        assert len(rows) == 1
        assert rows[0]["sample_id"] == "s1"
    def test_metric_means_computed_from_csv(self, tmp_manager, tmp_path):
        run_dir = tmp_path / "run2"
        run_dir.mkdir()
        df = pd.DataFrame([
            {"sample_id": "s1", "answer_relevancy": 0.8},
            {"sample_id": "s2", "answer_relevancy": 0.6},
        ])
        df.to_csv(run_dir / "scores.csv", index=False)
        means = tmp_manager._read_metric_means(run_dir)
        assert means["answer_relevancy"] == pytest.approx(0.7, abs=1e-4)
 # ---------------------------------------------------------------------------
 # API endpoint tests
 # ---------------------------------------------------------------------------
 class TestSessionAsyncEndpoints:
    def test_submit_returns_202_with_session_fields(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            resp = client.post("/api/score/session_async", json={
                "session_id": "test-session-001",
                "question": "What is CT?",
                "answer": "CT is computed tomography.",
                "metrics": ["answer_relevancy"],
            })
        assert resp.status_code == 202
        data = resp.json()
        assert data["session_id"] == "test-session-001"
        assert "job_id" in data
        assert "run_id" in data
        assert data["status"] == "queued"
        assert data["call_count"] >= 1
    def test_run_id_deterministic_for_session(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            r1 = client.post("/api/score/session_async", json={
                "session_id": "det-session",
                "question": "Q1",
                "answer": "A1",
                "metrics": ["answer_relevancy"],
            })
            r2 = client.post("/api/score/session_async", json={
                "session_id": "det-session",
                "question": "Q2",
                "answer": "A2",
                "metrics": ["answer_relevancy"],
            })
        assert r1.json()["run_id"] == r2.json()["run_id"]
    def test_different_sessions_different_run_ids(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            r1 = client.post("/api/score/session_async", json={
                "session_id": "session-A",
                "question": "Q",
                "answer": "A",
                "metrics": ["answer_relevancy"],
            })
            r2 = client.post("/api/score/session_async", json={
                "session_id": "session-B",
                "question": "Q",
                "answer": "A",
                "metrics": ["answer_relevancy"],
            })
        assert r1.json()["run_id"] != r2.json()["run_id"]
    def test_call_count_increments_per_session(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            for _ in range(3):
                client.post("/api/score/session_async", json={
                    "session_id": "count-session",
                    "question": "Q",
                    "answer": "A",
                    "metrics": ["answer_relevancy"],
                })
        time.sleep(0.05)
        resp = client.get("/api/score/sessions/count-session")
        assert resp.status_code == 200
        assert resp.json()["call_count"] == 3
    def test_get_session_returns_jobs_list(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            client.post("/api/score/session_async", json={
                "session_id": "list-session",
                "question": "Q",
                "answer": "A",
                "metrics": ["answer_relevancy"],
            })
        time.sleep(0.05)
        resp = client.get("/api/score/sessions/list-session")
        assert resp.status_code == 200
        data = resp.json()
        assert len(data["jobs"]) == 1
    def test_get_unknown_session_returns_404(self, client):
        resp = client.get("/api/score/sessions/no-such-session-xyz")
        assert resp.status_code == 404
    def test_get_session_job_by_id(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            resp = client.post("/api/score/session_async", json={
                "session_id": "job-lookup-session",
                "question": "Q",
                "answer": "A",
                "metrics": ["answer_relevancy"],
            })
        job_id = resp.json()["job_id"]
        time.sleep(0.05)
        get_resp = client.get(f"/api/score/session/jobs/{job_id}")
        assert get_resp.status_code == 200
        assert get_resp.json()["job_id"] == job_id
    def test_get_unknown_job_returns_404(self, client):
        resp = client.get("/api/score/session/jobs/nonexistent-job-id")
        assert resp.status_code == 404
    def test_missing_session_id_returns_422(self, client):
        resp = client.post("/api/score/session_async", json={
            "question": "Q",
            "answer": "A",
            "metrics": ["answer_relevancy"],
        })
        assert resp.status_code == 422
    def test_list_sessions_endpoint(self, client):
        with patch("webapp.services.session_score_manager.SessionScoreJobManager._run"):
            client.post("/api/score/session_async", json={
                "session_id": "list-all-session",
                "question": "Q",
                "answer": "A",
                "metrics": ["answer_relevancy"],
            })
        resp = client.get("/api/score/sessions")
        assert resp.status_code == 200
        assert "sessions" in resp.json()
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _mock_request():
    """Build a minimal ScoreRequest for testing."""
    from webapp.models import ScoreRequest
    return ScoreRequest(
        question="What is dual-source CT?",
        answer="It uses two X-ray sources.",
        metrics=["answer_relevancy"],
    )
--- a/webapp/init.py
+++ b/webapp/init.py
@@ -0,0 +1,5 @@
 """Lightweight FastAPI web console layered on top of the rag_eval platform.
 This package is additive and non-invasive: it imports rag_eval as a library and
 reads run artifacts from disk. It never modifies the core evaluation modules.
 """
--- a/webapp/api/init.py
+++ b/webapp/api/init.py
@@ -0,0 +1 @@
 """API router package for the evaluation console."""
--- a/webapp/api/evaluations.py
+++ b/webapp/api/evaluations.py
@@ -0,0 +1,54 @@
 """Routes for triggering evaluations and polling background task status."""
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import (
    TaskStatus,
    TriggerEvaluationRequest,
    TriggerEvaluationResponse,
 )
 from webapp.services import scenario_scanner
 from webapp.services.task_manager import task_manager
 router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
 logger = logging.getLogger("webapp.api.evaluations")
@router.post("", response_model=TriggerEvaluationResponse)
 def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
    """Validate the scenario path and queue a background evaluation task."""
    logger.info("[trigger] scenario=%s", request.scenario_path)
    resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
    if resolved is None:
        logger.warning("[trigger] invalid scenario path: %s", request.scenario_path)
        raise HTTPException(
            status_code=400,
            detail=f"无效或不允许的场景路径: {request.scenario_path}",
        )
    task_id = task_manager.submit(request.scenario_path)
    logger.info("[trigger] queued  task_id=%s  scenario=%s", task_id, request.scenario_path)
    return TriggerEvaluationResponse(task_id=task_id)
@router.get("/{task_id}", response_model=TaskStatus)
 def get_task_status(task_id: str) -> TaskStatus:
    """Return the current status and logs for one evaluation task."""
    status = task_manager.get(task_id)
    if status is None:
        logger.warning("[task_status] not found  task_id=%s", task_id)
        raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
    logger.debug("[task_status] task_id=%s  status=%s", task_id, status.status)
    return status
@router.get("", response_model=dict)
 def list_tasks() -> dict[str, list]:
    """Return all known evaluation tasks for this server session."""
    tasks = task_manager.list_tasks()
    logger.info("[list_tasks] count=%d", len(tasks))
    return {"tasks": [task.model_dump() for task in tasks]}
--- a/webapp/api/llm_profiles.py
+++ b/webapp/api/llm_profiles.py
@@ -0,0 +1,245 @@
 """CRUD routes for LLM profiles plus the scenario-patching apply endpoint."""
 from __future__ import annotations
 import logging
 import time
 from fastapi import APIRouter, HTTPException
 from openai import OpenAI
 from webapp.models import (
    CreateProfileRequest,
    LLMProfile,
    ProfileApplyRequest,
    ProfileApplyResponse,
    ProfileProbeRequest,
    ProfileTestResponse,
 )
 from webapp.services.profile_manager import profile_manager
 from webapp.services.yaml_patcher import apply_profiles_to_scenario
 router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])
 logger = logging.getLogger("webapp.api.llm_profiles")
 # 常见 embedding 模型名称关键词，用于自动判断走 /embeddings 端点
 _EMBEDDING_MODEL_KEYWORDS = (
    "embedding", "embed", "text-search", "text-similarity",
    "code-search", "ada-002",
 )
 def _is_embedding_model(model: str) -> bool:
    """Heuristic: return True if the model name looks like an embedding model."""
    return any(kw in model.lower() for kw in _EMBEDDING_MODEL_KEYWORDS)
 def _do_connectivity_test(
    model: str,
    base_url: str,
    api_key: str,
    timeout_seconds: int,
 ) -> ProfileTestResponse:
    """Send a minimal request and return the connectivity test result.
    - Embedding models → POST /embeddings with a short text
    - Chat models → POST /chat/completions, tries max_completion_tokens first
      (required by newer models like gpt-5.x), falls back to max_tokens.
    """
    client = OpenAI(
        api_key=api_key,
        base_url=base_url.rstrip("/"),
        timeout=float(timeout_seconds),
    )
    t0 = time.monotonic()
    if _is_embedding_model(model):
        # Embedding 模型走 /embeddings 端点
        try:
            client.embeddings.create(model=model, input="test")
            latency_ms = int((time.monotonic() - t0) * 1000)
            return ProfileTestResponse(ok=True, message="连接成功（embedding）", latency_ms=latency_ms)
        except Exception as exc:  # noqa: BLE001
            latency_ms = int((time.monotonic() - t0) * 1000)
            return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
    # Chat 模型：先不限制 token（最兼容），超时/鉴权错误直接返回
    # 避免 max_tokens=1 对部分模型（gpt-5.x）触发 min-output 限制
    try:
        client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "hi"}],
            max_tokens=8,   # 足够小节省费用，同时满足各模型最小输出要求
        )
        latency_ms = int((time.monotonic() - t0) * 1000)
        return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
    except Exception as exc:  # noqa: BLE001
        err_str = str(exc)
        # 如果 max_tokens 不被支持，改用 max_completion_tokens 再试一次
        if "max_tokens" in err_str and "max_completion_tokens" in err_str:
            try:
                client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": "hi"}],
                    max_completion_tokens=8,
                )
                latency_ms = int((time.monotonic() - t0) * 1000)
                return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
            except Exception as exc2:  # noqa: BLE001
                latency_ms = int((time.monotonic() - t0) * 1000)
                return ProfileTestResponse(ok=False, message=str(exc2), latency_ms=latency_ms)
        latency_ms = int((time.monotonic() - t0) * 1000)
        return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
    latency_ms = int((time.monotonic() - t0) * 1000)
    return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
 def probe_connectivity(request: ProfileProbeRequest) -> ProfileTestResponse:
    """Test LLM connectivity with inline credentials (no saved profile required)."""
    logger.info("[probe] model=%s  base_url=%s", request.model, request.base_url)
    result = _do_connectivity_test(
        model=request.model,
        base_url=request.base_url,
        api_key=request.api_key,
        timeout_seconds=request.timeout_seconds,
    )
    logger.info("[probe] ok=%s  latency=%sms  msg=%s", result.ok, result.latency_ms, result.message)
    return result
@router.get("", response_model=dict)
 def list_profiles() -> dict:
    """Return all saved LLM profiles."""
    profiles = profile_manager.list_all()
    logger.info("[list_profiles] count=%d", len(profiles))
    return {"profiles": [p.model_dump() for p in profiles]}
@router.post("", status_code=201, response_model=LLMProfile)
 def create_profile(request: CreateProfileRequest) -> LLMProfile:
    """Create a new LLM profile."""
    logger.info("[create_profile] name=%r  model=%s  base_url=%s", request.name, request.model, request.base_url)
    profile = profile_manager.create(
        name=request.name,
        model=request.model,
        base_url=request.base_url,
        api_key=request.api_key,
        timeout_seconds=request.timeout_seconds,
    )
    logger.info("[create_profile] created  id=%s", profile.profile_id)
    return profile
@router.put("/{profile_id}", response_model=LLMProfile)
 def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile:
    """Update an existing LLM profile by id."""
    logger.info("[update_profile] id=%s  name=%r  model=%s", profile_id, request.name, request.model)
    updated = profile_manager.update(
        profile_id=profile_id,
        name=request.name,
        model=request.model,
        base_url=request.base_url,
        api_key=request.api_key,
        timeout_seconds=request.timeout_seconds,
    )
    if updated is None:
        logger.warning("[update_profile] not found  id=%s", profile_id)
        raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
    # Invalidate scorer cache so next request picks up the new profile settings.
    try:
        from webapp.services.inline_scorer import inline_scorer
        inline_scorer.invalidate_cache()
        logger.info("[update_profile] scorer cache invalidated  id=%s", profile_id)
    except Exception:  # noqa: BLE001
        pass
    logger.info("[update_profile] updated  id=%s", profile_id)
    return updated
@router.delete("/{profile_id}", response_model=dict)
 def delete_profile(profile_id: str) -> dict:
    """Delete an LLM profile by id."""
    logger.info("[delete_profile] id=%s", profile_id)
    deleted = profile_manager.delete(profile_id)
    if not deleted:
        logger.warning("[delete_profile] not found  id=%s", profile_id)
        raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
    # Invalidate scorer cache in case the deleted profile was in use.
    try:
        from webapp.services.inline_scorer import inline_scorer
        inline_scorer.invalidate_cache()
    except Exception:  # noqa: BLE001
        pass
    logger.info("[delete_profile] deleted  id=%s", profile_id)
    return {"deleted": True}
@router.post("/{profile_id}/test", response_model=ProfileTestResponse)
 def test_profile(profile_id: str) -> ProfileTestResponse:
    """Test LLM connectivity for a saved profile."""
    profile = profile_manager.get(profile_id)
    if profile is None:
        logger.warning("[test_profile] not found  id=%s", profile_id)
        raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
    logger.info("[test_profile] id=%s  model=%s  base_url=%s", profile_id, profile.model, profile.base_url)
    result = _do_connectivity_test(
        model=profile.model,
        base_url=profile.base_url,
        api_key=profile.api_key,
        timeout_seconds=profile.timeout_seconds,
    )
    logger.info("[test_profile] ok=%s  latency=%sms", result.ok, result.latency_ms)
    return result
@router.post("/apply", response_model=ProfileApplyResponse)
 def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
    """Patch selected LLM profiles into the target scenario YAML file."""
    logger.info(
        "[apply_profiles] scenario=%s  judge=%s  answer=%s  dataset=%s  metric_weights=%s  doc_weights=%s",
        request.scenario_path,
        request.judge_profile_id,
        request.answer_profile_id,
        request.dataset_profile_id,
        bool(request.metric_weights),
        bool(request.doc_weights),
    )
    role_profiles: dict[str, LLMProfile | None] = {
        "judge": profile_manager.get(request.judge_profile_id) if request.judge_profile_id else None,
        "answer": profile_manager.get(request.answer_profile_id) if request.answer_profile_id else None,
        "dataset": profile_manager.get(request.dataset_profile_id) if request.dataset_profile_id else None,
    }
    missing = [
        role
        for role, pid in [
            ("judge", request.judge_profile_id),
            ("answer", request.answer_profile_id),
            ("dataset", request.dataset_profile_id),
        ]
        if pid and role_profiles[role] is None
    ]
    if missing:
        logger.warning("[apply_profiles] missing profiles for roles: %s", missing)
        raise HTTPException(
            status_code=400,
            detail=f"Profile(s) not found for roles: {', '.join(missing)}",
        )
    patched = apply_profiles_to_scenario(
        scenario_path=request.scenario_path,
        judge_profile=role_profiles["judge"],
        answer_profile=role_profiles["answer"],
        dataset_profile=role_profiles["dataset"],
        metric_weights=request.metric_weights,
        doc_weights=request.doc_weights,
    )
    logger.info("[apply_profiles] patched fields: %s", patched)
    return ProfileApplyResponse(
        scenario_path=request.scenario_path,
        patched_fields=patched,
    )
--- a/webapp/api/pipeline.py
+++ b/webapp/api/pipeline.py
@@ -0,0 +1,131 @@
 """Routes for the end-to-end pipeline API (document parse → build → eval)."""
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import (
    PipelineJobRequest,
    PipelineJobResponse,
    PipelineJobStatus,
 )
 from webapp.services.pipeline_task_manager import pipeline_task_manager
 router = APIRouter(prefix="/api/pipeline", tags=["pipeline"])
 logger = logging.getLogger("webapp.api.pipeline")
@router.post(
    "/jobs",
    status_code=202,
    response_model=PipelineJobResponse,
    summary="提交全链路评估任务",
    responses={
        202: {
            "description": "任务已成功排队，立即返回 job_id。",
            "content": {
                "application/json": {
                    "example": {
                        "job_id": "a1b2c3d4e5f6",
                        "job_name": "siemens-ct-eval-2026",
                        "status": "queued",
                    }
                }
            },
        },
        422: {"description": "请求参数校验失败（docs_path 等必填字段缺失或格式错误）。"},
    },
 )
 def submit_pipeline_job(request: PipelineJobRequest) -> PipelineJobResponse:
    """提交一个「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全链路任务。
    任务在后台线程中异步执行，立即返回 `job_id`。
    通过 `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。
    **Pipeline 执行阶段**：
    1. `parsing_documents` — 调用阿里云 DocMind 解析每份 PDF
    2. `generating_questions` — LLM 从文档片段生成草稿题库
    3. `evaluating` — RAGAS 在线评测打分（answer_model 答题 + judge_model 评分）
    4. `done` — 所有产物写入磁盘，`status` 变为 `completed`
    """
    logger.info(
        "[submit_pipeline] docs_path=%s  job_name=%r  gen_model=%s  judge=%s  max_docs=%s",
        request.docs_path, request.job_name, request.generation_model,
        request.judge_model, request.max_documents,
    )
    task = pipeline_task_manager.submit(request)
    logger.info("[submit_pipeline] queued  job_id=%s  job_name=%s", task.job_id, task.job_name)
    return PipelineJobResponse(
        job_id=task.job_id,
        job_name=task.job_name,
        status=task.status,
    )
@router.get(
    "/jobs/{job_id}",
    response_model=PipelineJobStatus,
    summary="查询任务状态",
    responses={
        200: {"description": "返回任务当前状态、执行阶段、日志及完成后的产物路径。"},
        404: {"description": "指定 job_id 的任务不存在。"},
    },
 )
 def get_pipeline_job(job_id: str) -> PipelineJobStatus:
    """查询一个 Pipeline 任务的当前状态、执行阶段、实时日志和结果。
    **轮询建议**：每 3–5 秒查询一次，直到 `status` 为 `completed` 或 `failed`。
    `result` 字段在任务完成后填充，包含：
    - `scores_csv` — 每道题目逐项评分
    - `summary_md` — 评估摘要 Markdown
    - `dataset_csv` — 生成的题库 CSV
    - `source_chunks_jsonl` — 文档片段索引
    """
    status = pipeline_task_manager.get(job_id)
    if status is None:
        logger.warning("[get_pipeline_job] not found  job_id=%s", job_id)
        raise HTTPException(status_code=404, detail=f"Pipeline job not found: {job_id}")
    logger.debug("[get_pipeline_job] job_id=%s  status=%s  phase=%s", job_id, status.status, status.phase)
    return status
@router.get(
    "/jobs",
    response_model=dict,
    summary="列出所有任务",
    responses={
        200: {
            "description": "按创建时间倒序返回本次服务器会话中所有的 Pipeline 任务。",
            "content": {
                "application/json": {
                    "example": {
                        "jobs": [
                            {
                                "job_id": "a1b2c3d4e5f6",
                                "job_name": "siemens-ct-eval",
                                "status": "completed",
                                "phase": "done",
                                "logs": ["[build] 17 documents parsed", "..."],
                                "result": {
                                    "total_questions": 19,
                                    "eval_run_id": "2026-06-18T...",
                                    "scores_csv": "outputs/pipeline/.../scores.csv",
                                    "summary_md": "outputs/pipeline/.../summary.md",
                                },
                                "error": None,
                            }
                        ]
                    }
                }
            },
        }
    },
 )
 def list_pipeline_jobs() -> dict:
    """返回本次服务器会话中所有已提交的 Pipeline 任务，按创建时间倒序排列。"""
    jobs = pipeline_task_manager.list_jobs()
    logger.info("[list_pipeline_jobs] count=%d", len(jobs))
    return {"jobs": [s.model_dump() for s in jobs]}
--- a/webapp/api/runs.py
+++ b/webapp/api/runs.py
@@ -0,0 +1,43 @@
 """Routes for listing evaluation runs and fetching a single run's report."""
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import RunDetail
 from webapp.services import report_builder, run_reader
 router = APIRouter(prefix="/api/runs", tags=["runs"])
 logger = logging.getLogger("webapp.api.runs")
@router.get("")
 def get_runs() -> dict[str, list]:
    """Return summaries for every discoverable evaluation run."""
    summaries = run_reader.list_run_summaries()
    logger.info("[get_runs] found %d runs", len(summaries))
    return {"runs": [summary.model_dump() for summary in summaries]}
@router.get("/{run_id}")
 def get_run_detail(run_id: str) -> RunDetail:
    """Return the full summary and aggregated report for one run."""
    logger.info("[get_run_detail] run_id=%s", run_id)
    run_dir = run_reader.find_run_dir(run_id)
    if run_dir is None:
        logger.warning("[get_run_detail] not found  run_id=%s", run_id)
        raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
    summary = run_reader.build_run_summary(run_dir)
    if summary is None:
        logger.warning("[get_run_detail] missing metadata  run_id=%s", run_id)
        raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
    report = report_builder.build_report(run_dir, summary.metrics)
    logger.info(
        "[get_run_detail] ok  run_id=%s  metrics=%s  valid=%d  invalid=%d",
        run_id, summary.metrics, summary.valid_samples, summary.invalid_samples,
    )
    return RunDetail(summary=summary, report=report)
--- a/webapp/api/scenarios.py
+++ b/webapp/api/scenarios.py
@@ -0,0 +1,21 @@
 """Route for discovering scenario YAML files that can be evaluated."""
 from __future__ import annotations
 import logging
 from fastapi import APIRouter
 from webapp.services import scenario_scanner
 router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
 logger = logging.getLogger("webapp.api.scenarios")
@router.get("")
 def get_scenarios() -> dict[str, list]:
    """Return every scenario file found under the scenarios/ directory."""
    scenarios = scenario_scanner.list_scenarios()
    valid = sum(1 for s in scenarios if not s.error)
    logger.info("[get_scenarios] total=%d  valid=%d  errors=%d", len(scenarios), valid, len(scenarios) - valid)
    return {"scenarios": [item.model_dump() for item in scenarios]}
--- a/webapp/api/score.py
+++ b/webapp/api/score.py
@@ -0,0 +1,176 @@
 """Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
 from __future__ import annotations
 import logging
 import time
 from typing import Annotated
 from fastapi import APIRouter, Header, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from rag_eval.metrics.weights import compute_weighted_score
 from rag_eval.settings import EvaluationSettings
 from webapp.models import ScoreRequest, ScoreResponse
 from webapp.services.inline_scorer import inline_scorer
 router = APIRouter(prefix="/api/score", tags=["score"])
 logger = logging.getLogger("webapp.api.score")
 def _get_settings() -> EvaluationSettings:
    """Return a fresh EvaluationSettings instance (overridable in tests)."""
    return EvaluationSettings()
 def _check_auth(authorization: str | None, token: str) -> None:
    """Raise 401 if Bearer token does not match the configured token."""
    if authorization is None:
        raise HTTPException(status_code=401, detail="Missing Authorization header.")
    parts = authorization.split(" ", 1)
    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
@router.post(
    "",
    response_model=ScoreResponse,
    summary="单题实时评分（Dify 外部 Tool）",
    responses={
        200: {
            "description": "各指标得分、加权综合得分及耗时。",
            "content": {
                "application/json": {
                    "example": {
                        "scores": {
                            "faithfulness": 0.875,
                            "answer_relevancy": 0.920,
                            "context_recall": 0.810,
                            "context_precision": 0.850,
                        },
                        "weighted_score": 0.8638,
                        "latency_ms": 3420,
                        "skipped_metrics": [],
                        "error": None,
                    }
                }
            },
        },
        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
        422: {"description": "请求参数校验失败（必填字段缺失或 metrics 名称不合法）。"},
    },
 )
 def score_sample(
    raw_request: Request,
    request: ScoreRequest,
    authorization: Annotated[str | None, Header()] = None,
 ) -> ScoreResponse:
    """接受单条问答记录，同步运行 RAGAS 指标打分，实时返回各指标得分。
    **主要用途**：供 Dify 外部 Tool 调用。Dify Agent 在生成回答后，将
    `(question, answer, contexts)` 发送到此端点，即可获得 RAGAS 质量评分，
    用于日志记录、质量监控或触发 Agent 自我改进流程。
    **contexts 格式**：多个检索片段用 `context_separator`（默认 `" |||| "`）拼接为一个字符串，
    服务端自动拆分后传入 RAGAS 管道。**contexts 为可选字段**，缺失时自动跳过依赖检索内容的指标
    （`faithfulness`、`context_recall`、`context_precision`、`noise_sensitivity`）。
    **ground_truth 可选**：
    - 提供时：所有指定指标均参与计算。
    - 缺失时：自动跳过依赖参考答案的指标（`context_recall`、
      `factual_correctness`、`semantic_similarity`、`noise_sensitivity`），
      跳过的指标在响应的 `skipped_metrics` 列表中列出，对应 `scores` 值为 `null`。
    **支持的 RAGAS 指标**：
    - `faithfulness` — 回答与检索片段的事实一致性
    - `answer_relevancy` — 回答与问题的相关性
    - `context_recall` — 参考答案覆盖到的检索内容比例（需 ground_truth）
    - `context_precision` — 检索片段中与答案相关的部分占比
    - `noise_sensitivity` — 对无关噪声片段的敏感度（需 ground_truth）
    - `factual_correctness` — 回答与参考答案的事实准确性（需 ground_truth）
    - `semantic_similarity` — 回答与参考答案的语义相似度（需 ground_truth）
    **推荐模型配置**：
    - `judge_model`: `gpt-5`
    - `embedding_model`: `text-embedding-3-small`
    **鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需在请求头携带
    `Authorization: Bearer <token>`；留空则无需鉴权（适合内网部署）。
    """
    client = f"{raw_request.client.host}:{raw_request.client.port}" if raw_request.client else "unknown"
    logger.info(
        "[score] incoming  client=%s  method=%s  content_type=%s  metrics=%s  has_gt=%s  has_ctx=%s",
        client,
        raw_request.method,
        raw_request.headers.get("content-type", ""),
        request.metrics,
        request.ground_truth is not None,
        bool(request.contexts),
    )
    settings = _get_settings()
    # Require Bearer auth only when the deployment configured a shared token.
    if settings.score_api_token:
        _check_auth(authorization, settings.score_api_token)
    judge_model = request.judge_model or settings.ragas_judge_model
    embedding_model = request.embedding_model or settings.ragas_embedding_model
    effective = request.effective_metrics()
    requested = set(request.metrics)
    skipped = sorted(requested - set(effective))
    if not effective:
        return ScoreResponse(
            scores={metric_name: None for metric_name in request.metrics},
            weighted_score=None,
            latency_ms=0,
            skipped_metrics=skipped,
        )
    t0 = time.monotonic()
    try:
        raw_scores = inline_scorer.score(
            question=request.question,
            answer=request.answer,
            contexts=request.contexts_as_list(),
            ground_truth=request.ground_truth,
            metrics=effective,
            judge_model=judge_model,
            embedding_model=embedding_model,
            settings=settings,
        )
    except Exception as exc:  # noqa: BLE001
        latency_ms = int((time.monotonic() - t0) * 1000)
        return ScoreResponse(
            scores={},
            weighted_score=None,
            latency_ms=latency_ms,
            skipped_metrics=skipped,
            error=f"{type(exc).__name__}: {exc}",
        )
    latency_ms = int((time.monotonic() - t0) * 1000)
    # Keep skipped metrics visible to callers by emitting them as null scores.
    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
    all_scores.update(raw_scores)
    # 综合加权得分计算（已暂时禁用）
    # weighted = compute_weighted_score(
    #     {key: value for key, value in raw_scores.items() if value is not None},
    #     {},
    # )
    logger.info(
        "[score] done  latency=%dms  skipped=%s  scores=%s",
        latency_ms,
        skipped,
        {k: (round(v, 4) if v is not None else None) for k, v in all_scores.items()},
    )
    return ScoreResponse(
        scores=all_scores,
        weighted_score=None,  # 综合加权得分已暂时禁用
        latency_ms=latency_ms,
        skipped_metrics=skipped,
    )
--- a/webapp/api/score_jobs.py
+++ b/webapp/api/score_jobs.py
@@ -0,0 +1,89 @@
 """Routes for async RAGAS scoring jobs (Dify fire-and-forget integration).
 Dify calls POST /api/score/async → gets job_id immediately (202).
 Scoring runs in background, result written as a standard run artifact.
 View full report at GET /api/runs/{run_id} or in the 「运行列表」 page.
 """
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import AsyncScoreJobResponse, AsyncScoreJobStatus, ScoreRequest
 from webapp.services.score_job_manager import score_job_manager
 router = APIRouter(prefix="/api/score", tags=["score"])
 logger = logging.getLogger("webapp.api.score_jobs")
@router.post(
    "/async",
    status_code=202,
    response_model=AsyncScoreJobResponse,
    summary="提交异步评分任务（Dify 推荐方式）",
    responses={
        202: {
            "description": (
                "任务已排队，立即返回 job_id（202 Accepted）。\n\n"
                "评分在后台执行，完成后自动生成完整报告（含优化建议）。\n"
                "通过 `GET /api/score/jobs/{job_id}` 查询状态，"
                "完成后在「运行列表」页查看完整报告。"
            ),
            "content": {
                "application/json": {
                    "example": {"job_id": "abc123def456", "status": "queued", "run_id": None}
                }
            },
        },
    },
 )
 def submit_async_score(request: ScoreRequest) -> AsyncScoreJobResponse:
    """提交异步 RAGAS 评分任务，立即返回 job_id。
    **适合 Dify 工作流**：HTTP 节点无需等待评分完成（无超时风险），
    工作流立即继续，评分结果在 RAGAS 平台「运行列表」中查看。
    评分完成后自动生成：
    - 各指标得分（`scores.csv`）
    - 摘要报告（`summary.md`）
    - LLM 优化建议（`optimization_advice.md`）
    """
    logger.info(
        "[score_async] submit  metrics=%s  has_ctx=%s  has_gt=%s",
        request.metrics, bool(request.contexts), bool(request.ground_truth),
    )
    status = score_job_manager.submit(request)
    logger.info("[score_async] queued  job_id=%s", status.job_id)
    return AsyncScoreJobResponse(job_id=status.job_id, status=status.status)
@router.get(
    "/jobs",
    response_model=dict,
    summary="列出所有异步评分记录",
 )
 def list_score_jobs() -> dict:
    """返回所有异步评分记录，按创建时间倒序排列。"""
    jobs = score_job_manager.list_jobs()
    logger.info("[score_jobs] list  count=%d", len(jobs))
    return {"jobs": [j.model_dump() for j in jobs]}
@router.get(
    "/jobs/{job_id}",
    response_model=AsyncScoreJobStatus,
    summary="查询单个异步评分任务状态",
    responses={404: {"description": "指定 job_id 的评分任务不存在。"}},
 )
 def get_score_job(job_id: str) -> AsyncScoreJobStatus:
    """查询单个异步评分任务的状态和结果。
    `status` 为 `completed` 时，`run_id` 字段包含对应的运行 ID，
    可通过 `GET /api/runs/{run_id}` 获取完整评分报告。
    """
    status = score_job_manager.get(job_id)
    if status is None:
        raise HTTPException(status_code=404, detail=f"Score job not found: {job_id}")
    return status
--- a/webapp/api/session_score_jobs.py
+++ b/webapp/api/session_score_jobs.py
@@ -0,0 +1,206 @@
 """Routes for session-grouped async RAGAS scoring (Dify multi-call integration).
 Use case: Dify evaluates multiple Q&A pairs in a session. Each pair gets its own
 `POST /api/score/session_async` call with a shared `session_id`. All results are
 accumulated into one report, visible in 「运行列表」→「报告详情」.
 Key behaviour:
  - Deterministic run_id: derived from session_id — same session always maps to the
    same report directory (outputs/score-session/session-<id>/).
  - Append semantics: each call adds a new sample row. Previous rows are preserved.
  - Advisor regeneration: optimization_advice.md is regenerated after every call
    using the full set of accumulated rows.
  - Each call returns its own `job_id` for individual status polling, plus the
    shared `run_id` and `session_id`.
 Endpoints:
  POST /api/score/session_async         Submit one call (returns job_id + run_id)
  GET  /api/score/sessions              List all sessions
  GET  /api/score/sessions/{session_id} Session aggregate (call_count, metric_means, jobs)
  GET  /api/score/session/jobs/{job_id} Status of one individual call
 """
 from __future__ import annotations
 import logging
 from fastapi import APIRouter, HTTPException
 from webapp.models import (
    AsyncScoreJobStatus,
    ScoreRequest,
    SessionScoreJobResponse,
    SessionScoreRequest,
    SessionStatus,
 )
 from webapp.services.session_score_manager import session_score_manager
 router = APIRouter(prefix="/api/score", tags=["score"])
 logger = logging.getLogger("webapp.api.session_score_jobs")
@router.post(
    "/session_async",
    status_code=202,
    response_model=SessionScoreJobResponse,
    summary="提交 Session 异步评分（多样本批量聚合）",
    description=(
        "**用途**\n"
        "- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
        "- 相同 `session_id` 的多次调用不会生成多个独立报告，而是持续追加到同一个 session 报告。\n\n"
        "**请求字段说明**\n"
        "- `session_id`：会话唯一标识，同一会话必须保持一致。\n"
        "- `question` / `answer`：本次待评分的问答对。\n"
        "- `contexts`：检索片段拼接字符串，按 `context_separator` 拆分。\n"
        "- `ground_truth`：标准答案，可选；缺失时会自动跳过依赖它的指标。\n"
        "- `metrics`：本次需要计算的指标列表。\n"
        "- `judge_model` / `embedding_model`：可选；为空时回退到系统默认配置。\n\n"
        "**处理行为**\n"
        "1. 服务端立即返回 `202 Accepted`，并生成本次调用的 `job_id`。\n"
        "2. 系统根据 `session_id` 计算固定 `run_id`，格式为 `session-<sanitized-session_id>`。\n"
        "3. 本次评分完成后，会向该 session 的 `scores.csv` 追加一行样本数据。\n"
        "4. 系统会基于当前 session 的全量样本重写 `summary.md`，并重新生成 `optimization_advice.md`。\n"
        "5. 报告可在「运行列表」中按 `run_id` 查看；同一 session 的后续调用会持续增量更新该报告。\n\n"
        "**后续查询接口**\n"
        "- `GET /api/score/session/jobs/{job_id}`：查询本次调用状态与得分。\n"
        "- `GET /api/score/sessions/{session_id}`：查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
        "- `GET /api/runs/{run_id}`：查看完整评估报告内容。\n\n"
        "**典型请求示例**\n"
        "```json\n"
        "{\n"
        "  \"session_id\": \"dify-session-001\",\n"
        "  \"question\": \"单源CT与双源CT在球管配置上有何本质区别？\",\n"
        "  \"answer\": \"单源CT只有一套球管-探测器系统，双源CT有两套独立的球管-探测器系统。\",\n"
        "  \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
        "  \"context_separator\": \" |||| \",\n"
        "  \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
        "  \"judge_model\": \"gpt-5.5\",\n"
        "  \"embedding_model\": \"text-embedding-3-small\"\n"
        "}\n"
        "```"
    ),
    responses={
        202: {
            "description": (
                "调用已排队，立即返回 job_id + run_id（202 Accepted）。\n\n"
                "相同 `session_id` 的多次调用合并为同一报告，每次调用新增一个样本行。\n"
                "评分完成后，`summary.md` 和 `optimization_advice.md` 增量更新。\n"
                "通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态，"
                "通过 `GET /api/score/session/jobs/{job_id}` 查询单次调用状态，"
                "在「运行列表」中查看完整报告（run_id 即 `session-<session_id>` 形式）。"
            ),
            "content": {
                "application/json": {
                    "example": {
                        "job_id": "abc123def456",
                        "session_id": "dify-session-001",
                        "run_id": "session-dify-session-001",
                        "status": "queued",
                        "call_count": 1,
                    }
                }
            },
        },
    },
 )
 def submit_session_async_score(request: SessionScoreRequest) -> SessionScoreJobResponse:
    """提交 Session 异步 RAGAS 评分，立即返回 job_id。
    相同 `session_id` 的多次调用合并到同一评估报告中，每次调用：
    1. 新增一个样本行到 `scores.csv`
    2. 重写 `summary.md`（包含所有累积样本的指标均值）
    3. 重新生成 `optimization_advice.md`（基于全量样本的 LLM 优化建议）
    **适合 Dify 工作流**：在循环节点中批量调用，所有轮次共用同一 `session_id`，
    最终在 RAGAS 平台「运行列表」中查看完整的批量评估报告。
    """
    logger.info(
        "[session_async] submit  session_id=%s  metrics=%s  has_ctx=%s  has_gt=%s",
        request.session_id,
        request.metrics,
        bool(request.contexts),
        bool(request.ground_truth),
    )
    # Strip session_id to build a plain ScoreRequest for the manager
    score_request = ScoreRequest(
        question=request.question,
        answer=request.answer,
        contexts=request.contexts,
        ground_truth=request.ground_truth,
        context_separator=request.context_separator,
        metrics=request.metrics,
        judge_model=request.judge_model,
        embedding_model=request.embedding_model,
    )
    status, run_id = session_score_manager.submit(request.session_id, score_request)
    # Compute call_count from current session state
    session_status = session_score_manager.get_session(request.session_id)
    call_count = session_status.call_count if session_status else 1
    logger.info(
        "[session_async] queued  job_id=%s  session_id=%s  run_id=%s  call=%d",
        status.job_id, request.session_id, run_id, call_count,
    )
    return SessionScoreJobResponse(
        job_id=status.job_id,
        session_id=request.session_id,
        run_id=run_id,
        status=status.status,
        call_count=call_count,
    )
@router.get(
    "/sessions",
    response_model=dict,
    summary="列出所有 Session 聚合状态",
 )
 def list_sessions() -> dict:
    """返回所有 session 的聚合状态，按最近完成时间倒序排列。"""
    sessions = session_score_manager.list_sessions()
    logger.info("[session_score] list_sessions  count=%d", len(sessions))
    return {"sessions": [s.model_dump() for s in sessions]}
@router.get(
    "/sessions/{session_id}",
    response_model=SessionStatus,
    summary="查询 Session 聚合状态（指标均值 + 所有调用记录）",
    responses={404: {"description": "指定 session_id 不存在。"}},
 )
 def get_session(session_id: str) -> SessionStatus:
    """查询 session 的聚合评分状态。
    返回内容：
    - `run_id`：在「运行列表」中查看完整报告
    - `call_count`：本 session 累计调用次数
    - `metric_means`：所有已累积样本的各指标均值（实时读取 scores.csv）
    - `jobs`：本 session 所有调用记录列表
    """
    status = session_score_manager.get_session(session_id)
    if status is None:
        raise HTTPException(status_code=404, detail=f"Session not found: {session_id}")
    return status
@router.get(
    "/session/jobs/{job_id}",
    response_model=AsyncScoreJobStatus,
    summary="查询 Session 单次调用状态",
    responses={404: {"description": "指定 job_id 不存在。"}},
 )
 def get_session_job(job_id: str) -> AsyncScoreJobStatus:
    """查询 session 评分中某次调用的状态和评分结果。
    `status` 为 `completed` 时，`run_id` 即所属 session 的报告目录，
    `scores` 包含本次调用的各指标得分。
    """
    status = session_score_manager.get_job(job_id)
    if status is None:
        raise HTTPException(
            status_code=404, detail=f"Session score job not found: {job_id}"
        )
    return status
--- a/webapp/models.py
+++ b/webapp/models.py
@@ -0,0 +1,617 @@
 """Pydantic response models for the evaluation console HTTP API."""
 from __future__ import annotations
 from datetime import datetime, timezone
 from typing import Any
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 def _utcnow_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
 class RunSummary(BaseModel):
    """Compact description of a single evaluation run for list views."""
    run_id: str
    scenario_name: str
    mode: str = ""
    judge_model: str = ""
    embedding_model: str = ""
    started_at: str = ""
    finished_at: str = ""
    dataset: str = ""
    total_samples: int = 0
    valid_samples: int = 0
    invalid_samples: int = 0
    metrics: list[str] = Field(default_factory=list)
    metric_means: dict[str, float | None] = Field(default_factory=dict)
    output_path: str = ""
 class GroupStat(BaseModel):
    """Mean metric values for one slice of samples grouped by a metadata field."""
    key: str
    count: int
    means: dict[str, float | None] = Field(default_factory=dict)
 class DistributionBin(BaseModel):
    """One histogram bucket of sample counts for a single metric."""
    label: str
    lower: float
    upper: float
    count: int
 class SampleScore(BaseModel):
    """Per-sample row used for the lowest-score review table."""
    sample_id: str
    question: str = ""
    contexts: list[str] = Field(default_factory=list)
    answer: str = ""
    ground_truth: str = ""
    language: str = ""
    difficulty: str = ""
    question_type: str = ""
    metrics: dict[str, float | None] = Field(default_factory=dict)
    mean_score: float | None = None
    error: str = ""
 class ReportData(BaseModel):
    """Aggregated report payload rendered by the report detail page."""
    metrics: list[str] = Field(default_factory=list)
    metric_means: dict[str, float | None] = Field(default_factory=dict)
    distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
    groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
    lowest_samples: list[SampleScore] = Field(default_factory=list)
    summary_markdown: str = ""
    advice_markdown: str = ""  # optimization_advice.md content (empty if not generated)
    weighted_score_mean: float | None = Field(
        default=None,
        description="加权综合得分均值（metric_weights × doc_weights 共同作用）。",
    )
    metric_weights: dict[str, float] = Field(
        default_factory=dict,
        description="该次运行使用的指标权重配置（来自 scenario.snapshot.yaml）。",
    )
    doc_weights: dict[str, float] = Field(
        default_factory=dict,
        description="该次运行使用的文档权重配置（来自 scenario.snapshot.yaml）。",
    )
 class RunDetail(BaseModel):
    """Full payload for a single run: summary metadata plus the report."""
    summary: RunSummary
    report: ReportData
 class ScenarioInfo(BaseModel):
    """One discoverable scenario YAML file that can be evaluated from the UI."""
    path: str
    scenario_name: str = ""
    mode: str = ""
    dataset: str = ""
    judge_model: str = ""
    metrics: list[str] = Field(default_factory=list)
    error: str = ""
    metric_weights: dict[str, float] = Field(
        default_factory=dict,
        description="从场景 YAML 读取的指标权重配置，供前端权重面板预填。",
    )
    doc_weights: dict[str, float] = Field(
        default_factory=dict,
        description="从场景 YAML 读取的文档权重配置，供前端权重面板预填。",
    )
 class TaskStatus(BaseModel):
    """State of a background evaluation task tracked by the task manager."""
    task_id: str
    scenario_path: str
    status: str
    logs: list[str] = Field(default_factory=list)
    run_id: str | None = None
    error: str | None = None
    created_at: str = ""
    finished_at: str = ""
 class TriggerEvaluationRequest(BaseModel):
    """Request body for launching an evaluation run from the UI."""
    scenario_path: str
 class TriggerEvaluationResponse(BaseModel):
    """Response returned immediately after queuing an evaluation task."""
    task_id: str
 class LLMProfile(BaseModel):
    """A named LLM connection configuration that can be reused across tasks."""
    profile_id: str
    name: str
    model: str
    base_url: str
    api_key: str
    timeout_seconds: int = 30
    created_at: str = Field(default_factory=_utcnow_iso)
    updated_at: str = Field(default_factory=_utcnow_iso)
 class CreateProfileRequest(BaseModel):
    """Request body for creating or updating an LLM profile."""
    name: str
    model: str
    base_url: str
    api_key: str
    timeout_seconds: int = 30
 class ProfileApplyRequest(BaseModel):
    """Request body to patch LLM profile selections into a scenario YAML."""
    scenario_path: str
    judge_profile_id: str | None = None
    answer_profile_id: str | None = None
    dataset_profile_id: str | None = None
    metric_weights: dict[str, float] | None = Field(
        default=None,
        description="指标权重映射，如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。",
    )
    doc_weights: dict[str, float] | None = Field(
        default=None,
        description="文档权重映射，如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。",
    )
 class ProfileApplyResponse(BaseModel):
    """Response after patching a scenario YAML with profile settings."""
    scenario_path: str
    patched_fields: list[str] = Field(default_factory=list)
 class ProfileProbeRequest(BaseModel):
    """Inline credentials for testing LLM connectivity without saving a profile."""
    model: str
    base_url: str
    api_key: str
    timeout_seconds: int = 30
 class ProfileTestResponse(BaseModel):
    """Result of a LLM connectivity test."""
    ok: bool
    message: str
    latency_ms: int | None = None
 def jsonable(value: Any) -> Any:
    """Convert NaN/inf floats into None so the payload stays valid JSON."""
    import math
    if isinstance(value, float):
        if math.isnan(value) or math.isinf(value):
            return None
        return value
    if isinstance(value, dict):
        return {key: jsonable(item) for key, item in value.items()}
    if isinstance(value, list):
        return [jsonable(item) for item in value]
    return value
 # ---------------------------------------------------------------------------
 # Full pipeline (build + eval) job models
 # ---------------------------------------------------------------------------
 class PipelineJobRequest(BaseModel):
    """Request body for launching an end-to-end build + evaluation pipeline job."""
    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {
                    "summary": "西门子 CT 文档评估（完整参数）",
                    "value": {
                        "docs_path": "datasets/siemens-pdfs",
                        "job_name": "siemens-ct-eval-2026",
                        "generation_model": "qwen3.6-plus",
                        "answer_model": "deepseek-v4-flash",
                        "judge_model": "deepseek-v4-flash",
                        "embedding_model": "text-embedding-v3",
                        "max_questions_per_document": 10,
                        "max_source_chunks_per_question": 3,
                        "max_documents": None,
                        "max_samples": None,
                        "metrics": [
                            "faithfulness",
                            "answer_relevancy",
                            "context_recall",
                            "context_precision",
                        ],
                        "optimization_advisor": False,
                        "failure_mode": "skip",
                    },
                },
                {
                    "summary": "快速冒烟测试（仅 2 份文档、5 道题）",
                    "value": {
                        "docs_path": "datasets/siemens-pdfs",
                        "job_name": "smoke-test",
                        "generation_model": "qwen3.6-plus",
                        "answer_model": "deepseek-v4-flash",
                        "judge_model": "deepseek-v4-flash",
                        "embedding_model": "text-embedding-v3",
                        "max_questions_per_document": 5,
                        "max_source_chunks_per_question": 3,
                        "max_documents": 2,
                        "max_samples": 10,
                        "metrics": ["faithfulness", "answer_relevancy"],
                        "optimization_advisor": False,
                        "failure_mode": "skip",
                    },
                },
            ]
        }
    )
    docs_path: str = Field(
        description="PDF 文档所在文件夹的绝对路径或相对于仓库根目录的相对路径。"
    )
    job_name: str = Field(
        default="",
        description="任务显示名称；留空时系统自动生成唯一标识。",
    )
    generation_model: str = Field(
        default="qwen3.6-plus",
        description="用于从文档片段生成草稿题库的 LLM 模型名称。",
    )
    answer_model: str = Field(
        default="deepseek-v4-flash",
        description="在线评估时调用的答题 LLM 模型名称（siemens_pdf_qa adapter）。",
    )
    judge_model: str = Field(
        default="deepseek-v4-flash",
        description="RAGAS 指标评分时使用的 Judge LLM 模型名称。",
    )
    embedding_model: str = Field(
        default="text-embedding-v3",
        description="RAGAS context-recall / context-precision 使用的 Embedding 模型名称。",
    )
    max_questions_per_document: int = Field(
        default=10, gt=0,
        description="每份 PDF 文档最多生成的草稿题目数量。",
    )
    max_source_chunks_per_question: int = Field(
        default=3, gt=0,
        description="每道题目最多引用的文档片段（source chunk）数量。",
    )
    max_documents: int | None = Field(
        default=None, gt=0,
        description="限制处理的 PDF 文件数量上限（冒烟测试时使用）。",
    )
    max_samples: int | None = Field(
        default=None, gt=0,
        description="限制评估的题目数量上限（冒烟测试时使用）。",
    )
    metrics: list[str] = Field(
        default_factory=lambda: [
            "faithfulness",
            "answer_relevancy",
            "context_recall",
            "context_precision",
        ],
        description=(
            "需要计算的 RAGAS 指标列表。"
            "可选值：faithfulness, answer_relevancy, context_recall, "
            "context_precision, noise_sensitivity, factual_correctness, semantic_similarity。"
        ),
    )
    optimization_advisor: bool = Field(
        default=False,
        description="为 True 时启用 RAGAS 优化建议模块，生成 optimization_advice.md。",
    )
    failure_mode: str = Field(
        default="skip",
        description="PDF 解析失败时的处理策略：skip（跳过继续）或 fail（立即中止）。",
    )
 class PipelineResult(BaseModel):
    """Artifact locations and statistics for a completed pipeline run."""
    build_artifact_dir: str = Field(description="题库生成阶段的产物根目录路径。")
    dataset_csv: str = Field(description="生成的草稿题库 CSV 文件路径（评估输入）。")
    source_chunks_jsonl: str = Field(description="文档片段索引文件路径（在线评估 adapter 使用）。")
    total_questions: int = Field(description="成功生成的有效题目总数。")
    parse_failures: int = Field(description="文档解析失败的 PDF 数量。")
    eval_run_id: str = Field(description="RAGAS 评估运行 ID。")
    eval_output_dir: str = Field(description="RAGAS 评估产物根目录路径。")
    scores_csv: str = Field(description="每道题目逐项评分的 CSV 文件路径。")
    summary_md: str = Field(description="评估结果摘要 Markdown 文件路径。")
 class PipelineJobStatus(BaseModel):
    """State of one end-to-end pipeline job."""
    job_id: str = Field(description="任务唯一标识符。")
    job_name: str = Field(description="任务显示名称。")
    status: str = Field(description="任务状态：queued | running | completed | failed。")
    phase: str = Field(default="idle", description="当前执行阶段：idle | parsing_documents | generating_questions | evaluating | done。")
    logs: list[str] = Field(default_factory=list, description="实时日志行列表。")
    result: PipelineResult | None = Field(default=None, description="任务完成后填充的产物路径与统计信息。")
    error: str | None = Field(default=None, description="失败时的错误信息。")
    created_at: str = Field(default="", description="任务创建时间（ISO 8601 UTC）。")
    finished_at: str = Field(default="", description="任务结束时间（ISO 8601 UTC）。")
 class PipelineJobResponse(BaseModel):
    """Immediate response returned after a pipeline job is queued."""
    job_id: str = Field(description="任务唯一标识符，用于后续轮询状态。")
    job_name: str = Field(description="任务显示名称。")
    status: str = Field(default="queued", description="初始状态，通常为 queued。")
 # ---------------------------------------------------------------------------
 # Dify 实时评分 API 模型
 # ---------------------------------------------------------------------------
 # 需要 ground_truth 才能计算的指标集合
 _GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
    "context_recall",
    "factual_correctness",
    "semantic_similarity",
    "noise_sensitivity",
 })
 # 需要 contexts 才能计算的指标集合
 _CONTEXT_DEPENDENT_METRICS: frozenset[str] = frozenset({
    "faithfulness",
    "context_recall",
    "context_precision",
    "noise_sensitivity",
 })
 # 所有合法指标名称
 _VALID_METRICS: frozenset[str] = frozenset({
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
    "noise_sensitivity",
    "factual_correctness",
    "semantic_similarity",
 })
 _DEFAULT_SCORE_METRICS: list[str] = [
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
 ]
 class ScoreRequest(BaseModel):
    """Request body for the real-time single-sample scoring endpoint."""
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "question": "双源CT的时间分辨率是多少?",
                "answer": "双源CT的单扇区时间分辨率为75ms。",
                "contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
                "ground_truth": "双源CT单扇区时间分辨率为75ms，需旋转135度。",
                "context_separator": " |||| ",
                "metrics": [
                    "faithfulness",
                    "answer_relevancy",
                    "context_recall",
                    "context_precision",
                ],
                "judge_model": "gpt-5",
                "embedding_model": "text-embedding-3-small",
            }
        }
    )
    question: str = Field(description="问题文本。")
    answer: str = Field(description="待评分的回答。")
    contexts: str | None = Field(
        default=None,
        description="检索上下文字符串，多段之间用 context_separator 拼接。缺失时自动跳过依赖检索内容的指标（faithfulness、context_recall、context_precision、noise_sensitivity）。",
    )
    ground_truth: str | None = Field(
        default=None,
        description="标准参考答案（可选）。缺失时自动跳过需要它的指标。",
    )
    context_separator: str = Field(
        default=" |||| ",
        description="contexts 字段中段落分隔符，默认为四个竖线两侧各一空格。",
    )
    metrics: list[str] = Field(
        default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
        description="需要计算的 RAGAS 指标列表。",
    )
    judge_model: str | None = Field(
        default=None,
        description="Judge LLM 模型名称；为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
    )
    embedding_model: str | None = Field(
        default=None,
        description="Embedding 模型名称；为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
    )
    @field_validator("metrics")
    @classmethod
    def validate_metric_names(cls, value: list[str]) -> list[str]:
        """Reject any metric name not in the supported registry."""
        invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
        if invalid:
            raise ValueError(
                f"不支持的指标名称：{invalid}。"
                f"合法值：{sorted(_VALID_METRICS)}"
            )
        if not value:
            raise ValueError("metrics 不能为空列表。")
        return value
    def contexts_as_list(self) -> list[str]:
        """Split the contexts string into a list of non-empty fragments.
        Returns an empty list when contexts is None or blank.
        """
        if not self.contexts:
            return []
        separator = self.context_separator or " |||| "
        return [part.strip() for part in self.contexts.split(separator) if part.strip()]
    def effective_metrics(self) -> list[str]:
        """Return metrics filtered to exclude GT-dependent or context-dependent ones when inputs are absent."""
        result = list(self.metrics)
        if self.ground_truth is None:
            result = [m for m in result if m not in _GT_DEPENDENT_METRICS]
        if not self.contexts:
            result = [m for m in result if m not in _CONTEXT_DEPENDENT_METRICS]
        return result
 class ScoreResponse(BaseModel):
    """Response payload for the real-time scoring endpoint."""
    scores: dict[str, float | None] = Field(
        description="各指标得分（NaN 或计算失败时为 null）。"
    )
    weighted_score: float | None = Field(
        default=None,
        description="等权加权综合得分（仅对非 null 指标求均值）。",
    )
    latency_ms: int = Field(description="服务端打分耗时（毫秒）。")
    skipped_metrics: list[str] = Field(
        default_factory=list,
        description="因缺少 ground_truth 而跳过的指标名称列表。",
    )
    error: str | None = Field(
        default=None,
        description="打分异常时的错误信息（HTTP 200 仍返回，scores 为空）。",
    )
 # ---------------------------------------------------------------------------
 # 异步评分记录模型
 # ---------------------------------------------------------------------------
 class AsyncScoreJobResponse(BaseModel):
    """Immediate 202 response after submitting an async score job."""
    job_id: str = Field(description="任务唯一标识符，用于后续查询结果。")
    status: str = Field(default="queued", description="初始状态：queued。")
    run_id: str | None = Field(
        default=None,
        description="评分完成后写入的 Run ID，可在「运行列表」中查看完整报告。",
    )
 # ---------------------------------------------------------------------------
 # Session async 评分模型
 # ---------------------------------------------------------------------------
 class SessionScoreRequest(ScoreRequest):
    """Request body for session-grouped async scoring.
    All calls sharing the same session_id are accumulated into one report.
    Each call adds a new sample row to the session's scores.csv.
    """
    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {
                    "summary": "Dify 会话批量评分",
                    "value": {
                        "session_id": "dify-session-001",
                        "question": "单源CT与双源CT在球管配置上有何本质区别？",
                        "answer": "单源CT只有一套球管-探测器系统，双源CT有两套独立的球管-探测器系统。",
                        "contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管",
                        "context_separator": " |||| ",
                        "metrics": ["answer_relevancy", "faithfulness"],
                        "judge_model": "gpt-5.5",
                        "embedding_model": "text-embedding-3-small",
                    },
                }
            ]
        }
    )
    session_id: str = Field(
        description=(
            "会话唯一标识符。相同 session_id 的多次调用合并为同一报告，"
            "每次调用新增一个样本行，指标均值和优化建议在每次调用后增量更新。"
        ),
    )
 class SessionScoreJobResponse(BaseModel):
    """Immediate 202 response after submitting a session scoring call."""
    job_id: str = Field(description="本次调用的任务唯一标识符。")
    session_id: str = Field(description="会话标识符。")
    run_id: str = Field(description="本 session 对应的报告 Run ID，可在「运行列表」中查看。")
    status: str = Field(default="queued", description="初始状态：queued。")
    call_count: int = Field(default=1, description="本 session 当前累计调用次数（包含本次）。")
 class SessionStatus(BaseModel):
    """Aggregate status and metrics for a scoring session."""
    session_id: str = Field(description="会话标识符。")
    run_id: str = Field(description="对应报告目录的 Run ID。")
    call_count: int = Field(description="本 session 累计调用次数。")
    metric_means: dict[str, float | None] = Field(
        default_factory=dict, description="所有已累积样本的各指标均值。"
    )
    latest_finished_at: str = Field(default="", description="最近一次评分完成时间（ISO 8601 UTC）。")
    jobs: list[AsyncScoreJobStatus] = Field(
        default_factory=list, description="本 session 所有调用记录，按创建时间排序。"
    )
 class AsyncScoreJobStatus(BaseModel):
    """State of one async score job (queued → running → completed/failed)."""
    job_id: str = Field(description="任务唯一标识符。")
    status: str = Field(description="queued | running | completed | failed")
    created_at: str = Field(default="", description="创建时间（ISO 8601 UTC）。")
    finished_at: str = Field(default="", description="完成时间（ISO 8601 UTC）。")
    run_id: str | None = Field(
        default=None,
        description="完成后对应的 Run ID，可通过 GET /api/runs/{run_id} 查看完整报告。",
    )
    request_summary: dict = Field(
        default_factory=dict,
        description="请求参数快照（question 前80字、metrics、judge_model 等）。",
    )
    scores: dict[str, float | None] = Field(default_factory=dict, description="各指标得分。")
    weighted_score: float | None = Field(default=None, description="加权综合得分。")
    latency_ms: int = Field(default=0, description="评分耗时毫秒。")
    skipped_metrics: list[str] = Field(default_factory=list)
    error: str | None = Field(default=None)
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`{"reason":"idle timeout","timestamp":1781598635371}`
		`@@ -0,0 +1 @@`
							`Completed run: C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas\outputs\online\siemens-pdf-question-bank`
		`@@ -0,0 +1 @@`
							`"""Dataset loading and normalization for the RAG evaluation platform."""`
		`@@ -0,0 +1 @@`
							`"""API router package for the evaluation console."""`