siemens_ragas/project-overview.html

<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Siemens RAGAS 项目总览</title>
  <style>
    :root {
      --siemens-teal: #009999;
      --siemens-blue: #0066CC;
      --bg: #f8fafc;
      --surface: #ffffff;
      --surface-soft: #eef6f8;
      --text: #0f172a;
      --muted: #475569;
      --border: #dbe4ee;
      --code-bg: #1e293b;
      --code-text: #e2e8f0;
      --shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
      --radius: 18px;
      --sidebar-width: 260px;
      --content-max: 1360px;
    }

    * { box-sizing: border-box; }
    html { scroll-behavior: smooth; }
    body {
      margin: 0;
      font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
      color: var(--text);
      background: linear-gradient(180deg, #f8fafc 0%, #eff7fb 100%);
    }

    a { color: inherit; text-decoration: none; }
    code, pre { font-family: "Cascadia Code", "SFMono-Regular", Consolas, monospace; }

    .layout {
      display: flex;
      min-height: 100vh;
    }

    .sidebar {
      position: fixed;
      inset: 0 auto 0 0;
      width: var(--sidebar-width);
      background: linear-gradient(180deg, #062630 0%, #083947 42%, #0a4a65 100%);
      color: #e6fffb;
      padding: 26px 18px 24px;
      overflow-y: auto;
      box-shadow: 4px 0 18px rgba(0, 0, 0, 0.12);
      z-index: 20;
    }

    .brand {
      margin-bottom: 22px;
      padding-bottom: 18px;
      border-bottom: 1px solid rgba(255, 255, 255, 0.14);
    }

    .brand h1 {
      margin: 0;
      font-size: 1.22rem;
      line-height: 1.35;
    }

    .brand p {
      margin: 10px 0 0;
      font-size: 0.92rem;
      color: rgba(230, 255, 251, 0.78);
    }

    .sidebar nav {
      display: grid;
      gap: 8px;
    }

    .sidebar nav a {
      display: block;
      padding: 10px 12px;
      border-radius: 12px;
      color: rgba(255, 255, 255, 0.82);
      font-size: 0.95rem;
      transition: 0.2s ease;
      border: 1px solid transparent;
    }

    .sidebar nav a:hover,
    .sidebar nav a.active {
      background: rgba(255, 255, 255, 0.12);
      color: #fff;
      border-color: rgba(255, 255, 255, 0.18);
      transform: translateX(2px);
    }

    .sidebar .meta {
      margin-top: 22px;
      padding-top: 16px;
      border-top: 1px solid rgba(255, 255, 255, 0.14);
      font-size: 0.84rem;
      color: rgba(230, 255, 251, 0.72);
      line-height: 1.7;
    }

    .main {
      margin-left: var(--sidebar-width);
      width: calc(100% - var(--sidebar-width));
      padding: 28px;
    }

    .container {
      max-width: var(--content-max);
      margin: 0 auto;
    }

    .hero {
      position: relative;
      overflow: hidden;
      background: radial-gradient(circle at top right, rgba(0, 102, 204, 0.22), transparent 34%),
                  linear-gradient(135deg, #ffffff 0%, #effafb 55%, #e6f2ff 100%);
      border: 1px solid rgba(0, 153, 153, 0.15);
      border-radius: 26px;
      padding: 34px;
      box-shadow: var(--shadow);
      margin-bottom: 28px;
    }

    .hero::after {
      content: "";
      position: absolute;
      right: -60px;
      top: -60px;
      width: 220px;
      height: 220px;
      border-radius: 50%;
      background: radial-gradient(circle, rgba(0, 153, 153, 0.18), transparent 70%);
      pointer-events: none;
    }

    .eyebrow {
      display: inline-flex;
      align-items: center;
      gap: 8px;
      padding: 6px 12px;
      border-radius: 999px;
      background: rgba(0, 153, 153, 0.08);
      color: var(--siemens-teal);
      font-weight: 700;
      font-size: 0.86rem;
      margin-bottom: 16px;
    }

    .hero h2 {
      margin: 0;
      font-size: clamp(2rem, 3vw, 3.1rem);
      line-height: 1.12;
      letter-spacing: -0.03em;
      max-width: 900px;
    }

    .hero p {
      max-width: 900px;
      font-size: 1.02rem;
      line-height: 1.8;
      color: var(--muted);
      margin: 16px 0 0;
    }

    .hero-grid {
      display: grid;
      grid-template-columns: 1.5fr 1fr;
      gap: 24px;
      align-items: end;
      margin-top: 24px;
    }

    .hero-stats {
      display: grid;
      grid-template-columns: repeat(4, minmax(0, 1fr));
      gap: 14px;
    }

    .stat {
      background: rgba(255, 255, 255, 0.8);
      border: 1px solid rgba(0, 102, 204, 0.1);
      border-radius: 18px;
      padding: 16px;
      box-shadow: var(--shadow);
    }

    .stat b {
      display: block;
      font-size: 1.55rem;
      color: var(--siemens-blue);
      margin-bottom: 6px;
    }

    .stat span {
      color: var(--muted);
      font-size: 0.92rem;
    }

    .hero-note {
      background: linear-gradient(135deg, rgba(0, 153, 153, 0.12), rgba(0, 102, 204, 0.1));
      border: 1px solid rgba(0, 153, 153, 0.14);
      border-radius: 20px;
      padding: 20px;
      line-height: 1.8;
      color: #103451;
    }

    .hero-note b { color: var(--siemens-blue); }

    section {
      background: var(--surface);
      border: 1px solid var(--border);
      border-radius: 22px;
      padding: 28px;
      box-shadow: var(--shadow);
      margin-bottom: 24px;
      scroll-margin-top: 18px;
    }

    .section-title {
      display: flex;
      align-items: center;
      gap: 14px;
      margin: 0 0 18px;
      font-size: 1.5rem;
      letter-spacing: -0.02em;
    }

    .section-title::before {
      content: "";
      width: 6px;
      height: 28px;
      border-radius: 999px;
      background: linear-gradient(180deg, var(--siemens-teal), var(--siemens-blue));
      flex: 0 0 auto;
    }

    .section-intro {
      margin: 0 0 18px;
      color: var(--muted);
      line-height: 1.85;
    }

    .badges,
    .stack-badges {
      display: flex;
      flex-wrap: wrap;
      gap: 10px;
    }

    .badge {
      display: inline-flex;
      align-items: center;
      gap: 8px;
      padding: 9px 14px;
      border-radius: 999px;
      background: linear-gradient(135deg, rgba(0, 153, 153, 0.12), rgba(0, 102, 204, 0.1));
      color: #0f4060;
      border: 1px solid rgba(0, 153, 153, 0.15);
      font-size: 0.93rem;
      font-weight: 600;
    }

    .grid-2,
    .grid-3,
    .grid-4 {
      display: grid;
      gap: 18px;
    }

    .grid-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); }
    .grid-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); }
    .grid-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); }

    .card {
      background: linear-gradient(180deg, #ffffff 0%, #fbfdff 100%);
      border: 1px solid var(--border);
      border-radius: 18px;
      padding: 18px;
      box-shadow: var(--shadow);
    }

    .card h3,
    .card h4 {
      margin: 0 0 12px;
      font-size: 1.05rem;
    }

    .card p,
    .card li {
      color: var(--muted);
      line-height: 1.8;
      margin: 0;
    }

    .card ul {
      margin: 0;
      padding-left: 20px;
    }

    .card .mini {
      font-size: 0.86rem;
      color: #64748b;
      margin-top: 8px;
    }

    .diagram,
    .code-block,
    .tree {
      background: var(--code-bg);
      color: var(--code-text);
      padding: 20px;
      border-radius: 18px;
      overflow-x: auto;
      border: 1px solid rgba(148, 163, 184, 0.16);
      box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.04);
    }

    .diagram { font-size: 0.95rem; line-height: 1.5; }
    .code-block { line-height: 1.7; }
    .tree { line-height: 1.65; }

    .pill-heading {
      display: inline-flex;
      align-items: center;
      gap: 8px;
      padding: 6px 12px;
      border-radius: 999px;
      background: rgba(0, 102, 204, 0.08);
      color: var(--siemens-blue);
      font-weight: 700;
      margin: 14px 0 10px;
      font-size: 0.9rem;
    }

    table {
      width: 100%;
      border-collapse: collapse;
      overflow: hidden;
      border-radius: 16px;
      border: 1px solid var(--border);
    }

    th,
    td {
      padding: 14px 14px;
      text-align: left;
      vertical-align: top;
      border-bottom: 1px solid var(--border);
      line-height: 1.7;
      font-size: 0.95rem;
    }

    th {
      background: linear-gradient(180deg, #edf8f8, #e8f1ff);
      color: #103451;
      font-size: 0.92rem;
    }

    tr:last-child td { border-bottom: none; }

    .flow-grid {
      display: grid;
      gap: 16px;
    }

    .flow-card {
      border: 1px solid var(--border);
      border-radius: 18px;
      padding: 18px;
      background: linear-gradient(180deg, #ffffff 0%, #fbfdff 100%);
    }

    .flow-card h3 {
      margin: 0 0 12px;
      font-size: 1.06rem;
    }

    .flow-card pre {
      margin: 0;
      white-space: pre-wrap;
      word-break: break-word;
      line-height: 1.85;
    }

    .metric-card {
      border: 1px solid var(--border);
      border-radius: 18px;
      padding: 18px;
      background: linear-gradient(180deg, #ffffff 0%, #fafdff 100%);
      box-shadow: var(--shadow);
    }

    .metric-card h3 {
      margin: 0 0 10px;
      font-size: 1.02rem;
      color: var(--siemens-blue);
    }

    .metric-card .need-gt {
      display: inline-block;
      padding: 4px 10px;
      border-radius: 999px;
      font-size: 0.82rem;
      font-weight: 700;
      margin-bottom: 12px;
    }

    .need-gt.yes { background: rgba(0, 102, 204, 0.1); color: var(--siemens-blue); }
    .need-gt.no { background: rgba(0, 153, 153, 0.12); color: var(--siemens-teal); }

    .formula {
      background: linear-gradient(180deg, #f3fbfb 0%, #eef5ff 100%);
      border: 1px solid rgba(0, 102, 204, 0.14);
      color: #0f4060;
      padding: 18px;
      border-radius: 18px;
      line-height: 1.9;
    }

    .steps {
      margin: 0;
      padding-left: 18px;
      color: var(--muted);
      line-height: 1.9;
    }

    .muted { color: var(--muted); }
    .small { font-size: 0.9rem; }

    .footer {
      text-align: center;
      color: #64748b;
      font-size: 0.88rem;
      padding: 8px 0 24px;
    }

    .mobile-topbar {
      display: none;
      position: sticky;
      top: 0;
      z-index: 25;
      background: rgba(248, 250, 252, 0.92);
      backdrop-filter: blur(10px);
      border-bottom: 1px solid rgba(15, 23, 42, 0.08);
      padding: 12px 16px;
      margin: -28px -28px 18px;
    }

    .mobile-topbar button {
      border: none;
      background: linear-gradient(135deg, var(--siemens-teal), var(--siemens-blue));
      color: #fff;
      border-radius: 12px;
      padding: 10px 14px;
      font-weight: 700;
      cursor: pointer;
    }

    .sidebar-backdrop {
      display: none;
      position: fixed;
      inset: 0;
      background: rgba(15, 23, 42, 0.42);
      z-index: 15;
    }

    @media (max-width: 1180px) {
      .hero-grid,
      .grid-4 {
        grid-template-columns: repeat(2, minmax(0, 1fr));
      }
    }

    @media (max-width: 980px) {
      .sidebar {
        transform: translateX(-100%);
        transition: transform 0.24s ease;
      }
      .sidebar.open { transform: translateX(0); }
      .sidebar-backdrop.show { display: block; }
      .main {
        margin-left: 0;
        width: 100%;
      }
      .mobile-topbar { display: flex; justify-content: space-between; align-items: center; }
      .grid-2,
      .grid-3,
      .grid-4,
      .hero-grid,
      .hero-stats {
        grid-template-columns: 1fr;
      }
    }

    @media (max-width: 640px) {
      .main { padding: 18px; }
      .hero, section { padding: 20px; }
      .mobile-topbar { margin: -18px -18px 16px; }
      th, td { padding: 12px 10px; }
    }
  </style>
</head>
<body>
  <div class="sidebar-backdrop" id="sidebar-backdrop"></div>
  <div class="layout">
    <aside class="sidebar" id="sidebar">
      <div class="brand">
        <h1>Siemens RAGAS<br />项目文档</h1>
        <p>西门子医疗影像 CT 知识库 RAG 评估平台</p>
      </div>
      <nav>
        <a href="#overview">1. 项目概述</a>
        <a href="#architecture">2. 系统架构</a>
        <a href="#modules">3. 核心模块说明</a>
        <a href="#flows">4. 数据流说明</a>
        <a href="#metrics">5. RAGAS 评估指标</a>
        <a href="#apis">6. API 接口文档</a>
        <a href="#weights">7. 指标权重配置</a>
        <a href="#deployment">8. 部署说明</a>
        <a href="#stack">9. 技术栈</a>
        <a href="#structure">10. 目录结构</a>
      </nav>
      <div class="meta">
        <div><b>生成时间</b><br />2026-06-24</div>
        <div style="margin-top:10px;"><b>输出文件</b><br />project-overview.html</div>
        <div style="margin-top:10px;"><b>来源</b><br />README / pyproject / main.py / webmain.py / rag_eval / webapp / scenarios / .env.example</div>
      </div>
    </aside>

    <main class="main">
      <div class="mobile-topbar">
        <strong>Siemens RAGAS</strong>
        <button id="menu-toggle" type="button">目录</button>
      </div>

      <div class="container">
        <header class="hero">
          <div class="eyebrow">Siemens Healthineers · RAG Evaluation Platform</div>
          <h2>Siemens RAGAS RAG 评估平台：面向 CT 知识库的自动化质量评估闭环</h2>
          <p>
            本项目将 <b>PDF 文档解析</b>、<b>题库生成</b>、<b>在线/离线 RAGAS 评测</b>、<b>报告沉淀与 Web 可视化</b>
            统一进一个可复用平台。CLI 与 FastAPI Web 控制台共享同一套 <code>rag_eval</code> 核心引擎，适合批量评估、持续优化与 Dify 实时评分集成。
          </p>
          <div class="hero-grid">
            <div>
              <div class="hero-stats">
                <div class="stat"><b>3</b><span>入口形态<br />CLI / Web / REST API</span></div>
                <div class="stat"><b>7</b><span>RAGAS 指标<br />含 GT 依赖与非依赖</span></div>
                <div class="stat"><b>4</b><span>核心流程<br />Build / Eval / Pipeline / Score</span></div>
                <div class="stat"><b>2</b><span>适配模式<br />HTTP / Python Adapter</span></div>
              </div>
            </div>
            <div class="hero-note">
              <b>核心价值：</b>将 PDF 资料转成可评测题库，再以 Siemens 医疗影像场景为中心完成答题、打分、加权汇总与报告产物沉淀，形成完整质量治理闭环。
            </div>
          </div>
        </header>

        <section id="overview">
          <h2 class="section-title">1. 项目概述</h2>
          <p class="section-intro">
            <b>项目名称：</b>Siemens RAGAS RAG 评估平台。<br />
            <b>目标：</b>为西门子医疗影像 CT 知识库 RAG 系统提供自动化质量评估。<br />
            <b>定位：</b>既能作为离线评测框架，也能作为在线评估控制台与 API 服务，为知识库 QA、Prompt 迭代、检索策略优化提供统一基线。
          </p>

          <div class="grid-2">
            <div class="card">
              <h3>业务闭环</h3>
              <p>PDF解析 → 题库生成 → RAGAS评测 → 报告可视化 → 再迭代。项目不仅覆盖评测本身，还覆盖评测数据源建设与运行产物管理。</p>
            </div>
            <div class="card">
              <h3>运行方式</h3>
              <p><code>main.py</code> 负责 CLI 评估与 dataset build，<code>webmain.py</code> 负责启动 FastAPI 控制台，<code>webapp.server</code> 暴露 REST API 与静态前端。</p>
            </div>
          </div>

          <div class="pill-heading">技术亮点</div>
          <div class="badges">
            <span class="badge">统一 CLI / Web / API 三入口</span>
            <span class="badge">阿里云 DocMind 文档解析</span>
            <span class="badge">OpenAI 兼容模型接入</span>
            <span class="badge">RAGAS 0.4.3 指标流水线</span>
            <span class="badge">在线 / 离线双模式评估</span>
            <span class="badge">Python / HTTP Adapter 扩展机制</span>
            <span class="badge">场景 YAML 驱动</span>
            <span class="badge">Pipeline 后台线程编排</span>
            <span class="badge">Dify 实时单题评分接口</span>
            <span class="badge">metric_weights + doc_weights 加权汇总</span>
            <span class="badge">历史 run 资产沉淀</span>
            <span class="badge">Web 报告聚合与分布分析</span>
          </div>
        </section>

        <section id="architecture">
          <h2 class="section-title">2. 系统架构</h2>
          <p class="section-intro">
            平台采用“多入口 + 单评估核心”的结构。CLI 和 Web 控制台都汇入 <code>rag_eval</code> 核心引擎；API 层只负责任务编排、配置管理与结果查询。
          </p>

          <pre class="diagram">┌─────────────────────────────────────────────────────────┐
│                    siemens_ragas 平台                    │
├─────────────┬───────────────────┬───────────────────────┤
│  CLI 入口   │   Web 控制台      │   REST API            │
│  main.py    │   webmain.py      │   FastAPI             │
├─────────────┴───────────────────┴───────────────────────┤
│                    核心评估引擎 (rag_eval)                │
├──────────────┬──────────────────┬────────────────────────┤
│ dataset_     │   execution/     │   metrics/             │
│ builder/     │   evaluator.py   │   pipeline.py          │
│ (PDF→题库)   │   (评估流程)     │   (RAGAS指标)          │
├──────────────┴──────────────────┴────────────────────────┤
│              外部依赖                                     │
│  阿里云DocMind (PDF解析) │ OpenAI兼容API (LLM/Embedding) │
└─────────────────────────────────────────────────────────┘</pre>

          <div class="grid-3" style="margin-top:18px;">
            <div class="card">
              <h3>CLI 编排</h3>
              <p><code>main.py</code> 通过互斥参数在 <code>--scenario</code> 与 <code>--dataset-build-config</code> 之间分派，分别进入评估流程与题库构建流程。</p>
            </div>
            <div class="card">
              <h3>Web 服务</h3>
              <p><code>webmain.py</code> 负责 uvicorn 启动、日志文件轮转与 host/port 配置；<code>webapp.server</code> 注册 runs、scenarios、evaluations、pipeline、score 等 API。</p>
            </div>
            <div class="card">
              <h3>核心执行器</h3>
              <p><code>rag_eval.execution.runner</code> 负责加载 scenario、构建模型与 adapter、调用 <code>Evaluator</code> 执行并写出标准化产物。</p>
            </div>
          </div>
        </section>

        <section id="modules">
          <h2 class="section-title">3. 核心模块说明</h2>
          <p class="section-intro">以下模块覆盖数据准备、评估执行、Web 管理与 Siemens 业务适配的主要职责边界。</p>

          <table>
            <thead>
              <tr>
                <th>模块</th>
                <th>路径</th>
                <th>职责</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td><b>dataset_builder</b></td>
                <td><code>rag_eval/dataset_builder/</code></td>
                <td>PDF解析、source chunk 归一化、LLM 题目生成、草稿题库与构建产物写出。</td>
              </tr>
              <tr>
                <td><b>execution</b></td>
                <td><code>rag_eval/execution/</code></td>
                <td>评估编排、在线/离线模式切换、adapter 调用、RAGAS 打分与结果聚合。</td>
              </tr>
              <tr>
                <td><b>metrics</b></td>
                <td><code>rag_eval/metrics/</code></td>
                <td>RAGAS 指标注册、模型构建、评估管道装配、指标权重与文档权重聚合。</td>
              </tr>
              <tr>
                <td><b>reporting</b></td>
                <td><code>rag_eval/reporting/</code></td>
                <td>运行产物写入、summary 生成、metadata 与 scenario 快照沉淀。</td>
              </tr>
              <tr>
                <td><b>adapters</b></td>
                <td><code>rag_eval/adapters/</code></td>
                <td>HTTP/Python 应用适配器封装，把外部应用结果统一为 <code>answer / contexts / raw_response</code>。</td>
              </tr>
              <tr>
                <td><b>webapp</b></td>
                <td><code>webapp/</code></td>
                <td>FastAPI Web 控制台、OpenAPI 文档、任务后台管理、场景扫描、历史报告查询。</td>
              </tr>
              <tr>
                <td><b>apps</b></td>
                <td><code>apps/siemens_pdf_qa/</code></td>
                <td>西门子 CT 知识库问答适配器，基于 source chunk 证据构造 Prompt 并调用 OpenAI 兼容模型生成答案。</td>
              </tr>
            </tbody>
          </table>

          <div class="grid-3" style="margin-top:18px;">
            <div class="card">
              <h3>settings.py</h3>
              <p>集中读取 <code>.env</code>：OpenAI Key/Base URL、RAGAS Judge/Embedding 模型、并发、阿里云 DocMind、<code>SCORE_API_TOKEN</code> 等。</p>
            </div>
            <div class="card">
              <h3>registry.py</h3>
              <p>定义 7 个受支持指标：faithfulness、answer_relevancy、context_recall、context_precision、noise_sensitivity、factual_correctness、semantic_similarity。</p>
            </div>
            <div class="card">
              <h3>inline_scorer.py</h3>
              <p>为 <code>/api/score</code> 提供模块级缓存评分器，按 <code>(judge_model, embedding_model)</code> 复用 LLM 与 embedding 连接。</p>
            </div>
          </div>
        </section>

        <section id="flows">
          <h2 class="section-title">4. 数据流说明</h2>
          <p class="section-intro">项目围绕四条关键流程展开：题库构建、在线评估、API 全链路 Pipeline 与 Dify 实时评分。</p>

          <div class="flow-grid">
            <div class="flow-card">
              <h3>Flow A: 题库生成流程（Dataset Build）</h3>
              <pre>PDF文件 → 阿里云DocMind解析 → 文档切片(source_chunks)
→ LLM生成题目 → CSV题库文件 → 人工审核</pre>
              <p class="small muted" style="margin-top:12px;">对应入口：<code>main.py --dataset-build-config</code>；核心实现：<code>rag_eval.dataset_builder.runner</code>。</p>
            </div>

            <div class="flow-card">
              <h3>Flow B: RAGAS评估流程（Online Evaluation）</h3>
              <pre>CSV题库 → 规范化样本 → 应用适配器(siemens_pdf_qa)
→ LLM答题 → RAGAS指标计算 → 加权得分 → 报告产物</pre>
              <p class="small muted" style="margin-top:12px;">对应 Siemens 场景：<code>scenarios/online/siemens-pdf-question-bank-online.yaml</code>，由 <code>apps.siemens_pdf_qa.adapter:run</code> 提供答案与证据片段。</p>
            </div>

            <div class="flow-card">
              <h3>Flow C: 全链路 Pipeline（API触发）</h3>
              <pre>POST /api/pipeline/jobs → 后台线程 → Flow A → Flow B → 产物路径</pre>
              <p class="small muted" style="margin-top:12px;">由 <code>webapp.services.pipeline_task_manager</code> 在线程池中串行执行 <code>parsing_documents → generating_questions → evaluating</code> 三阶段，并返回 <code>scores.csv / summary.md / dataset.csv</code> 等路径。</p>
            </div>

            <div class="flow-card">
              <h3>Flow D: Dify实时评分（/api/score）</h3>
              <pre>Dify Agent → POST /api/score → InlineScorer → RAGAS metrics → 得分JSON</pre>
              <p class="small muted" style="margin-top:12px;">当 <code>ground_truth</code> 缺失时，会自动跳过依赖参考答案的指标，并在响应中给出 <code>skipped_metrics</code>。</p>
            </div>
          </div>
        </section>

        <section id="metrics">
          <h2 class="section-title">5. RAGAS 评估指标</h2>
          <p class="section-intro">平台当前支持 7 个指标，既覆盖回答忠实度和相关性，也覆盖对参考答案、噪声片段和语义相似度的衡量。</p>

          <div class="grid-4">
            <div class="metric-card">
              <h3>faithfulness</h3>
              <span class="need-gt no">无需 ground_truth</span>
              <p>回答对检索内容的忠实度，用于防止模型脱离证据“幻觉式”作答。</p>
            </div>
            <div class="metric-card">
              <h3>answer_relevancy</h3>
              <span class="need-gt no">无需 ground_truth</span>
              <p>衡量回答与问题本身的相关性，判断是否真正命中用户所问。</p>
            </div>
            <div class="metric-card">
              <h3>context_precision</h3>
              <span class="need-gt no">无需 ground_truth</span>
              <p>衡量检索片段的精准度，关注上下文中与回答真正相关的比例。</p>
            </div>
            <div class="metric-card">
              <h3>context_recall</h3>
              <span class="need-gt yes">需要 ground_truth</span>
              <p>检索内容对标准答案覆盖程度，反映召回是否足够支撑正确作答。</p>
            </div>
            <div class="metric-card">
              <h3>noise_sensitivity</h3>
              <span class="need-gt yes">需要 ground_truth</span>
              <p>系统对噪声检索片段的鲁棒性，越不受无关片段干扰越好。</p>
            </div>
            <div class="metric-card">
              <h3>factual_correctness</h3>
              <span class="need-gt yes">需要 ground_truth</span>
              <p>与标准答案对齐的事实准确性，代码中作为端到端事实正确度指标。</p>
            </div>
            <div class="metric-card">
              <h3>semantic_similarity</h3>
              <span class="need-gt yes">需要 ground_truth</span>
              <p>回答与标准答案的语义相似度，依赖 embedding，不需要额外 LLM 调用。</p>
            </div>
            <div class="metric-card">
              <h3>指标默认集合</h3>
              <span class="need-gt no">在线评分默认</span>
              <p><code>/api/score</code> 的默认指标集合为 faithfulness、answer_relevancy、context_recall、context_precision。</p>
            </div>
          </div>
        </section>

        <section id="apis">
          <h2 class="section-title">6. API 接口文档</h2>
          <p class="section-intro">Web 层基于 FastAPI 提供任务提交、历史查询、LLM 配置管理与 Dify 实时评分接口。</p>

          <table>
            <thead>
              <tr>
                <th>方法</th>
                <th>路径</th>
                <th>说明</th>
              </tr>
            </thead>
            <tbody>
              <tr><td><b>POST</b></td><td><code>/api/pipeline/jobs</code></td><td>提交全链路评估任务，后台线程自动执行 PDF 解析、题库生成和在线评估。</td></tr>
              <tr><td><b>GET</b></td><td><code>/api/pipeline/jobs/{id}</code></td><td>查询指定 Pipeline 任务状态、阶段、日志和产物路径。</td></tr>
              <tr><td><b>POST</b></td><td><code>/api/score</code></td><td>Dify 实时评分接口，接收单条问答并返回各指标得分与综合得分。</td></tr>
              <tr><td><b>POST</b></td><td><code>/api/llm-profiles/probe</code></td><td>临时测试 LLM / Embedding 连通性，无需先保存配置。</td></tr>
              <tr><td><b>POST</b></td><td><code>/api/llm-profiles</code></td><td>创建命名的 LLM 配置档案，可供场景或控制台复用。</td></tr>
              <tr><td><b>POST</b></td><td><code>/api/evaluations</code></td><td>基于已有场景 YAML 启动一次后台评估任务。</td></tr>
              <tr><td><b>GET</b></td><td><code>/api/runs</code></td><td>获取历史运行列表，用于控制台报告页与明细页渲染。</td></tr>
            </tbody>
          </table>

          <div class="grid-2" style="margin-top:18px;">
            <div class="card">
              <h3>/api/score 请求示例</h3>
              <pre class="code-block">POST /api/score
Authorization: Bearer &lt;token&gt;
Content-Type: application/json

{
  "question": "双源CT的时间分辨率是多少?",
  "answer": "双源CT的单扇区时间分辨率为75ms。",
  "contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
  "ground_truth": "双源CT单扇区时间分辨率为75ms，需旋转135度。",
  "context_separator": " |||| ",
  "metrics": [
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision"
  ],
  "judge_model": "gpt-5",
  "embedding_model": "text-embedding-3-small"
}</pre>
            </div>
            <div class="card">
              <h3>/api/score 响应示例</h3>
              <pre class="code-block">{
  "scores": {
    "faithfulness": 0.875,
    "answer_relevancy": 0.92,
    "context_recall": 0.81,
    "context_precision": 0.85
  },
  "weighted_score": 0.8638,
  "latency_ms": 3420,
  "skipped_metrics": [],
  "error": null
}</pre>
            </div>
          </div>

          <div class="grid-3" style="margin-top:18px;">
            <div class="card">
              <h3>Pipeline API</h3>
              <p>返回 <code>job_id</code> 后即可轮询。阶段枚举包括 <code>queued</code>、<code>running</code>、<code>parsing_documents</code>、<code>evaluating</code> 与 <code>done</code>。</p>
            </div>
            <div class="card">
              <h3>LLM Profiles</h3>
              <p>支持保存 base URL、API Key、model、timeout，并通过 <code>/apply</code> 将配置写回场景 YAML，同时可补充 metric/doc 权重。</p>
            </div>
            <div class="card">
              <h3>Runs API</h3>
              <p>历史 run 会读取 <code>summary.md</code>、<code>scores.csv</code> 与 <code>scenario.snapshot.yaml</code>，聚合指标均值、分布和最低分样本。</p>
            </div>
          </div>
        </section>

        <section id="weights">
          <h2 class="section-title">7. 指标权重配置</h2>
          <p class="section-intro">平台支持两级权重：<code>metric_weights</code> 控制不同指标的重要性，<code>doc_weights</code> 控制不同文档对总体分数的影响。</p>

          <pre class="code-block">metric_weights:
  faithfulness: 0.35
  context_recall: 0.25
  context_precision: 0.20
  answer_relevancy: 0.20

doc_weights:
  "322_双源CT.pdf": 2.0</pre>

          <div class="formula" style="margin-top:18px;">
            <b>weighted_score 计算逻辑：</b><br />
            1. <b>单样本综合分</b> = Σ(有效指标分 × 指标权重) / Σ(有效指标权重)<br />
            2. <b>总体综合分</b> = Σ(单样本综合分 × 文档权重) / Σ(文档权重)<br />
            3. 当权重未配置时，代码会自动退化为默认 1.0，即普通平均。<br />
            4. <code>doc_weights</code> 不仅影响总体综合分，也会影响控制台中按文档聚合后的指标均值。
          </div>

          <div class="grid-2" style="margin-top:18px;">
            <div class="card">
              <h3>代码依据</h3>
              <p><code>rag_eval.metrics.weights.compute_weighted_score()</code> 负责单样本指标加权；<code>compute_overall_weighted_score_mean()</code> 负责跨样本文档加权。</p>
            </div>
            <div class="card">
              <h3>配置入口</h3>
              <p>场景 YAML 可直接定义；Web 控制台也可通过 <code>/api/llm-profiles/apply</code> 将权重补丁写回场景文件。</p>
            </div>
          </div>
        </section>

        <section id="deployment">
          <h2 class="section-title">8. 部署说明</h2>
          <p class="section-intro">仓库自带 <code>deploy.sh</code>，适合 Linux 一键部署 Web 控制台与相关依赖。</p>

          <div class="grid-2">
            <div class="card">
              <h3>Linux 部署步骤</h3>
              <ol class="steps">
                <li>准备 Python 3.12+ 环境，执行 <code>bash deploy.sh</code>。</li>
                <li>脚本会自动创建 <code>.venv</code>、安装 <code>pyproject.toml</code> 依赖，并补装 <code>fastapi / uvicorn / httpx</code>。</li>
                <li>若 <code>.env</code> 不存在，将从 <code>.env.example</code> 复制一份模板。</li>
                <li>脚本初始化 <code>configs / logs / outputs / datasets</code> 目录，并尝试生成 demo 数据。</li>
                <li>最后使用 <code>webmain.py</code> 后台启动服务，默认端口 8800，冲突时回退到 8801。</li>
              </ol>
            </div>
            <div class="card">
              <h3>常用命令</h3>
              <pre class="code-block"># 依赖安装
uv sync

# CLI 运行在线/离线评估
.\.venv\Scripts\python.exe main.py --scenario scenarios\online\siemens-pdf-question-bank-online.yaml

# CLI 运行题库生成
.\.venv\Scripts\python.exe main.py --dataset-build-config scenarios\siemens_build\siemens-pdf-build.yaml

# 启动 Web 控制台
.\.venv\Scripts\python.exe webmain.py --host 127.0.0.1 --port 8800</pre>
            </div>
          </div>

          <div class="pill-heading">关键 .env 配置</div>
          <table>
            <thead>
              <tr>
                <th>变量</th>
                <th>用途</th>
                <th>示例 / 默认</th>
              </tr>
            </thead>
            <tbody>
              <tr><td><code>OPENAI_API_KEY</code></td><td>OpenAI 兼容接口凭据</td><td><code>your-api-key</code></td></tr>
              <tr><td><code>OPENAI_BASE_URL</code></td><td>统一 LLM / Embedding 网关地址</td><td><code>http://6.86.80.4:30080/v1</code></td></tr>
              <tr><td><code>RAGAS_JUDGE_MODEL</code></td><td>RAGAS Judge 默认模型</td><td><code>gpt-5</code></td></tr>
              <tr><td><code>RAGAS_EMBEDDING_MODEL</code></td><td>Embedding 默认模型</td><td><code>text-embedding-3-small</code></td></tr>
              <tr><td><code>ALIBABA_ACCESS_KEY_ID</code> / <code>ALIBABA_ACCESS_KEY_SECRET</code></td><td>阿里云 DocMind 凭据</td><td>dataset build 必填</td></tr>
              <tr><td><code>ALIBABA_ENDPOINT</code></td><td>DocMind 服务域名</td><td><code>docmind-api.cn-hangzhou.aliyuncs.com</code></td></tr>
              <tr><td><code>DATASET_GENERATOR_MODEL</code></td><td>题库生成默认模型</td><td><code>qwen3.6-plus</code></td></tr>
              <tr><td><code>SCORE_API_TOKEN</code></td><td><code>/api/score</code> Bearer 鉴权令牌</td><td>留空则不鉴权</td></tr>
              <tr><td><code>RAGAS_METRIC_TIMEOUT_SECONDS</code></td><td>RAGAS 指标计算超时</td><td><code>300</code>（7指标建议值）</td></tr>
            </tbody>
          </table>
        </section>

        <section id="stack">
          <h2 class="section-title">9. 技术栈</h2>
          <p class="section-intro">从依赖声明、Web 服务实现与测试代码来看，项目的技术栈如下。</p>

          <div class="stack-badges">
            <span class="badge"><b>后端</b> Python 3.12 · FastAPI · RAGAS 0.4.3 · Pydantic v2 · uvicorn</span>
            <span class="badge"><b>AI / ML</b> OpenAI SDK · LangChain · ragas · instructor 风格结构</span>
            <span class="badge"><b>文档解析</b> 阿里云 DocMind</span>
            <span class="badge"><b>前端</b> Vanilla JS · Chart.js 风格报告页</span>
            <span class="badge"><b>测试</b> pytest · FastAPI TestClient</span>
            <span class="badge"><b>工具链</b> uv · pyproject.toml · YAML 场景驱动</span>
          </div>

          <div class="grid-3" style="margin-top:18px;">
            <div class="card">
              <h3>依赖声明</h3>
              <p><code>pyproject.toml</code> 中列出了 <code>ragas==0.4.3</code>、<code>langchain-openai</code>、<code>datasets</code>、<code>pydantic-settings</code>、阿里云 DocMind SDK 等核心依赖。</p>
            </div>
            <div class="card">
              <h3>Web 生态</h3>
              <p>控制台基于 FastAPI + 静态页面，服务器通过 <code>webmain.py</code> 配置日志和 uvicorn，前端报告页结合图表与表格展示历史 run。</p>
            </div>
            <div class="card">
              <h3>测试现状</h3>
              <p>仓库同时存在 <code>pytest</code> 与 <code>fastapi.testclient.TestClient</code> 用例，涵盖 Pipeline、权重聚合、实时评分与 LLM 配置接口。</p>
            </div>
          </div>
        </section>

        <section id="structure">
          <h2 class="section-title">10. 目录结构</h2>
          <p class="section-intro">以下树状图概括了项目中最关键的源码、配置、数据、输出与测试位置。</p>

          <pre class="tree">siemens_ragas/
├── README.md
├── pyproject.toml
├── main.py
├── webmain.py
├── deploy.sh
├── .env.example
├── apps/
│   ├── pdf_question_bank/
│   ├── sample_python/
│   └── siemens_pdf_qa/
├── datasets/
│   ├── raw/
│   └── normalized/
├── docs/
├── outputs/
├── scenarios/
│   ├── online/
│   ├── offline/
│   └── siemens_build/
├── tests/
│   ├── webapp/
│   ├── test_pipeline.py
│   ├── test_weights.py
│   └── test_webapp_report_builder.py
├── rag_eval/
│   ├── adapters/
│   ├── advisor/
│   ├── config/
│   ├── datasets/
│   ├── dataset_builder/
│   │   ├── generator/
│   │   └── parser/
│   ├── execution/
│   ├── metrics/
│   ├── reporting/
│   └── shared/
└── webapp/
    ├── api/
    ├── services/
    └── static/
        ├── css/
        └── js/</pre>

          <div class="grid-2" style="margin-top:18px;">
            <div class="card">
              <h3>源码主路径</h3>
              <p><code>rag_eval/</code> 是平台核心，<code>webapp/</code> 负责服务端 API 与控制台，<code>apps/</code> 放置面向不同业务应用的 adapter。</p>
            </div>
            <div class="card">
              <h3>配置与产物</h3>
              <p><code>scenarios/</code> 维护 YAML 场景，<code>datasets/</code> 存放原始/规范化数据，<code>outputs/</code> 产出评测运行结果与 dataset build 工件。</p>
            </div>
          </div>
        </section>

        <div class="footer">
          Generated for <b>siemens_ragas</b> · Self-contained HTML overview · Siemens teal/blue documentation theme
        </div>
      </div>
    </main>
  </div>

  <script>
    const sidebar = document.getElementById('sidebar');
    const menuToggle = document.getElementById('menu-toggle');
    const backdrop = document.getElementById('sidebar-backdrop');
    const navLinks = Array.from(document.querySelectorAll('.sidebar nav a'));
    const sections = Array.from(document.querySelectorAll('section[id]'));

    function closeSidebar() {
      sidebar.classList.remove('open');
      backdrop.classList.remove('show');
    }

    function openSidebar() {
      sidebar.classList.add('open');
      backdrop.classList.add('show');
    }

    if (menuToggle) {
      menuToggle.addEventListener('click', () => {
        if (sidebar.classList.contains('open')) {
          closeSidebar();
        } else {
          openSidebar();
        }
      });
    }

    if (backdrop) {
      backdrop.addEventListener('click', closeSidebar);
    }

    navLinks.forEach(link => {
      link.addEventListener('click', () => {
        if (window.innerWidth <= 980) closeSidebar();
      });
    });

    const observer = new IntersectionObserver((entries) => {
      entries.forEach(entry => {
        if (!entry.isIntersecting) return;
        const id = entry.target.getAttribute('id');
        navLinks.forEach(link => {
          const active = link.getAttribute('href') === `#${id}`;
          link.classList.toggle('active', active);
        });
      });
    }, {
      rootMargin: '-20% 0px -60% 0px',
      threshold: 0.1
    });

    sections.forEach(section => observer.observe(section));
  </script>
</body>
</html>