Files
siemens_ragas/project-overview.html
wangwei 630b70cc2a docs: add project-overview.html — full project documentation
Covers: overview, architecture, modules, data flows (4 flows),
RAGAS metrics (7), API reference, weight config, deployment,
tech stack, directory structure. Self-contained HTML with
Siemens teal theme, sidebar scrollspy, responsive layout.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-24 10:17:08 +08:00

1102 lines
41 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Siemens RAGAS 项目总览</title>
<style>
:root {
--siemens-teal: #009999;
--siemens-blue: #0066CC;
--bg: #f8fafc;
--surface: #ffffff;
--surface-soft: #eef6f8;
--text: #0f172a;
--muted: #475569;
--border: #dbe4ee;
--code-bg: #1e293b;
--code-text: #e2e8f0;
--shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
--radius: 18px;
--sidebar-width: 260px;
--content-max: 1360px;
}
* { box-sizing: border-box; }
html { scroll-behavior: smooth; }
body {
margin: 0;
font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
color: var(--text);
background: linear-gradient(180deg, #f8fafc 0%, #eff7fb 100%);
}
a { color: inherit; text-decoration: none; }
code, pre { font-family: "Cascadia Code", "SFMono-Regular", Consolas, monospace; }
.layout {
display: flex;
min-height: 100vh;
}
.sidebar {
position: fixed;
inset: 0 auto 0 0;
width: var(--sidebar-width);
background: linear-gradient(180deg, #062630 0%, #083947 42%, #0a4a65 100%);
color: #e6fffb;
padding: 26px 18px 24px;
overflow-y: auto;
box-shadow: 4px 0 18px rgba(0, 0, 0, 0.12);
z-index: 20;
}
.brand {
margin-bottom: 22px;
padding-bottom: 18px;
border-bottom: 1px solid rgba(255, 255, 255, 0.14);
}
.brand h1 {
margin: 0;
font-size: 1.22rem;
line-height: 1.35;
}
.brand p {
margin: 10px 0 0;
font-size: 0.92rem;
color: rgba(230, 255, 251, 0.78);
}
.sidebar nav {
display: grid;
gap: 8px;
}
.sidebar nav a {
display: block;
padding: 10px 12px;
border-radius: 12px;
color: rgba(255, 255, 255, 0.82);
font-size: 0.95rem;
transition: 0.2s ease;
border: 1px solid transparent;
}
.sidebar nav a:hover,
.sidebar nav a.active {
background: rgba(255, 255, 255, 0.12);
color: #fff;
border-color: rgba(255, 255, 255, 0.18);
transform: translateX(2px);
}
.sidebar .meta {
margin-top: 22px;
padding-top: 16px;
border-top: 1px solid rgba(255, 255, 255, 0.14);
font-size: 0.84rem;
color: rgba(230, 255, 251, 0.72);
line-height: 1.7;
}
.main {
margin-left: var(--sidebar-width);
width: calc(100% - var(--sidebar-width));
padding: 28px;
}
.container {
max-width: var(--content-max);
margin: 0 auto;
}
.hero {
position: relative;
overflow: hidden;
background: radial-gradient(circle at top right, rgba(0, 102, 204, 0.22), transparent 34%),
linear-gradient(135deg, #ffffff 0%, #effafb 55%, #e6f2ff 100%);
border: 1px solid rgba(0, 153, 153, 0.15);
border-radius: 26px;
padding: 34px;
box-shadow: var(--shadow);
margin-bottom: 28px;
}
.hero::after {
content: "";
position: absolute;
right: -60px;
top: -60px;
width: 220px;
height: 220px;
border-radius: 50%;
background: radial-gradient(circle, rgba(0, 153, 153, 0.18), transparent 70%);
pointer-events: none;
}
.eyebrow {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 6px 12px;
border-radius: 999px;
background: rgba(0, 153, 153, 0.08);
color: var(--siemens-teal);
font-weight: 700;
font-size: 0.86rem;
margin-bottom: 16px;
}
.hero h2 {
margin: 0;
font-size: clamp(2rem, 3vw, 3.1rem);
line-height: 1.12;
letter-spacing: -0.03em;
max-width: 900px;
}
.hero p {
max-width: 900px;
font-size: 1.02rem;
line-height: 1.8;
color: var(--muted);
margin: 16px 0 0;
}
.hero-grid {
display: grid;
grid-template-columns: 1.5fr 1fr;
gap: 24px;
align-items: end;
margin-top: 24px;
}
.hero-stats {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 14px;
}
.stat {
background: rgba(255, 255, 255, 0.8);
border: 1px solid rgba(0, 102, 204, 0.1);
border-radius: 18px;
padding: 16px;
box-shadow: var(--shadow);
}
.stat b {
display: block;
font-size: 1.55rem;
color: var(--siemens-blue);
margin-bottom: 6px;
}
.stat span {
color: var(--muted);
font-size: 0.92rem;
}
.hero-note {
background: linear-gradient(135deg, rgba(0, 153, 153, 0.12), rgba(0, 102, 204, 0.1));
border: 1px solid rgba(0, 153, 153, 0.14);
border-radius: 20px;
padding: 20px;
line-height: 1.8;
color: #103451;
}
.hero-note b { color: var(--siemens-blue); }
section {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 22px;
padding: 28px;
box-shadow: var(--shadow);
margin-bottom: 24px;
scroll-margin-top: 18px;
}
.section-title {
display: flex;
align-items: center;
gap: 14px;
margin: 0 0 18px;
font-size: 1.5rem;
letter-spacing: -0.02em;
}
.section-title::before {
content: "";
width: 6px;
height: 28px;
border-radius: 999px;
background: linear-gradient(180deg, var(--siemens-teal), var(--siemens-blue));
flex: 0 0 auto;
}
.section-intro {
margin: 0 0 18px;
color: var(--muted);
line-height: 1.85;
}
.badges,
.stack-badges {
display: flex;
flex-wrap: wrap;
gap: 10px;
}
.badge {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 9px 14px;
border-radius: 999px;
background: linear-gradient(135deg, rgba(0, 153, 153, 0.12), rgba(0, 102, 204, 0.1));
color: #0f4060;
border: 1px solid rgba(0, 153, 153, 0.15);
font-size: 0.93rem;
font-weight: 600;
}
.grid-2,
.grid-3,
.grid-4 {
display: grid;
gap: 18px;
}
.grid-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); }
.grid-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); }
.grid-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); }
.card {
background: linear-gradient(180deg, #ffffff 0%, #fbfdff 100%);
border: 1px solid var(--border);
border-radius: 18px;
padding: 18px;
box-shadow: var(--shadow);
}
.card h3,
.card h4 {
margin: 0 0 12px;
font-size: 1.05rem;
}
.card p,
.card li {
color: var(--muted);
line-height: 1.8;
margin: 0;
}
.card ul {
margin: 0;
padding-left: 20px;
}
.card .mini {
font-size: 0.86rem;
color: #64748b;
margin-top: 8px;
}
.diagram,
.code-block,
.tree {
background: var(--code-bg);
color: var(--code-text);
padding: 20px;
border-radius: 18px;
overflow-x: auto;
border: 1px solid rgba(148, 163, 184, 0.16);
box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.04);
}
.diagram { font-size: 0.95rem; line-height: 1.5; }
.code-block { line-height: 1.7; }
.tree { line-height: 1.65; }
.pill-heading {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 6px 12px;
border-radius: 999px;
background: rgba(0, 102, 204, 0.08);
color: var(--siemens-blue);
font-weight: 700;
margin: 14px 0 10px;
font-size: 0.9rem;
}
table {
width: 100%;
border-collapse: collapse;
overflow: hidden;
border-radius: 16px;
border: 1px solid var(--border);
}
th,
td {
padding: 14px 14px;
text-align: left;
vertical-align: top;
border-bottom: 1px solid var(--border);
line-height: 1.7;
font-size: 0.95rem;
}
th {
background: linear-gradient(180deg, #edf8f8, #e8f1ff);
color: #103451;
font-size: 0.92rem;
}
tr:last-child td { border-bottom: none; }
.flow-grid {
display: grid;
gap: 16px;
}
.flow-card {
border: 1px solid var(--border);
border-radius: 18px;
padding: 18px;
background: linear-gradient(180deg, #ffffff 0%, #fbfdff 100%);
}
.flow-card h3 {
margin: 0 0 12px;
font-size: 1.06rem;
}
.flow-card pre {
margin: 0;
white-space: pre-wrap;
word-break: break-word;
line-height: 1.85;
}
.metric-card {
border: 1px solid var(--border);
border-radius: 18px;
padding: 18px;
background: linear-gradient(180deg, #ffffff 0%, #fafdff 100%);
box-shadow: var(--shadow);
}
.metric-card h3 {
margin: 0 0 10px;
font-size: 1.02rem;
color: var(--siemens-blue);
}
.metric-card .need-gt {
display: inline-block;
padding: 4px 10px;
border-radius: 999px;
font-size: 0.82rem;
font-weight: 700;
margin-bottom: 12px;
}
.need-gt.yes { background: rgba(0, 102, 204, 0.1); color: var(--siemens-blue); }
.need-gt.no { background: rgba(0, 153, 153, 0.12); color: var(--siemens-teal); }
.formula {
background: linear-gradient(180deg, #f3fbfb 0%, #eef5ff 100%);
border: 1px solid rgba(0, 102, 204, 0.14);
color: #0f4060;
padding: 18px;
border-radius: 18px;
line-height: 1.9;
}
.steps {
margin: 0;
padding-left: 18px;
color: var(--muted);
line-height: 1.9;
}
.muted { color: var(--muted); }
.small { font-size: 0.9rem; }
.footer {
text-align: center;
color: #64748b;
font-size: 0.88rem;
padding: 8px 0 24px;
}
.mobile-topbar {
display: none;
position: sticky;
top: 0;
z-index: 25;
background: rgba(248, 250, 252, 0.92);
backdrop-filter: blur(10px);
border-bottom: 1px solid rgba(15, 23, 42, 0.08);
padding: 12px 16px;
margin: -28px -28px 18px;
}
.mobile-topbar button {
border: none;
background: linear-gradient(135deg, var(--siemens-teal), var(--siemens-blue));
color: #fff;
border-radius: 12px;
padding: 10px 14px;
font-weight: 700;
cursor: pointer;
}
.sidebar-backdrop {
display: none;
position: fixed;
inset: 0;
background: rgba(15, 23, 42, 0.42);
z-index: 15;
}
@media (max-width: 1180px) {
.hero-grid,
.grid-4 {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
}
@media (max-width: 980px) {
.sidebar {
transform: translateX(-100%);
transition: transform 0.24s ease;
}
.sidebar.open { transform: translateX(0); }
.sidebar-backdrop.show { display: block; }
.main {
margin-left: 0;
width: 100%;
}
.mobile-topbar { display: flex; justify-content: space-between; align-items: center; }
.grid-2,
.grid-3,
.grid-4,
.hero-grid,
.hero-stats {
grid-template-columns: 1fr;
}
}
@media (max-width: 640px) {
.main { padding: 18px; }
.hero, section { padding: 20px; }
.mobile-topbar { margin: -18px -18px 16px; }
th, td { padding: 12px 10px; }
}
</style>
</head>
<body>
<div class="sidebar-backdrop" id="sidebar-backdrop"></div>
<div class="layout">
<aside class="sidebar" id="sidebar">
<div class="brand">
<h1>Siemens RAGAS<br />项目文档</h1>
<p>西门子医疗影像 CT 知识库 RAG 评估平台</p>
</div>
<nav>
<a href="#overview">1. 项目概述</a>
<a href="#architecture">2. 系统架构</a>
<a href="#modules">3. 核心模块说明</a>
<a href="#flows">4. 数据流说明</a>
<a href="#metrics">5. RAGAS 评估指标</a>
<a href="#apis">6. API 接口文档</a>
<a href="#weights">7. 指标权重配置</a>
<a href="#deployment">8. 部署说明</a>
<a href="#stack">9. 技术栈</a>
<a href="#structure">10. 目录结构</a>
</nav>
<div class="meta">
<div><b>生成时间</b><br />2026-06-24</div>
<div style="margin-top:10px;"><b>输出文件</b><br />project-overview.html</div>
<div style="margin-top:10px;"><b>来源</b><br />README / pyproject / main.py / webmain.py / rag_eval / webapp / scenarios / .env.example</div>
</div>
</aside>
<main class="main">
<div class="mobile-topbar">
<strong>Siemens RAGAS</strong>
<button id="menu-toggle" type="button">目录</button>
</div>
<div class="container">
<header class="hero">
<div class="eyebrow">Siemens Healthineers · RAG Evaluation Platform</div>
<h2>Siemens RAGAS RAG 评估平台:面向 CT 知识库的自动化质量评估闭环</h2>
<p>
本项目将 <b>PDF 文档解析</b><b>题库生成</b><b>在线/离线 RAGAS 评测</b><b>报告沉淀与 Web 可视化</b>
统一进一个可复用平台。CLI 与 FastAPI Web 控制台共享同一套 <code>rag_eval</code> 核心引擎,适合批量评估、持续优化与 Dify 实时评分集成。
</p>
<div class="hero-grid">
<div>
<div class="hero-stats">
<div class="stat"><b>3</b><span>入口形态<br />CLI / Web / REST API</span></div>
<div class="stat"><b>7</b><span>RAGAS 指标<br />含 GT 依赖与非依赖</span></div>
<div class="stat"><b>4</b><span>核心流程<br />Build / Eval / Pipeline / Score</span></div>
<div class="stat"><b>2</b><span>适配模式<br />HTTP / Python Adapter</span></div>
</div>
</div>
<div class="hero-note">
<b>核心价值:</b>将 PDF 资料转成可评测题库,再以 Siemens 医疗影像场景为中心完成答题、打分、加权汇总与报告产物沉淀,形成完整质量治理闭环。
</div>
</div>
</header>
<section id="overview">
<h2 class="section-title">1. 项目概述</h2>
<p class="section-intro">
<b>项目名称:</b>Siemens RAGAS RAG 评估平台。<br />
<b>目标:</b>为西门子医疗影像 CT 知识库 RAG 系统提供自动化质量评估。<br />
<b>定位:</b>既能作为离线评测框架,也能作为在线评估控制台与 API 服务,为知识库 QA、Prompt 迭代、检索策略优化提供统一基线。
</p>
<div class="grid-2">
<div class="card">
<h3>业务闭环</h3>
<p>PDF解析 → 题库生成 → RAGAS评测 → 报告可视化 → 再迭代。项目不仅覆盖评测本身,还覆盖评测数据源建设与运行产物管理。</p>
</div>
<div class="card">
<h3>运行方式</h3>
<p><code>main.py</code> 负责 CLI 评估与 dataset build<code>webmain.py</code> 负责启动 FastAPI 控制台,<code>webapp.server</code> 暴露 REST API 与静态前端。</p>
</div>
</div>
<div class="pill-heading">技术亮点</div>
<div class="badges">
<span class="badge">统一 CLI / Web / API 三入口</span>
<span class="badge">阿里云 DocMind 文档解析</span>
<span class="badge">OpenAI 兼容模型接入</span>
<span class="badge">RAGAS 0.4.3 指标流水线</span>
<span class="badge">在线 / 离线双模式评估</span>
<span class="badge">Python / HTTP Adapter 扩展机制</span>
<span class="badge">场景 YAML 驱动</span>
<span class="badge">Pipeline 后台线程编排</span>
<span class="badge">Dify 实时单题评分接口</span>
<span class="badge">metric_weights + doc_weights 加权汇总</span>
<span class="badge">历史 run 资产沉淀</span>
<span class="badge">Web 报告聚合与分布分析</span>
</div>
</section>
<section id="architecture">
<h2 class="section-title">2. 系统架构</h2>
<p class="section-intro">
平台采用“多入口 + 单评估核心”的结构。CLI 和 Web 控制台都汇入 <code>rag_eval</code> 核心引擎API 层只负责任务编排、配置管理与结果查询。
</p>
<pre class="diagram">┌─────────────────────────────────────────────────────────┐
│ siemens_ragas 平台 │
├─────────────┬───────────────────┬───────────────────────┤
│ CLI 入口 │ Web 控制台 │ REST API │
│ main.py │ webmain.py │ FastAPI │
├─────────────┴───────────────────┴───────────────────────┤
│ 核心评估引擎 (rag_eval) │
├──────────────┬──────────────────┬────────────────────────┤
│ dataset_ │ execution/ │ metrics/ │
│ builder/ │ evaluator.py │ pipeline.py │
│ (PDF→题库) │ (评估流程) │ (RAGAS指标) │
├──────────────┴──────────────────┴────────────────────────┤
│ 外部依赖 │
│ 阿里云DocMind (PDF解析) │ OpenAI兼容API (LLM/Embedding) │
└─────────────────────────────────────────────────────────┘</pre>
<div class="grid-3" style="margin-top:18px;">
<div class="card">
<h3>CLI 编排</h3>
<p><code>main.py</code> 通过互斥参数在 <code>--scenario</code><code>--dataset-build-config</code> 之间分派,分别进入评估流程与题库构建流程。</p>
</div>
<div class="card">
<h3>Web 服务</h3>
<p><code>webmain.py</code> 负责 uvicorn 启动、日志文件轮转与 host/port 配置;<code>webapp.server</code> 注册 runs、scenarios、evaluations、pipeline、score 等 API。</p>
</div>
<div class="card">
<h3>核心执行器</h3>
<p><code>rag_eval.execution.runner</code> 负责加载 scenario、构建模型与 adapter、调用 <code>Evaluator</code> 执行并写出标准化产物。</p>
</div>
</div>
</section>
<section id="modules">
<h2 class="section-title">3. 核心模块说明</h2>
<p class="section-intro">以下模块覆盖数据准备、评估执行、Web 管理与 Siemens 业务适配的主要职责边界。</p>
<table>
<thead>
<tr>
<th>模块</th>
<th>路径</th>
<th>职责</th>
</tr>
</thead>
<tbody>
<tr>
<td><b>dataset_builder</b></td>
<td><code>rag_eval/dataset_builder/</code></td>
<td>PDF解析、source chunk 归一化、LLM 题目生成、草稿题库与构建产物写出。</td>
</tr>
<tr>
<td><b>execution</b></td>
<td><code>rag_eval/execution/</code></td>
<td>评估编排、在线/离线模式切换、adapter 调用、RAGAS 打分与结果聚合。</td>
</tr>
<tr>
<td><b>metrics</b></td>
<td><code>rag_eval/metrics/</code></td>
<td>RAGAS 指标注册、模型构建、评估管道装配、指标权重与文档权重聚合。</td>
</tr>
<tr>
<td><b>reporting</b></td>
<td><code>rag_eval/reporting/</code></td>
<td>运行产物写入、summary 生成、metadata 与 scenario 快照沉淀。</td>
</tr>
<tr>
<td><b>adapters</b></td>
<td><code>rag_eval/adapters/</code></td>
<td>HTTP/Python 应用适配器封装,把外部应用结果统一为 <code>answer / contexts / raw_response</code></td>
</tr>
<tr>
<td><b>webapp</b></td>
<td><code>webapp/</code></td>
<td>FastAPI Web 控制台、OpenAPI 文档、任务后台管理、场景扫描、历史报告查询。</td>
</tr>
<tr>
<td><b>apps</b></td>
<td><code>apps/siemens_pdf_qa/</code></td>
<td>西门子 CT 知识库问答适配器,基于 source chunk 证据构造 Prompt 并调用 OpenAI 兼容模型生成答案。</td>
</tr>
</tbody>
</table>
<div class="grid-3" style="margin-top:18px;">
<div class="card">
<h3>settings.py</h3>
<p>集中读取 <code>.env</code>OpenAI Key/Base URL、RAGAS Judge/Embedding 模型、并发、阿里云 DocMind、<code>SCORE_API_TOKEN</code> 等。</p>
</div>
<div class="card">
<h3>registry.py</h3>
<p>定义 7 个受支持指标faithfulness、answer_relevancy、context_recall、context_precision、noise_sensitivity、factual_correctness、semantic_similarity。</p>
</div>
<div class="card">
<h3>inline_scorer.py</h3>
<p><code>/api/score</code> 提供模块级缓存评分器,按 <code>(judge_model, embedding_model)</code> 复用 LLM 与 embedding 连接。</p>
</div>
</div>
</section>
<section id="flows">
<h2 class="section-title">4. 数据流说明</h2>
<p class="section-intro">项目围绕四条关键流程展开题库构建、在线评估、API 全链路 Pipeline 与 Dify 实时评分。</p>
<div class="flow-grid">
<div class="flow-card">
<h3>Flow A: 题库生成流程Dataset Build</h3>
<pre>PDF文件 → 阿里云DocMind解析 → 文档切片(source_chunks)
→ LLM生成题目 → CSV题库文件 → 人工审核</pre>
<p class="small muted" style="margin-top:12px;">对应入口:<code>main.py --dataset-build-config</code>;核心实现:<code>rag_eval.dataset_builder.runner</code></p>
</div>
<div class="flow-card">
<h3>Flow B: RAGAS评估流程Online Evaluation</h3>
<pre>CSV题库 → 规范化样本 → 应用适配器(siemens_pdf_qa)
→ LLM答题 → RAGAS指标计算 → 加权得分 → 报告产物</pre>
<p class="small muted" style="margin-top:12px;">对应 Siemens 场景:<code>scenarios/online/siemens-pdf-question-bank-online.yaml</code>,由 <code>apps.siemens_pdf_qa.adapter:run</code> 提供答案与证据片段。</p>
</div>
<div class="flow-card">
<h3>Flow C: 全链路 PipelineAPI触发</h3>
<pre>POST /api/pipeline/jobs → 后台线程 → Flow A → Flow B → 产物路径</pre>
<p class="small muted" style="margin-top:12px;"><code>webapp.services.pipeline_task_manager</code> 在线程池中串行执行 <code>parsing_documents → generating_questions → evaluating</code> 三阶段,并返回 <code>scores.csv / summary.md / dataset.csv</code> 等路径。</p>
</div>
<div class="flow-card">
<h3>Flow D: Dify实时评分/api/score</h3>
<pre>Dify Agent → POST /api/score → InlineScorer → RAGAS metrics → 得分JSON</pre>
<p class="small muted" style="margin-top:12px;"><code>ground_truth</code> 缺失时,会自动跳过依赖参考答案的指标,并在响应中给出 <code>skipped_metrics</code></p>
</div>
</div>
</section>
<section id="metrics">
<h2 class="section-title">5. RAGAS 评估指标</h2>
<p class="section-intro">平台当前支持 7 个指标,既覆盖回答忠实度和相关性,也覆盖对参考答案、噪声片段和语义相似度的衡量。</p>
<div class="grid-4">
<div class="metric-card">
<h3>faithfulness</h3>
<span class="need-gt no">无需 ground_truth</span>
<p>回答对检索内容的忠实度,用于防止模型脱离证据“幻觉式”作答。</p>
</div>
<div class="metric-card">
<h3>answer_relevancy</h3>
<span class="need-gt no">无需 ground_truth</span>
<p>衡量回答与问题本身的相关性,判断是否真正命中用户所问。</p>
</div>
<div class="metric-card">
<h3>context_precision</h3>
<span class="need-gt no">无需 ground_truth</span>
<p>衡量检索片段的精准度,关注上下文中与回答真正相关的比例。</p>
</div>
<div class="metric-card">
<h3>context_recall</h3>
<span class="need-gt yes">需要 ground_truth</span>
<p>检索内容对标准答案覆盖程度,反映召回是否足够支撑正确作答。</p>
</div>
<div class="metric-card">
<h3>noise_sensitivity</h3>
<span class="need-gt yes">需要 ground_truth</span>
<p>系统对噪声检索片段的鲁棒性,越不受无关片段干扰越好。</p>
</div>
<div class="metric-card">
<h3>factual_correctness</h3>
<span class="need-gt yes">需要 ground_truth</span>
<p>与标准答案对齐的事实准确性,代码中作为端到端事实正确度指标。</p>
</div>
<div class="metric-card">
<h3>semantic_similarity</h3>
<span class="need-gt yes">需要 ground_truth</span>
<p>回答与标准答案的语义相似度,依赖 embedding不需要额外 LLM 调用。</p>
</div>
<div class="metric-card">
<h3>指标默认集合</h3>
<span class="need-gt no">在线评分默认</span>
<p><code>/api/score</code> 的默认指标集合为 faithfulness、answer_relevancy、context_recall、context_precision。</p>
</div>
</div>
</section>
<section id="apis">
<h2 class="section-title">6. API 接口文档</h2>
<p class="section-intro">Web 层基于 FastAPI 提供任务提交、历史查询、LLM 配置管理与 Dify 实时评分接口。</p>
<table>
<thead>
<tr>
<th>方法</th>
<th>路径</th>
<th>说明</th>
</tr>
</thead>
<tbody>
<tr><td><b>POST</b></td><td><code>/api/pipeline/jobs</code></td><td>提交全链路评估任务,后台线程自动执行 PDF 解析、题库生成和在线评估。</td></tr>
<tr><td><b>GET</b></td><td><code>/api/pipeline/jobs/{id}</code></td><td>查询指定 Pipeline 任务状态、阶段、日志和产物路径。</td></tr>
<tr><td><b>POST</b></td><td><code>/api/score</code></td><td>Dify 实时评分接口,接收单条问答并返回各指标得分与综合得分。</td></tr>
<tr><td><b>POST</b></td><td><code>/api/llm-profiles/probe</code></td><td>临时测试 LLM / Embedding 连通性,无需先保存配置。</td></tr>
<tr><td><b>POST</b></td><td><code>/api/llm-profiles</code></td><td>创建命名的 LLM 配置档案,可供场景或控制台复用。</td></tr>
<tr><td><b>POST</b></td><td><code>/api/evaluations</code></td><td>基于已有场景 YAML 启动一次后台评估任务。</td></tr>
<tr><td><b>GET</b></td><td><code>/api/runs</code></td><td>获取历史运行列表,用于控制台报告页与明细页渲染。</td></tr>
</tbody>
</table>
<div class="grid-2" style="margin-top:18px;">
<div class="card">
<h3>/api/score 请求示例</h3>
<pre class="code-block">POST /api/score
Authorization: Bearer &lt;token&gt;
Content-Type: application/json
{
"question": "双源CT的时间分辨率是多少?",
"answer": "双源CT的单扇区时间分辨率为75ms。",
"contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
"ground_truth": "双源CT单扇区时间分辨率为75ms需旋转135度。",
"context_separator": " |||| ",
"metrics": [
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision"
],
"judge_model": "gpt-5",
"embedding_model": "text-embedding-3-small"
}</pre>
</div>
<div class="card">
<h3>/api/score 响应示例</h3>
<pre class="code-block">{
"scores": {
"faithfulness": 0.875,
"answer_relevancy": 0.92,
"context_recall": 0.81,
"context_precision": 0.85
},
"weighted_score": 0.8638,
"latency_ms": 3420,
"skipped_metrics": [],
"error": null
}</pre>
</div>
</div>
<div class="grid-3" style="margin-top:18px;">
<div class="card">
<h3>Pipeline API</h3>
<p>返回 <code>job_id</code> 后即可轮询。阶段枚举包括 <code>queued</code><code>running</code><code>parsing_documents</code><code>evaluating</code><code>done</code></p>
</div>
<div class="card">
<h3>LLM Profiles</h3>
<p>支持保存 base URL、API Key、model、timeout并通过 <code>/apply</code> 将配置写回场景 YAML同时可补充 metric/doc 权重。</p>
</div>
<div class="card">
<h3>Runs API</h3>
<p>历史 run 会读取 <code>summary.md</code><code>scores.csv</code><code>scenario.snapshot.yaml</code>,聚合指标均值、分布和最低分样本。</p>
</div>
</div>
</section>
<section id="weights">
<h2 class="section-title">7. 指标权重配置</h2>
<p class="section-intro">平台支持两级权重:<code>metric_weights</code> 控制不同指标的重要性,<code>doc_weights</code> 控制不同文档对总体分数的影响。</p>
<pre class="code-block">metric_weights:
faithfulness: 0.35
context_recall: 0.25
context_precision: 0.20
answer_relevancy: 0.20
doc_weights:
"322_双源CT.pdf": 2.0</pre>
<div class="formula" style="margin-top:18px;">
<b>weighted_score 计算逻辑:</b><br />
1. <b>单样本综合分</b> = Σ(有效指标分 × 指标权重) / Σ(有效指标权重)<br />
2. <b>总体综合分</b> = Σ(单样本综合分 × 文档权重) / Σ(文档权重)<br />
3. 当权重未配置时,代码会自动退化为默认 1.0,即普通平均。<br />
4. <code>doc_weights</code> 不仅影响总体综合分,也会影响控制台中按文档聚合后的指标均值。
</div>
<div class="grid-2" style="margin-top:18px;">
<div class="card">
<h3>代码依据</h3>
<p><code>rag_eval.metrics.weights.compute_weighted_score()</code> 负责单样本指标加权;<code>compute_overall_weighted_score_mean()</code> 负责跨样本文档加权。</p>
</div>
<div class="card">
<h3>配置入口</h3>
<p>场景 YAML 可直接定义Web 控制台也可通过 <code>/api/llm-profiles/apply</code> 将权重补丁写回场景文件。</p>
</div>
</div>
</section>
<section id="deployment">
<h2 class="section-title">8. 部署说明</h2>
<p class="section-intro">仓库自带 <code>deploy.sh</code>,适合 Linux 一键部署 Web 控制台与相关依赖。</p>
<div class="grid-2">
<div class="card">
<h3>Linux 部署步骤</h3>
<ol class="steps">
<li>准备 Python 3.12+ 环境,执行 <code>bash deploy.sh</code></li>
<li>脚本会自动创建 <code>.venv</code>、安装 <code>pyproject.toml</code> 依赖,并补装 <code>fastapi / uvicorn / httpx</code></li>
<li><code>.env</code> 不存在,将从 <code>.env.example</code> 复制一份模板。</li>
<li>脚本初始化 <code>configs / logs / outputs / datasets</code> 目录,并尝试生成 demo 数据。</li>
<li>最后使用 <code>webmain.py</code> 后台启动服务,默认端口 8800冲突时回退到 8801。</li>
</ol>
</div>
<div class="card">
<h3>常用命令</h3>
<pre class="code-block"># 依赖安装
uv sync
# CLI 运行在线/离线评估
.\.venv\Scripts\python.exe main.py --scenario scenarios\online\siemens-pdf-question-bank-online.yaml
# CLI 运行题库生成
.\.venv\Scripts\python.exe main.py --dataset-build-config scenarios\siemens_build\siemens-pdf-build.yaml
# 启动 Web 控制台
.\.venv\Scripts\python.exe webmain.py --host 127.0.0.1 --port 8800</pre>
</div>
</div>
<div class="pill-heading">关键 .env 配置</div>
<table>
<thead>
<tr>
<th>变量</th>
<th>用途</th>
<th>示例 / 默认</th>
</tr>
</thead>
<tbody>
<tr><td><code>OPENAI_API_KEY</code></td><td>OpenAI 兼容接口凭据</td><td><code>your-api-key</code></td></tr>
<tr><td><code>OPENAI_BASE_URL</code></td><td>统一 LLM / Embedding 网关地址</td><td><code>http://6.86.80.4:30080/v1</code></td></tr>
<tr><td><code>RAGAS_JUDGE_MODEL</code></td><td>RAGAS Judge 默认模型</td><td><code>gpt-5</code></td></tr>
<tr><td><code>RAGAS_EMBEDDING_MODEL</code></td><td>Embedding 默认模型</td><td><code>text-embedding-3-small</code></td></tr>
<tr><td><code>ALIBABA_ACCESS_KEY_ID</code> / <code>ALIBABA_ACCESS_KEY_SECRET</code></td><td>阿里云 DocMind 凭据</td><td>dataset build 必填</td></tr>
<tr><td><code>ALIBABA_ENDPOINT</code></td><td>DocMind 服务域名</td><td><code>docmind-api.cn-hangzhou.aliyuncs.com</code></td></tr>
<tr><td><code>DATASET_GENERATOR_MODEL</code></td><td>题库生成默认模型</td><td><code>qwen3.6-plus</code></td></tr>
<tr><td><code>SCORE_API_TOKEN</code></td><td><code>/api/score</code> Bearer 鉴权令牌</td><td>留空则不鉴权</td></tr>
<tr><td><code>RAGAS_METRIC_TIMEOUT_SECONDS</code></td><td>RAGAS 指标计算超时</td><td><code>300</code>7指标建议值</td></tr>
</tbody>
</table>
</section>
<section id="stack">
<h2 class="section-title">9. 技术栈</h2>
<p class="section-intro">从依赖声明、Web 服务实现与测试代码来看,项目的技术栈如下。</p>
<div class="stack-badges">
<span class="badge"><b>后端</b> Python 3.12 · FastAPI · RAGAS 0.4.3 · Pydantic v2 · uvicorn</span>
<span class="badge"><b>AI / ML</b> OpenAI SDK · LangChain · ragas · instructor 风格结构</span>
<span class="badge"><b>文档解析</b> 阿里云 DocMind</span>
<span class="badge"><b>前端</b> Vanilla JS · Chart.js 风格报告页</span>
<span class="badge"><b>测试</b> pytest · FastAPI TestClient</span>
<span class="badge"><b>工具链</b> uv · pyproject.toml · YAML 场景驱动</span>
</div>
<div class="grid-3" style="margin-top:18px;">
<div class="card">
<h3>依赖声明</h3>
<p><code>pyproject.toml</code> 中列出了 <code>ragas==0.4.3</code><code>langchain-openai</code><code>datasets</code><code>pydantic-settings</code>、阿里云 DocMind SDK 等核心依赖。</p>
</div>
<div class="card">
<h3>Web 生态</h3>
<p>控制台基于 FastAPI + 静态页面,服务器通过 <code>webmain.py</code> 配置日志和 uvicorn前端报告页结合图表与表格展示历史 run。</p>
</div>
<div class="card">
<h3>测试现状</h3>
<p>仓库同时存在 <code>pytest</code><code>fastapi.testclient.TestClient</code> 用例,涵盖 Pipeline、权重聚合、实时评分与 LLM 配置接口。</p>
</div>
</div>
</section>
<section id="structure">
<h2 class="section-title">10. 目录结构</h2>
<p class="section-intro">以下树状图概括了项目中最关键的源码、配置、数据、输出与测试位置。</p>
<pre class="tree">siemens_ragas/
├── README.md
├── pyproject.toml
├── main.py
├── webmain.py
├── deploy.sh
├── .env.example
├── apps/
│ ├── pdf_question_bank/
│ ├── sample_python/
│ └── siemens_pdf_qa/
├── datasets/
│ ├── raw/
│ └── normalized/
├── docs/
├── outputs/
├── scenarios/
│ ├── online/
│ ├── offline/
│ └── siemens_build/
├── tests/
│ ├── webapp/
│ ├── test_pipeline.py
│ ├── test_weights.py
│ └── test_webapp_report_builder.py
├── rag_eval/
│ ├── adapters/
│ ├── advisor/
│ ├── config/
│ ├── datasets/
│ ├── dataset_builder/
│ │ ├── generator/
│ │ └── parser/
│ ├── execution/
│ ├── metrics/
│ ├── reporting/
│ └── shared/
└── webapp/
├── api/
├── services/
└── static/
├── css/
└── js/</pre>
<div class="grid-2" style="margin-top:18px;">
<div class="card">
<h3>源码主路径</h3>
<p><code>rag_eval/</code> 是平台核心,<code>webapp/</code> 负责服务端 API 与控制台,<code>apps/</code> 放置面向不同业务应用的 adapter。</p>
</div>
<div class="card">
<h3>配置与产物</h3>
<p><code>scenarios/</code> 维护 YAML 场景,<code>datasets/</code> 存放原始/规范化数据,<code>outputs/</code> 产出评测运行结果与 dataset build 工件。</p>
</div>
</div>
</section>
<div class="footer">
Generated for <b>siemens_ragas</b> · Self-contained HTML overview · Siemens teal/blue documentation theme
</div>
</div>
</main>
</div>
<script>
const sidebar = document.getElementById('sidebar');
const menuToggle = document.getElementById('menu-toggle');
const backdrop = document.getElementById('sidebar-backdrop');
const navLinks = Array.from(document.querySelectorAll('.sidebar nav a'));
const sections = Array.from(document.querySelectorAll('section[id]'));
function closeSidebar() {
sidebar.classList.remove('open');
backdrop.classList.remove('show');
}
function openSidebar() {
sidebar.classList.add('open');
backdrop.classList.add('show');
}
if (menuToggle) {
menuToggle.addEventListener('click', () => {
if (sidebar.classList.contains('open')) {
closeSidebar();
} else {
openSidebar();
}
});
}
if (backdrop) {
backdrop.addEventListener('click', closeSidebar);
}
navLinks.forEach(link => {
link.addEventListener('click', () => {
if (window.innerWidth <= 980) closeSidebar();
});
});
const observer = new IntersectionObserver((entries) => {
entries.forEach(entry => {
if (!entry.isIntersecting) return;
const id = entry.target.getAttribute('id');
navLinks.forEach(link => {
const active = link.getAttribute('href') === `#${id}`;
link.classList.toggle('active', active);
});
});
}, {
rootMargin: '-20% 0px -60% 0px',
threshold: 0.1
});
sections.forEach(section => observer.observe(section));
</script>
</body>
</html>