feat: Dify score API complete — add SCORE_API_TOKEN to .env.example

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
feat: add POST /api/score endpoint for Dify real-time scoring
2026-06-22 15:28:20 +08:00 · 2026-06-22 15:14:19 +08:00 · 2026-06-22 15:03:43 +08:00 · 2026-06-22 15:00:05 +08:00 · 2026-06-22 14:55:43 +08:00 · 2026-06-22 14:51:52 +08:00
36 changed files with 5729 additions and 65 deletions
--- a/.env.example
+++ b/.env.example
@@ -30,3 +30,8 @@ PARSER_FAILURE_MODE=fail

 # 生成题库时使用的模型（可在 Web 控制台 LLM 配置中按场景覆盖）
 DATASET_GENERATOR_MODEL=qwen3.6-plus
+
+# ===== Dify 集成 — 实时评分 API =====
+# 为 /api/score 端点设置 Bearer Token 鉴权（留空则不鉴权，适合内网部署）
+# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
+SCORE_API_TOKEN=
--- a/deploy.sh
+++ b/deploy.sh
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+# deploy.sh — Siemens RAGAS 一键部署脚本（Linux）
+# 用法：bash deploy.sh
+# 功能：检查环境 → 安装依赖 → 初始化配置 → 启动后台服务
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# ── 颜色输出 ──────────────────────────────────────────────────────
+if [ -t 1 ]; then
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
+else
+    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
+fi
+
+ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
+err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
+info() { echo -e "${CYAN}[INFO]${NC}  $*"; }
+
+echo ""
+echo -e "${CYAN}============================================================${NC}"
+echo -e "${CYAN}  Siemens RAGAS Console  —  Linux 一键部署${NC}"
+echo -e "${CYAN}============================================================${NC}"
+echo ""
+
+# ── 阶段 1：Python 版本检查 ───────────────────────────────────────
+info "阶段 1/7：检查 Python 版本..."
+
+PYTHON_BIN=""
+for candidate in python3.12 python3.13 python3.14 python3; do
+    if command -v "$candidate" &>/dev/null; then
+        version=$("$candidate" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || true)
+        major=$(echo "$version" | cut -d. -f1)
+        minor=$(echo "$version" | cut -d. -f2)
+        if [ "${major:-0}" -ge 3 ] && [ "${minor:-0}" -ge 12 ]; then
+            PYTHON_BIN="$candidate"
+            ok "Python $version ($candidate)"
+            break
+        fi
+    fi
+done
+
+if [ -z "$PYTHON_BIN" ]; then
+    err "未找到 Python 3.12+。请安装后重试。"
+    err "  Ubuntu/Debian: sudo apt install python3.12 python3.12-venv"
+    err "  CentOS/RHEL:   sudo dnf install python3.12"
+    exit 1
+fi
+
+# ── 阶段 2：虚拟环境 ──────────────────────────────────────────────
+info "阶段 2/7：准备虚拟环境..."
+
+if [ -d ".venv" ] && [ -f ".venv/bin/python" ]; then
+    ok ".venv 已存在，跳过创建"
+else
+    info "创建 .venv..."
+    "$PYTHON_BIN" -m venv .venv
+    ok ".venv 创建完成"
+fi
+
+PIP=".venv/bin/pip"
+PYTHON=".venv/bin/python"
+
+# ── 阶段 3：安装依赖 ──────────────────────────────────────────────
+info "阶段 3/7：安装项目依赖（可能需要几分钟）..."
+
+"$PIP" install --upgrade pip -q
+ok "pip 已升级"
+
+"$PIP" install -e . -q
+ok "项目依赖安装完成（pyproject.toml）"
+
+"$PIP" install fastapi uvicorn httpx -q
+ok "Web 服务依赖安装完成（fastapi / uvicorn / httpx）"
+
+# ── 阶段 4：配置文件 ──────────────────────────────────────────────
+info "阶段 4/7：初始化配置文件..."
+
+if [ ! -f ".env" ]; then
+    cp .env.example .env
+    warn ".env 已从 .env.example 复制，请编辑填写实际的 API Key 等配置后再启动："
+    warn "  nano .env   或   vim .env"
+    warn "  关键字段：OPENAI_API_KEY, OPENAI_BASE_URL, ALIBABA_ACCESS_KEY_ID, ALIBABA_ACCESS_KEY_SECRET"
+else
+    ok ".env 已存在，跳过"
+fi
+
+# ── 阶段 5：目录初始化 ────────────────────────────────────────────
+info "阶段 5/7：初始化目录结构..."
+
+mkdir -p configs logs outputs datasets
+ok "目录就绪：configs/ logs/ outputs/ datasets/"
+
+# 确保其他脚本有执行权限
+for script in start.sh stop.sh run_eval.sh; do
+    [ -f "$script" ] && chmod +x "$script"
+done
+ok "辅助脚本已设置执行权限"
+
+# ── 阶段 6：Demo 数据 ─────────────────────────────────────────────
+info "阶段 6/7：初始化演示数据..."
+
+DEMO_DIR="outputs/kba-knowledge-base-offline-baseline"
+if [ -d "$DEMO_DIR" ]; then
+    ok "演示数据已存在，跳过"
+else
+    info "生成演示数据（scripts/seed_sample_run.py）..."
+    if "$PYTHON" scripts/seed_sample_run.py; then
+        ok "演示数据生成完成"
+    else
+        warn "演示数据生成失败，控制台报告页将为空（服务仍可正常启动）"
+    fi
+fi
+
+# ── 阶段 7：启动服务 ──────────────────────────────────────────────
+info "阶段 7/7：启动 Web 服务..."
+
+# 检查 .env 是否包含默认占位符
+if grep -q "your-api-key" .env 2>/dev/null; then
+    warn ".env 中仍包含默认占位符，部分功能（评估执行）将不可用"
+    warn "请编辑 .env 后重新运行 start.sh"
+fi
+
+# 端口检测
+PORT=8800
+if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
+    warn "端口 $PORT 已被占用，尝试 8801..."
+    PORT=8801
+    if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
+        err "端口 8800 和 8801 均被占用。请手动运行："
+        err "  .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
+        exit 1
+    fi
+fi
+
+# 清理残留 PID
+if [ -f ".server.pid" ]; then
+    OLD_PID=$(cat .server.pid)
+    if kill -0 "$OLD_PID" 2>/dev/null; then
+        warn "检测到已有服务进程 (PID=$OLD_PID)，停止旧进程..."
+        kill "$OLD_PID" 2>/dev/null || true
+        sleep 1
+    fi
+    rm -f .server.pid
+fi
+
+# 后台启动
+nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
+SERVER_PID=$!
+echo "$SERVER_PID" > .server.pid
+
+# 等待 3 秒验证进程存活
+sleep 3
+if kill -0 "$SERVER_PID" 2>/dev/null; then
+    ok "服务已启动 (PID=$SERVER_PID)"
+    echo ""
+    echo -e "${CYAN}============================================================${NC}"
+    echo -e "${GREEN}  部署成功！${NC}"
+    echo -e "${GREEN}  访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
+    echo -e "${GREEN}  本机访问: http://127.0.0.1:${PORT}${NC}"
+    echo -e "${CYAN}  服务日志: tail -f logs/server.log${NC}"
+    echo -e "${CYAN}  停止服务: bash stop.sh${NC}"
+    echo -e "${CYAN}============================================================${NC}"
+    echo ""
+else
+    err "服务启动失败，请查看日志："
+    err "  tail -20 logs/server.log"
+    rm -f .server.pid
+    exit 1
+fi
--- a/docs/superpowers/plans/2026-06-18-metric-doc-weights.md
+++ b/docs/superpowers/plans/2026-06-18-metric-doc-weights.md
--- a/docs/superpowers/plans/2026-06-22-dify-score-api.md
+++ b/docs/superpowers/plans/2026-06-22-dify-score-api.md
@@ -0,0 +1,974 @@
+# Dify 实时评分 API Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** 新增 `POST /api/score` 端点，供 Dify 外部 Tool 调用，接受单条问答记录并同步返回 RAGAS 各指标得分。
+
+**Architecture:** 新增 `inline_scorer.py` 服务层封装 RAGAS 打分逻辑，以 `(judge_model, embedding_model)` 为 key 缓存 LLM 客户端；新增 `webapp/api/score.py` 路由；`ScoreRequest`/`ScoreResponse` 放入 `webapp/models.py`；`SCORE_API_TOKEN` 加入 `EvaluationSettings`。
+
+**Tech Stack:** Python 3.12, FastAPI, Pydantic v2, RAGAS 0.4.3, pytest
+
+## Global Constraints
+
+- Python 3.12+，PEP 8，4 空格缩进，类型注解必须
+- contexts 用 `context_separator`（默认 `" |||| "`）拆分为 list[str]
+- ground_truth 为可选；缺失时跳过 context_recall / factual_correctness / semantic_similarity / noise_sensitivity
+- SCORE_API_TOKEN 为空时不鉴权（内网部署场景）
+- 所有测试用 pytest，不依赖真实 LLM
+
+---
+
+## 文件清单
+
+| 操作 | 文件 | 职责 |
+|------|------|------|
+| 新建 | `webapp/services/inline_scorer.py` | LLM 客户端缓存 + 单题打分 |
+| 新建 | `webapp/api/score.py` | `/api/score` 路由 |
+| 新建 | `tests/webapp/test_score_api.py` | 端点测试（全 mock） |
+| 修改 | `webapp/models.py` | 新增 ScoreRequest / ScoreResponse |
+| 修改 | `rag_eval/settings.py` | 新增 score_api_token 字段 |
+| 修改 | `webapp/server.py` | 注册 score router，更新 OPENAPI_TAGS 和 description |
+
+---
+
+## Task 1: ScoreRequest / ScoreResponse 模型 + settings 字段
+
+**Files:**
+- Modify: `webapp/models.py`
+- Modify: `rag_eval/settings.py`
+- Test: `tests/webapp/test_score_api.py` (partial — model validation tests)
+
+**Interfaces:**
+- Produces:
+  - `ScoreRequest` Pydantic model（见下方字段）
+  - `ScoreResponse` Pydantic model
+  - `EvaluationSettings.score_api_token: str | None`
+
+- [ ] **Step 1: Write failing model-validation tests**
+
+Create `tests/webapp/test_score_api.py`:
+
+```python
+"""Tests for POST /api/score endpoint."""
+from __future__ import annotations
+
+import math
+import pytest
+from pydantic import ValidationError
+from webapp.models import ScoreRequest, ScoreResponse
+
+
+class TestScoreRequest:
+    def test_minimal_valid_request(self):
+        """Only required fields — question, answer, contexts."""
+        req = ScoreRequest(
+            question="What is CT?",
+            answer="CT is imaging.",
+            contexts="CT uses X-rays.",
+        )
+        assert req.question == "What is CT?"
+        assert req.contexts == "CT uses X-rays."
+        assert req.ground_truth is None
+        assert req.context_separator == " |||| "
+        assert req.metrics == ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
+
+    def test_contexts_split_by_separator(self):
+        """contexts_as_list() splits on context_separator."""
+        req = ScoreRequest(
+            question="q", answer="a",
+            contexts="ctx1 |||| ctx2 |||| ctx3",
+            context_separator=" |||| ",
+        )
+        assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
+
+    def test_contexts_split_custom_separator(self):
+        req = ScoreRequest(
+            question="q", answer="a",
+            contexts="a---b---c",
+            context_separator="---",
+        )
+        assert req.contexts_as_list() == ["a", "b", "c"]
+
+    def test_contexts_split_single_item(self):
+        req = ScoreRequest(question="q", answer="a", contexts="only one")
+        assert req.contexts_as_list() == ["only one"]
+
+    def test_missing_question_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(answer="a", contexts="c")  # type: ignore[call-arg]
+
+    def test_missing_answer_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(question="q", contexts="c")  # type: ignore[call-arg]
+
+    def test_missing_contexts_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(question="q", answer="a")  # type: ignore[call-arg]
+
+    def test_custom_metrics_accepted(self):
+        req = ScoreRequest(
+            question="q", answer="a", contexts="c",
+            metrics=["faithfulness"],
+        )
+        assert req.metrics == ["faithfulness"]
+
+    def test_invalid_metric_name_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(question="q", answer="a", contexts="c", metrics=["not_a_metric"])
+
+    def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
+        """Without ground_truth, GT-dependent metrics are excluded."""
+        req = ScoreRequest(
+            question="q", answer="a", contexts="c",
+            metrics=["faithfulness", "context_recall", "factual_correctness", "semantic_similarity", "noise_sensitivity"],
+        )
+        effective = req.effective_metrics()
+        assert "faithfulness" in effective
+        assert "context_recall" not in effective
+        assert "factual_correctness" not in effective
+        assert "semantic_similarity" not in effective
+        assert "noise_sensitivity" not in effective
+
+    def test_effective_metrics_keeps_all_when_ground_truth_present(self):
+        req = ScoreRequest(
+            question="q", answer="a", contexts="c", ground_truth="gt",
+            metrics=["faithfulness", "context_recall", "factual_correctness"],
+        )
+        effective = req.effective_metrics()
+        assert effective == ["faithfulness", "context_recall", "factual_correctness"]
+
+
+class TestScoreResponse:
+    def test_score_response_structure(self):
+        resp = ScoreResponse(
+            scores={"faithfulness": 0.85, "answer_relevancy": None},
+            weighted_score=0.85,
+            latency_ms=1200,
+        )
+        assert resp.scores["faithfulness"] == 0.85
+        assert resp.scores["answer_relevancy"] is None
+        assert resp.latency_ms == 1200
+```
+
+- [ ] **Step 2: Run to verify FAIL**
+
+```
+cd C:\Projects\AIProjects\Siemens-AIPOC\siemens_ragas
+python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
+```
+Expected: `ImportError: cannot import name 'ScoreRequest' from 'webapp.models'`
+
+- [ ] **Step 3: Add ScoreRequest and ScoreResponse to `webapp/models.py`**
+
+Append to the end of `webapp/models.py` (after `PipelineJobResponse`):
+
+```python
+# ---------------------------------------------------------------------------
+# Dify 实时评分 API 模型
+# ---------------------------------------------------------------------------
+
+# 需要 ground_truth 才能计算的指标集合
+_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
+    "context_recall",
+    "factual_correctness",
+    "semantic_similarity",
+    "noise_sensitivity",
+})
+
+# 所有合法指标名称
+_VALID_METRICS: frozenset[str] = frozenset({
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+    "noise_sensitivity",
+    "factual_correctness",
+    "semantic_similarity",
+})
+
+_DEFAULT_SCORE_METRICS: list[str] = [
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+]
+
+
+class ScoreRequest(BaseModel):
+    """Request body for the real-time single-sample scoring endpoint."""
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "summary": "基础评分请求",
+                    "value": {
+                        "question": "双源CT的时间分辨率是多少?",
+                        "answer": "双源CT的单扇区时间分辨率为75ms。",
+                        "contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
+                        "ground_truth": "双源CT单扇区时间分辨率为75ms，需旋转135度。",
+                        "context_separator": " |||| ",
+                        "metrics": ["faithfulness", "answer_relevancy", "context_recall", "context_precision"],
+                        "judge_model": "deepseek-v4-flash",
+                        "embedding_model": "text-embedding-v3",
+                    },
+                }
+            ]
+        }
+    )
+
+    question: str = Field(description="问题文本。")
+    answer: str = Field(description="待评分的回答。")
+    contexts: str = Field(
+        description="检索上下文字符串，多段之间用 context_separator 拼接。"
+    )
+    ground_truth: str | None = Field(
+        default=None,
+        description="标准参考答案（可选）。缺失时自动跳过需要它的指标。",
+    )
+    context_separator: str = Field(
+        default=" |||| ",
+        description="contexts 字段中段落分隔符，默认为四个竖线两侧各一空格。",
+    )
+    metrics: list[str] = Field(
+        default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
+        description="需要计算的 RAGAS 指标列表。",
+    )
+    judge_model: str | None = Field(
+        default=None,
+        description="Judge LLM 模型名称；为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
+    )
+    embedding_model: str | None = Field(
+        default=None,
+        description="Embedding 模型名称；为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
+    )
+
+    @field_validator("metrics")
+    @classmethod
+    def validate_metric_names(cls, value: list[str]) -> list[str]:
+        """Reject any metric name not in the supported registry."""
+        invalid = [m for m in value if m not in _VALID_METRICS]
+        if invalid:
+            raise ValueError(
+                f"不支持的指标名称：{invalid}。"
+                f"合法值：{sorted(_VALID_METRICS)}"
+            )
+        if not value:
+            raise ValueError("metrics 不能为空列表。")
+        return value
+
+    def contexts_as_list(self) -> list[str]:
+        """Split the contexts string into a list of non-empty fragments."""
+        sep = self.context_separator or " |||| "
+        return [s.strip() for s in self.contexts.split(sep) if s.strip()]
+
+    def effective_metrics(self) -> list[str]:
+        """Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
+        if self.ground_truth is not None:
+            return list(self.metrics)
+        return [m for m in self.metrics if m not in _GT_DEPENDENT_METRICS]
+
+
+class ScoreResponse(BaseModel):
+    """Response payload for the real-time scoring endpoint."""
+
+    scores: dict[str, float | None] = Field(
+        description="各指标得分（NaN 或计算失败时为 null）。"
+    )
+    weighted_score: float | None = Field(
+        default=None,
+        description="等权加权综合得分（仅对非 null 指标求均值）。",
+    )
+    latency_ms: int = Field(description="服务端打分耗时（毫秒）。")
+    skipped_metrics: list[str] = Field(
+        default_factory=list,
+        description="因缺少 ground_truth 而跳过的指标名称列表。",
+    )
+    error: str | None = Field(
+        default=None,
+        description="打分异常时的错误信息（HTTP 200 仍返回，scores 为空）。",
+    )
+```
+
+Also add `field_validator` to the import line at the top of `webapp/models.py`:
+```python
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+```
+
+- [ ] **Step 4: Add `score_api_token` to `rag_eval/settings.py`**
+
+Add after the `dataset_generator_model` field:
+```python
+score_api_token: str | None = Field(
+    default=None,
+    alias="SCORE_API_TOKEN",
+    description="Bearer token for /api/score endpoint. Empty = no auth.",
+)
+```
+
+- [ ] **Step 5: Run to verify PASS**
+
+```
+python -m pytest tests/webapp/test_score_api.py::TestScoreRequest tests/webapp/test_score_api.py::TestScoreResponse -v
+```
+Expected: all 12 tests PASS.
+
+- [ ] **Step 6: Commit**
+
+```
+git add webapp/models.py rag_eval/settings.py tests/webapp/test_score_api.py
+git commit -m "feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting"
+```
+
+---
+
+## Task 2: InlineScorer 服务（LLM 缓存 + 打分）
+
+**Files:**
+- Create: `webapp/services/inline_scorer.py`
+
+**Interfaces:**
+- Consumes:
+  - `build_models(judge_model, embedding_model, settings) -> tuple[Any, Any]` from `rag_eval.metrics.factory`
+  - `MetricPipeline(metrics, metric_timeout_seconds)` from `rag_eval.metrics.pipeline`
+  - `NormalizedSample` from `rag_eval.shared.models`
+  - `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights`
+  - `EvaluationSettings` from `rag_eval.settings`
+- Produces:
+  - `inline_scorer: InlineScorer` (module-level singleton)
+  - `InlineScorer.score(question, answer, contexts, ground_truth, metrics, judge_model, embedding_model, settings) -> dict[str, float | None]`
+
+- [ ] **Step 1: Write failing test**
+
+Add to `tests/webapp/test_score_api.py`:
+
+```python
+class TestInlineScorer:
+    def test_score_returns_dict_with_requested_metrics(self):
+        """InlineScorer.score returns a dict keyed by the requested metrics."""
+        from unittest.mock import AsyncMock, MagicMock, patch
+        from webapp.services.inline_scorer import InlineScorer
+        from rag_eval.settings import EvaluationSettings
+
+        mock_score = MagicMock()
+        mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
+        mock_score.error = ""
+
+        mock_pipeline = MagicMock()
+        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
+
+        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
+            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
+                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
+                    scorer = InlineScorer()
+                    result = scorer.score(
+                        question="q", answer="a",
+                        contexts=["ctx1"],
+                        ground_truth=None,
+                        metrics=["faithfulness", "answer_relevancy"],
+                        judge_model="test-model",
+                        embedding_model="test-embed",
+                        settings=EvaluationSettings(_env_file=None),
+                    )
+        assert "faithfulness" in result
+        assert "answer_relevancy" in result
+        assert result["faithfulness"] == pytest.approx(0.9)
+
+    def test_score_converts_nan_to_none(self):
+        """NaN scores are converted to None in the returned dict."""
+        import math
+        from unittest.mock import AsyncMock, MagicMock, patch
+        from webapp.services.inline_scorer import InlineScorer
+        from rag_eval.settings import EvaluationSettings
+
+        mock_score = MagicMock()
+        mock_score.metrics = {"faithfulness": float("nan")}
+        mock_score.error = ""
+
+        mock_pipeline = MagicMock()
+        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
+
+        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
+            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
+                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
+                    scorer = InlineScorer()
+                    result = scorer.score(
+                        question="q", answer="a", contexts=["c"],
+                        ground_truth=None,
+                        metrics=["faithfulness"],
+                        judge_model="m", embedding_model="e",
+                        settings=EvaluationSettings(_env_file=None),
+                    )
+        assert result["faithfulness"] is None
+```
+
+- [ ] **Step 2: Run to verify FAIL**
+
+```
+python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
+```
+Expected: `ModuleNotFoundError: No module named 'webapp.services.inline_scorer'`
+
+- [ ] **Step 3: Create `webapp/services/inline_scorer.py`**
+
+```python
+"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
+
+A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
+(judge_model, embedding_model), so repeated Dify Tool calls with the same
+models reuse existing AsyncOpenAI connections instead of creating new ones.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import math
+import threading
+from typing import Any
+
+from rag_eval.compat import ensure_ragas_import_compat
+from rag_eval.metrics.factory import build_models
+from rag_eval.metrics.pipeline import MetricPipeline
+from rag_eval.metrics.weights import compute_weighted_score
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.models import NormalizedSample
+
+ensure_ragas_import_compat()
+
+from ragas.metrics.collections import (  # noqa: E402
+    AnswerRelevancy,
+    ContextPrecision,
+    ContextRecall,
+    FactualCorrectness,
+    Faithfulness,
+    NoiseSensitivity,
+    SemanticSimilarity,
+)
+
+
+def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
+    """Instantiate only the RAGAS metric objects requested."""
+    registry: dict[str, Any] = {
+        "faithfulness": Faithfulness(llm=llm),
+        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
+        "context_recall": ContextRecall(llm=llm),
+        "context_precision": ContextPrecision(llm=llm),
+        "noise_sensitivity": NoiseSensitivity(llm=llm),
+        "factual_correctness": FactualCorrectness(llm=llm),
+        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
+    }
+    return {name: registry[name] for name in metrics if name in registry}
+
+
+class InlineScorer:
+    """Thread-safe single-sample RAGAS scorer with LLM client caching."""
+
+    def __init__(self) -> None:
+        # Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
+        self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
+        self._lock = threading.Lock()
+
+    def _get_models(
+        self,
+        judge_model: str,
+        embedding_model: str,
+        settings: EvaluationSettings,
+    ) -> tuple[Any, Any]:
+        """Return cached LLM/embedding clients, building them on first use."""
+        cache_key = (judge_model, embedding_model)
+        with self._lock:
+            if cache_key not in self._model_cache:
+                llm, embeddings = build_models(judge_model, embedding_model, settings)
+                self._model_cache[cache_key] = (llm, embeddings)
+            return self._model_cache[cache_key]
+
+    def score(
+        self,
+        question: str,
+        answer: str,
+        contexts: list[str],
+        ground_truth: str | None,
+        metrics: list[str],
+        judge_model: str,
+        embedding_model: str,
+        settings: EvaluationSettings,
+    ) -> dict[str, float | None]:
+        """Score one sample synchronously and return {metric_name: score | None}.
+
+        NaN values from RAGAS are converted to None for clean JSON serialization.
+        """
+        llm, embeddings = self._get_models(judge_model, embedding_model, settings)
+        metric_instances = _build_metric_instances(metrics, llm, embeddings)
+
+        pipeline = MetricPipeline(
+            metrics=metric_instances,
+            metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
+        )
+
+        sample = NormalizedSample(
+            sample_id="inline-score",
+            question=question,
+            answer=answer,
+            contexts=contexts,
+            ground_truth=ground_truth or "",
+        )
+
+        metric_score = asyncio.run(pipeline.score_sample(sample))
+
+        # Convert NaN → None for clean JSON output
+        return {
+            name: (None if math.isnan(v) or math.isinf(v) else round(v, 4))
+            for name, v in metric_score.metrics.items()
+        }
+
+
+# Module-level singleton shared by FastAPI routes.
+inline_scorer = InlineScorer()
+```
+
+- [ ] **Step 4: Run to verify PASS**
+
+```
+python -m pytest tests/webapp/test_score_api.py::TestInlineScorer -v
+```
+Expected: both tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```
+git add webapp/services/inline_scorer.py tests/webapp/test_score_api.py
+git commit -m "feat: add InlineScorer service with LLM client caching"
+```
+
+---
+
+## Task 3: `/api/score` 路由 + 鉴权 + 集成测试
+
+**Files:**
+- Create: `webapp/api/score.py`
+- Modify: `webapp/server.py`
+
+**Interfaces:**
+- Consumes:
+  - `ScoreRequest`, `ScoreResponse` from `webapp.models`
+  - `inline_scorer: InlineScorer` from `webapp.services.inline_scorer`
+  - `EvaluationSettings` from `rag_eval.settings`
+  - `compute_weighted_score(scores, {}) -> float | None` from `rag_eval.metrics.weights`
+- Produces: `POST /api/score` endpoint
+
+- [ ] **Step 1: Write failing endpoint tests**
+
+Add to `tests/webapp/test_score_api.py`:
+
+```python
+# ── Fixtures ─────────────────────────────────────────────────────────────────
+import pytest
+from fastapi.testclient import TestClient
+from unittest.mock import MagicMock, patch
+
+
+@pytest.fixture()
+def client(monkeypatch):
+    """TestClient with mocked InlineScorer."""
+    import webapp.api.score as score_mod
+
+    mock_scorer = MagicMock()
+    mock_scorer.score.return_value = {
+        "faithfulness": 0.85,
+        "answer_relevancy": 0.90,
+    }
+    monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+    from webapp.server import create_app
+    return TestClient(create_app())
+
+
+class TestScoreEndpoint:
+    def test_post_score_returns_200(self, client):
+        resp = client.post("/api/score", json={
+            "question": "What is CT?",
+            "answer": "CT is imaging.",
+            "contexts": "CT uses X-rays.",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "scores" in data
+        assert "latency_ms" in data
+        assert data["scores"]["faithfulness"] == pytest.approx(0.85)
+
+    def test_weighted_score_computed(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        # weighted_score is the mean of all non-null scores
+        assert data["weighted_score"] is not None
+
+    def test_missing_required_fields_returns_422(self, client):
+        resp = client.post("/api/score", json={"question": "q"})
+        assert resp.status_code == 422
+
+    def test_invalid_metric_name_returns_422(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["not_a_metric"],
+        })
+        assert resp.status_code == 422
+
+    def test_skipped_metrics_returned_when_no_ground_truth(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["faithfulness", "context_recall"],
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "context_recall" in data["skipped_metrics"]
+
+    def test_contexts_split_on_separator(self, client, monkeypatch):
+        """contexts string is split before passing to scorer."""
+        import webapp.api.score as score_mod
+        calls = []
+        def capture(*args, **kwargs):
+            calls.append(kwargs.get("contexts", []))
+            return {"faithfulness": 0.9}
+        monkeypatch.setattr(score_mod.inline_scorer, "score", capture)
+
+        client.post("/api/score", json={
+            "question": "q", "answer": "a",
+            "contexts": "ctx1 |||| ctx2",
+            "context_separator": " |||| ",
+        })
+        assert calls[0] == ["ctx1", "ctx2"]
+
+    def test_bearer_token_auth_required_when_configured(self, monkeypatch):
+        """When SCORE_API_TOKEN is set, requests without token get 401."""
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "secret-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {"faithfulness": 0.9}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        test_client = TestClient(create_app())
+
+        # No auth header → 401
+        resp = test_client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 401
+
+        # Correct token → 200
+        resp = test_client.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer secret-token"},
+        )
+        assert resp.status_code == 200
+
+    def test_wrong_bearer_token_returns_401(self, monkeypatch):
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "correct-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        test_client = TestClient(create_app())
+        resp = test_client.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer wrong-token"},
+        )
+        assert resp.status_code == 401
+```
+
+- [ ] **Step 2: Run to verify FAIL**
+
+```
+python -m pytest tests/webapp/test_score_api.py::TestScoreEndpoint -v
+```
+Expected: `ModuleNotFoundError: No module named 'webapp.api.score'`
+
+- [ ] **Step 3: Create `webapp/api/score.py`**
+
+```python
+"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
+
+from __future__ import annotations
+
+import time
+
+from fastapi import APIRouter, Header, HTTPException
+from typing import Annotated
+
+from rag_eval.metrics.weights import compute_weighted_score
+from rag_eval.settings import EvaluationSettings
+from webapp.models import ScoreRequest, ScoreResponse
+from webapp.services.inline_scorer import inline_scorer
+
+router = APIRouter(prefix="/api/score", tags=["score"])
+
+
+def _get_settings() -> EvaluationSettings:
+    """Return a fresh EvaluationSettings instance (overridable in tests)."""
+    return EvaluationSettings()
+
+
+def _check_auth(authorization: str | None, token: str) -> None:
+    """Raise 401 if Bearer token does not match the configured token."""
+    if authorization is None:
+        raise HTTPException(status_code=401, detail="Missing Authorization header.")
+    parts = authorization.split(" ", 1)
+    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
+        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
+
+
+@router.post(
+    "",
+    response_model=ScoreResponse,
+    summary="单题实时评分（Dify 外部 Tool）",
+    responses={
+        200: {"description": "各指标得分和加权综合得分。"},
+        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
+        422: {"description": "请求参数校验失败。"},
+    },
+)
+def score_sample(
+    request: ScoreRequest,
+    authorization: Annotated[str | None, Header()] = None,
+) -> ScoreResponse:
+    """接受单条问答记录，同步运行 RAGAS 指标打分，实时返回各指标得分。
+
+    供 Dify 外部 Tool 调用。将 `contexts` 字段按 `context_separator` 拆分后传入
+    RAGAS 管道；`ground_truth` 缺失时自动跳过依赖它的指标。
+    """
+    settings = _get_settings()
+
+    # 鉴权（仅在配置了 token 时生效）
+    if settings.score_api_token:
+        _check_auth(authorization, settings.score_api_token)
+
+    judge_model = request.judge_model or settings.ragas_judge_model
+    embedding_model = request.embedding_model or settings.ragas_embedding_model
+    effective = request.effective_metrics()
+    requested = set(request.metrics)
+    skipped = sorted(requested - set(effective))
+
+    if not effective:
+        # All requested metrics require ground_truth which is absent.
+        return ScoreResponse(
+            scores={m: None for m in request.metrics},
+            weighted_score=None,
+            latency_ms=0,
+            skipped_metrics=skipped,
+        )
+
+    t0 = time.monotonic()
+    try:
+        raw_scores = inline_scorer.score(
+            question=request.question,
+            answer=request.answer,
+            contexts=request.contexts_as_list(),
+            ground_truth=request.ground_truth,
+            metrics=effective,
+            judge_model=judge_model,
+            embedding_model=embedding_model,
+            settings=settings,
+        )
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ScoreResponse(
+            scores={},
+            weighted_score=None,
+            latency_ms=latency_ms,
+            skipped_metrics=skipped,
+            error=f"{type(exc).__name__}: {exc}",
+        )
+
+    latency_ms = int((time.monotonic() - t0) * 1000)
+
+    # Merge: skipped metrics appear as null in final scores dict.
+    all_scores: dict[str, float | None] = {m: None for m in request.metrics}
+    all_scores.update(raw_scores)
+
+    # Weighted score = equal-weight mean of non-null effective scores.
+    weighted = compute_weighted_score(
+        {k: v for k, v in raw_scores.items() if v is not None},
+        {},
+    )
+
+    return ScoreResponse(
+        scores=all_scores,
+        weighted_score=round(weighted, 4) if weighted is not None else None,
+        latency_ms=latency_ms,
+        skipped_metrics=skipped,
+    )
+```
+
+- [ ] **Step 4: Register router in `webapp/server.py`**
+
+Add `score` to the import line:
+```python
+from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score
+```
+
+Add the router registration after `pipeline.router`:
+```python
+app.include_router(score.router)
+```
+
+Add `"score"` tag to `OPENAPI_TAGS` list (insert before `"meta"`):
+```python
+    {
+        "name": "score",
+        "description": (
+            "**实时评分 API（Dify 外部 Tool）**\n\n"
+            "接受单条问答记录 `(question, answer, contexts, ground_truth)`，\n"
+            "同步运行 RAGAS 指标打分，返回各指标得分和加权综合得分。\n\n"
+            "适用场景：Dify Agent 在回答后即时调用，用于质量监控或自我改进。\n\n"
+            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
+            "`Authorization: Bearer <token>` 请求头。"
+        ),
+    },
+```
+
+Also update the `description` field in `FastAPI(...)` to add a bullet:
+```python
+"- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
+```
+
+- [ ] **Step 5: Run to verify PASS**
+
+```
+python -m pytest tests/webapp/test_score_api.py -v
+```
+Expected: all tests PASS.
+
+- [ ] **Step 6: Verify server boots and route appears**
+
+```
+python -c "
+from webapp.server import create_app
+app = create_app()
+routes = [(r.path, list(getattr(r,'methods',[]))) for r in app.routes]
+score_routes = [(p,m) for p,m in routes if 'score' in p]
+print('Score routes:', score_routes)
+"
+```
+Expected output:
+```
+Score routes: [('/api/score', ['POST'])]
+```
+
+- [ ] **Step 7: Commit**
+
+```
+git add webapp/api/score.py webapp/server.py tests/webapp/test_score_api.py
+git commit -m "feat: add POST /api/score endpoint for Dify real-time scoring"
+```
+
+---
+
+## Task 4: 全量回归 + `.env.example` 更新
+
+**Files:**
+- Modify: `.env.example`
+
+- [ ] **Step 1: Add SCORE_API_TOKEN to `.env.example`**
+
+Add this block after `DATASET_GENERATOR_MODEL=qwen3.6-plus`:
+
+```
+# ===== Dify 集成 — 实时评分 API =====
+# 为 /api/score 端点设置 Bearer Token 鉴权（留空则不鉴权，适合内网部署）
+# Dify 外部 Tool 配置 Authorization: Bearer <此处填写相同值>
+SCORE_API_TOKEN=
+```
+
+- [ ] **Step 2: Run full test suite**
+
+```
+python -m pytest tests/ -v --tb=short
+```
+
+Pre-existing failures to ignore:
+- `test_normalize_sample_pdf_offline_smoke_row` — 缺少 CSV fixture
+- `test_evaluator_and_reporting_write_run_assets` — 预存在的断言不匹配
+- `test_question_generator_rejects_invalid_json` — retry 循环吞掉了 ValueError
+- `test_question_generator_rejects_non_list_samples` — 同上
+
+**零新增失败**即为通过。
+
+- [ ] **Step 3: Final commit**
+
+```
+git add .env.example
+git commit -m "feat: Dify score API complete — add SCORE_API_TOKEN to .env.example
+
+- POST /api/score: real-time RAGAS scoring for Dify external Tool
+- ScoreRequest/ScoreResponse Pydantic models with full field docs
+- InlineScorer with (judge_model, embedding_model) client cache
+- Bearer token auth via SCORE_API_TOKEN env var (optional)
+- contexts split by configurable separator (default ' |||| ')
+- GT-dependent metrics auto-skipped when ground_truth absent
+- Full test coverage (22 new tests)
+
+Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>"
+```
+
+---
+
+## Dify 侧配置参考
+
+任务完成后，在 Dify 「工具」→「自定义工具」中填写如下 OpenAPI Schema：
+
+```yaml
+openapi: 3.1.0
+info:
+  title: RAGAS 实时评分
+  version: 1.0.0
+servers:
+  - url: http://<your-server>:8800
+paths:
+  /api/score:
+    post:
+      operationId: scoreQA
+      summary: 对一条问答记录进行 RAGAS 评分
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required: [question, answer, contexts]
+              properties:
+                question:       { type: string }
+                answer:         { type: string }
+                contexts:       { type: string, description: "多段上下文用 ' |||| ' 拼接" }
+                ground_truth:   { type: string }
+                metrics:
+                  type: array
+                  items: { type: string }
+                  default: [faithfulness, answer_relevancy, context_recall, context_precision]
+      responses:
+        '200':
+          description: 评分结果
+          content:
+            application/json:
+              schema:
+                type: object
+                properties:
+                  scores:         { type: object }
+                  weighted_score: { type: number }
+                  latency_ms:     { type: integer }
+                  skipped_metrics: { type: array, items: { type: string } }
+```
--- a/docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
+++ b/docs/superpowers/specs/2026-06-18-metric-doc-weights-design.md
@@ -0,0 +1,240 @@
+# 指标权重 & 文档片段权重功能设计
+
+**日期**: 2026-06-18  
+**状态**: 已批准，待实现  
+**范围**: 在「新建评估」运行评估时，支持为 RAGAS 指标和文档配置权重，计算加权综合得分并在报告中展示。
+
+---
+
+## 1. 目标
+
+1. **指标权重（Metric Weights）**：允许为每个 RAGAS 指标配置浮点权重（如 faithfulness: 0.35），计算每道题的加权综合得分 `weighted_score`。
+2. **文档权重（Doc Weights）**：允许为特定 PDF 文档名称配置权重（如 `"322_双源CT.pdf": 2.0`），该文档的题目在汇总指标均值时按权重放大贡献。
+3. **前端覆盖**：在「新建评估」页面选中场景后，展示可编辑的权重面板，运行前可临时覆盖 YAML 中的权重。
+4. **完全向后兼容**：两个字段均为可选，省略时退化为等权行为，现有场景 YAML 无需修改。
+
+---
+
+## 2. 数据模型
+
+### 2.1 场景 YAML（新增可选字段）
+
+```yaml
+# 可选。缺省时所有指标权重 = 1.0
+metric_weights:
+  faithfulness: 0.35
+  context_recall: 0.25
+  context_precision: 0.20
+  answer_relevancy: 0.20
+
+# 可选。缺省时所有文档权重 = 1.0
+doc_weights:
+  "322_双源CT成像技术.pdf": 2.0
+  "323_单源CT对比.pdf": 1.5
+```
+
+### 2.2 Pydantic Schema（`rag_eval/config/schema.py`）
+
+`ScenarioModel` 新增：
+```python
+metric_weights: dict[str, float] = Field(default_factory=dict)
+doc_weights:    dict[str, float] = Field(default_factory=dict)
+```
+
+`ConfigDict(extra="ignore")` 不变，新字段不影响既有 YAML 的加载。
+
+### 2.3 内部 Scenario dataclass（`rag_eval/shared/models.py`）
+
+`Scenario` 新增：
+```python
+metric_weights: dict[str, float] = field(default_factory=dict)
+doc_weights:    dict[str, float] = field(default_factory=dict)
+```
+
+随 `scenario.snapshot()` 序列化，供 `run_reader` / 报告层读取。
+
+---
+
+## 3. 后端：权重计算逻辑
+
+### 3.1 新模块 `rag_eval/metrics/weights.py`
+
+纯函数模块，无外部依赖，独立可测：
+
+```python
+def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
+    """返回 key 对应的权重，缺失时返回 default。"""
+
+def compute_weighted_score(
+    scores: dict[str, float | None],
+    metric_weights: dict[str, float],
+) -> float | None:
+    """
+    给定各指标得分和权重，返回加权综合得分。
+    - 忽略 NaN / None 值
+    - metric_weights 为空时退化为等权均值
+    - 全部 NaN 时返回 None
+    公式: Σ(w_i * s_i) / Σ(w_i)，只对非 NaN 项求和
+    """
+
+def weighted_metric_means(
+    score_rows: list[dict],
+    metrics: list[str],
+    doc_weights: dict[str, float],
+) -> dict[str, float | None]:
+    """
+    对每个指标计算文档加权均值。
+    - sample_weight = doc_weights.get(row["doc_name"], 1.0)
+    - 公式: Σ(sample_weight_j * score_m_j) / Σ(sample_weight_j)
+    - doc_weights 为空时退化为普通算术均值
+    """
+```
+
+### 3.2 评估器（`rag_eval/execution/evaluator.py`）
+
+`_merge_score()` 新增两列：
+```python
+record["weighted_score"] = compute_weighted_score(
+    score.metrics, self.scenario.metric_weights
+)
+record["sample_weight"] = self.scenario.doc_weights.get(
+    sample.metadata.get("doc_name", ""), 1.0
+)
+```
+
+`scores.csv` 新增 `weighted_score`、`sample_weight` 两列。
+
+### 3.3 报告摘要（`rag_eval/reporting/summary.py`）
+
+`build_summary_markdown()` 改用 `weighted_metric_means()` 计算各指标均值；
+新增 `weighted_score` 整体均值行：
+
+```
+## Metric Means（加权）
+- faithfulness:     0.8123  (w=0.35)
+- context_recall:   0.7654  (w=0.25)
+- context_precision: 0.7200  (w=0.20)
+- answer_relevancy: 0.7400  (w=0.20)
+- **weighted_score: 0.7789**
+```
+
+---
+
+## 4. yaml_patcher 扩展（`webapp/services/yaml_patcher.py`）
+
+`apply_profiles_to_scenario()` 扩展签名，新增可选参数：
+
+```python
+def apply_profiles_to_scenario(
+    scenario_path: str,
+    judge_profile: LLMProfile | None,
+    answer_profile: LLMProfile | None,
+    dataset_profile: LLMProfile | None,
+    metric_weights: dict[str, float] | None = None,   # 新增
+    doc_weights: dict[str, float] | None = None,       # 新增
+    _resolve_absolute: bool = False,
+) -> list[str]:
+```
+
+- `metric_weights` 非 None 时写入 `data["metric_weights"]`，追加 `"metric_weights"` 到 patched 列表
+- `doc_weights` 非 None 时写入 `data["doc_weights"]`，追加 `"doc_weights"` 到 patched 列表
+
+---
+
+## 5. Webapp 模型与 API 扩展
+
+### 5.1 `webapp/models.py`
+
+`ProfileApplyRequest` 新增：
+```python
+metric_weights: dict[str, float] | None = None
+doc_weights:    dict[str, float] | None = None
+```
+
+`ProfileApplyResponse` 不变（`patched_fields` 已包含新字段名）。
+
+### 5.2 `webapp/api/llm_profiles.py` — `apply_profiles()`
+
+透传 `metric_weights` / `doc_weights` 给 `apply_profiles_to_scenario()`。
+
+---
+
+## 6. 前端：权重配置面板
+
+### 6.1 HTML（`index.html`）
+
+在 `#llm-assignment-panel` 下方新增 `#weight-config-panel`（选中场景后显示）：
+
+```
+┌─────────────────────────────────────────────┐
+│ 权重配置  （可选，留空使用场景原始配置）         │
+├─────────────────────────────────────────────┤
+│ 指标权重                                     │
+│  faithfulness        [____1.0____]           │
+│  context_recall      [____1.0____]           │
+│  ...（根据选中场景的 metrics 动态生成）         │
+│                                              │
+│ 文档权重（doc_weights）                       │
+│  [doc名称_______________] [权重__] [＋] [✕]  │
+│  [doc名称_______________] [权重__] [＋] [✕]  │
+│  ＋ 添加文档权重规则                          │
+└─────────────────────────────────────────────┘
+```
+
+### 6.2 `runner.js`
+
+- `renderScenarioItem()` 选中后调用 `Runner._renderWeightPanel(sc)` 动态生成指标行
+- `_applyProfilesIfNeeded()` 同时读取权重输入，追加到 `apply` 请求 body
+- `Runner._collectWeights()` 收集 metric_weights / doc_weights，全部为 1.0 时不发送（跳过）
+
+### 6.3 CSS（`app.css`）
+
+新增 `.weight-config-panel`、`.weight-row`、`.weight-input` 样式，与现有 `.llm-role-row` 风格一致。
+
+---
+
+## 7. 报告展示（`webapp/services/report_builder.py`）
+
+- `RunSummary.metric_means` 改用 `weighted_metric_means()` 计算（需从 `scenario.snapshot.yaml` 读取 `doc_weights` / `metric_weights`）
+- `RunSummary` 新增 `weighted_score_mean: float | None` 字段
+- 前端 `report.js` 的指标卡片区新增「综合加权得分」卡片，使用 `good/warn/bad` 配色
+
+---
+
+## 8. 测试计划
+
+| 测试文件 | 覆盖内容 |
+|----------|---------|
+| `tests/test_weights.py` | `compute_weighted_score` / `weighted_metric_means` 纯函数，含 NaN 边界、空权重、全 NaN |
+| `tests/test_dataset_build.py` | 无改动（隔离良好） |
+| `tests/test_offline_eval.py` | `_merge_score` 新增 weighted_score / sample_weight 列断言 |
+| `tests/webapp/test_llm_profiles_api.py` | `apply_profiles` 带 metric_weights / doc_weights 的 patching 测试 |
+
+---
+
+## 9. 改动文件清单
+
+| 文件 | 改动类型 |
+|------|---------|
+| `rag_eval/config/schema.py` | 新增字段 |
+| `rag_eval/shared/models.py` | 新增字段 |
+| `rag_eval/config/loader.py` | 透传新字段到 Scenario |
+| `rag_eval/metrics/weights.py` | **新建** |
+| `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 |
+| `rag_eval/reporting/summary.py` | 改用加权均值 |
+| `webapp/services/yaml_patcher.py` | 新增 metric_weights / doc_weights 参数 |
+| `webapp/models.py` | ProfileApplyRequest 新增字段；RunSummary 新增 weighted_score_mean |
+| `webapp/api/llm_profiles.py` | 透传新参数 |
+| `webapp/services/report_builder.py` | 加权均值计算 |
+| `webapp/static/index.html` | 新增权重配置面板 |
+| `webapp/static/js/runner.js` | 权重面板逻辑 |
+| `webapp/static/css/app.css` | 新增权重面板样式 |
+| `tests/test_weights.py` | **新建** |
+
+---
+
+## 10. 向后兼容保证
+
+- `metric_weights: {}` + `doc_weights: {}` → 所有权重 = 1.0，行为与当前完全一致
+- 现有场景 YAML 不含这两个字段 → Pydantic `default_factory=dict` 填充空字典
+- `scores.csv` 新增两列不影响现有报告读取（`run_reader` 只读已知列）
--- a/docs/superpowers/specs/2026-06-22-dify-score-api-design.md
+++ b/docs/superpowers/specs/2026-06-22-dify-score-api-design.md
@@ -0,0 +1,138 @@
+# Dify 集成 — 单题实时评分 API 设计
+
+**日期**: 2026-06-22  
+**状态**: 已批准，待实现  
+**范围**: 在现有 FastAPI 服务中新增 `POST /api/score` 端点，供 Dify 外部 Tool 调用，实现单条问答记录的实时 RAGAS 指标评分。
+
+---
+
+## 1. 目标
+
+让 Dify Agent 能在回答完问题后，将 `(question, answer, contexts, ground_truth)` 发给 siemens_ragas 服务，实时获取各 RAGAS 指标得分，用于质量监控或 Agent 自我改进。
+
+---
+
+## 2. API 规范
+
+### `POST /api/score`
+
+**请求体：**
+
+```json
+{
+  "question":          "双源CT的时间分辨率是多少?",
+  "answer":            "双源CT的单扇区时间分辨率为75ms。",
+  "contexts":          "片段1：双源CT采用两套管-探测器系统... |||| 片段2：单扇区采集旋转135度...",
+  "ground_truth":      "双源CT单扇区时间分辨率为75ms，需旋转135度。",
+  "context_separator": " |||| ",
+  "metrics":           ["faithfulness", "answer_relevancy"],
+  "judge_model":       "deepseek-v4-flash",
+  "embedding_model":   "text-embedding-v3"
+}
+```
+
+**字段说明：**
+
+| 字段 | 类型 | 必填 | 说明 |
+|------|------|------|------|
+| `question` | str | ✅ | 问题文本 |
+| `answer` | str | ✅ | 待评分的回答 |
+| `contexts` | str | ✅ | 检索到的上下文，多段用 `context_separator` 拼接 |
+| `ground_truth` | str | ❌ | 标准答案；缺失时跳过依赖它的指标（context_recall、factual_correctness、semantic_similarity） |
+| `context_separator` | str | ❌ | 默认 `" \|\|\|\| "`（四个竖线，两侧各一空格） |
+| `metrics` | list[str] | ❌ | 默认 `["faithfulness", "answer_relevancy", "context_recall", "context_precision"]` |
+| `judge_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_JUDGE_MODEL` |
+| `embedding_model` | str | ❌ | 默认读 `.env` 中 `RAGAS_EMBEDDING_MODEL` |
+
+**响应体（200 OK）：**
+
+```json
+{
+  "scores": {
+    "faithfulness":     0.8750,
+    "answer_relevancy": 0.9200
+  },
+  "weighted_score": 0.8975,
+  "latency_ms": 3420
+}
+```
+
+**错误响应：**
+
+| 状态码 | 场景 |
+|--------|------|
+| 400 | 必填字段缺失、metrics 名称不合法 |
+| 401 | 配置了 `SCORE_API_TOKEN` 但请求未携带有效 Bearer Token |
+| 422 | 请求体 JSON 格式错误（Pydantic 校验） |
+| 500 | RAGAS 内部评分异常，附带 error 字段 |
+
+**鉴权（可选）：**  
+若 `.env` 中 `SCORE_API_TOKEN` 非空，则要求请求头携带 `Authorization: Bearer <token>`。为空则不鉴权（内网部署场景）。
+
+---
+
+## 3. 架构与文件改动
+
+### 新文件
+
+| 文件 | 职责 |
+|------|------|
+| `webapp/api/score.py` | 路由定义，请求验证，调用 InlineScorer |
+| `webapp/services/inline_scorer.py` | LLM 客户端缓存 + RAGAS 评分逻辑封装 |
+
+### 修改文件
+
+| 文件 | 改动 |
+|------|------|
+| `webapp/models.py` | 新增 `ScoreRequest`、`ScoreResponse` |
+| `webapp/server.py` | 注册 `score.router`，更新 `openapi_tags` |
+| `rag_eval/settings.py` | 新增 `score_api_token: str | None` 字段 |
+
+---
+
+## 4. `inline_scorer.py` 设计
+
+```python
+class InlineScorer:
+    """同步执行 RAGAS 单题评分，内部缓存 LLM 客户端。"""
+
+    def score(
+        self,
+        question: str,
+        answer: str,
+        contexts: list[str],
+        ground_truth: str | None,
+        metrics: list[str],
+        judge_model: str,
+        embedding_model: str,
+        settings: EvaluationSettings,
+    ) -> dict[str, float | None]:
+        """返回 {metric_name: score} 字典，NaN 记为 None。"""
+```
+
+**客户端缓存策略：**  
+以 `(judge_model, embedding_model)` 为 key，缓存 `(llm, embeddings)` 对象，避免每次请求都重建 AsyncOpenAI 连接。缓存为模块级单例（`_scorer_cache: dict`），线程安全（加 `threading.Lock`）。
+
+**评分执行：**  
+复用 `build_metric_pipeline` 构建 `MetricPipeline`，然后 `asyncio.run(pipeline.score_sample(sample))` 执行。与现有 `evaluator.py` 模式一致。
+
+**ground_truth 为空时的指标跳过逻辑：**  
+`context_recall`、`factual_correctness`、`semantic_similarity`、`noise_sensitivity` 需要 ground_truth；若请求中未提供，自动从 metrics 列表中移除这些指标，并在响应中对应字段返回 `null`。
+
+---
+
+## 5. Dify 侧配置方法
+
+1. 在 Dify 「工具」→「自定义工具」中创建新工具
+2. 填写 OpenAPI Schema（与 `/api/score` 端点对齐）
+3. 鉴权方式：API Key（Bearer）或无鉴权
+4. 在 Agent / Workflow 节点中引用该工具，将 `question`、`answer`、`contexts` 变量映射到工具输入
+
+---
+
+## 6. 不在范围内
+
+- 批量评分接口（异步 job）
+- Dify Workflow 节点插件（需要 Dify 插件开发框架）
+- 评分结果持久化到 scores.csv
+- 与现有 report_builder 集成展示
--- a/docs/superpowers/specs/2026-06-22-linux-deploy-design.md
+++ b/docs/superpowers/specs/2026-06-22-linux-deploy-design.md
@@ -0,0 +1,173 @@
+# Linux 一键部署脚本设计
+
+**日期**: 2026-06-22  
+**状态**: 已批准，待实现  
+**范围**: 为 siemens_ragas 项目提供 Linux 环境的部署与运维脚本（无 Docker，无 systemd）。
+
+---
+
+## 1. 目标
+
+提供四个 Bash 脚本，覆盖 Linux 服务器上的完整生命周期：
+
+| 脚本 | 职责 |
+|------|------|
+| `deploy.sh` | 一键完成环境检查、依赖安装、配置初始化、启动服务 |
+| `start.sh` | 仅启动 Web 服务（已部署后复用，不重装依赖） |
+| `stop.sh` | 停止后台 Web 服务 |
+| `run_eval.sh` | 运行单次评估（对应 Windows 的 `run_eval.ps1`） |
+
+---
+
+## 2. 约束与假设
+
+- Linux 目标环境有 PyPI 网络访问（pip 可直接安装）
+- 代码已通过 `git clone` 或文件拷贝到服务器
+- 使用 `pip + venv`（不使用 uv）
+- Web 服务监听 `0.0.0.0:8800`（内网可达）
+- 后台运行使用 `nohup`，PID 写入 `.server.pid`，日志追加到 `logs/server.log`
+- 所有脚本均放在仓库根目录，路径相对于 `$SCRIPT_DIR`
+
+---
+
+## 3. `deploy.sh` 详细设计
+
+### 3.1 阶段 1：Python 版本检查
+
+```
+require Python >= 3.12
+```
+
+- `python3 --version` 解析 major.minor
+- 不满足则打印错误并 `exit 1`
+- 满足则打印 `[OK] Python X.Y.Z`
+
+### 3.2 阶段 2：虚拟环境
+
+- 目标路径：`$SCRIPT_DIR/.venv`
+- 已存在则跳过创建（打印 `[OK] .venv already exists`）
+- 不存在则 `python3 -m venv .venv`
+
+### 3.3 阶段 3：依赖安装
+
+```bash
+.venv/bin/pip install --upgrade pip -q
+.venv/bin/pip install -e . -q          # 安装 pyproject.toml 中的依赖
+.venv/bin/pip install fastapi uvicorn httpx -q  # Web 服务额外依赖
+```
+
+- 失败则打印错误并 `exit 1`
+- `fastapi`、`uvicorn`、`httpx` 在 `pyproject.toml` 中未列，需单独安装
+
+### 3.4 阶段 4：配置文件
+
+- 若 `.env` 不存在：`cp .env.example .env`，打印警告提示用户编辑后再启动
+- 若 `.env` 已存在：跳过，打印 `[OK] .env found`
+
+### 3.5 阶段 5：目录初始化
+
+创建以下目录（`mkdir -p`，幂等）：
+- `configs/` — LLM Profile 持久化存储
+- `logs/` — 评估日志 + 服务器日志
+- `outputs/` — 评估运行产物
+- `datasets/` — 原始数据集
+
+### 3.6 阶段 6：Demo 数据
+
+- 检查 `outputs/kba-knowledge-base-offline-baseline/` 是否存在
+- 不存在则运行 `.venv/bin/python scripts/seed_sample_run.py`
+- 失败时打印 `[WARN]`（非致命，报告页为空但服务可启动）
+
+### 3.7 阶段 7：端口检测
+
+- 默认端口 `8800`
+- 用 `ss -tlnp` 或 `netstat -tlnp` 检查是否占用
+- 占用则尝试 `8801`，仍占用则报错退出
+
+### 3.8 阶段 8：启动服务
+
+```bash
+nohup .venv/bin/python webmain.py \
+    --host 0.0.0.0 \
+    --port $PORT \
+    >> logs/server.log 2>&1 &
+echo $! > .server.pid
+```
+
+- 等待 2 秒后用 `kill -0 $PID` 检测进程是否存活
+- 存活则打印 URL 和 stop 方法
+- 未存活则打印 `[ERROR] Server failed to start. Check logs/server.log.` 并 `exit 1`
+
+---
+
+## 4. `start.sh` 详细设计
+
+单独负责启动，不做任何环境初始化。
+
+```bash
+#!/usr/bin/env bash
+# 检查 .venv 存在
+# 端口检测（同 deploy.sh 逻辑）
+# 检查 .env 存在（不存在则 warn 但不阻止）
+# nohup 启动 + PID 文件 + 存活验证
+# 打印 URL
+```
+
+---
+
+## 5. `stop.sh` 详细设计
+
+```bash
+#!/usr/bin/env bash
+# 读取 .server.pid
+# 若文件不存在：打印 "No server PID file found." 退出
+# kill $PID
+# 等待 2 秒，若进程仍存活用 kill -9
+# 删除 .server.pid
+# 打印 "Server stopped."
+```
+
+---
+
+## 6. `run_eval.sh` 详细设计
+
+对应 Windows 的 `run_eval.ps1`。
+
+```
+用法:
+  ./run_eval.sh                          # online eval (默认)
+  ./run_eval.sh offline                  # offline smoke
+  ./run_eval.sh scenarios/xxx.yaml       # 自定义场景
+  ./run_eval.sh online DEBUG             # 自定义日志级别
+```
+
+- 参数 1（Scenario）：`online` / `offline` / 文件路径，默认 `online`
+- 参数 2（LogLevel）：`DEBUG` / `INFO` / `WARNING` / `ERROR`，默认 `INFO`
+- 场景别名映射：
+  - `online` → `scenarios/online/siemens-pdf-question-bank-online.yaml`
+  - `offline` → `scenarios/offline/siemens-pdf-offline-smoke.yaml`
+- 时间戳日志文件：`logs/eval_$(date +%Y-%m-%d_%H%M%S).log`
+- 环境变量：`PYTHONIOENCODING=utf-8 PYTHONPATH=.`
+- 调用：`.venv/bin/python main.py --scenario $SCENARIO --log-file $LOG_FILE --log-level $LOG_LEVEL`
+- 非零退出码时打印错误并 `exit 1`
+
+---
+
+## 7. 通用约定
+
+- 所有脚本首行：`#!/usr/bin/env bash`
+- `set -euo pipefail` — 错误立即退出，未定义变量报错，管道错误传播
+- `SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"` — 从任意目录执行均正确
+- `cd "$SCRIPT_DIR"` — 切换到仓库根目录
+- 颜色输出：绿色 `[OK]`、黄色 `[WARN]`、红色 `[ERROR]`（检测 tty，非交互式终端降级为无色）
+- 执行权限：脚本自身需要 `chmod +x`（在 deploy.sh 内对其他脚本自动 chmod）
+
+---
+
+## 8. 不在范围内
+
+- Docker / docker-compose 支持
+- systemd service 配置
+- Nginx 反向代理配置
+- SSL/TLS 配置
+- 离线/内网镜像源配置
--- a/rag_eval/config/loader.py
+++ b/rag_eval/config/loader.py
@@ -62,6 +62,8 @@ def load_scenario(path: str | Path) -> Scenario:
        ),
        source_path=scenario_path,
        optimization_advisor=model.optimization_advisor,
+        metric_weights=dict(model.metric_weights),
+        doc_weights=dict(model.doc_weights),
    )
    # Run cross-field checks after all relative paths have been resolved.
    validate_scenario(scenario)
--- a/rag_eval/config/schema.py
+++ b/rag_eval/config/schema.py
@@ -55,6 +55,8 @@ class ScenarioModel(BaseModel):
    output_dir: str
    runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
    optimization_advisor: bool = False
+    metric_weights: dict[str, float] = Field(default_factory=dict)
+    doc_weights: dict[str, float] = Field(default_factory=dict)

    @field_validator("metrics")
    @classmethod
--- a/rag_eval/execution/evaluator.py
+++ b/rag_eval/execution/evaluator.py
@@ -12,6 +12,7 @@ from rag_eval.datasets.loader import load_dataset_records
 from rag_eval.datasets.normalizers import normalize_records
 from rag_eval.execution.concurrency import gather_with_limit
 from rag_eval.metrics.pipeline import MetricPipeline
+from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
 from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
 from rag_eval.shared.utils import utc_now_iso

@@ -171,7 +172,7 @@ class Evaluator:
        return valid, invalid

    def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
-        """Combine sample data, metric results, and run metadata into one output row."""
+        """Combine sample data, metric results, run metadata, and weight columns."""
        record = sample.to_record()
        record["contexts"] = sample.contexts
        record.update(score.metrics)
@@ -179,4 +180,12 @@ class Evaluator:
        record["judge_model"] = self.scenario.judge_model
        record["embedding_model"] = self.scenario.embedding_model
        record["run_id"] = self.scenario.scenario_name
+        # Weighted score columns — enable post-hoc weighted aggregation in reporting.
+        record["weighted_score"] = compute_weighted_score(
+            score.metrics, self.scenario.metric_weights
+        )
+        doc_name = str(sample.metadata.get("doc_name", "") or "")
+        record["sample_weight"] = resolve_weight(
+            self.scenario.doc_weights, doc_name, default=1.0
+        )
        return record
--- a/rag_eval/metrics/weights.py
+++ b/rag_eval/metrics/weights.py
@@ -0,0 +1,152 @@
+"""Utility functions for weighted metric aggregation.
+
+All functions are pure (no side effects, no I/O) and operate on plain dicts/lists.
+Weights do not need to be pre-normalised — normalisation is done internally.
+"""
+
+from __future__ import annotations
+
+import math
+
+
+def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float:
+    """Return the weight for *key*, or *default* when absent."""
+    return float(weights.get(key, default))
+
+
+def compute_weighted_score(
+    scores: dict[str, float | None],
+    metric_weights: dict[str, float],
+) -> float | None:
+    """Return the weighted mean of valid (non-NaN, non-None) metric scores.
+
+    Args:
+        scores: mapping of metric_name -> raw score (may be NaN or None).
+        metric_weights: optional per-metric weights; absent keys default to 1.0.
+
+    Returns:
+        Weighted mean as a float, or None when no valid score exists.
+    """
+    total_weight = 0.0
+    total_score = 0.0
+    for metric, score in scores.items():
+        if score is None:
+            continue
+        try:
+            value = float(score)
+        except (TypeError, ValueError):
+            continue
+        if math.isnan(value) or math.isinf(value):
+            continue
+        weight = resolve_weight(metric_weights, metric, default=1.0)
+        total_weight += weight
+        total_score += weight * value
+    if total_weight == 0.0:
+        return None
+    return total_score / total_weight
+
+
+def weighted_metric_means(
+    score_rows: list[dict],
+    metrics: list[str],
+    doc_weights: dict[str, float],
+) -> dict[str, float | None]:
+    """Compute per-metric weighted means across all score rows.
+
+    Each row's contribution is scaled by the doc_weight for its ``doc_name``.
+    Rows with NaN/None for a given metric are excluded from that metric's mean.
+
+    Args:
+        score_rows: list of score record dicts (from scores.csv).
+        metrics: ordered list of metric names to aggregate.
+        doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0.
+
+    Returns:
+        Dict mapping metric_name -> weighted mean (or None if no valid data).
+    """
+    totals: dict[str, float] = {metric: 0.0 for metric in metrics}
+    weights_sum: dict[str, float] = {metric: 0.0 for metric in metrics}
+
+    for row in score_rows:
+        doc_name = str(row.get("doc_name", "") or "")
+        sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
+        for metric in metrics:
+            raw_value = row.get(metric)
+            if raw_value is None:
+                continue
+            try:
+                value = float(raw_value)
+            except (TypeError, ValueError):
+                continue
+            if math.isnan(value) or math.isinf(value):
+                continue
+            totals[metric] += sample_weight * value
+            weights_sum[metric] += sample_weight
+
+    return {
+        metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None)
+        for metric in metrics
+    }
+
+
+def compute_overall_weighted_score_mean(
+    score_rows: list[dict],
+    metric_weights: dict[str, float],
+    doc_weights: dict[str, float],
+) -> float | None:
+    """Compute the overall weighted-score mean across all samples.
+
+    For each sample:
+      1. Compute per-sample weighted_score via compute_weighted_score.
+      2. Scale by the doc weight for that sample's doc_name.
+    Then return the weighted mean of all per-sample weighted_scores.
+    """
+    total_weight = 0.0
+    total_score = 0.0
+    for row in score_rows:
+        metric_scores: dict[str, float | None] = {}
+        for key, value in row.items():
+            if key in _META_COLUMNS:
+                continue
+            metric_scores[key] = value  # type: ignore[assignment]
+
+        weighted_score = compute_weighted_score(metric_scores, metric_weights)
+        if weighted_score is None:
+            continue
+        doc_name = str(row.get("doc_name", "") or "")
+        sample_weight = resolve_weight(doc_weights, doc_name, default=1.0)
+        total_weight += sample_weight
+        total_score += sample_weight * weighted_score
+
+    return total_score / total_weight if total_weight > 0 else None
+
+
+# Columns in scores.csv that are sample metadata, not metric scores.
+_META_COLUMNS = frozenset(
+    {
+        "sample_id",
+        "question",
+        "contexts",
+        "answer",
+        "ground_truth",
+        "scenario",
+        "language",
+        "retrieval_config",
+        "error",
+        "judge_model",
+        "embedding_model",
+        "run_id",
+        "difficulty",
+        "question_type",
+        "doc_id",
+        "doc_name",
+        "section_path",
+        "page_start",
+        "page_end",
+        "source_chunk_ids",
+        "review_status",
+        "review_notes",
+        "weighted_score",
+        "sample_weight",
+    }
+)
--- a/rag_eval/reporting/summary.py
+++ b/rag_eval/reporting/summary.py
@@ -6,6 +6,10 @@ import math

 import pandas as pd

+from rag_eval.metrics.weights import (
+    compute_overall_weighted_score_mean,
+    weighted_metric_means,
+)
 from rag_eval.shared.models import EvaluationResult


@@ -55,24 +59,41 @@ def build_summary_markdown(result: EvaluationResult) -> str:
        lines.append("No valid samples were scored.")
        return "\n".join(lines) + "\n"

-    for metric in result.scenario.metrics:
-        mean_value = scores[metric].mean(numeric_only=True)
-        if isinstance(mean_value, float) and not math.isnan(mean_value):
-            lines.append(f"- {metric}: `{mean_value:.4f}`")
-        else:
-            lines.append(f"- {metric}: `n/a`")
-
-    # Keep the summary self-sufficient by including every scored sample and its errors.
-    detail_columns = ["sample_id", *result.scenario.metrics, "error"]
-    detail = scores[detail_columns]
-    lines.extend(
-        [
-            "",
-            "## Per-sample Scores",
-            "",
-            "```text",
-            _table_from_frame(detail),
-            "```",
-        ]
+    score_rows_list = scores.to_dict(orient="records")
+    w_means = weighted_metric_means(
+        score_rows_list, result.scenario.metrics, result.scenario.doc_weights
    )
+
+    has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights)
+
+    for metric in result.scenario.metrics:
+        mean_value = w_means.get(metric)
+        w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0
+        weight_note = f"  (w={w:.2f})" if result.scenario.metric_weights else ""
+        if mean_value is not None and not math.isnan(mean_value):
+            lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}")
+        else:
+            lines.append(f"- {metric}: `n/a`{weight_note}")
+
+    if has_weights:
+        overall_ws = compute_overall_weighted_score_mean(
+            score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights
+        )
+        weight_suffix = " (加权)"
+        if overall_ws is not None and not math.isnan(overall_ws):
+            lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**")
+        else:
+            lines.append(f"- **weighted_score{weight_suffix}: `n/a`**")
+
+    detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"]
+    existing_columns = [c for c in detail_columns if c in scores.columns]
+    detail = scores[existing_columns]
+    lines.extend([
+        "",
+        "## Per-sample Scores",
+        "",
+        "```text",
+        _table_from_frame(detail),
+        "```",
+    ])
    return "\n".join(lines) + "\n"
--- a/rag_eval/settings.py
+++ b/rag_eval/settings.py
@@ -52,6 +52,11 @@ class EvaluationSettings(BaseSettings):
    )
    parser_failure_mode: str = Field(default="fail", alias="PARSER_FAILURE_MODE")
    dataset_generator_model: str | None = Field(default=None, alias="DATASET_GENERATOR_MODEL")
+    score_api_token: str | None = Field(
+        default=None,
+        alias="SCORE_API_TOKEN",
+        description="Bearer token for /api/score endpoint. Empty = no auth.",
+    )

    @property
    def openai_client_kwargs(self) -> dict[str, str | float]:
--- a/rag_eval/shared/models.py
+++ b/rag_eval/shared/models.py
@@ -77,6 +77,8 @@ class Scenario:
    app_adapter: AppAdapterConfig | None = None
    source_path: Path | None = None
    optimization_advisor: bool = False
+    metric_weights: dict[str, float] = field(default_factory=dict)
+    doc_weights: dict[str, float] = field(default_factory=dict)

    def snapshot(self) -> dict[str, Any]:
        """Serialize the scenario into a reporting-friendly dictionary snapshot."""
--- a/run_eval.sh
+++ b/run_eval.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+# run_eval.sh — Siemens RAGAS 评估运行脚本（Linux）
+# 对应 Windows 的 run_eval.ps1
+#
+# 用法:
+#   bash run_eval.sh                          # online 评估（默认）
+#   bash run_eval.sh offline                  # offline 冒烟测试
+#   bash run_eval.sh scenarios/xxx.yaml       # 自定义场景
+#   bash run_eval.sh online DEBUG             # 指定日志级别
+#   bash run_eval.sh build scenarios/siemens_build/siemens-pdf-build.yaml
+#                                             # 题库生成
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# ── 颜色输出 ──────────────────────────────────────────────────────
+if [ -t 1 ]; then
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
+else
+    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
+fi
+
+ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
+err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
+info() { echo -e "${CYAN}[INFO]${NC}  $*"; }
+
+# ── 参数解析 ──────────────────────────────────────────────────────
+SCENARIO="${1:-online}"
+LOG_LEVEL="${2:-INFO}"
+
+# 场景别名映射
+declare -A SCENARIO_MAP=(
+    ["online"]="scenarios/online/siemens-pdf-question-bank-online.yaml"
+    ["offline"]="scenarios/offline/siemens-pdf-offline-smoke.yaml"
+)
+
+# 检测是否是 dataset build 模式
+BUILD_MODE=false
+BUILD_CONFIG=""
+if [ "$SCENARIO" = "build" ]; then
+    BUILD_MODE=true
+    BUILD_CONFIG="${2:-scenarios/siemens_build/siemens-pdf-build.yaml}"
+    LOG_LEVEL="${3:-INFO}"
+elif [ -v "SCENARIO_MAP[$SCENARIO]" ]; then
+    SCENARIO="${SCENARIO_MAP[$SCENARIO]}"
+fi
+
+# ── 验证 ──────────────────────────────────────────────────────────
+echo ""
+echo -e "${CYAN}============================================================${NC}"
+echo -e "${CYAN}  Siemens RAGAS  —  评估运行${NC}"
+echo -e "${CYAN}============================================================${NC}"
+echo ""
+
+# 检查虚拟环境
+if [ ! -f ".venv/bin/python" ]; then
+    err "未找到 .venv，请先执行部署：bash deploy.sh"
+    exit 1
+fi
+
+PYTHON=".venv/bin/python"
+
+# Build 模式校验
+if [ "$BUILD_MODE" = true ]; then
+    if [ ! -f "$BUILD_CONFIG" ]; then
+        err "题库生成配置文件不存在：$BUILD_CONFIG"
+        echo ""
+        echo "可用配置："
+        find scenarios/ -name "*.yaml" 2>/dev/null | head -20 | sed 's/^/  /'
+        exit 1
+    fi
+    ok "模式      : 题库生成 (dataset build)"
+    ok "配置文件  : $BUILD_CONFIG"
+else
+    # 场景文件校验
+    if [ ! -f "$SCENARIO" ]; then
+        err "场景文件不存在：$SCENARIO"
+        echo ""
+        echo "用法示例："
+        echo "  bash run_eval.sh                          # online 评估"
+        echo "  bash run_eval.sh offline                  # offline 冒烟"
+        echo "  bash run_eval.sh scenarios/xxx.yaml       # 自定义场景"
+        echo "  bash run_eval.sh build [config.yaml]      # 题库生成"
+        exit 1
+    fi
+    ok "场景文件  : $SCENARIO"
+fi
+
+# 日志级别校验
+LOG_LEVEL_UPPER="${LOG_LEVEL^^}"
+case "$LOG_LEVEL_UPPER" in
+    DEBUG|INFO|WARNING|ERROR) ;;
+    *)
+        warn "未知日志级别 '$LOG_LEVEL'，使用默认值 INFO"
+        LOG_LEVEL_UPPER="INFO"
+        ;;
+esac
+ok "日志级别  : $LOG_LEVEL_UPPER"
+
+# 创建日志目录
+mkdir -p logs
+TIMESTAMP=$(date +%Y-%m-%d_%H%M%S)
+LOG_FILE="logs/eval_${TIMESTAMP}.log"
+ok "日志文件  : $LOG_FILE"
+
+echo ""
+echo -e "${CYAN}============================================================${NC}"
+echo -e "${CYAN}  开始运行，按 Ctrl+C 中止${NC}"
+echo -e "${CYAN}============================================================${NC}"
+echo ""
+
+# ── 运行 ──────────────────────────────────────────────────────────
+export PYTHONIOENCODING="utf-8"
+export PYTHONPATH="."
+
+if [ "$BUILD_MODE" = true ]; then
+    "$PYTHON" main.py \
+        --dataset-build-config "$BUILD_CONFIG"
+else
+    "$PYTHON" main.py \
+        --scenario "$SCENARIO" \
+        --log-file "$LOG_FILE" \
+        --log-level "$LOG_LEVEL_UPPER"
+fi
+
+EXIT_CODE=$?
+
+echo ""
+if [ $EXIT_CODE -eq 0 ]; then
+    echo -e "${GREEN}============================================================${NC}"
+    echo -e "${GREEN}  运行完成！${NC}"
+    if [ "$BUILD_MODE" = false ]; then
+        echo -e "${GREEN}  日志已保存到：$LOG_FILE${NC}"
+    fi
+    echo -e "${CYAN}  在 Web 控制台查看报告：bash start.sh${NC}"
+    echo -e "${GREEN}============================================================${NC}"
+else
+    err "运行失败（exit code=$EXIT_CODE）"
+    if [ "$BUILD_MODE" = false ]; then
+        err "查看日志：cat $LOG_FILE"
+    fi
+    exit $EXIT_CODE
+fi
+echo ""
--- a/start.sh
+++ b/start.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# start.sh — 启动 Siemens RAGAS Web 服务（后台运行）
+# 前提：已执行过 deploy.sh（.venv 和依赖均已就绪）
+# 用法：bash start.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# ── 颜色输出 ──────────────────────────────────────────────────────
+if [ -t 1 ]; then
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
+else
+    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
+fi
+
+ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
+err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
+
+echo ""
+echo -e "${CYAN}============================================================${NC}"
+echo -e "${CYAN}  Siemens RAGAS Console  —  启动服务${NC}"
+echo -e "${CYAN}============================================================${NC}"
+echo ""
+
+# 检查虚拟环境
+if [ ! -f ".venv/bin/python" ]; then
+    err "未找到 .venv，请先执行部署：bash deploy.sh"
+    exit 1
+fi
+
+PYTHON=".venv/bin/python"
+
+# 检查 .env
+if [ ! -f ".env" ]; then
+    warn ".env 不存在，请先复制并编辑配置："
+    warn "  cp .env.example .env && nano .env"
+fi
+
+if grep -q "your-api-key" .env 2>/dev/null; then
+    warn ".env 中仍包含默认占位符，部分功能（评估执行）将不可用"
+fi
+
+# 检查是否已有运行中的进程
+if [ -f ".server.pid" ]; then
+    EXISTING_PID=$(cat .server.pid)
+    if kill -0 "$EXISTING_PID" 2>/dev/null; then
+        warn "服务已在运行 (PID=$EXISTING_PID)，无需重复启动"
+        warn "如需重启请先执行：bash stop.sh"
+        exit 0
+    else
+        # PID 文件残留，清理
+        rm -f .server.pid
+    fi
+fi
+
+# 创建必要目录
+mkdir -p logs
+
+# 端口检测
+PORT=8800
+if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
+    warn "端口 $PORT 已被占用，尝试 8801..."
+    PORT=8801
+    if ss -tlnp 2>/dev/null | grep -q ":$PORT " || netstat -tlnp 2>/dev/null | grep -q ":$PORT "; then
+        err "端口 8800 和 8801 均被占用，请手动指定端口："
+        err "  .venv/bin/python webmain.py --host 0.0.0.0 --port <PORT>"
+        exit 1
+    fi
+fi
+
+# 后台启动
+nohup "$PYTHON" webmain.py --host 0.0.0.0 --port "$PORT" >> logs/server.log 2>&1 &
+SERVER_PID=$!
+echo "$SERVER_PID" > .server.pid
+
+# 等待 3 秒验证进程存活
+sleep 3
+if kill -0 "$SERVER_PID" 2>/dev/null; then
+    ok "服务已启动 (PID=$SERVER_PID)"
+    echo ""
+    echo -e "${CYAN}  访问地址: http://$(hostname -I | awk '{print $1}'):${PORT}${NC}"
+    echo -e "${CYAN}  本机访问: http://127.0.0.1:${PORT}${NC}"
+    echo -e "${CYAN}  查看日志: tail -f logs/server.log${NC}"
+    echo -e "${CYAN}  停止服务: bash stop.sh${NC}"
+    echo ""
+else
+    err "服务启动失败，请查看日志："
+    err "  tail -20 logs/server.log"
+    rm -f .server.pid
+    exit 1
+fi
--- a/stop.sh
+++ b/stop.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# stop.sh — 停止 Siemens RAGAS 后台 Web 服务
+# 用法：bash stop.sh
+
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+# ── 颜色输出 ──────────────────────────────────────────────────────
+if [ -t 1 ]; then
+    GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; NC='\033[0m'
+else
+    GREEN=''; YELLOW=''; RED=''; CYAN=''; NC=''
+fi
+
+ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
+err()  { echo -e "${RED}[ERROR]${NC} $*" >&2; }
+
+echo ""
+echo -e "${CYAN}  Siemens RAGAS Console  —  停止服务${NC}"
+echo ""
+
+PID_FILE="$SCRIPT_DIR/.server.pid"
+
+if [ ! -f "$PID_FILE" ]; then
+    warn "未找到 .server.pid，服务可能未启动或已停止"
+    exit 0
+fi
+
+PID=$(cat "$PID_FILE")
+
+if ! kill -0 "$PID" 2>/dev/null; then
+    warn "进程 $PID 已不存在，清理 PID 文件"
+    rm -f "$PID_FILE"
+    exit 0
+fi
+
+# 优雅停止（SIGTERM）
+echo -e "  正在停止进程 (PID=$PID)..."
+kill "$PID" 2>/dev/null || true
+
+# 等待最多 5 秒
+for i in 1 2 3 4 5; do
+    sleep 1
+    if ! kill -0 "$PID" 2>/dev/null; then
+        break
+    fi
+    echo -e "  等待进程退出... ($i/5)"
+done
+
+# 若进程仍存在，强制终止
+if kill -0 "$PID" 2>/dev/null; then
+    warn "进程未响应，强制终止 (SIGKILL)..."
+    kill -9 "$PID" 2>/dev/null || true
+    sleep 1
+fi
+
+rm -f "$PID_FILE"
+
+if kill -0 "$PID" 2>/dev/null; then
+    err "无法停止进程 $PID，请手动执行：kill -9 $PID"
+    exit 1
+else
+    ok "服务已停止"
+    echo ""
+fi
--- a/tests/test_offline_eval.py
+++ b/tests/test_offline_eval.py
@@ -80,6 +80,64 @@ class ScenarioAndDatasetTests(unittest.TestCase):
        self.assertTrue(scenario.dataset.path.name.endswith(".csv"))
        self.assertTrue(scenario.output_dir.name == "sample-offline-baseline")

+    def test_load_scenario_metric_and_doc_weights(self) -> None:
+        """load_scenario passes metric_weights and doc_weights into Scenario."""
+        import os
+        import tempfile
+
+        import yaml
+
+        from rag_eval.config.loader import load_scenario
+
+        payload = {
+            "scenario_name": "w-test",
+            "mode": "offline",
+            "dataset": "nonexistent.csv",
+            "judge_model": "m",
+            "embedding_model": "e",
+            "metrics": ["faithfulness"],
+            "output_dir": "out",
+            "metric_weights": {"faithfulness": 0.7},
+            "doc_weights": {"doc.pdf": 2.0},
+        }
+        with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
+            yaml.dump(payload, f, allow_unicode=True)
+            tmp_path = f.name
+        try:
+            scenario = load_scenario(tmp_path)
+            assert scenario.metric_weights == {"faithfulness": 0.7}
+            assert scenario.doc_weights == {"doc.pdf": 2.0}
+        finally:
+            os.unlink(tmp_path)
+
+    def test_load_scenario_defaults_to_empty_weights(self) -> None:
+        """load_scenario defaults metric_weights and doc_weights to empty dicts."""
+        import os
+        import tempfile
+
+        import yaml
+
+        from rag_eval.config.loader import load_scenario
+
+        payload = {
+            "scenario_name": "no-w",
+            "mode": "offline",
+            "dataset": "nonexistent.csv",
+            "judge_model": "m",
+            "embedding_model": "e",
+            "metrics": ["faithfulness"],
+            "output_dir": "out",
+        }
+        with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", encoding="utf-8", delete=False) as f:
+            yaml.dump(payload, f, allow_unicode=True)
+            tmp_path = f.name
+        try:
+            scenario = load_scenario(tmp_path)
+            assert scenario.metric_weights == {}
+            assert scenario.doc_weights == {}
+        finally:
+            os.unlink(tmp_path)
+
    def test_scenario_snapshot_serializes_path_static_kwargs(self) -> None:
        scenario = load_scenario("scenarios/online/sample-pdf-question-bank-online.yaml")
        snapshot = scenario.snapshot()
@@ -125,6 +183,117 @@ class ScenarioAndDatasetTests(unittest.TestCase):


 class EvaluatorAndReportingTests(unittest.TestCase):
+    def test_merge_score_includes_weighted_score_and_sample_weight(self):
+        """_merge_score adds weighted_score and sample_weight columns."""
+        from unittest.mock import MagicMock
+        from rag_eval.execution.evaluator import Evaluator
+        from rag_eval.shared.models import (
+            MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig,
+        )
+
+        scenario = Scenario(
+            scenario_name="w-test", mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m", embedding_model="e",
+            metrics=["faithfulness", "context_recall"],
+            output_dir=Path("out"),
+            metric_weights={"faithfulness": 3.0, "context_recall": 1.0},
+            doc_weights={"doc.pdf": 2.0},
+        )
+        evaluator = Evaluator(
+            scenario=scenario,
+            metric_pipeline=MagicMock(),
+            app_adapter=None,
+        )
+        sample = NormalizedSample(
+            sample_id="s1", question="q", contexts=["ctx"],
+            answer="a", ground_truth="gt",
+            metadata={"doc_name": "doc.pdf"},
+        )
+        score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0})
+        row = evaluator._merge_score(sample, score)
+        # (3*1.0 + 1*0.0) / (3+1) = 0.75
+        assert abs(row["weighted_score"] - 0.75) < 1e-4
+        assert row["sample_weight"] == 2.0
+
+    def test_summary_markdown_shows_weighted_score(self):
+        """build_summary_markdown includes weighted_score when metric_weights set."""
+        import math
+        from rag_eval.reporting.summary import build_summary_markdown
+        from rag_eval.shared.models import (
+            EvaluationResult, NormalizedSample, DatasetConfig, Scenario,
+        )
+        from pathlib import Path
+        scenario = Scenario(
+            scenario_name="ws-test", mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m", embedding_model="e",
+            metrics=["faithfulness"],
+            output_dir=Path("out"),
+            metric_weights={"faithfulness": 1.0},
+            doc_weights={},
+        )
+        sample = NormalizedSample(
+            sample_id="s1", question="q", contexts=["c"],
+            answer="a", ground_truth="gt",
+        )
+        result = EvaluationResult(
+            scenario=scenario, run_id="r1",
+            started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00",
+            valid_samples=[sample], invalid_samples=[],
+            score_rows=[{
+                "sample_id": "s1", "faithfulness": 0.8,
+                "weighted_score": 0.8, "sample_weight": 1.0,
+                "doc_name": "", "error": "",
+            }],
+        )
+        md = build_summary_markdown(result)
+        assert "weighted_score" in md
+        assert "0.8000" in md
+
+    def test_summary_markdown_hides_weighted_score_without_weights(self):
+        """build_summary_markdown preserves unweighted summaries when no weights set."""
+        from rag_eval.shared.models import DatasetConfig, EvaluationResult, NormalizedSample, Scenario
+
+        scenario = Scenario(
+            scenario_name="plain-test",
+            mode="offline",
+            dataset=DatasetConfig(path=Path("d.csv")),
+            judge_model="m",
+            embedding_model="e",
+            metrics=["faithfulness"],
+            output_dir=Path("out"),
+            metric_weights={},
+            doc_weights={},
+        )
+        sample = NormalizedSample(
+            sample_id="s1",
+            question="q",
+            contexts=["c"],
+            answer="a",
+            ground_truth="gt",
+        )
+        result = EvaluationResult(
+            scenario=scenario,
+            run_id="r1",
+            started_at="2026-01-01T00:00:00",
+            finished_at="2026-01-01T00:01:00",
+            valid_samples=[sample],
+            invalid_samples=[],
+            score_rows=[{
+                "sample_id": "s1",
+                "faithfulness": 0.8,
+                "weighted_score": 0.8,
+                "sample_weight": 1.0,
+                "doc_name": "",
+                "error": "",
+            }],
+        )
+
+        md = build_summary_markdown(result)
+
+        assert "- **weighted_score" not in md
+
    def test_metric_pipeline_scores_sample(self) -> None:
        pipeline = MetricPipeline(
            metrics={
--- a/tests/test_webapp_report_builder.py
+++ b/tests/test_webapp_report_builder.py
@@ -0,0 +1,89 @@
+"""Regression tests for weighted webapp report aggregation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from webapp.services.report_builder import build_report
+from webapp.services.run_reader import _infer_metrics_from_scores, _read_weights_from_snapshot
+
+
+def _write_run_artifacts(run_dir: Path) -> None:
+    """Create a minimal run directory with weighted scores and a snapshot."""
+    run_dir.mkdir(parents=True, exist_ok=True)
+    (run_dir / "scores.csv").write_text(
+        "\n".join(
+            [
+                "sample_id,doc_name,faithfulness,context_recall,weighted_score,sample_weight",
+                "s1,a.pdf,1.0,0.5,0.8333,3.0",
+                "s2,b.pdf,0.0,0.5,0.1667,1.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    (run_dir / "summary.md").write_text("summary", encoding="utf-8")
+    (run_dir / "optimization_advice.md").write_text("advice", encoding="utf-8")
+    (run_dir / "scenario.snapshot.yaml").write_text(
+        "\n".join(
+            [
+                "metrics:",
+                "  - faithfulness",
+                "  - context_recall",
+                "metric_weights:",
+                "  faithfulness: 2.0",
+                "  context_recall: 1.0",
+                "doc_weights:",
+                "  a.pdf: 3.0",
+                "  b.pdf: 1.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_read_weights_from_snapshot_returns_metric_and_doc_weights(tmp_path: Path) -> None:
+    """Snapshot weight reader returns both weight maps as plain float dicts."""
+    run_dir = tmp_path / "run"
+    _write_run_artifacts(run_dir)
+
+    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)
+
+    assert metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
+    assert doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
+
+
+def test_build_report_uses_weighted_means_and_exposes_snapshot_weights(tmp_path: Path) -> None:
+    """Report aggregation uses weighted means and surfaces snapshot weights."""
+    run_dir = tmp_path / "run"
+    _write_run_artifacts(run_dir)
+
+    report = build_report(run_dir, ["faithfulness", "context_recall"])
+
+    assert report.metric_means == {
+        "faithfulness": pytest.approx(0.75, rel=1e-4),
+        "context_recall": pytest.approx(0.5, rel=1e-4),
+    }
+    assert report.weighted_score_mean == pytest.approx(0.6667, rel=1e-4)
+    assert report.metric_weights == {"faithfulness": 2.0, "context_recall": 1.0}
+    assert report.doc_weights == {"a.pdf": 3.0, "b.pdf": 1.0}
+    assert report.summary_markdown == "summary"
+    assert report.advice_markdown == "advice"
+
+
+def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) -> None:
+    """Metric inference excludes weighted helper columns from scores.csv."""
+    run_dir = tmp_path / "run"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    (run_dir / "scores.csv").write_text(
+        "\n".join(
+            [
+                "sample_id,doc_name,faithfulness,weighted_score,sample_weight",
+                "s1,a.pdf,0.8,0.8,2.0",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
--- a/tests/test_weights.py
+++ b/tests/test_weights.py
@@ -0,0 +1,124 @@
+"""Unit tests for rag_eval/metrics/weights.py"""
+import math
+
+import pytest
+
+from rag_eval.metrics.weights import (
+    compute_overall_weighted_score_mean,
+    compute_weighted_score,
+    resolve_weight,
+    weighted_metric_means,
+)
+
+
+class TestResolveWeight:
+    def test_returns_value_when_key_present(self):
+        assert resolve_weight({"faith": 0.5}, "faith") == 0.5
+
+    def test_returns_default_when_key_missing(self):
+        assert resolve_weight({}, "faith") == 1.0
+
+    def test_returns_custom_default_when_key_missing(self):
+        assert resolve_weight({}, "faith", default=2.0) == 2.0
+
+    def test_empty_dict_returns_default(self):
+        assert resolve_weight({}, "anything") == 1.0
+
+
+class TestComputeWeightedScore:
+    def test_equal_weights_is_simple_mean(self):
+        scores = {"faithfulness": 0.8, "context_recall": 0.6}
+        result = compute_weighted_score(scores, {})
+        assert result == pytest.approx(0.7, rel=1e-4)
+
+    def test_explicit_weights(self):
+        scores = {"faithfulness": 1.0, "context_recall": 0.0}
+        weights = {"faithfulness": 3.0, "context_recall": 1.0}
+        result = compute_weighted_score(scores, weights)
+        assert result == pytest.approx(0.75, rel=1e-4)
+
+    def test_nan_values_excluded(self):
+        scores = {"faithfulness": float("nan"), "context_recall": 0.8}
+        result = compute_weighted_score(scores, {})
+        assert result == pytest.approx(0.8, rel=1e-4)
+
+    def test_none_values_excluded(self):
+        scores = {"faithfulness": None, "context_recall": 0.6}
+        result = compute_weighted_score(scores, {})
+        assert result == pytest.approx(0.6, rel=1e-4)
+
+    def test_all_nan_returns_none(self):
+        scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
+        assert compute_weighted_score(scores, {}) is None
+
+    def test_empty_scores_returns_none(self):
+        assert compute_weighted_score({}, {}) is None
+
+    def test_missing_metric_in_weights_uses_default_1(self):
+        scores = {"faithfulness": 0.8, "context_recall": 0.4}
+        weights = {"faithfulness": 2.0}
+        result = compute_weighted_score(scores, weights)
+        assert result == pytest.approx(2.0 / 3, rel=1e-4)
+
+
+class TestWeightedMetricMeans:
+    def _rows(self):
+        return [
+            {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
+            {"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
+        ]
+
+    def test_equal_weights_gives_arithmetic_mean(self):
+        rows = self._rows()
+        result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
+        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
+        assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
+
+    def test_doc_weight_amplifies_contribution(self):
+        rows = self._rows()
+        doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
+        result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
+        assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
+
+    def test_nan_rows_skipped_per_metric(self):
+        rows = [
+            {"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
+            {"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
+        ]
+        result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
+        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
+        assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
+
+    def test_missing_metric_column_returns_none(self):
+        rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
+        result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
+        assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
+        assert result["unknown_metric"] is None
+
+    def test_empty_rows_returns_none_for_all(self):
+        result = weighted_metric_means([], ["faithfulness"], {})
+        assert result["faithfulness"] is None
+
+
+class TestComputeOverallWeightedScoreMean:
+    def test_basic_weighted_mean_of_weighted_scores(self):
+        rows = [
+            {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
+            {"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
+        ]
+        metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
+        result = compute_overall_weighted_score_mean(rows, metric_weights, {})
+        assert result == pytest.approx(0.5, rel=1e-4)
+
+    def test_doc_weight_amplifies_sample(self):
+        rows = [
+            {"doc_name": "important.pdf", "faithfulness": 1.0},
+            {"doc_name": "other.pdf", "faithfulness": 0.0},
+        ]
+        doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
+        result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
+        assert result == pytest.approx(0.9, rel=1e-4)
+
+    def test_all_nan_returns_none(self):
+        rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
+        assert compute_overall_weighted_score_mean(rows, {}, {}) is None
--- a/tests/webapp/test_llm_profiles_api.py
+++ b/tests/webapp/test_llm_profiles_api.py
@@ -137,3 +137,104 @@ def test_apply_no_profiles_returns_empty(tmp_path):
        _resolve_absolute=True,
    )
    assert patched == []
+
+
+def test_apply_metric_weights_patches_yaml(tmp_path):
+    """Applying metric_weights writes them into the YAML."""
+    import yaml as yaml_lib
+    import pytest
+    scenario_file = tmp_path / "w-scenario.yaml"
+    scenario_file.write_text(
+        "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
+        "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
+        encoding="utf-8",
+    )
+    from webapp.services.yaml_patcher import apply_profiles_to_scenario
+    patched = apply_profiles_to_scenario(
+        scenario_path=str(scenario_file),
+        judge_profile=None, answer_profile=None, dataset_profile=None,
+        metric_weights={"faithfulness": 0.7, "context_recall": 0.3},
+        _resolve_absolute=True,
+    )
+    assert "metric_weights" in patched
+    data = yaml_lib.safe_load(scenario_file.read_text())
+    assert abs(data["metric_weights"]["faithfulness"] - 0.7) < 1e-9
+
+
+def test_apply_doc_weights_patches_yaml(tmp_path):
+    """Applying doc_weights writes them into the YAML."""
+    import yaml as yaml_lib
+    scenario_file = tmp_path / "dw-scenario.yaml"
+    scenario_file.write_text(
+        "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n"
+        "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n",
+        encoding="utf-8",
+    )
+    from webapp.services.yaml_patcher import apply_profiles_to_scenario
+    patched = apply_profiles_to_scenario(
+        scenario_path=str(scenario_file),
+        judge_profile=None, answer_profile=None, dataset_profile=None,
+        doc_weights={"doc.pdf": 2.0},
+        _resolve_absolute=True,
+    )
+    assert "doc_weights" in patched
+    data = yaml_lib.safe_load(scenario_file.read_text())
+    assert abs(data["doc_weights"]["doc.pdf"] - 2.0) < 1e-9
+
+
+# ---------------------------------------------------------------------------
+# Connectivity test endpoint tests
+# ---------------------------------------------------------------------------
+from unittest.mock import MagicMock, patch
+
+
+def test_probe_connectivity_success(client):
+    """POST /api/llm-profiles/probe returns ok=True on successful completion."""
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
+        MockOpenAI.return_value.chat.completions.create.return_value = mock_response
+        resp = client.post("/api/llm-profiles/probe", json={
+            "model": "test-model",
+            "base_url": "http://x/v1",
+            "api_key": "sk-test",
+        })
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ok"] is True
+    assert data["latency_ms"] is not None
+
+
+def test_probe_connectivity_failure(client):
+    """POST /api/llm-profiles/probe returns ok=False when the LLM call raises."""
+    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
+        MockOpenAI.return_value.chat.completions.create.side_effect = Exception("connection refused")
+        resp = client.post("/api/llm-profiles/probe", json={
+            "model": "test-model",
+            "base_url": "http://x/v1",
+            "api_key": "sk-test",
+        })
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ok"] is False
+    assert "connection refused" in data["message"]
+
+
+def test_test_saved_profile_success(client):
+    """POST /api/llm-profiles/{id}/test returns ok=True for a saved profile."""
+    body = {"name": "T", "model": "m1", "base_url": "http://x/v1", "api_key": "k"}
+    pid = client.post("/api/llm-profiles", json=body).json()["profile_id"]
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    with patch("webapp.api.llm_profiles.OpenAI") as MockOpenAI:
+        MockOpenAI.return_value.chat.completions.create.return_value = mock_response
+        resp = client.post(f"/api/llm-profiles/{pid}/test")
+    assert resp.status_code == 200
+    assert resp.json()["ok"] is True
+
+
+def test_test_nonexistent_profile_returns_404(client):
+    """POST /api/llm-profiles/{id}/test returns 404 for unknown profile id."""
+    resp = client.post("/api/llm-profiles/nonexistent/test")
+    assert resp.status_code == 404
--- a/tests/webapp/test_score_api.py
+++ b/tests/webapp/test_score_api.py
@@ -0,0 +1,327 @@
+"""Tests for POST /api/score endpoint."""
+from __future__ import annotations
+
+import pytest
+from pydantic import ValidationError
+
+from webapp.models import ScoreRequest, ScoreResponse
+
+
+class TestScoreRequest:
+    def test_minimal_valid_request(self):
+        """Only required fields — question, answer, contexts."""
+        req = ScoreRequest(
+            question="What is CT?",
+            answer="CT is imaging.",
+            contexts="CT uses X-rays.",
+        )
+        assert req.question == "What is CT?"
+        assert req.contexts == "CT uses X-rays."
+        assert req.ground_truth is None
+        assert req.context_separator == " |||| "
+        assert req.metrics == [
+            "faithfulness",
+            "answer_relevancy",
+            "context_recall",
+            "context_precision",
+        ]
+
+    def test_contexts_split_by_separator(self):
+        """contexts_as_list() splits on context_separator."""
+        req = ScoreRequest(
+            question="q",
+            answer="a",
+            contexts="ctx1 |||| ctx2 |||| ctx3",
+            context_separator=" |||| ",
+        )
+        assert req.contexts_as_list() == ["ctx1", "ctx2", "ctx3"]
+
+    def test_contexts_split_custom_separator(self):
+        req = ScoreRequest(
+            question="q",
+            answer="a",
+            contexts="a---b---c",
+            context_separator="---",
+        )
+        assert req.contexts_as_list() == ["a", "b", "c"]
+
+    def test_contexts_split_single_item(self):
+        req = ScoreRequest(question="q", answer="a", contexts="only one")
+        assert req.contexts_as_list() == ["only one"]
+
+    def test_missing_question_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(answer="a", contexts="c")  # type: ignore[call-arg]
+
+    def test_missing_answer_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(question="q", contexts="c")  # type: ignore[call-arg]
+
+    def test_missing_contexts_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(question="q", answer="a")  # type: ignore[call-arg]
+
+    def test_custom_metrics_accepted(self):
+        req = ScoreRequest(
+            question="q",
+            answer="a",
+            contexts="c",
+            metrics=["faithfulness"],
+        )
+        assert req.metrics == ["faithfulness"]
+
+    def test_invalid_metric_name_raises(self):
+        with pytest.raises(ValidationError):
+            ScoreRequest(
+                question="q",
+                answer="a",
+                contexts="c",
+                metrics=["not_a_metric"],
+            )
+
+    def test_effective_metrics_drops_ground_truth_dependent_when_missing(self):
+        """Without ground_truth, GT-dependent metrics are excluded."""
+        req = ScoreRequest(
+            question="q",
+            answer="a",
+            contexts="c",
+            metrics=[
+                "faithfulness",
+                "context_recall",
+                "factual_correctness",
+                "semantic_similarity",
+                "noise_sensitivity",
+            ],
+        )
+        effective = req.effective_metrics()
+        assert "faithfulness" in effective
+        assert "context_recall" not in effective
+        assert "factual_correctness" not in effective
+        assert "semantic_similarity" not in effective
+        assert "noise_sensitivity" not in effective
+
+    def test_effective_metrics_keeps_all_when_ground_truth_present(self):
+        req = ScoreRequest(
+            question="q",
+            answer="a",
+            contexts="c",
+            ground_truth="gt",
+            metrics=["faithfulness", "context_recall", "factual_correctness"],
+        )
+        effective = req.effective_metrics()
+        assert effective == [
+            "faithfulness",
+            "context_recall",
+            "factual_correctness",
+        ]
+
+
+class TestScoreResponse:
+    def test_score_response_structure(self):
+        resp = ScoreResponse(
+            scores={"faithfulness": 0.85, "answer_relevancy": None},
+            weighted_score=0.85,
+            latency_ms=1200,
+        )
+        assert resp.scores["faithfulness"] == 0.85
+        assert resp.scores["answer_relevancy"] is None
+        assert resp.latency_ms == 1200
+
+
+class TestInlineScorer:
+    def test_score_returns_dict_with_requested_metrics(self):
+        """InlineScorer.score returns a dict keyed by the requested metrics."""
+        from unittest.mock import AsyncMock, MagicMock, patch
+        from webapp.services.inline_scorer import InlineScorer
+        from rag_eval.settings import EvaluationSettings
+
+        mock_score = MagicMock()
+        mock_score.metrics = {"faithfulness": 0.9, "answer_relevancy": 0.8}
+        mock_score.error = ""
+
+        mock_pipeline = MagicMock()
+        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
+
+        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
+            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
+                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
+                    scorer = InlineScorer()
+                    result = scorer.score(
+                        question="q", answer="a",
+                        contexts=["ctx1"],
+                        ground_truth=None,
+                        metrics=["faithfulness", "answer_relevancy"],
+                        judge_model="test-model",
+                        embedding_model="test-embed",
+                        settings=EvaluationSettings(_env_file=None),
+                    )
+        assert "faithfulness" in result
+        assert "answer_relevancy" in result
+        assert result["faithfulness"] == pytest.approx(0.9)
+
+    def test_score_converts_nan_to_none(self):
+        """NaN scores are converted to None in the returned dict."""
+        import math
+        from unittest.mock import AsyncMock, MagicMock, patch
+        from webapp.services.inline_scorer import InlineScorer
+        from rag_eval.settings import EvaluationSettings
+
+        mock_score = MagicMock()
+        mock_score.metrics = {"faithfulness": float("nan")}
+        mock_score.error = ""
+
+        mock_pipeline = MagicMock()
+        mock_pipeline.score_sample = AsyncMock(return_value=mock_score)
+
+        with patch("webapp.services.inline_scorer.build_models", return_value=(MagicMock(), MagicMock())):
+            with patch("webapp.services.inline_scorer.MetricPipeline", return_value=mock_pipeline):
+                with patch("webapp.services.inline_scorer._build_metric_instances", return_value={}):
+                    scorer = InlineScorer()
+                    result = scorer.score(
+                        question="q", answer="a", contexts=["c"],
+                        ground_truth=None,
+                        metrics=["faithfulness"],
+                        judge_model="m", embedding_model="e",
+                        settings=EvaluationSettings(_env_file=None),
+                    )
+        assert result["faithfulness"] is None
+
+
+# ── Endpoint integration tests ────────────────────────────────────────────────
+
+@pytest.fixture()
+def client(monkeypatch):
+    """TestClient with mocked InlineScorer."""
+    import webapp.api.score as score_mod
+    from unittest.mock import MagicMock
+
+    mock_scorer = MagicMock()
+    mock_scorer.score.return_value = {
+        "faithfulness": 0.85,
+        "answer_relevancy": 0.90,
+    }
+    monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+    from webapp.server import create_app
+    return TestClient(create_app())
+
+
+from fastapi.testclient import TestClient
+
+
+class TestScoreEndpoint:
+    def test_post_score_returns_200(self, client):
+        resp = client.post("/api/score", json={
+            "question": "What is CT?",
+            "answer": "CT is imaging.",
+            "contexts": "CT uses X-rays.",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "scores" in data
+        assert "latency_ms" in data
+        assert data["scores"]["faithfulness"] == pytest.approx(0.85)
+
+    def test_weighted_score_computed(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["weighted_score"] is not None
+
+    def test_missing_required_fields_returns_422(self, client):
+        resp = client.post("/api/score", json={"question": "q"})
+        assert resp.status_code == 422
+
+    def test_invalid_metric_name_returns_422(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["not_a_metric"],
+        })
+        assert resp.status_code == 422
+
+    def test_skipped_metrics_returned_when_no_ground_truth(self, client):
+        resp = client.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+            "metrics": ["faithfulness", "context_recall"],
+        })
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "context_recall" in data["skipped_metrics"]
+
+    def test_contexts_split_on_separator(self, monkeypatch):
+        """contexts string is split before passing to scorer."""
+        import webapp.api.score as score_mod
+        from unittest.mock import MagicMock
+        calls = []
+        def capture(**kwargs):
+            calls.append(kwargs.get("contexts", []))
+            return {"faithfulness": 0.9}
+        mock_scorer = MagicMock()
+        mock_scorer.score.side_effect = lambda **kw: capture(**kw)
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+        tc.post("/api/score", json={
+            "question": "q", "answer": "a",
+            "contexts": "ctx1 |||| ctx2",
+            "context_separator": " |||| ",
+        })
+        assert len(calls) == 1
+        assert calls[0] == ["ctx1", "ctx2"]
+
+    def test_bearer_token_auth_required_when_configured(self, monkeypatch):
+        """When SCORE_API_TOKEN is set, requests without token get 401."""
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+        from unittest.mock import MagicMock
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "secret-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {"faithfulness": 0.9}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+
+        # No auth header -> 401
+        resp = tc.post("/api/score", json={
+            "question": "q", "answer": "a", "contexts": "c",
+        })
+        assert resp.status_code == 401
+
+        # Correct token -> 200
+        resp = tc.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer secret-token"},
+        )
+        assert resp.status_code == 200
+
+    def test_wrong_bearer_token_returns_401(self, monkeypatch):
+        import webapp.api.score as score_mod
+        from rag_eval.settings import EvaluationSettings
+        from unittest.mock import MagicMock
+
+        mock_settings = EvaluationSettings(_env_file=None)
+        object.__setattr__(mock_settings, "score_api_token", "correct-token")
+        monkeypatch.setattr(score_mod, "_get_settings", lambda: mock_settings)
+
+        mock_scorer = MagicMock()
+        mock_scorer.score.return_value = {}
+        monkeypatch.setattr(score_mod, "inline_scorer", mock_scorer)
+
+        from webapp.server import create_app
+        from fastapi.testclient import TestClient
+        tc = TestClient(create_app())
+        resp = tc.post("/api/score",
+            json={"question": "q", "answer": "a", "contexts": "c"},
+            headers={"Authorization": "Bearer wrong-token"},
+        )
+        assert resp.status_code == 401
--- a/webapp/api/llm_profiles.py
+++ b/webapp/api/llm_profiles.py
@@ -2,13 +2,18 @@

 from __future__ import annotations

+import time
+
 from fastapi import APIRouter, HTTPException
+from openai import OpenAI

 from webapp.models import (
    CreateProfileRequest,
    LLMProfile,
    ProfileApplyRequest,
    ProfileApplyResponse,
+    ProfileProbeRequest,
+    ProfileTestResponse,
 )
 from webapp.services.profile_manager import profile_manager
 from webapp.services.yaml_patcher import apply_profiles_to_scenario
@@ -16,6 +21,43 @@ from webapp.services.yaml_patcher import apply_profiles_to_scenario
 router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"])


+def _do_connectivity_test(
+    model: str,
+    base_url: str,
+    api_key: str,
+    timeout_seconds: int,
+) -> ProfileTestResponse:
+    """Send a minimal chat completion request and return the test result."""
+    client = OpenAI(
+        api_key=api_key,
+        base_url=base_url.rstrip("/"),
+        timeout=float(timeout_seconds),
+    )
+    t0 = time.monotonic()
+    try:
+        client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": "hi"}],
+            max_tokens=1,
+        )
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
+
+
+@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
+def probe_connectivity(request: ProfileProbeRequest) -> ProfileTestResponse:
+    """Test LLM connectivity with inline credentials (no saved profile required)."""
+    return _do_connectivity_test(
+        model=request.model,
+        base_url=request.base_url,
+        api_key=request.api_key,
+        timeout_seconds=request.timeout_seconds,
+    )
+
+
@router.get("", response_model=dict)
 def list_profiles() -> dict:
    """Return all saved LLM profiles."""
@@ -59,6 +101,20 @@ def delete_profile(profile_id: str) -> dict:
    return {"deleted": True}


+@router.post("/{profile_id}/test", response_model=ProfileTestResponse)
+def test_profile(profile_id: str) -> ProfileTestResponse:
+    """Test LLM connectivity for a saved profile."""
+    profile = profile_manager.get(profile_id)
+    if profile is None:
+        raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}")
+    return _do_connectivity_test(
+        model=profile.model,
+        base_url=profile.base_url,
+        api_key=profile.api_key,
+        timeout_seconds=profile.timeout_seconds,
+    )
+
+
@router.post("/apply", response_model=ProfileApplyResponse)
 def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
    """Patch selected LLM profiles into the target scenario YAML file."""
@@ -89,6 +145,8 @@ def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse:
        judge_profile=role_profiles["judge"],
        answer_profile=role_profiles["answer"],
        dataset_profile=role_profiles["dataset"],
+        metric_weights=request.metric_weights,
+        doc_weights=request.doc_weights,
    )
    return ProfileApplyResponse(
        scenario_path=request.scenario_path,
--- a/webapp/api/score.py
+++ b/webapp/api/score.py
@@ -0,0 +1,105 @@
+"""Route for real-time single-sample RAGAS scoring (Dify external Tool endpoint)."""
+
+from __future__ import annotations
+
+import time
+from typing import Annotated
+
+from fastapi import APIRouter, Header, HTTPException
+
+from rag_eval.metrics.weights import compute_weighted_score
+from rag_eval.settings import EvaluationSettings
+from webapp.models import ScoreRequest, ScoreResponse
+from webapp.services.inline_scorer import inline_scorer
+
+router = APIRouter(prefix="/api/score", tags=["score"])
+
+
+def _get_settings() -> EvaluationSettings:
+    """Return a fresh EvaluationSettings instance (overridable in tests)."""
+    return EvaluationSettings()
+
+
+def _check_auth(authorization: str | None, token: str) -> None:
+    """Raise 401 if Bearer token does not match the configured token."""
+    if authorization is None:
+        raise HTTPException(status_code=401, detail="Missing Authorization header.")
+    parts = authorization.split(" ", 1)
+    if len(parts) != 2 or parts[0].lower() != "bearer" or parts[1] != token:
+        raise HTTPException(status_code=401, detail="Invalid Bearer token.")
+
+
+@router.post(
+    "",
+    response_model=ScoreResponse,
+    summary="单题实时评分（Dify 外部 Tool）",
+    responses={
+        200: {"description": "各指标得分和加权综合得分。"},
+        401: {"description": "配置了 SCORE_API_TOKEN 但未提供有效 Bearer token。"},
+        422: {"description": "请求参数校验失败。"},
+    },
+)
+def score_sample(
+    request: ScoreRequest,
+    authorization: Annotated[str | None, Header()] = None,
+) -> ScoreResponse:
+    """Accept one QA sample, run RAGAS metrics synchronously, and return scores."""
+    settings = _get_settings()
+
+    # Require Bearer auth only when the deployment configured a shared token.
+    if settings.score_api_token:
+        _check_auth(authorization, settings.score_api_token)
+
+    judge_model = request.judge_model or settings.ragas_judge_model
+    embedding_model = request.embedding_model or settings.ragas_embedding_model
+    effective = request.effective_metrics()
+    requested = set(request.metrics)
+    skipped = sorted(requested - set(effective))
+
+    if not effective:
+        return ScoreResponse(
+            scores={metric_name: None for metric_name in request.metrics},
+            weighted_score=None,
+            latency_ms=0,
+            skipped_metrics=skipped,
+        )
+
+    t0 = time.monotonic()
+    try:
+        raw_scores = inline_scorer.score(
+            question=request.question,
+            answer=request.answer,
+            contexts=request.contexts_as_list(),
+            ground_truth=request.ground_truth,
+            metrics=effective,
+            judge_model=judge_model,
+            embedding_model=embedding_model,
+            settings=settings,
+        )
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ScoreResponse(
+            scores={},
+            weighted_score=None,
+            latency_ms=latency_ms,
+            skipped_metrics=skipped,
+            error=f"{type(exc).__name__}: {exc}",
+        )
+
+    latency_ms = int((time.monotonic() - t0) * 1000)
+
+    # Keep skipped metrics visible to callers by emitting them as null scores.
+    all_scores: dict[str, float | None] = {metric_name: None for metric_name in request.metrics}
+    all_scores.update(raw_scores)
+
+    weighted = compute_weighted_score(
+        {key: value for key, value in raw_scores.items() if value is not None},
+        {},
+    )
+
+    return ScoreResponse(
+        scores=all_scores,
+        weighted_score=round(weighted, 4) if weighted is not None else None,
+        latency_ms=latency_ms,
+        skipped_metrics=skipped,
+    )
--- a/webapp/models.py
+++ b/webapp/models.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 from datetime import datetime, timezone
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator


 def _utcnow_iso() -> str:
@@ -74,6 +74,18 @@ class ReportData(BaseModel):
    lowest_samples: list[SampleScore] = Field(default_factory=list)
    summary_markdown: str = ""
    advice_markdown: str = ""  # optimization_advice.md content (empty if not generated)
+    weighted_score_mean: float | None = Field(
+        default=None,
+        description="加权综合得分均值（metric_weights × doc_weights 共同作用）。",
+    )
+    metric_weights: dict[str, float] = Field(
+        default_factory=dict,
+        description="该次运行使用的指标权重配置（来自 scenario.snapshot.yaml）。",
+    )
+    doc_weights: dict[str, float] = Field(
+        default_factory=dict,
+        description="该次运行使用的文档权重配置（来自 scenario.snapshot.yaml）。",
+    )


 class RunDetail(BaseModel):
@@ -93,6 +105,14 @@ class ScenarioInfo(BaseModel):
    judge_model: str = ""
    metrics: list[str] = Field(default_factory=list)
    error: str = ""
+    metric_weights: dict[str, float] = Field(
+        default_factory=dict,
+        description="从场景 YAML 读取的指标权重配置，供前端权重面板预填。",
+    )
+    doc_weights: dict[str, float] = Field(
+        default_factory=dict,
+        description="从场景 YAML 读取的文档权重配置，供前端权重面板预填。",
+    )


 class TaskStatus(BaseModel):
@@ -150,6 +170,14 @@ class ProfileApplyRequest(BaseModel):
    judge_profile_id: str | None = None
    answer_profile_id: str | None = None
    dataset_profile_id: str | None = None
+    metric_weights: dict[str, float] | None = Field(
+        default=None,
+        description="指标权重映射，如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。",
+    )
+    doc_weights: dict[str, float] | None = Field(
+        default=None,
+        description="文档权重映射，如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。",
+    )


 class ProfileApplyResponse(BaseModel):
@@ -159,6 +187,23 @@ class ProfileApplyResponse(BaseModel):
    patched_fields: list[str] = Field(default_factory=list)


+class ProfileProbeRequest(BaseModel):
+    """Inline credentials for testing LLM connectivity without saving a profile."""
+
+    model: str
+    base_url: str
+    api_key: str
+    timeout_seconds: int = 30
+
+
+class ProfileTestResponse(BaseModel):
+    """Result of a LLM connectivity test."""
+
+    ok: bool
+    message: str
+    latency_ms: int | None = None
+
+
 def jsonable(value: Any) -> Any:
    """Convert NaN/inf floats into None so the payload stays valid JSON."""
    import math
@@ -172,3 +217,288 @@ def jsonable(value: Any) -> Any:
    if isinstance(value, list):
        return [jsonable(item) for item in value]
    return value
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline (build + eval) job models
+# ---------------------------------------------------------------------------
+
+class PipelineJobRequest(BaseModel):
+    """Request body for launching an end-to-end build + evaluation pipeline job."""
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "summary": "西门子 CT 文档评估（完整参数）",
+                    "value": {
+                        "docs_path": "datasets/siemens-pdfs",
+                        "job_name": "siemens-ct-eval-2026",
+                        "generation_model": "qwen3.6-plus",
+                        "answer_model": "deepseek-v4-flash",
+                        "judge_model": "deepseek-v4-flash",
+                        "embedding_model": "text-embedding-v3",
+                        "max_questions_per_document": 10,
+                        "max_source_chunks_per_question": 3,
+                        "max_documents": None,
+                        "max_samples": None,
+                        "metrics": [
+                            "faithfulness",
+                            "answer_relevancy",
+                            "context_recall",
+                            "context_precision",
+                        ],
+                        "optimization_advisor": False,
+                        "failure_mode": "skip",
+                    },
+                },
+                {
+                    "summary": "快速冒烟测试（仅 2 份文档、5 道题）",
+                    "value": {
+                        "docs_path": "datasets/siemens-pdfs",
+                        "job_name": "smoke-test",
+                        "generation_model": "qwen3.6-plus",
+                        "answer_model": "deepseek-v4-flash",
+                        "judge_model": "deepseek-v4-flash",
+                        "embedding_model": "text-embedding-v3",
+                        "max_questions_per_document": 5,
+                        "max_source_chunks_per_question": 3,
+                        "max_documents": 2,
+                        "max_samples": 10,
+                        "metrics": ["faithfulness", "answer_relevancy"],
+                        "optimization_advisor": False,
+                        "failure_mode": "skip",
+                    },
+                },
+            ]
+        }
+    )
+
+    docs_path: str = Field(
+        description="PDF 文档所在文件夹的绝对路径或相对于仓库根目录的相对路径。"
+    )
+    job_name: str = Field(
+        default="",
+        description="任务显示名称；留空时系统自动生成唯一标识。",
+    )
+    generation_model: str = Field(
+        default="qwen3.6-plus",
+        description="用于从文档片段生成草稿题库的 LLM 模型名称。",
+    )
+    answer_model: str = Field(
+        default="deepseek-v4-flash",
+        description="在线评估时调用的答题 LLM 模型名称（siemens_pdf_qa adapter）。",
+    )
+    judge_model: str = Field(
+        default="deepseek-v4-flash",
+        description="RAGAS 指标评分时使用的 Judge LLM 模型名称。",
+    )
+    embedding_model: str = Field(
+        default="text-embedding-v3",
+        description="RAGAS context-recall / context-precision 使用的 Embedding 模型名称。",
+    )
+    max_questions_per_document: int = Field(
+        default=10, gt=0,
+        description="每份 PDF 文档最多生成的草稿题目数量。",
+    )
+    max_source_chunks_per_question: int = Field(
+        default=3, gt=0,
+        description="每道题目最多引用的文档片段（source chunk）数量。",
+    )
+    max_documents: int | None = Field(
+        default=None, gt=0,
+        description="限制处理的 PDF 文件数量上限（冒烟测试时使用）。",
+    )
+    max_samples: int | None = Field(
+        default=None, gt=0,
+        description="限制评估的题目数量上限（冒烟测试时使用）。",
+    )
+    metrics: list[str] = Field(
+        default_factory=lambda: [
+            "faithfulness",
+            "answer_relevancy",
+            "context_recall",
+            "context_precision",
+        ],
+        description=(
+            "需要计算的 RAGAS 指标列表。"
+            "可选值：faithfulness, answer_relevancy, context_recall, "
+            "context_precision, noise_sensitivity, factual_correctness, semantic_similarity。"
+        ),
+    )
+    optimization_advisor: bool = Field(
+        default=False,
+        description="为 True 时启用 RAGAS 优化建议模块，生成 optimization_advice.md。",
+    )
+    failure_mode: str = Field(
+        default="skip",
+        description="PDF 解析失败时的处理策略：skip（跳过继续）或 fail（立即中止）。",
+    )
+
+
+class PipelineResult(BaseModel):
+    """Artifact locations and statistics for a completed pipeline run."""
+
+    build_artifact_dir: str = Field(description="题库生成阶段的产物根目录路径。")
+    dataset_csv: str = Field(description="生成的草稿题库 CSV 文件路径（评估输入）。")
+    source_chunks_jsonl: str = Field(description="文档片段索引文件路径（在线评估 adapter 使用）。")
+    total_questions: int = Field(description="成功生成的有效题目总数。")
+    parse_failures: int = Field(description="文档解析失败的 PDF 数量。")
+    eval_run_id: str = Field(description="RAGAS 评估运行 ID。")
+    eval_output_dir: str = Field(description="RAGAS 评估产物根目录路径。")
+    scores_csv: str = Field(description="每道题目逐项评分的 CSV 文件路径。")
+    summary_md: str = Field(description="评估结果摘要 Markdown 文件路径。")
+
+
+class PipelineJobStatus(BaseModel):
+    """State of one end-to-end pipeline job."""
+
+    job_id: str = Field(description="任务唯一标识符。")
+    job_name: str = Field(description="任务显示名称。")
+    status: str = Field(description="任务状态：queued | running | completed | failed。")
+    phase: str = Field(default="idle", description="当前执行阶段：idle | parsing_documents | generating_questions | evaluating | done。")
+    logs: list[str] = Field(default_factory=list, description="实时日志行列表。")
+    result: PipelineResult | None = Field(default=None, description="任务完成后填充的产物路径与统计信息。")
+    error: str | None = Field(default=None, description="失败时的错误信息。")
+    created_at: str = Field(default="", description="任务创建时间（ISO 8601 UTC）。")
+    finished_at: str = Field(default="", description="任务结束时间（ISO 8601 UTC）。")
+
+
+class PipelineJobResponse(BaseModel):
+    """Immediate response returned after a pipeline job is queued."""
+
+    job_id: str = Field(description="任务唯一标识符，用于后续轮询状态。")
+    job_name: str = Field(description="任务显示名称。")
+    status: str = Field(default="queued", description="初始状态，通常为 queued。")
+
+
+# ---------------------------------------------------------------------------
+# Dify 实时评分 API 模型
+# ---------------------------------------------------------------------------
+
+# 需要 ground_truth 才能计算的指标集合
+_GT_DEPENDENT_METRICS: frozenset[str] = frozenset({
+    "context_recall",
+    "factual_correctness",
+    "semantic_similarity",
+    "noise_sensitivity",
+})
+
+# 所有合法指标名称
+_VALID_METRICS: frozenset[str] = frozenset({
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+    "noise_sensitivity",
+    "factual_correctness",
+    "semantic_similarity",
+})
+
+_DEFAULT_SCORE_METRICS: list[str] = [
+    "faithfulness",
+    "answer_relevancy",
+    "context_recall",
+    "context_precision",
+]
+
+
+class ScoreRequest(BaseModel):
+    """Request body for the real-time single-sample scoring endpoint."""
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "summary": "基础评分请求",
+                    "value": {
+                        "question": "双源CT的时间分辨率是多少?",
+                        "answer": "双源CT的单扇区时间分辨率为75ms。",
+                        "contexts": "双源CT采用两套管-探测器系统 |||| 单扇区采集旋转135度",
+                        "ground_truth": "双源CT单扇区时间分辨率为75ms，需旋转135度。",
+                        "context_separator": " |||| ",
+                        "metrics": [
+                            "faithfulness",
+                            "answer_relevancy",
+                            "context_recall",
+                            "context_precision",
+                        ],
+                        "judge_model": "deepseek-v4-flash",
+                        "embedding_model": "text-embedding-v3",
+                    },
+                }
+            ]
+        }
+    )
+
+    question: str = Field(description="问题文本。")
+    answer: str = Field(description="待评分的回答。")
+    contexts: str = Field(
+        description="检索上下文字符串，多段之间用 context_separator 拼接。"
+    )
+    ground_truth: str | None = Field(
+        default=None,
+        description="标准参考答案（可选）。缺失时自动跳过需要它的指标。",
+    )
+    context_separator: str = Field(
+        default=" |||| ",
+        description="contexts 字段中段落分隔符，默认为四个竖线两侧各一空格。",
+    )
+    metrics: list[str] = Field(
+        default_factory=lambda: list(_DEFAULT_SCORE_METRICS),
+        description="需要计算的 RAGAS 指标列表。",
+    )
+    judge_model: str | None = Field(
+        default=None,
+        description="Judge LLM 模型名称；为 null 时使用 .env 中的 RAGAS_JUDGE_MODEL。",
+    )
+    embedding_model: str | None = Field(
+        default=None,
+        description="Embedding 模型名称；为 null 时使用 .env 中的 RAGAS_EMBEDDING_MODEL。",
+    )
+
+    @field_validator("metrics")
+    @classmethod
+    def validate_metric_names(cls, value: list[str]) -> list[str]:
+        """Reject any metric name not in the supported registry."""
+        invalid = [metric_name for metric_name in value if metric_name not in _VALID_METRICS]
+        if invalid:
+            raise ValueError(
+                f"不支持的指标名称：{invalid}。"
+                f"合法值：{sorted(_VALID_METRICS)}"
+            )
+        if not value:
+            raise ValueError("metrics 不能为空列表。")
+        return value
+
+    def contexts_as_list(self) -> list[str]:
+        """Split the contexts string into a list of non-empty fragments."""
+        separator = self.context_separator or " |||| "
+        return [part.strip() for part in self.contexts.split(separator) if part.strip()]
+
+    def effective_metrics(self) -> list[str]:
+        """Return metrics filtered to exclude GT-dependent ones when ground_truth is absent."""
+        if self.ground_truth is not None:
+            return list(self.metrics)
+        return [metric_name for metric_name in self.metrics if metric_name not in _GT_DEPENDENT_METRICS]
+
+
+class ScoreResponse(BaseModel):
+    """Response payload for the real-time scoring endpoint."""
+
+    scores: dict[str, float | None] = Field(
+        description="各指标得分（NaN 或计算失败时为 null）。"
+    )
+    weighted_score: float | None = Field(
+        default=None,
+        description="等权加权综合得分（仅对非 null 指标求均值）。",
+    )
+    latency_ms: int = Field(description="服务端打分耗时（毫秒）。")
+    skipped_metrics: list[str] = Field(
+        default_factory=list,
+        description="因缺少 ground_truth 而跳过的指标名称列表。",
+    )
+    error: str | None = Field(
+        default=None,
+        description="打分异常时的错误信息（HTTP 200 仍返回，scores 为空）。",
+    )
--- a/webapp/server.py
+++ b/webapp/server.py
@@ -13,23 +13,95 @@ from fastapi import FastAPI
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles

-from webapp.api import evaluations, llm_profiles, runs, scenarios
+from webapp.api import evaluations, llm_profiles, pipeline, runs, scenarios, score

 STATIC_DIR = Path(__file__).resolve().parent / "static"

+# OpenAPI tag metadata — controls the grouping and descriptions in /docs.
+OPENAPI_TAGS = [
+    {
+        "name": "pipeline",
+        "description": (
+            "**全链路评估 Pipeline API**\n\n"
+            "一次调用完成「解析文档 → 生成题库 → RAGAS 评估 → 输出报告」全流程。\n\n"
+            "**使用流程**\n"
+            "1. `POST /api/pipeline/jobs` 提交任务，立即拿到 `job_id`。\n"
+            "2. `GET /api/pipeline/jobs/{job_id}` 轮询 `status` / `phase` / `logs`。\n"
+            "3. 当 `status=completed` 时，`result` 字段包含所有产物路径。\n\n"
+            "**Pipeline 阶段**\n"
+            "| phase | 说明 |\n"
+            "|-------|------|\n"
+            "| `parsing_documents` | 调用阿里云 DocMind 解析每份 PDF |\n"
+            "| `generating_questions` | LLM 从文档片段生成草稿题库 |\n"
+            "| `evaluating` | RAGAS 在线评测打分 |\n"
+            "| `done` | 所有产物写入磁盘，任务完成 |"
+        ),
+    },
+    {
+        "name": "evaluations",
+        "description": (
+            "**单场景评估 API**\n\n"
+            "基于已有 YAML 场景文件触发评估任务，并查询任务状态与日志。"
+        ),
+    },
+    {
+        "name": "llm-profiles",
+        "description": (
+            "**LLM 配置管理 API**\n\n"
+            "增删改查已保存的 LLM 连接配置（模型名称、Base URL、API Key）；"
+            "支持连通性测试；可将配置一键写入场景 YAML 文件。"
+        ),
+    },
+    {
+        "name": "runs",
+        "description": "**评估运行列表 API**\n\n查询历史评估运行记录及详细报告数据。",
+    },
+    {
+        "name": "scenarios",
+        "description": "**场景文件 API**\n\n扫描并列出 `scenarios/` 目录下所有可用的 YAML 场景文件。",
+    },
+    {
+        "name": "score",
+        "description": (
+            "**实时评分 API（Dify 外部 Tool）**\n\n"
+            "接受单条问答记录 `(question, answer, contexts, ground_truth)`，\n"
+            "同步运行 RAGAS 指标打分，返回各指标得分和加权综合得分。\n\n"
+            "适用场景：Dify Agent 在回答后即时调用，用于质量监控或自我改进。\n\n"
+            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
+            "`Authorization: Bearer <token>` 请求头。"
+        ),
+    },
+    {
+        "name": "meta",
+        "description": "**系统 API**\n\n健康检查等基础接口。",
+    },
+]
+

 def create_app() -> FastAPI:
    """Build and configure the FastAPI application instance."""
    app = FastAPI(
-        title="Siemens RAGAS 评估控制台",
-        description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
-        version="0.1.0",
+        title="RAGAS 评估系统",
+        description=(
+            "西门子医疗影像 RAG 评估平台 API 文档。\n\n"
+            "提供以下能力：\n"
+            "- **Pipeline API** — 一键完成「解析文档 → 生成题库 → RAGAS 评估」全链路\n"
+            "- **实时评分 API** — 供 Dify 外部 Tool 调用的单题 RAGAS 评分接口\n"
+            "- **评估 API** — 基于 YAML 场景文件触发单次评估\n"
+            "- **LLM 配置 API** — 管理多个 LLM 连接配置，支持连通性测试\n"
+            "- **报告 API** — 查询历史运行记录与评估报告\n\n"
+            "> **快速开始**：调用 `POST /api/pipeline/jobs` 传入 PDF 文件夹路径即可启动完整评估流程。"
+        ),
+        version="0.2.0",
+        openapi_tags=OPENAPI_TAGS,
    )

    app.include_router(runs.router)
    app.include_router(scenarios.router)
    app.include_router(evaluations.router)
    app.include_router(llm_profiles.router)
+    app.include_router(pipeline.router)
+    app.include_router(score.router)

    @app.get("/api/health", tags=["meta"])
    def health() -> dict[str, str]:
--- a/webapp/services/inline_scorer.py
+++ b/webapp/services/inline_scorer.py
@@ -0,0 +1,109 @@
+"""LLM-cached inline RAGAS scorer for the real-time /api/score endpoint.
+
+A module-level InlineScorer singleton caches (llm, embeddings) pairs keyed by
+(judge_model, embedding_model), so repeated Dify Tool calls with the same
+models reuse existing AsyncOpenAI connections instead of creating new ones.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import math
+import threading
+from typing import Any
+
+from rag_eval.compat import ensure_ragas_import_compat
+from rag_eval.metrics.factory import build_models
+from rag_eval.metrics.pipeline import MetricPipeline
+from rag_eval.settings import EvaluationSettings
+from rag_eval.shared.models import NormalizedSample
+
+ensure_ragas_import_compat()
+
+from ragas.metrics.collections import (  # noqa: E402
+    AnswerRelevancy,
+    ContextPrecision,
+    ContextRecall,
+    FactualCorrectness,
+    Faithfulness,
+    NoiseSensitivity,
+    SemanticSimilarity,
+)
+
+
+def _build_metric_instances(metrics: list[str], llm: Any, embeddings: Any) -> dict[str, Any]:
+    """Instantiate only the RAGAS metric objects requested."""
+    registry: dict[str, Any] = {
+        "faithfulness": Faithfulness(llm=llm),
+        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
+        "context_recall": ContextRecall(llm=llm),
+        "context_precision": ContextPrecision(llm=llm),
+        "noise_sensitivity": NoiseSensitivity(llm=llm),
+        "factual_correctness": FactualCorrectness(llm=llm),
+        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
+    }
+    return {name: registry[name] for name in metrics if name in registry}
+
+
+class InlineScorer:
+    """Thread-safe single-sample RAGAS scorer with LLM client caching."""
+
+    def __init__(self) -> None:
+        """Initialize the scorer cache and synchronization primitives."""
+        # Cache keyed by (judge_model, embedding_model) -> (llm, embeddings)
+        self._model_cache: dict[tuple[str, str], tuple[Any, Any]] = {}
+        self._lock = threading.Lock()
+
+    def _get_models(
+        self,
+        judge_model: str,
+        embedding_model: str,
+        settings: EvaluationSettings,
+    ) -> tuple[Any, Any]:
+        """Return cached LLM/embedding clients, building them on first use."""
+        cache_key = (judge_model, embedding_model)
+        with self._lock:
+            if cache_key not in self._model_cache:
+                llm, embeddings = build_models(judge_model, embedding_model, settings)
+                self._model_cache[cache_key] = (llm, embeddings)
+            return self._model_cache[cache_key]
+
+    def score(
+        self,
+        question: str,
+        answer: str,
+        contexts: list[str],
+        ground_truth: str | None,
+        metrics: list[str],
+        judge_model: str,
+        embedding_model: str,
+        settings: EvaluationSettings,
+    ) -> dict[str, float | None]:
+        """Score one sample synchronously and return {metric_name: score | None}."""
+        llm, embeddings = self._get_models(judge_model, embedding_model, settings)
+        metric_instances = _build_metric_instances(metrics, llm, embeddings)
+
+        pipeline = MetricPipeline(
+            metrics=metric_instances,
+            metric_timeout_seconds=settings.ragas_metric_timeout_seconds,
+        )
+
+        sample = NormalizedSample(
+            sample_id="inline-score",
+            question=question,
+            answer=answer,
+            contexts=contexts,
+            ground_truth=ground_truth or "",
+        )
+
+        metric_score = asyncio.run(pipeline.score_sample(sample))
+
+        # Convert NaN and Inf into None for clean JSON output.
+        return {
+            name: (None if math.isnan(value) or math.isinf(value) else round(value, 4))
+            for name, value in metric_score.metrics.items()
+        }
+
+
+# Module-level singleton shared by FastAPI routes.
+inline_scorer = InlineScorer()
--- a/webapp/services/report_builder.py
+++ b/webapp/services/report_builder.py
@@ -13,6 +13,11 @@ from pathlib import Path

 import pandas as pd

+from rag_eval.metrics.weights import (
+    compute_overall_weighted_score_mean,
+    weighted_metric_means as _weighted_metric_means,
+)
+from webapp.services.run_reader import _read_weights_from_snapshot
 from webapp.services.text_utils import parse_contexts
 from webapp.models import (
    DistributionBin,
@@ -42,17 +47,6 @@ def _round_or_none(value: float | None) -> float | None:
    return round(float(value), 4)


-def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
-    """Compute the mean of each metric column across all scored samples."""
-    means: dict[str, float | None] = {}
-    for metric in metrics:
-        if metric in frame.columns:
-            means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
-        else:
-            means[metric] = None
-    return means
-
-
 def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
    """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
    bins: list[DistributionBin] = []
@@ -165,6 +159,7 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
    frame = run_reader.read_scores_frame(run_dir)
    summary_markdown = run_reader.read_summary_markdown(run_dir)
    advice_markdown = run_reader.read_advice_markdown(run_dir)
+    metric_weights, doc_weights = _read_weights_from_snapshot(run_dir)

    if frame.empty or not metrics:
        return ReportData(
@@ -172,8 +167,20 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
            metric_means={metric: None for metric in metrics},
            summary_markdown=summary_markdown,
            advice_markdown=advice_markdown,
+            metric_weights=metric_weights,
+            doc_weights=doc_weights,
        )

+    score_rows_list = frame.to_dict(orient="records")
+
+    # Use weighted metric means (degrades to arithmetic mean when weights are empty).
+    w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights)
+    rounded_means = {metric: _round_or_none(value) for metric, value in w_means.items()}
+
+    overall_ws = compute_overall_weighted_score_mean(
+        score_rows_list, metric_weights, doc_weights
+    )
+
    distributions = {
        metric: _distribution(frame, metric)
        for metric in metrics
@@ -182,10 +189,13 @@ def build_report(run_dir: Path, metrics: list[str]) -> ReportData:

    return ReportData(
        metrics=metrics,
-        metric_means=_metric_means(frame, metrics),
+        metric_means=rounded_means,
        distributions=distributions,
        groupings=_groupings(frame, metrics),
        lowest_samples=_lowest_samples(frame, metrics),
        summary_markdown=summary_markdown,
        advice_markdown=advice_markdown,
+        weighted_score_mean=_round_or_none(overall_ws),
+        metric_weights=metric_weights,
+        doc_weights=doc_weights,
    )
--- a/webapp/services/run_reader.py
+++ b/webapp/services/run_reader.py
@@ -64,6 +64,27 @@ def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
    return []


+def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]:
+    """Read metric_weights and doc_weights from a scenario snapshot if present.
+
+    Returns a (metric_weights, doc_weights) tuple of plain dicts.
+    Both default to empty dicts when the snapshot is absent or lacks the fields.
+    """
+    snapshot = run_dir / "scenario.snapshot.yaml"
+    if not snapshot.is_file():
+        return {}, {}
+    try:
+        payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
+    except (OSError, yaml.YAMLError):
+        return {}, {}
+    mw = payload.get("metric_weights") or {}
+    dw = payload.get("doc_weights") or {}
+    return (
+        {str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))},
+        {str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))},
+    )
+
+
 def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
    """Find every run directory (one that contains metadata.json) under the roots."""
    run_dirs: list[Path] = []
@@ -159,6 +180,8 @@ NON_METRIC_COLUMNS = {
    "source_chunk_ids",
    "review_status",
    "review_notes",
+    "weighted_score",
+    "sample_weight",
 }


--- a/webapp/services/scenario_scanner.py
+++ b/webapp/services/scenario_scanner.py
@@ -37,6 +37,16 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:

    metrics = payload.get("metrics")
    metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
+    raw_metric_weights = payload.get("metric_weights") or {}
+    raw_doc_weights = payload.get("doc_weights") or {}
+    metric_weights = {
+        str(k): float(v) for k, v in raw_metric_weights.items()
+        if isinstance(v, (int, float))
+    }
+    doc_weights = {
+        str(k): float(v) for k, v in raw_doc_weights.items()
+        if isinstance(v, (int, float))
+    }

    return ScenarioInfo(
        path=relative,
@@ -45,6 +55,8 @@ def _summarize_scenario(path: Path) -> ScenarioInfo:
        dataset=str(payload.get("dataset", "")),
        judge_model=str(payload.get("judge_model", "")),
        metrics=metric_list,
+        metric_weights=metric_weights,
+        doc_weights=doc_weights,
    )


--- a/webapp/services/yaml_patcher.py
+++ b/webapp/services/yaml_patcher.py
@@ -32,9 +32,11 @@ def apply_profiles_to_scenario(
    judge_profile: LLMProfile | None,
    answer_profile: LLMProfile | None,
    dataset_profile: LLMProfile | None,
+    metric_weights: dict[str, float] | None = None,
+    doc_weights: dict[str, float] | None = None,
    _resolve_absolute: bool = False,
 ) -> list[str]:
-    """Patch the YAML file at *scenario_path* with the supplied profiles.
+    """Patch the YAML file at *scenario_path* with the supplied profiles and weights.

    Returns a list of dotted field names that were actually patched.
    Setting *_resolve_absolute=True* skips repo-root resolution (used in tests).
@@ -67,6 +69,14 @@ def apply_profiles_to_scenario(
            generation["model"] = dataset_profile.model
            patched.append("generation.model")

+    if metric_weights is not None:
+        data["metric_weights"] = dict(metric_weights)
+        patched.append("metric_weights")
+
+    if doc_weights is not None:
+        data["doc_weights"] = dict(doc_weights)
+        patched.append("doc_weights")
+
    resolved.write_text(
        yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
        encoding="utf-8",
--- a/webapp/static/css/app.css
+++ b/webapp/static/css/app.css
@@ -308,6 +308,203 @@ table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: t
 .llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); }
 .llm-role-select { min-width: 240px; }

+/* ---------- API 文档 iframe ---------- */
+#view-apidocs { padding: 0; display: flex; flex-direction: column; flex: 1; }
+.apidocs-frame {
+  flex: 1;
+  width: 100%;
+  height: calc(100vh - 64px);
+  border: none;
+}
+
+.report-actions {
+  display: flex; justify-content: flex-end; margin: 0 0 12px;
+}
+.btn-export-pdf {
+  font-size: 13px; display: flex; align-items: center; gap: 6px;
+}
+
+/* ---------- 报告历史切换下拉 ---------- */
+.report-switcher {
+  display: flex; align-items: center; gap: 10px;
+  background: var(--surface); border: 1px solid var(--line);
+  border-radius: var(--radius); padding: 10px 16px;
+  margin-bottom: 14px; box-shadow: var(--shadow);
+}
+.report-switcher-label {
+  font-size: 13px; font-weight: 600; color: var(--slate); white-space: nowrap;
+}
+.report-switcher-select {
+  flex: 1; min-width: 0;
+  border: 1px solid var(--line); border-radius: 6px; padding: 6px 10px;
+  font-size: 13px; font-family: inherit; background: var(--bg); color: var(--ink);
+  cursor: pointer;
+}
+.report-switcher-select:focus { outline: none; border-color: var(--petrol); }
+
+/* ?? ?????? ??????????????????????????????????? */
+.weight-config-panel { margin-top: 12px; }
+.weight-section-title { font-size: 13px; font-weight: 600; color: var(--text); margin-bottom: 8px; }
+.weight-rows { display: flex; flex-direction: column; gap: 6px; }
+.weight-row {
+  display: flex; align-items: center; gap: 10px;
+  font-size: 13px;
+}
+.weight-row-label { min-width: 180px; color: var(--slate); font-family: monospace; }
+.weight-row-input {
+  width: 80px; padding: 4px 8px; border: 1px solid var(--border);
+  border-radius: 6px; font-size: 13px; text-align: right;
+}
+.weight-row-input:focus { outline: none; border-color: #6366f1; }
+.doc-weight-name {
+  flex: 1; padding: 4px 8px; border: 1px solid var(--border);
+  border-radius: 6px; font-size: 13px; min-width: 0;
+}
+.weight-row-remove { color: var(--bad); cursor: pointer; font-size: 14px; background: none; border: none; padding: 2px 6px; }
+.weight-row-remove:hover { background: #fee2e2; border-radius: 4px; }
+
+/* weighted_score ???????? */
+.metric-card.weighted-score-card {
+  border: 2px solid #6366f1;
+  background: #f5f3ff;
+}
+.metric-card.weighted-score-card .metric-name { color: #4f46e5; font-weight: 700; }
+
+/* ================================================================
+   打印样式（导出 PDF 用）
+   浏览器打印时隐藏 UI chrome，保留报告内容，图表 canvas 原样输出
+   ================================================================ */
+@media print {
+  /* ── 页面尺寸与边距 ── */
+  @page {
+    size: A4 portrait;
+    margin: 18mm 16mm 18mm 16mm;
+  }
+
+  /* ── 隐藏所有非报告元素 ── */
+  .sidebar,
+  .topbar,
+  .report-actions,
+  .no-print,
+  #dist-metric-select,
+  .grouping-tabs,
+  #view-runs,
+  #view-new,
+  #view-profiles { display: none !important; }
+
+  /* ── 全局基础 ── */
+  body {
+    font-size: 11pt;
+    line-height: 1.5;
+    color: #0f1b2d;
+    background: #fff;
+  }
+
+  /* ── 布局重置：main 全宽 ── */
+  .app { display: block; }
+  .main { display: block; width: 100%; }
+  .view { padding: 0; display: block !important; }
+  #view-report { display: block !important; }
+
+  /* ── 报告内容 ── */
+  #report-content { display: block !important; }
+  #report-empty { display: none !important; }
+
+  /* ── 元信息条 ── */
+  .report-meta {
+    display: flex;
+    justify-content: space-between;
+    border-bottom: 2px solid #009999;
+    padding-bottom: 8pt;
+    margin-bottom: 14pt;
+  }
+  .report-meta-title { font-size: 14pt; font-weight: 700; }
+  .report-meta-info { font-size: 9pt; color: #64748b; }
+
+  /* ── Section 标签 ── */
+  .section-label {
+    font-size: 9pt;
+    font-weight: 700;
+    letter-spacing: 0.5px;
+    color: #64748b;
+    text-transform: uppercase;
+    margin: 14pt 0 6pt;
+    border-bottom: 1px solid #e2e8f0;
+    padding-bottom: 3pt;
+    break-after: avoid;
+  }
+
+  /* ── ① 指标均值卡片 ── */
+  .metric-cards {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(90pt, 1fr));
+    gap: 8pt;
+    margin-bottom: 12pt;
+  }
+  .metric-card {
+    border: 1px solid #e2e8f0;
+    border-radius: 6pt;
+    padding: 10pt 8pt;
+    text-align: center;
+    break-inside: avoid;
+  }
+  .metric-value { font-size: 20pt; font-weight: 700; }
+  .metric-name  { font-size: 8pt; color: #64748b; margin-top: 2pt; }
+
+  /* ── ② 分布 + ③ 分组：打印时改为纵向排列 ── */
+  .report-row {
+    display: block;
+  }
+  .report-half {
+    margin-bottom: 12pt;
+    break-inside: avoid;
+  }
+  #dist-chart {
+    max-height: 160pt;
+    width: 100% !important;
+  }
+
+  /* ── 面板统一 ── */
+  .panel {
+    border: 1px solid #e2e8f0;
+    border-radius: 6pt;
+    padding: 10pt 12pt;
+    margin-bottom: 10pt;
+    break-inside: avoid;
+    box-shadow: none;
+  }
+  .panel h2 { font-size: 12pt; margin-bottom: 4pt; }
+
+  /* ── ④ 最低分样本：打印时全部展开，隐藏点击提示 ── */
+  .lowest-detail { display: block !important; hidden: false; }
+  .lowest-row { break-inside: avoid; }
+  .lowest-detail-inner { padding: 8pt 0; font-size: 10pt; }
+  .detail-label { font-size: 8pt; font-weight: 700; color: #64748b; margin-bottom: 2pt; }
+  .detail-context .ctx-item { border-bottom: 1px dashed #e2e8f0; padding: 2pt 0; font-size: 9pt; }
+
+  /* ── ⑤ 优化建议 ── */
+  #advice-section { display: block !important; }
+  .advice-panel { border: 1px solid #e2e8f0; border-radius: 6pt; padding: 10pt 12pt; }
+  .advice-md h2 { font-size: 12pt; margin-top: 10pt; }
+  .advice-md h3 { font-size: 11pt; }
+  .advice-md ul { margin: 4pt 0 4pt 16pt; }
+  .advice-md li { margin-bottom: 3pt; }
+
+  /* ── 分组表 ── */
+  table.group-table { width: 100%; font-size: 9pt; border-collapse: collapse; }
+  table.group-table th,
+  table.group-table td { padding: 4pt 6pt; border-bottom: 1px solid #e2e8f0; }
+  table.group-table th { font-weight: 700; color: #64748b; }
+
+  /* ── 颜色保留（部分浏览器打印默认去色） ── */
+  .good  { color: #16a34a !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+  .warn  { color: #eab308 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+  .bad   { color: #dc2626 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+  .score-badge.good { background: #dcfce7 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+  .score-badge.warn { background: #fef9c3 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+  .score-badge.bad  { background: #fee2e2 !important; -webkit-print-color-adjust: exact; print-color-adjust: exact; }
+}
+
 /* ---------- ⑤ 优化建议面板 ---------- */
 .advice-panel { border-left: 3px solid #7c3aed; }
 .advice-header {
--- a/webapp/static/index.html
+++ b/webapp/static/index.html
@@ -3,7 +3,7 @@
 <head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-  <title>Siemens RAGAS 评估控制台</title>
+  <title>RAGAS 评估控制台</title>
  <link rel="stylesheet" href="/static/css/app.css" />
  <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
 </head>
@@ -28,6 +28,9 @@
        <button class="nav-item" data-view="profiles">
          <span class="nav-ico">⚙</span><span>LLM 配置</span>
        </button>
+        <button class="nav-item" data-view="apidocs">
+          <span class="nav-ico">⎔</span><span>API 文档</span>
+        </button>
      </nav>
      <div class="sidebar-foot">
        <span class="dot" id="health-dot"></span>
@@ -89,6 +92,22 @@
          </div>
        </div>

+        <!-- ??????????????? -->
+        <div class="panel weight-config-panel" id="weight-config-panel" hidden>
+          <h2>???? <span class="muted" style="font-size:13px;font-weight:400">???????????????</span></h2>
+
+          <div class="weight-section">
+            <div class="weight-section-title">???? <span class="muted" style="font-size:12px">???????????????????</span></div>
+            <div id="metric-weight-rows" class="weight-rows"></div>
+          </div>
+
+          <div class="weight-section" style="margin-top:16px">
+            <div class="weight-section-title">???? <span class="muted" style="font-size:12px">?? PDF ???????????????????????</span></div>
+            <div id="doc-weight-rows" class="weight-rows"></div>
+            <button class="btn btn-sm" id="add-doc-weight-btn" style="margin-top:8px">? ??????</button>
+          </div>
+        </div>
+
        <div class="panel" id="task-panel" hidden>
          <div class="task-head">
            <h2>评估进度</h2>
@@ -103,12 +122,25 @@

      <!-- 报告详情视图 -->
      <section class="view" id="view-report" hidden>
+        <!-- 历史报告切换下拉（顶部，始终可见） -->
+        <div class="report-switcher no-print" id="report-switcher">
+          <label class="report-switcher-label">切换报告</label>
+          <select class="select report-switcher-select" id="report-switcher-select">
+            <option value="">— 加载中… —</option>
+          </select>
+        </div>
+
        <div class="empty" id="report-empty">
          <p>请先从「运行列表」选择一次运行。</p>
        </div>
        <div id="report-content" hidden>
          <!-- 顶部元信息条 -->
          <div class="report-meta" id="report-meta"></div>
+          <div class="report-actions no-print">
+            <button class="btn btn-ghost btn-export-pdf" id="export-pdf-btn" onclick="Report.exportPdf()">
+              📄 导出 PDF
+            </button>
+          </div>

          <!-- ① 指标均值卡片 -->
          <div class="section-label">① 指标均值 OVERVIEW</div>
@@ -199,6 +231,17 @@
          <p class="muted">点击「新建配置」添加第一个。</p>
        </div>
      </section>
+
+      <!-- API 文档视图 -->
+      <section class="view" id="view-apidocs" hidden>
+        <iframe
+          id="apidocs-frame"
+          src="/docs"
+          class="apidocs-frame"
+          title="API 文档"
+          allowfullscreen>
+        </iframe>
+      </section>
    </main>
  </div>

--- a/webapp/static/js/app.js
+++ b/webapp/static/js/app.js
@@ -5,8 +5,8 @@
 const App = {
  currentRunId: null,
  activeView: null,
-  views: ["runs", "new", "report", "profiles"],
-  titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置" },
+  views: ["runs", "new", "report", "profiles", "apidocs"],
+  titles: { runs: "运行列表", new: "新建评估", report: "报告详情", profiles: "LLM 配置", apidocs: "API 文档" },

  // 初始化：绑定导航、从 URL/sessionStorage 恢复上次位置、启动健康检查。
  init() {
--- a/webapp/static/js/report.js
+++ b/webapp/static/js/report.js
@@ -4,11 +4,16 @@ const Report = {
  distChart: null,
  currentDetail: null,
  activeGrouping: null,
+  _switcherLoaded: false,

  // 加载并渲染指定运行的完整报告。
  async render(runId) {
    const empty = document.getElementById("report-empty");
    const content = document.getElementById("report-content");
+
+    // 加载历史报告下拉（仅首次）
+    Report._loadSwitcher(runId);
+
    if (!runId) {
      empty.hidden = false;
      content.hidden = true;
@@ -28,6 +33,10 @@ const Report = {
      Report.renderLowest(detail.report);
      Report.renderAdvice(detail.summary, detail.report);
      content.style.opacity = "1";
+
+      // 同步下拉选中项
+      const sel = document.getElementById("report-switcher-select");
+      if (sel) sel.value = runId;
    } catch (err) {
      empty.hidden = false;
      content.hidden = true;
@@ -35,6 +44,55 @@ const Report = {
    }
  },

+  // 加载并填充历史报告下拉选择框
+  async _loadSwitcher(currentRunId) {
+    const sel = document.getElementById("report-switcher-select");
+    if (!sel) return;
+
+    // 已加载过就只更新选中值，不重复请求
+    if (Report._switcherLoaded) {
+      if (currentRunId) sel.value = currentRunId;
+      return;
+    }
+
+    try {
+      const data = await API.runs();
+      const runs = data.runs || [];
+      sel.innerHTML = "";
+      if (runs.length === 0) {
+        sel.innerHTML = '<option value="">（无历史运行）</option>';
+        return;
+      }
+      runs.forEach((run) => {
+        const opt = document.createElement("option");
+        opt.value = run.run_id;
+        const timeStr = App.shortTime(run.finished_at);
+        const meanText = run.metric_means
+          ? Object.entries(run.metric_means)
+              .filter(([, v]) => v !== null && v !== undefined)
+              .slice(0, 2)
+              .map(([k, v]) => `${App.shortMetric(k)}=${v.toFixed(2)}`)
+              .join(" ")
+          : "";
+        opt.textContent = `${run.scenario_name || run.run_id}  ${timeStr}${meanText ? "  [" + meanText + "]" : ""}`;
+        sel.appendChild(opt);
+      });
+      Report._switcherLoaded = true;
+      if (currentRunId) sel.value = currentRunId;
+    } catch (_e) {
+      sel.innerHTML = '<option value="">（加载失败）</option>';
+    }
+
+    // 绑定切换事件（只绑一次）
+    sel.addEventListener("change", () => {
+      const rid = sel.value;
+      if (!rid) return;
+      App.currentRunId = rid;
+      App.enableReportNav();
+      Report.render(rid);
+    });
+  },
+
  // 顶部元信息条。
  renderMeta(summary) {
    const el = document.getElementById("report-meta");
@@ -69,6 +127,18 @@ const Report = {
      `;
      wrap.appendChild(card);
    });
+
+    // 综合加权得分卡片
+    const wsValue = (report && report.weighted_score_mean !== undefined) ? report.weighted_score_mean : null;
+    const wsCard = document.createElement("div");
+    wsCard.className = "metric-card weighted-score-card";
+    const wsCls = App.scoreClass(wsValue);
+    const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2);
+    wsCard.innerHTML = `
+      <div class="metric-value ${wsCls}">${wsText}</div>
+      <div class="metric-name">综合加权得分</div>
+    `;
+    wrap.appendChild(wsCard);
  },

  // ② 分数分布直方图（可切换指标）。
@@ -286,4 +356,22 @@ const Report = {

    body.innerHTML = `<div class="advice-md">${html}</div>`;
  },
+
+  // 导出 PDF：展开所有低分样本 → 打印 → 还原折叠状态
+  exportPdf() {
+    // 1. 记录当前各 detail 展开状态，并全部展开
+    const details = document.querySelectorAll("#lowest-table .lowest-detail");
+    const wasHidden = Array.from(details).map((el) => el.hidden);
+    details.forEach((el) => { el.hidden = false; });
+
+    // 2. 打印完成后还原折叠状态
+    const restore = () => {
+      details.forEach((el, i) => { el.hidden = wasHidden[i]; });
+      window.removeEventListener("afterprint", restore);
+    };
+    window.addEventListener("afterprint", restore);
+
+    // 3. 触发打印（浏览器弹出打印对话框，用户选"另存为 PDF"）
+    window.print();
+  },
 };
--- a/webapp/static/js/runner.js
+++ b/webapp/static/js/runner.js
@@ -1,11 +1,11 @@
-// runner.js — 新建评估视图：列出场景、LLM角色配置、触发评估、轮询任务状态与日志。
+// runner.js — 新建评估视图：列出场景、LLM角色配置、权重配置、触发评估、轮询任务状态。

 const Runner = {
  selectedScenario: null,
+  selectedScenarioInfo: null,
  pollTimer: null,
  lastRunId: null,

-  // 绑定运行按钮。
  init() {
    document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
    document.getElementById("view-report-btn").addEventListener("click", () => {
@@ -14,9 +14,9 @@ const Runner = {
        App.navigate("report", Runner.lastRunId);
      }
    });
+    document.getElementById("add-doc-weight-btn").addEventListener("click", () => Runner._addDocWeightRow());
  },

-  // 加载并渲染可触发的场景列表。
  async loadScenarios() {
    const list = document.getElementById("scenario-list");
    list.innerHTML = '<p class="muted">加载中…</p>';
@@ -32,17 +32,14 @@ const Runner = {
    } catch (err) {
      list.innerHTML = `<p class="muted">加载失败：${App.escape(err.message)}</p>`;
    }
-    // 同时加载 profiles 供角色选择
    Runner._populateProfileSelects();
  },

-  // 填充三个角色下拉框
  async _populateProfileSelects() {
    const cached = Profiles.getAll();
    const profiles = cached.length > 0
      ? cached
      : (await API.profiles().catch(() => ({ profiles: [] }))).profiles;
-
    ["role-judge", "role-answer", "role-dataset"].forEach(id => {
      const sel = document.getElementById(id);
      sel.innerHTML = '<option value="">— 使用场景原始配置 —</option>';
@@ -55,17 +52,14 @@ const Runner = {
    });
  },

-  // 构造单个场景条目。
  renderScenarioItem(sc) {
    const item = document.createElement("div");
    const invalid = !!sc.error;
    item.className = "scenario-item" + (invalid ? " invalid" : "");
-
    const modeTag = sc.mode
      ? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
      : "";
    const metricCount = (sc.metrics || []).length;
-
    item.innerHTML = `
      <div>
        <div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
@@ -77,27 +71,94 @@ const Runner = {
        <span class="tag">${metricCount} 指标</span>
      </div>
    `;
-
    if (!invalid) {
      item.addEventListener("click", () => {
        document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
        item.classList.add("selected");
        Runner.selectedScenario = sc.path;
+        Runner.selectedScenarioInfo = sc;
        document.getElementById("selected-scenario").textContent = sc.path;
        document.getElementById("run-btn").disabled = false;
-        // 显示 LLM 角色面板
        document.getElementById("llm-assignment-panel").hidden = false;
+        Runner._renderWeightPanel(sc);
+        document.getElementById("weight-config-panel").hidden = false;
      });
    }
    return item;
  },

-  // 触发评估：先 apply profiles（若选了），再触发任务。
+  // 根据选中场景渲染指标权重行（动态生成，按场景 metrics 列表）
+  _renderWeightPanel(sc) {
+    const metricRows = document.getElementById("metric-weight-rows");
+    metricRows.innerHTML = "";
+    const metrics = sc.metrics || [];
+    const existingWeights = sc.metric_weights || {};
+    metrics.forEach(metric => {
+      const row = document.createElement("div");
+      row.className = "weight-row";
+      const currentVal = existingWeights[metric] != null ? existingWeights[metric] : 1.0;
+      row.innerHTML = `
+        <span class="weight-row-label">${App.escape(metric)}</span>
+        <input class="weight-row-input" type="number" min="0" step="0.1"
+               data-metric="${App.escape(metric)}" value="${currentVal}" />
+      `;
+      metricRows.appendChild(row);
+    });
+
+    // 填充已有文档权重
+    const docRows = document.getElementById("doc-weight-rows");
+    docRows.innerHTML = "";
+    const existingDocWeights = sc.doc_weights || {};
+    Object.entries(existingDocWeights).forEach(([docName, w]) => {
+      Runner._addDocWeightRow(docName, w);
+    });
+  },
+
+  // 添加一行文档权重输入
+  _addDocWeightRow(docName, weight) {
+    const name = docName !== undefined ? docName : "";
+    const w = weight !== undefined ? weight : 1.0;
+    const container = document.getElementById("doc-weight-rows");
+    const row = document.createElement("div");
+    row.className = "weight-row";
+    row.innerHTML = `
+      <input class="doc-weight-name" type="text" placeholder="PDF 文件名（如 322_双源CT.pdf）" value="${App.escape(String(name))}" />
+      <input class="weight-row-input" type="number" min="0" step="0.1" value="${w}" />
+      <button class="weight-row-remove" title="删除">✕</button>
+    `;
+    row.querySelector(".weight-row-remove").addEventListener("click", () => row.remove());
+    container.appendChild(row);
+  },
+
+  // 收集权重面板当前值；全等权时返回 null（不发送）
+  _collectWeights() {
+    const metricWeights = {};
+    document.querySelectorAll("#metric-weight-rows .weight-row-input").forEach(input => {
+      const metric = input.dataset.metric;
+      const val = parseFloat(input.value);
+      if (metric && !isNaN(val)) metricWeights[metric] = val;
+    });
+
+    const docWeights = {};
+    document.querySelectorAll("#doc-weight-rows .weight-row").forEach(row => {
+      const nameInput = row.querySelector(".doc-weight-name");
+      const valInput = row.querySelector(".weight-row-input");
+      if (!nameInput || !valInput) return;
+      const name = nameInput.value.trim();
+      const val = parseFloat(valInput.value);
+      if (name && !isNaN(val)) docWeights[name] = val;
+    });
+
+    const allMetricDefault = Object.values(metricWeights).every(v => Math.abs(v - 1.0) < 1e-9);
+    const noDocWeights = Object.keys(docWeights).length === 0;
+    if (allMetricDefault && noDocWeights) return { metricWeights: null, docWeights: null };
+    return { metricWeights, docWeights };
+  },
+
  async trigger() {
    if (!Runner.selectedScenario) return;
    const runBtn = document.getElementById("run-btn");
    runBtn.disabled = true;
-
    const panel = document.getElementById("task-panel");
    const logBox = document.getElementById("task-log");
    const statusBadge = document.getElementById("task-status");
@@ -106,12 +167,8 @@ const Runner = {
    reportBtn.hidden = true;
    logBox.textContent = "";
    Runner._setStatus(statusBadge, "queued");
-
    try {
-      // Step 1: apply LLM profiles to YAML if any selected
      await Runner._applyProfilesIfNeeded(logBox);
-
-      // Step 2: trigger evaluation
      const resp = await API.triggerEvaluation(Runner.selectedScenario);
      Runner.poll(resp.task_id);
    } catch (err) {
@@ -121,20 +178,22 @@ const Runner = {
    }
  },

-  // 如果用户选了 profile，就先 apply 写回 YAML
  async _applyProfilesIfNeeded(logBox) {
    const judgeId = document.getElementById("role-judge").value;
    const answerId = document.getElementById("role-answer").value;
    const datasetId = document.getElementById("role-dataset").value;
+    const { metricWeights, docWeights } = Runner._collectWeights();

-    if (!judgeId && !answerId && !datasetId) return; // 全空，跳过
+    if (!judgeId && !answerId && !datasetId && !metricWeights && !docWeights) return;

-    logBox.textContent = "正在将 LLM 配置写入场景文件…\n";
+    logBox.textContent = "正在将 LLM 配置和权重写入场景文件…\n";
    const body = {
      scenario_path: Runner.selectedScenario,
      judge_profile_id: judgeId || null,
      answer_profile_id: answerId || null,
      dataset_profile_id: datasetId || null,
+      metric_weights: metricWeights,
+      doc_weights: docWeights,
    };
    const result = await API.applyProfiles(body);
    const fields = (result.patched_fields || []).join(", ");
@@ -143,13 +202,11 @@ const Runner = {
      : "（未找到可更新的字段，继续运行）\n";
  },

-  // 周期性轮询任务状态，刷新日志与徽标。
  poll(taskId) {
    const logBox = document.getElementById("task-log");
    const statusBadge = document.getElementById("task-status");
    const reportBtn = document.getElementById("view-report-btn");
    const runBtn = document.getElementById("run-btn");
-
    if (Runner.pollTimer) clearInterval(Runner.pollTimer);
    Runner.pollTimer = setInterval(async () => {
      try {
@@ -157,7 +214,6 @@ const Runner = {
        logBox.textContent = (status.logs || []).join("\n");
        logBox.scrollTop = logBox.scrollHeight;
        Runner._setStatus(statusBadge, status.status);
-
        if (status.status === "completed" || status.status === "failed") {
          clearInterval(Runner.pollTimer);
          runBtn.disabled = false;
@@ -175,7 +231,6 @@ const Runner = {
    }, 1200);
  },

-  // 更新状态徽标的文本与配色类。
  _setStatus(badge, status) {
    badge.textContent = status;
    badge.className = "badge " + status;
Author	SHA1	Message	Date
wangwei	1bcb208f92	feat: Dify score API complete — add SCORE_API_TOKEN to .env.example Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 15:28:20 +08:00
wangwei	a03a24be4e	feat: add POST /api/score endpoint for Dify real-time scoring Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 15:14:19 +08:00
wangwei	e4d4e4968b	feat: add InlineScorer service with LLM client caching Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 15:03:43 +08:00
wangwei	761faf9c42	feat: add ScoreRequest/ScoreResponse models and SCORE_API_TOKEN setting Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 15:00:05 +08:00
wangwei	9ad6ad4ebc	docs: add Dify score API implementation plan	2026-06-22 14:55:43 +08:00
wangwei	eee96eb158	docs: add Dify score API integration design spec Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 14:51:52 +08:00
wangwei	ccf25eb1f9	feat: add Linux deployment scripts (deploy/start/stop/run_eval) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 14:28:44 +08:00
wangwei	199b3af611	docs: add Linux deploy script design spec Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-22 14:18:14 +08:00
wangwei	f9e3ba0f64	feat: add weight config panel to 新建评估 and weighted_score card to report Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 17:28:15 +08:00
wangwei	36e5506e2a	feat: report_builder uses weighted means; ReportData gains weighted_score_mean Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 17:16:09 +08:00
wangwei	835614189e	feat: ScenarioInfo exposes metric_weights and doc_weights from YAML Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 17:05:26 +08:00
wangwei	ce0d2291b0	feat: yaml_patcher and ProfileApplyRequest support metric_weights and doc_weights Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 17:02:21 +08:00
wangwei	480f6d66ea	feat: use weighted metric means and add weighted_score row to summary.md Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 16:59:56 +08:00
wangwei	d371ef7d24	feat: add weighted_score and sample_weight columns to score rows Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 16:53:45 +08:00
wangwei	8617eaa5aa	feat: add metric_weights and doc_weights to Scenario schema and dataclass Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 16:50:33 +08:00
wangwei	e0b064587f	feat: add metric/doc weight computation module (weights.py) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 16:47:47 +08:00
wangwei	078097af00	docs: add metric/doc weights implementation plan	2026-06-18 16:43:08 +08:00
wangwei	ca586bf9bb	docs: add metric and doc weights feature design spec Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>	2026-06-18 16:37:18 +08:00
wangwei	9ad2daff73	feat: restore API文档 nav item (iframe /docs) without touching other 4 modules	2026-06-17 11:24:16 +08:00
wangwei	e8af5b906c	chore: remove API docs iframe nav item, rename title to RAGAS 评估控制台	2026-06-17 11:18:01 +08:00
wangwei	8ea2b9c7d2	feat: add API文档 nav item with embedded Swagger UI iframe	2026-06-17 11:09:55 +08:00
wangwei	074800b741	feat: add history report switcher dropdown in report detail view	2026-06-17 10:35:56 +08:00
wangwei	3019390592	feat: add export-to-PDF via browser print with @media print CSS	2026-06-17 10:28:01 +08:00