From 078097af0001b52dd890910fcb5b70fd63a16970 Mon Sep 17 00:00:00 2001 From: wangwei Date: Thu, 18 Jun 2026 16:43:08 +0800 Subject: [PATCH] docs: add metric/doc weights implementation plan --- .../plans/2026-06-18-metric-doc-weights.md | 1537 +++++++++++++++++ 1 file changed, 1537 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-18-metric-doc-weights.md diff --git a/docs/superpowers/plans/2026-06-18-metric-doc-weights.md b/docs/superpowers/plans/2026-06-18-metric-doc-weights.md new file mode 100644 index 0000000..9fb14ba --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-metric-doc-weights.md @@ -0,0 +1,1537 @@ +# 指标权重 & 文档片段权重 Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 在场景 YAML 中支持 `metric_weights` 和 `doc_weights` 两个可选字段,计算加权综合得分并在报告页和「新建评估」页的权重配置面板中展示。 + +**Architecture:** 新增纯函数模块 `rag_eval/metrics/weights.py` 承载所有计算逻辑;评估器写入两列新字段 (`weighted_score`, `sample_weight`) 到 `scores.csv`;`yaml_patcher` 扩展支持写入权重字段;前端在 LLM 角色配置面板下方动态渲染权重面板,报告页新增「加权综合得分」卡片。 + +**Tech Stack:** Python 3.12, Pydantic v2, FastAPI, Vanilla JS (无框架), pytest + +## Global Constraints + +- Python 3.12+,PEP 8,4 空格缩进,类型注解必须 +- 所有新字段均为可选,缺省行为与现有完全一致(向后兼容) +- 测试用 pytest,不依赖真实 LLM 或网络 +- JS 不引入任何新依赖(原生 DOM API) +- 权重值无需归一化,计算时内部 `w / Σw` + +--- + +## 文件清单 + +| 操作 | 文件 | 职责 | +|------|------|------| +| 新建 | `rag_eval/metrics/weights.py` | 权重计算纯函数 | +| 新建 | `tests/test_weights.py` | weights 模块单元测试 | +| 修改 | `rag_eval/config/schema.py` | ScenarioModel 新增两字段 | +| 修改 | `rag_eval/shared/models.py` | Scenario dataclass 新增两字段 | +| 修改 | `rag_eval/config/loader.py` | load_scenario 透传新字段 | +| 修改 | `rag_eval/execution/evaluator.py` | `_merge_score` 新增两列 | +| 修改 | `rag_eval/reporting/summary.py` | 改用加权均值,新增 weighted_score 行 | +| 修改 | `webapp/services/yaml_patcher.py` | 新增 metric_weights/doc_weights 参数 | +| 修改 | `webapp/models.py` | ProfileApplyRequest 新增字段;ReportData 新增 weighted_score_mean | +| 修改 | `webapp/api/llm_profiles.py` | apply_profiles 透传新参数 | +| 修改 | `webapp/services/report_builder.py` | 读取权重,计算加权均值和 weighted_score_mean | +| 修改 | `webapp/services/run_reader.py` | 从 snapshot.yaml 读取 metric_weights/doc_weights | +| 修改 | `webapp/static/index.html` | 新增权重配置面板 HTML | +| 修改 | `webapp/static/js/runner.js` | 权重面板逻辑 + apply 时传权重 | +| 修改 | `webapp/static/css/app.css` | 权重面板样式 | +| 修改 | `webapp/static/js/report.js` | renderMetricCards 中渲染 weighted_score 卡片 | +| 修改 | `webapp/api/scenarios.py` | ScenarioInfo 新增 metric_weights/doc_weights 字段 | +| 修改 | `webapp/services/scenario_scanner.py` | 扫描时读取权重字段 | +| 修改 | `tests/test_offline_eval.py` | 断言 scores.csv 包含 weighted_score/sample_weight | +| 修改 | `tests/webapp/test_llm_profiles_api.py` | apply_profiles 权重写入测试 | + +--- + +## Task 1: 新建权重计算核心模块(TDD) + +**Files:** +- Create: `rag_eval/metrics/weights.py` +- Create: `tests/test_weights.py` + +**Interfaces:** +- Produces: + - `resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float` + - `compute_weighted_score(scores: dict[str, float | None], metric_weights: dict[str, float]) -> float | None` + - `weighted_metric_means(score_rows: list[dict], metrics: list[str], doc_weights: dict[str, float]) -> dict[str, float | None]` + - `compute_overall_weighted_score_mean(score_rows: list[dict], metric_weights: dict[str, float], doc_weights: dict[str, float]) -> float | None` + +- [ ] **Step 1: Write failing tests** + +Create `tests/test_weights.py`: + +```python +"""Unit tests for rag_eval/metrics/weights.py""" +import math +import pytest +from rag_eval.metrics.weights import ( + resolve_weight, + compute_weighted_score, + weighted_metric_means, + compute_overall_weighted_score_mean, +) + + +class TestResolveWeight: + def test_returns_value_when_key_present(self): + assert resolve_weight({"faith": 0.5}, "faith") == 0.5 + + def test_returns_default_when_key_missing(self): + assert resolve_weight({}, "faith") == 1.0 + + def test_returns_custom_default_when_key_missing(self): + assert resolve_weight({}, "faith", default=2.0) == 2.0 + + def test_empty_dict_returns_default(self): + assert resolve_weight({}, "anything") == 1.0 + + +class TestComputeWeightedScore: + def test_equal_weights_is_simple_mean(self): + scores = {"faithfulness": 0.8, "context_recall": 0.6} + result = compute_weighted_score(scores, {}) + assert result == pytest.approx(0.7, rel=1e-4) + + def test_explicit_weights(self): + scores = {"faithfulness": 1.0, "context_recall": 0.0} + weights = {"faithfulness": 3.0, "context_recall": 1.0} + # (3*1.0 + 1*0.0) / (3+1) = 0.75 + result = compute_weighted_score(scores, weights) + assert result == pytest.approx(0.75, rel=1e-4) + + def test_nan_values_excluded(self): + scores = {"faithfulness": float("nan"), "context_recall": 0.8} + result = compute_weighted_score(scores, {}) + assert result == pytest.approx(0.8, rel=1e-4) + + def test_none_values_excluded(self): + scores = {"faithfulness": None, "context_recall": 0.6} + result = compute_weighted_score(scores, {}) + assert result == pytest.approx(0.6, rel=1e-4) + + def test_all_nan_returns_none(self): + scores = {"faithfulness": float("nan"), "context_recall": float("nan")} + assert compute_weighted_score(scores, {}) is None + + def test_empty_scores_returns_none(self): + assert compute_weighted_score({}, {}) is None + + def test_missing_metric_in_weights_uses_default_1(self): + scores = {"faithfulness": 0.8, "context_recall": 0.4} + weights = {"faithfulness": 2.0} # context_recall defaults to 1.0 + # (2*0.8 + 1*0.4) / (2+1) = 2.0/3 ≈ 0.6667 + result = compute_weighted_score(scores, weights) + assert result == pytest.approx(2.0 / 3, rel=1e-4) + + +class TestWeightedMetricMeans: + def _rows(self): + return [ + {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5}, + {"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8}, + ] + + def test_equal_weights_gives_arithmetic_mean(self): + rows = self._rows() + result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {}) + assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4) + assert result["context_recall"] == pytest.approx(0.65, rel=1e-4) + + def test_doc_weight_amplifies_contribution(self): + rows = self._rows() + # doc a.pdf gets weight 3, b.pdf gets 1 + doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0} + result = weighted_metric_means(rows, ["faithfulness"], doc_weights) + # (3*1.0 + 1*0.6) / (3+1) = 3.6/4 = 0.9 + assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4) + + def test_nan_rows_skipped_per_metric(self): + rows = [ + {"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5}, + {"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9}, + ] + result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {}) + assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4) + assert result["context_recall"] == pytest.approx(0.7, rel=1e-4) + + def test_missing_metric_column_returns_none(self): + rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}] + result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {}) + assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4) + assert result["unknown_metric"] is None + + def test_empty_rows_returns_none_for_all(self): + result = weighted_metric_means([], ["faithfulness"], {}) + assert result["faithfulness"] is None + + +class TestComputeOverallWeightedScoreMean: + def test_basic_weighted_mean_of_weighted_scores(self): + rows = [ + {"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0}, + {"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5}, + ] + metric_weights = {"faithfulness": 1.0, "context_recall": 1.0} + result = compute_overall_weighted_score_mean(rows, metric_weights, {}) + # sample 1 ws = 0.5, sample 2 ws = 0.5 → mean = 0.5 + assert result == pytest.approx(0.5, rel=1e-4) + + def test_doc_weight_amplifies_sample(self): + rows = [ + {"doc_name": "important.pdf", "faithfulness": 1.0}, + {"doc_name": "other.pdf", "faithfulness": 0.0}, + ] + doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0} + result = compute_overall_weighted_score_mean(rows, {}, doc_weights) + # ws_1=1.0 w=9, ws_2=0.0 w=1 → (9*1 + 1*0)/(9+1) = 0.9 + assert result == pytest.approx(0.9, rel=1e-4) + + def test_all_nan_returns_none(self): + rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}] + assert compute_overall_weighted_score_mean(rows, {}, {}) is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +``` +python -m pytest tests/test_weights.py -v +``` +Expected: `ModuleNotFoundError: No module named 'rag_eval.metrics.weights'` + +- [ ] **Step 3: Implement `rag_eval/metrics/weights.py`** + +```python +"""Utility functions for weighted metric aggregation. + +All functions are pure (no side effects, no I/O) and operate on plain dicts/lists. +Weights do not need to be pre-normalised — normalisation is done internally. +""" + +from __future__ import annotations + +import math + + +def resolve_weight(weights: dict[str, float], key: str, default: float = 1.0) -> float: + """Return the weight for *key*, or *default* when absent.""" + return float(weights.get(key, default)) + + +def compute_weighted_score( + scores: dict[str, float | None], + metric_weights: dict[str, float], +) -> float | None: + """Return the weighted mean of valid (non-NaN, non-None) metric scores. + + Args: + scores: mapping of metric_name -> raw score (may be NaN or None). + metric_weights: optional per-metric weights; absent keys default to 1.0. + + Returns: + Weighted mean as a float, or None when no valid score exists. + """ + total_weight = 0.0 + total_score = 0.0 + for metric, score in scores.items(): + if score is None: + continue + try: + v = float(score) + except (TypeError, ValueError): + continue + if math.isnan(v) or math.isinf(v): + continue + w = resolve_weight(metric_weights, metric, default=1.0) + total_weight += w + total_score += w * v + if total_weight == 0.0: + return None + return total_score / total_weight + + +def weighted_metric_means( + score_rows: list[dict], + metrics: list[str], + doc_weights: dict[str, float], +) -> dict[str, float | None]: + """Compute per-metric weighted means across all score rows. + + Each row's contribution is scaled by the doc_weight for its ``doc_name``. + Rows with NaN/None for a given metric are excluded from that metric's mean. + + Args: + score_rows: list of score record dicts (from scores.csv). + metrics: ordered list of metric names to aggregate. + doc_weights: mapping doc_name -> weight multiplier; absent keys default to 1.0. + + Returns: + Dict mapping metric_name -> weighted mean (or None if no valid data). + """ + totals: dict[str, float] = {m: 0.0 for m in metrics} + weights_sum: dict[str, float] = {m: 0.0 for m in metrics} + + for row in score_rows: + doc_name = str(row.get("doc_name", "") or "") + sample_w = resolve_weight(doc_weights, doc_name, default=1.0) + for metric in metrics: + raw = row.get(metric) + if raw is None: + continue + try: + v = float(raw) + except (TypeError, ValueError): + continue + if math.isnan(v) or math.isinf(v): + continue + totals[metric] += sample_w * v + weights_sum[metric] += sample_w + + return { + metric: (totals[metric] / weights_sum[metric] if weights_sum[metric] > 0 else None) + for metric in metrics + } + + +def compute_overall_weighted_score_mean( + score_rows: list[dict], + metric_weights: dict[str, float], + doc_weights: dict[str, float], +) -> float | None: + """Compute the overall weighted-score mean across all samples. + + For each sample: + 1. Compute per-sample weighted_score via ``compute_weighted_score``. + 2. Scale by the doc weight for that sample's ``doc_name``. + Then return the weighted mean of all per-sample weighted_scores. + + Args: + score_rows: list of score record dicts. + metric_weights: per-metric weight multipliers. + doc_weights: per-doc weight multipliers. + + Returns: + Float mean, or None when no sample has a valid weighted_score. + """ + total_weight = 0.0 + total_score = 0.0 + for row in score_rows: + # Collect only numeric metric columns (exclude meta-columns) + metric_scores: dict[str, float | None] = {} + for k, v in row.items(): + if k in _META_COLUMNS: + continue + metric_scores[k] = v # type: ignore[assignment] + + ws = compute_weighted_score(metric_scores, metric_weights) + if ws is None: + continue + doc_name = str(row.get("doc_name", "") or "") + sample_w = resolve_weight(doc_weights, doc_name, default=1.0) + total_weight += sample_w + total_score += sample_w * ws + + return total_score / total_weight if total_weight > 0 else None + + +# Columns in scores.csv that are sample metadata, not metric scores. +_META_COLUMNS = frozenset({ + "sample_id", "question", "contexts", "answer", "ground_truth", + "scenario", "language", "retrieval_config", "error", + "judge_model", "embedding_model", "run_id", + "difficulty", "question_type", "doc_id", "doc_name", + "section_path", "page_start", "page_end", + "source_chunk_ids", "review_status", "review_notes", + "weighted_score", "sample_weight", +}) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +``` +python -m pytest tests/test_weights.py -v +``` +Expected: all 18 tests PASS + +- [ ] **Step 5: Commit** + +``` +git add rag_eval/metrics/weights.py tests/test_weights.py +git commit -m "feat: add metric/doc weight computation module (weights.py)" +``` + +--- + +## Task 2: 扩展 Schema、Dataclass 和 Loader + +**Files:** +- Modify: `rag_eval/config/schema.py` +- Modify: `rag_eval/shared/models.py` +- Modify: `rag_eval/config/loader.py` + +**Interfaces:** +- Consumes: 无新依赖 +- Produces: + - `ScenarioModel.metric_weights: dict[str, float]` (default `{}`) + - `ScenarioModel.doc_weights: dict[str, float]` (default `{}`) + - `Scenario.metric_weights: dict[str, float]` (default `{}`) + - `Scenario.doc_weights: dict[str, float]` (default `{}`) + +- [ ] **Step 1: Write failing test** + +Add to `tests/test_offline_eval.py` inside `ScenarioAndDatasetTests`: + +```python +def test_load_scenario_metric_and_doc_weights(self): + """load_scenario passes metric_weights and doc_weights into Scenario.""" + import tempfile, yaml + from rag_eval.config.loader import load_scenario + payload = { + "scenario_name": "w-test", "mode": "offline", + "dataset": "nonexistent.csv", "judge_model": "m", + "embedding_model": "e", "metrics": ["faithfulness"], + "output_dir": "out", + "metric_weights": {"faithfulness": 0.7}, + "doc_weights": {"doc.pdf": 2.0}, + } + with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", + encoding="utf-8", delete=False) as f: + yaml.dump(payload, f, allow_unicode=True) + tmp_path = f.name + scenario = load_scenario(tmp_path) + assert scenario.metric_weights == {"faithfulness": 0.7} + assert scenario.doc_weights == {"doc.pdf": 2.0} + +def test_load_scenario_defaults_to_empty_weights(self): + """load_scenario defaults metric_weights and doc_weights to empty dicts.""" + import tempfile, yaml + from rag_eval.config.loader import load_scenario + payload = { + "scenario_name": "no-w", "mode": "offline", + "dataset": "nonexistent.csv", "judge_model": "m", + "embedding_model": "e", "metrics": ["faithfulness"], + "output_dir": "out", + } + with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w", + encoding="utf-8", delete=False) as f: + yaml.dump(payload, f, allow_unicode=True) + tmp_path = f.name + scenario = load_scenario(tmp_path) + assert scenario.metric_weights == {} + assert scenario.doc_weights == {} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +``` +python -m pytest tests/test_offline_eval.py::ScenarioAndDatasetTests::test_load_scenario_metric_and_doc_weights tests/test_offline_eval.py::ScenarioAndDatasetTests::test_load_scenario_defaults_to_empty_weights -v +``` +Expected: FAIL — `Scenario has no attribute 'metric_weights'` + +- [ ] **Step 3: Add fields to `rag_eval/config/schema.py`** + +In `ScenarioModel`, add after `optimization_advisor`: +```python +metric_weights: dict[str, float] = Field(default_factory=dict) +doc_weights: dict[str, float] = Field(default_factory=dict) +``` + +- [ ] **Step 4: Add fields to `rag_eval/shared/models.py`** + +In `Scenario` dataclass, add after `optimization_advisor: bool = False`: +```python +metric_weights: dict[str, float] = field(default_factory=dict) +doc_weights: dict[str, float] = field(default_factory=dict) +``` + +- [ ] **Step 5: Update `rag_eval/config/loader.py`** + +In `load_scenario()`, in the `Scenario(...)` constructor call, add after `optimization_advisor=model.optimization_advisor,`: +```python +metric_weights=dict(model.metric_weights), +doc_weights=dict(model.doc_weights), +``` + +- [ ] **Step 6: Run tests to verify they pass** + +``` +python -m pytest tests/test_offline_eval.py::ScenarioAndDatasetTests::test_load_scenario_metric_and_doc_weights tests/test_offline_eval.py::ScenarioAndDatasetTests::test_load_scenario_defaults_to_empty_weights -v +``` +Expected: both PASS + +- [ ] **Step 7: Commit** + +``` +git add rag_eval/config/schema.py rag_eval/shared/models.py rag_eval/config/loader.py tests/test_offline_eval.py +git commit -m "feat: add metric_weights and doc_weights to Scenario schema and dataclass" +``` + +--- + +## Task 3: 评估器 — _merge_score 新增两列 + +**Files:** +- Modify: `rag_eval/execution/evaluator.py` + +**Interfaces:** +- Consumes: `compute_weighted_score(scores, metric_weights) -> float | None` from `rag_eval.metrics.weights` +- Produces: `scores.csv` 新增列 `weighted_score: float | NaN`, `sample_weight: float` + +- [ ] **Step 1: Write failing test** + +Add to `tests/test_offline_eval.py` inside `EvaluatorAndReportingTests`: + +```python +def test_merge_score_includes_weighted_score_and_sample_weight(self): + """_merge_score adds weighted_score and sample_weight columns.""" + from unittest.mock import MagicMock + from rag_eval.execution.evaluator import Evaluator + from rag_eval.shared.models import ( + MetricScore, NormalizedSample, RuntimeConfig, Scenario, DatasetConfig, + ) + from pathlib import Path + scenario = Scenario( + scenario_name="w-test", mode="offline", + dataset=DatasetConfig(path=Path("d.csv")), + judge_model="m", embedding_model="e", + metrics=["faithfulness", "context_recall"], + output_dir=Path("out"), + metric_weights={"faithfulness": 3.0, "context_recall": 1.0}, + doc_weights={"doc.pdf": 2.0}, + ) + evaluator = Evaluator( + scenario=scenario, + metric_pipeline=MagicMock(), + app_adapter=None, + ) + sample = NormalizedSample( + sample_id="s1", question="q", contexts=["ctx"], + answer="a", ground_truth="gt", + metadata={"doc_name": "doc.pdf"}, + ) + score = MetricScore(metrics={"faithfulness": 1.0, "context_recall": 0.0}) + row = evaluator._merge_score(sample, score) + # (3*1.0 + 1*0.0) / 4 = 0.75 + assert abs(row["weighted_score"] - 0.75) < 1e-4 + assert row["sample_weight"] == 2.0 +``` + +- [ ] **Step 2: Run test to verify it fails** + +``` +python -m pytest tests/test_offline_eval.py::EvaluatorAndReportingTests::test_merge_score_includes_weighted_score_and_sample_weight -v +``` +Expected: FAIL — `KeyError: 'weighted_score'` + +- [ ] **Step 3: Update `rag_eval/execution/evaluator.py`** + +Add import at top of file (after existing imports): +```python +from rag_eval.metrics.weights import compute_weighted_score, resolve_weight +``` + +Replace `_merge_score` method: +```python +def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]: + """Combine sample data, metric results, run metadata, and weight columns.""" + record = sample.to_record() + record["contexts"] = sample.contexts + record.update(score.metrics) + record["error"] = score.error + record["judge_model"] = self.scenario.judge_model + record["embedding_model"] = self.scenario.embedding_model + record["run_id"] = self.scenario.scenario_name + # Weighted score columns — enable post-hoc weighted aggregation in reporting. + record["weighted_score"] = compute_weighted_score( + score.metrics, self.scenario.metric_weights + ) + doc_name = str(sample.metadata.get("doc_name", "") or "") + record["sample_weight"] = resolve_weight( + self.scenario.doc_weights, doc_name, default=1.0 + ) + return record +``` + +- [ ] **Step 4: Run test to verify it passes** + +``` +python -m pytest tests/test_offline_eval.py::EvaluatorAndReportingTests::test_merge_score_includes_weighted_score_and_sample_weight -v +``` +Expected: PASS + +- [ ] **Step 5: Commit** + +``` +git add rag_eval/execution/evaluator.py tests/test_offline_eval.py +git commit -m "feat: add weighted_score and sample_weight columns to score rows" +``` + +--- + +## Task 4: 报告摘要 — 改用加权均值 + +**Files:** +- Modify: `rag_eval/reporting/summary.py` + +**Interfaces:** +- Consumes: + - `weighted_metric_means(score_rows, metrics, doc_weights) -> dict[str, float | None]` + - `compute_overall_weighted_score_mean(score_rows, metric_weights, doc_weights) -> float | None` + - Both from `rag_eval.metrics.weights` + +- [ ] **Step 1: Write failing test** + +Add to `tests/test_offline_eval.py` inside `EvaluatorAndReportingTests`: + +```python +def test_summary_markdown_shows_weighted_score(self): + """build_summary_markdown includes weighted_score when metric_weights set.""" + import math + from rag_eval.reporting.summary import build_summary_markdown + from rag_eval.shared.models import ( + EvaluationResult, NormalizedSample, DatasetConfig, Scenario, RuntimeConfig, + ) + from pathlib import Path + scenario = Scenario( + scenario_name="ws-test", mode="offline", + dataset=DatasetConfig(path=Path("d.csv")), + judge_model="m", embedding_model="e", + metrics=["faithfulness"], + output_dir=Path("out"), + metric_weights={"faithfulness": 1.0}, + doc_weights={}, + ) + sample = NormalizedSample( + sample_id="s1", question="q", contexts=["c"], + answer="a", ground_truth="gt", + ) + result = EvaluationResult( + scenario=scenario, run_id="r1", + started_at="2026-01-01T00:00:00", finished_at="2026-01-01T00:01:00", + valid_samples=[sample], invalid_samples=[], + score_rows=[{ + "sample_id": "s1", "faithfulness": 0.8, + "weighted_score": 0.8, "sample_weight": 1.0, + "doc_name": "", "error": "", + }], + ) + md = build_summary_markdown(result) + assert "weighted_score" in md + assert "0.8000" in md +``` + +- [ ] **Step 2: Run test to verify it fails** + +``` +python -m pytest tests/test_offline_eval.py::EvaluatorAndReportingTests::test_summary_markdown_shows_weighted_score -v +``` +Expected: FAIL — `"weighted_score" not in md` + +- [ ] **Step 3: Update `rag_eval/reporting/summary.py`** + +Replace the entire file: + +```python +"""Markdown summary generation for completed evaluation runs.""" + +from __future__ import annotations + +import math + +import pandas as pd + +from rag_eval.metrics.weights import ( + compute_overall_weighted_score_mean, + weighted_metric_means, +) +from rag_eval.shared.models import EvaluationResult + + +def _table_from_frame(frame: pd.DataFrame) -> str: + """Render a small dataframe as a fixed-width markdown-friendly text table.""" + if frame.empty: + return "No rows." + + columns = list(frame.columns) + rows = [[str(value) for value in row] for row in frame.astype(object).values.tolist()] + widths = [] + for index, column in enumerate(columns): + column_width = len(str(column)) + row_width = max((len(row[index]) for row in rows), default=0) + widths.append(max(column_width, row_width)) + + header = " | ".join(str(column).ljust(widths[idx]) for idx, column in enumerate(columns)) + separator = "-|-".join("-" * widths[idx] for idx in range(len(columns))) + body = [ + " | ".join(row[idx].ljust(widths[idx]) for idx in range(len(columns))) + for row in rows + ] + return "\n".join([header, separator, *body]) + + +def build_summary_markdown(result: EvaluationResult) -> str: + """Build the human-readable markdown summary written for each evaluation run.""" + total = len(result.valid_samples) + len(result.invalid_samples) + scores = pd.DataFrame(result.score_rows) + + lines = [ + f"# {result.scenario.scenario_name}", + "", + f"- run_id: `{result.run_id}`", + f"- mode: `{result.scenario.mode}`", + f"- total_samples: `{total}`", + f"- valid_samples: `{len(result.valid_samples)}`", + f"- invalid_samples: `{len(result.invalid_samples)}`", + f"- judge_model: `{result.scenario.judge_model}`", + f"- embedding_model: `{result.scenario.embedding_model}`", + "", + "## Metric Means", + "", + ] + + if scores.empty: + lines.append("No valid samples were scored.") + return "\n".join(lines) + "\n" + + score_rows_list = scores.to_dict(orient="records") + w_means = weighted_metric_means( + score_rows_list, result.scenario.metrics, result.scenario.doc_weights + ) + + has_weights = bool(result.scenario.metric_weights or result.scenario.doc_weights) + weight_suffix = " (加权)" if has_weights else "" + + for metric in result.scenario.metrics: + mean_value = w_means.get(metric) + w = result.scenario.metric_weights.get(metric, 1.0) if result.scenario.metric_weights else 1.0 + weight_note = f" (w={w:.2f})" if result.scenario.metric_weights else "" + if mean_value is not None and not math.isnan(mean_value): + lines.append(f"- {metric}: `{mean_value:.4f}`{weight_note}") + else: + lines.append(f"- {metric}: `n/a`{weight_note}") + + overall_ws = compute_overall_weighted_score_mean( + score_rows_list, result.scenario.metric_weights, result.scenario.doc_weights + ) + if overall_ws is not None and not math.isnan(overall_ws): + lines.append(f"- **weighted_score{weight_suffix}: `{overall_ws:.4f}`**") + else: + lines.append(f"- **weighted_score{weight_suffix}: `n/a`**") + + detail_columns = ["sample_id", *result.scenario.metrics, "weighted_score", "error"] + existing_columns = [c for c in detail_columns if c in scores.columns] + detail = scores[existing_columns] + lines.extend([ + "", + "## Per-sample Scores", + "", + "```text", + _table_from_frame(detail), + "```", + ]) + return "\n".join(lines) + "\n" +``` + +- [ ] **Step 4: Run test to verify it passes** + +``` +python -m pytest tests/test_offline_eval.py::EvaluatorAndReportingTests::test_summary_markdown_shows_weighted_score -v +``` +Expected: PASS + +- [ ] **Step 5: Run full offline test suite to check no regressions** + +``` +python -m pytest tests/test_offline_eval.py tests/test_weights.py -v +``` +Expected: all PASS + +- [ ] **Step 6: Commit** + +``` +git add rag_eval/reporting/summary.py tests/test_offline_eval.py +git commit -m "feat: use weighted metric means and add weighted_score row to summary.md" +``` + +--- + +## Task 5: yaml_patcher 扩展 + +**Files:** +- Modify: `webapp/services/yaml_patcher.py` +- Modify: `webapp/models.py` +- Modify: `webapp/api/llm_profiles.py` + +**Interfaces:** +- Produces: + - `apply_profiles_to_scenario(..., metric_weights=None, doc_weights=None)` — new optional params + - `ProfileApplyRequest.metric_weights: dict[str, float] | None` + - `ProfileApplyRequest.doc_weights: dict[str, float] | None` + +- [ ] **Step 1: Write failing test** + +Add to `tests/webapp/test_llm_profiles_api.py`: + +```python +def test_apply_metric_weights_patches_yaml(tmp_path): + """Applying metric_weights writes them into the YAML.""" + scenario_file = tmp_path / "w-scenario.yaml" + scenario_file.write_text( + "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n" + "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n", + encoding="utf-8", + ) + from webapp.services.yaml_patcher import apply_profiles_to_scenario + patched = apply_profiles_to_scenario( + scenario_path=str(scenario_file), + judge_profile=None, answer_profile=None, dataset_profile=None, + metric_weights={"faithfulness": 0.7, "context_recall": 0.3}, + _resolve_absolute=True, + ) + assert "metric_weights" in patched + data = yaml_lib.safe_load(scenario_file.read_text()) + assert data["metric_weights"]["faithfulness"] == pytest.approx(0.7) + + +def test_apply_doc_weights_patches_yaml(tmp_path): + """Applying doc_weights writes them into the YAML.""" + scenario_file = tmp_path / "dw-scenario.yaml" + scenario_file.write_text( + "scenario_name: test\nmode: offline\njudge_model: m\nembedding_model: e\n" + "dataset: d.csv\nmetrics:\n- faithfulness\noutput_dir: out\n", + encoding="utf-8", + ) + from webapp.services.yaml_patcher import apply_profiles_to_scenario + patched = apply_profiles_to_scenario( + scenario_path=str(scenario_file), + judge_profile=None, answer_profile=None, dataset_profile=None, + doc_weights={"doc.pdf": 2.0}, + _resolve_absolute=True, + ) + assert "doc_weights" in patched + data = yaml_lib.safe_load(scenario_file.read_text()) + assert data["doc_weights"]["doc.pdf"] == pytest.approx(2.0) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +``` +python -m pytest tests/webapp/test_llm_profiles_api.py::test_apply_metric_weights_patches_yaml tests/webapp/test_llm_profiles_api.py::test_apply_doc_weights_patches_yaml -v +``` +Expected: FAIL — `unexpected keyword argument 'metric_weights'` + +- [ ] **Step 3: Update `webapp/services/yaml_patcher.py`** + +Replace `apply_profiles_to_scenario` signature and body: + +```python +def apply_profiles_to_scenario( + scenario_path: str, + judge_profile: LLMProfile | None, + answer_profile: LLMProfile | None, + dataset_profile: LLMProfile | None, + metric_weights: dict[str, float] | None = None, + doc_weights: dict[str, float] | None = None, + _resolve_absolute: bool = False, +) -> list[str]: + """Patch the YAML file at *scenario_path* with the supplied profiles and weights. + + Returns a list of dotted field names that were actually patched. + """ + if _resolve_absolute: + resolved = Path(scenario_path) + else: + resolved = _resolve_scenario_path(scenario_path) + + if not resolved.exists(): + raise FileNotFoundError(f"Scenario file not found: {resolved}") + + data: dict[str, Any] = yaml.safe_load(resolved.read_text(encoding="utf-8")) or {} + patched: list[str] = [] + + if judge_profile is not None: + data["judge_model"] = judge_profile.model + patched.append("judge_model") + + if answer_profile is not None: + adapter = data.get("app_adapter") + if isinstance(adapter, dict): + static_kwargs = adapter.setdefault("static_kwargs", {}) + static_kwargs["model"] = answer_profile.model + patched.append("app_adapter.static_kwargs.model") + + if dataset_profile is not None: + generation = data.get("generation") + if isinstance(generation, dict): + generation["model"] = dataset_profile.model + patched.append("generation.model") + + if metric_weights is not None: + data["metric_weights"] = dict(metric_weights) + patched.append("metric_weights") + + if doc_weights is not None: + data["doc_weights"] = dict(doc_weights) + patched.append("doc_weights") + + resolved.write_text( + yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False), + encoding="utf-8", + ) + return patched +``` + +- [ ] **Step 4: Update `webapp/models.py` — ProfileApplyRequest** + +Add two fields to `ProfileApplyRequest`: +```python +class ProfileApplyRequest(BaseModel): + """Request body to patch LLM profile selections into a scenario YAML.""" + + scenario_path: str + judge_profile_id: str | None = None + answer_profile_id: str | None = None + dataset_profile_id: str | None = None + metric_weights: dict[str, float] | None = Field( + default=None, + description="指标权重映射,如 {\"faithfulness\": 0.35}。为 null 时不修改 YAML。", + ) + doc_weights: dict[str, float] | None = Field( + default=None, + description="文档权重映射,如 {\"doc.pdf\": 2.0}。为 null 时不修改 YAML。", + ) +``` + +- [ ] **Step 5: Update `webapp/api/llm_profiles.py` — apply_profiles endpoint** + +In `apply_profiles()`, update the call to `apply_profiles_to_scenario`: +```python +patched = apply_profiles_to_scenario( + scenario_path=request.scenario_path, + judge_profile=role_profiles["judge"], + answer_profile=role_profiles["answer"], + dataset_profile=role_profiles["dataset"], + metric_weights=request.metric_weights, + doc_weights=request.doc_weights, +) +``` + +- [ ] **Step 6: Run tests to verify they pass** + +``` +python -m pytest tests/webapp/test_llm_profiles_api.py -v +``` +Expected: all 15 tests PASS + +- [ ] **Step 7: Commit** + +``` +git add webapp/services/yaml_patcher.py webapp/models.py webapp/api/llm_profiles.py tests/webapp/test_llm_profiles_api.py +git commit -m "feat: yaml_patcher and ProfileApplyRequest support metric_weights and doc_weights" +``` + +--- + +## Task 6: report_builder 和 run_reader 加权支持 + +**Files:** +- Modify: `webapp/services/run_reader.py` +- Modify: `webapp/services/report_builder.py` +- Modify: `webapp/models.py` (ReportData 新增 weighted_score_mean) + +**Interfaces:** +- Consumes: + - `weighted_metric_means(score_rows, metrics, doc_weights)` from `rag_eval.metrics.weights` + - `compute_overall_weighted_score_mean(score_rows, metric_weights, doc_weights)` from `rag_eval.metrics.weights` +- Produces: + - `ReportData.weighted_score_mean: float | None` + - `_read_weights_from_snapshot(run_dir) -> tuple[dict, dict]` in run_reader + +- [ ] **Step 1: Update `webapp/models.py` — ReportData** + +Add `weighted_score_mean` field: +```python +class ReportData(BaseModel): + """Aggregated report payload rendered by the report detail page.""" + + metrics: list[str] = Field(default_factory=list) + metric_means: dict[str, float | None] = Field(default_factory=dict) + distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict) + groupings: dict[str, list[GroupStat]] = Field(default_factory=dict) + lowest_samples: list[SampleScore] = Field(default_factory=list) + summary_markdown: str = "" + advice_markdown: str = "" + weighted_score_mean: float | None = Field( + default=None, + description="加权综合得分均值(metric_weights × doc_weights 共同作用)。等权时等于各指标均值的均值。", + ) + metric_weights: dict[str, float] = Field( + default_factory=dict, + description="该次运行使用的指标权重配置(来自 scenario.snapshot.yaml)。", + ) + doc_weights: dict[str, float] = Field( + default_factory=dict, + description="该次运行使用的文档权重配置(来自 scenario.snapshot.yaml)。", + ) +``` + +- [ ] **Step 2: Add `_read_weights_from_snapshot` to `webapp/services/run_reader.py`** + +Add after `_read_metrics_from_snapshot`: +```python +def _read_weights_from_snapshot(run_dir: Path) -> tuple[dict[str, float], dict[str, float]]: + """Read metric_weights and doc_weights from a scenario snapshot if present. + + Returns a (metric_weights, doc_weights) tuple of plain dicts. + Both default to empty dicts when the snapshot is absent or lacks the fields. + """ + snapshot = run_dir / "scenario.snapshot.yaml" + if not snapshot.is_file(): + return {}, {} + try: + payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {} + except (OSError, yaml.YAMLError): + return {}, {} + mw = payload.get("metric_weights") or {} + dw = payload.get("doc_weights") or {} + return ( + {str(k): float(v) for k, v in mw.items() if isinstance(v, (int, float))}, + {str(k): float(v) for k, v in dw.items() if isinstance(v, (int, float))}, + ) +``` + +- [ ] **Step 3: Update `webapp/services/report_builder.py`** + +Replace `_metric_means` call and `build_report` to use weighted versions: + +```python +# Add imports at top: +from rag_eval.metrics.weights import ( + compute_overall_weighted_score_mean, + weighted_metric_means as _weighted_metric_means, +) +from webapp.services.run_reader import _read_weights_from_snapshot +``` + +Replace `build_report`: +```python +def build_report(run_dir: Path, metrics: list[str]) -> ReportData: + """Build the full aggregated report payload for one run directory.""" + frame = run_reader.read_scores_frame(run_dir) + summary_markdown = run_reader.read_summary_markdown(run_dir) + advice_markdown = run_reader.read_advice_markdown(run_dir) + metric_weights, doc_weights = _read_weights_from_snapshot(run_dir) + + if frame.empty or not metrics: + return ReportData( + metrics=metrics, + metric_means={metric: None for metric in metrics}, + summary_markdown=summary_markdown, + advice_markdown=advice_markdown, + metric_weights=metric_weights, + doc_weights=doc_weights, + ) + + score_rows_list = frame.to_dict(orient="records") + + # Use weighted metric means (degrades to arithmetic mean when weights are empty). + w_means = _weighted_metric_means(score_rows_list, metrics, doc_weights) + rounded_means = {m: _round_or_none(v) for m, v in w_means.items()} + + overall_ws = compute_overall_weighted_score_mean( + score_rows_list, metric_weights, doc_weights + ) + + distributions = { + metric: _distribution(frame, metric) + for metric in metrics + if metric in frame.columns + } + + return ReportData( + metrics=metrics, + metric_means=rounded_means, + distributions=distributions, + groupings=_groupings(frame, metrics), + lowest_samples=_lowest_samples(frame, metrics), + summary_markdown=summary_markdown, + advice_markdown=advice_markdown, + weighted_score_mean=_round_or_none(overall_ws), + metric_weights=metric_weights, + doc_weights=doc_weights, + ) +``` + +Also delete the old `_metric_means` function (it is replaced by the weighted version). + +- [ ] **Step 4: Run existing webapp tests to check no regressions** + +``` +python -m pytest tests/webapp/ -v +``` +Expected: all PASS + +- [ ] **Step 5: Commit** + +``` +git add webapp/models.py webapp/services/run_reader.py webapp/services/report_builder.py +git commit -m "feat: report_builder uses weighted metric means; ReportData gains weighted_score_mean" +``` + +--- + +## Task 7: scenario_scanner — 读取权重字段供前端使用 + +**Files:** +- Modify: `webapp/models.py` (ScenarioInfo 新增字段) +- Modify: `webapp/services/scenario_scanner.py` + +**Interfaces:** +- Produces: + - `ScenarioInfo.metric_weights: dict[str, float]` + - `ScenarioInfo.doc_weights: dict[str, float]` + +- [ ] **Step 1: Add fields to `ScenarioInfo` in `webapp/models.py`** + +```python +class ScenarioInfo(BaseModel): + """One discoverable scenario YAML file that can be evaluated from the UI.""" + + path: str + scenario_name: str = "" + mode: str = "" + dataset: str = "" + judge_model: str = "" + metrics: list[str] = Field(default_factory=list) + error: str = "" + metric_weights: dict[str, float] = Field(default_factory=dict) + doc_weights: dict[str, float] = Field(default_factory=dict) +``` + +- [ ] **Step 2: Update `webapp/services/scenario_scanner.py` — `_summarize_scenario`** + +After the `metric_list` line, add weight extraction: +```python +raw_metric_weights = payload.get("metric_weights") or {} +raw_doc_weights = payload.get("doc_weights") or {} +metric_weights = {str(k): float(v) for k, v in raw_metric_weights.items() + if isinstance(v, (int, float))} +doc_weights = {str(k): float(v) for k, v in raw_doc_weights.items() + if isinstance(v, (int, float))} + +return ScenarioInfo( + path=relative, + scenario_name=str(payload.get("scenario_name", "")), + mode=str(payload.get("mode", "")), + dataset=str(payload.get("dataset", "")), + judge_model=str(payload.get("judge_model", "")), + metrics=metric_list, + metric_weights=metric_weights, + doc_weights=doc_weights, +) +``` + +- [ ] **Step 3: Run existing tests** + +``` +python -m pytest tests/webapp/ tests/test_offline_eval.py -v +``` +Expected: all PASS + +- [ ] **Step 4: Commit** + +``` +git add webapp/models.py webapp/services/scenario_scanner.py +git commit -m "feat: ScenarioInfo exposes metric_weights and doc_weights from YAML" +``` + +--- + +## Task 8: 前端 — 权重配置面板 + 报告卡片 + +**Files:** +- Modify: `webapp/static/index.html` +- Modify: `webapp/static/js/runner.js` +- Modify: `webapp/static/css/app.css` +- Modify: `webapp/static/js/report.js` + +**Interfaces:** +- Consumes: `ScenarioInfo.metric_weights`, `ScenarioInfo.doc_weights` (from `/api/scenarios`) +- Consumes: `ReportData.weighted_score_mean`, `ReportData.metric_weights` (from `/api/runs/{id}`) +- Produces: weight panel HTML in 新建评估; weighted_score card in 报告页 + +- [ ] **Step 1: Add weight panel HTML to `index.html`** + +Add this block immediately after the closing `` of `#llm-assignment-panel` (before `
+ +``` + +- [ ] **Step 2: Add CSS to `app.css`** + +Add at the end of the file, before any `@media print` block: + +```css +/* ── 权重配置面板 ─────────────────────────────────── */ +.weight-config-panel { margin-top: 12px; } +.weight-section-title { font-size: 13px; font-weight: 600; color: var(--text); margin-bottom: 8px; } +.weight-rows { display: flex; flex-direction: column; gap: 6px; } +.weight-row { + display: flex; align-items: center; gap: 10px; + font-size: 13px; +} +.weight-row-label { min-width: 180px; color: var(--slate); font-family: monospace; } +.weight-row-input { + width: 80px; padding: 4px 8px; border: 1px solid var(--border); + border-radius: 6px; font-size: 13px; text-align: right; +} +.weight-row-input:focus { outline: none; border-color: #6366f1; } +.doc-weight-name { + flex: 1; padding: 4px 8px; border: 1px solid var(--border); + border-radius: 6px; font-size: 13px; min-width: 0; +} +.weight-row-remove { color: var(--bad); cursor: pointer; font-size: 14px; background: none; border: none; padding: 2px 6px; } +.weight-row-remove:hover { background: #fee2e2; border-radius: 4px; } + +/* weighted_score 指标卡片突出显示 */ +.metric-card.weighted-score-card { + border: 2px solid #6366f1; + background: #f5f3ff; +} +.metric-card.weighted-score-card .metric-name { color: #4f46e5; font-weight: 700; } +``` + +- [ ] **Step 3: Update `runner.js`** + +Replace the entire `runner.js` with: + +```javascript +// runner.js — 新建评估视图:列出场景、LLM角色配置、权重配置、触发评估、轮询任务状态。 + +const Runner = { + selectedScenario: null, + selectedScenarioInfo: null, + pollTimer: null, + lastRunId: null, + + init() { + document.getElementById("run-btn").addEventListener("click", () => Runner.trigger()); + document.getElementById("view-report-btn").addEventListener("click", () => { + if (Runner.lastRunId) { + App.enableReportNav(); + App.navigate("report", Runner.lastRunId); + } + }); + document.getElementById("add-doc-weight-btn").addEventListener("click", () => Runner._addDocWeightRow()); + }, + + async loadScenarios() { + const list = document.getElementById("scenario-list"); + list.innerHTML = '

加载中…

'; + try { + const data = await API.scenarios(); + const scenarios = data.scenarios || []; + if (scenarios.length === 0) { + list.innerHTML = '

未在 scenarios/ 下找到场景文件。

'; + return; + } + list.innerHTML = ""; + scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc))); + } catch (err) { + list.innerHTML = `

加载失败:${App.escape(err.message)}

`; + } + Runner._populateProfileSelects(); + }, + + async _populateProfileSelects() { + const cached = Profiles.getAll(); + const profiles = cached.length > 0 + ? cached + : (await API.profiles().catch(() => ({ profiles: [] }))).profiles; + ["role-judge", "role-answer", "role-dataset"].forEach(id => { + const sel = document.getElementById(id); + sel.innerHTML = ''; + profiles.forEach(p => { + const opt = document.createElement("option"); + opt.value = p.profile_id; + opt.textContent = `${p.name} (${p.model})`; + sel.appendChild(opt); + }); + }); + }, + + renderScenarioItem(sc) { + const item = document.createElement("div"); + const invalid = !!sc.error; + item.className = "scenario-item" + (invalid ? " invalid" : ""); + const modeTag = sc.mode + ? `${App.escape(sc.mode)}` + : ""; + const metricCount = (sc.metrics || []).length; + item.innerHTML = ` +
+
${App.escape(sc.scenario_name || sc.path)}
+
${App.escape(sc.path)}
+ ${sc.error ? `
${App.escape(sc.error)}
` : ""} +
+
+ ${modeTag} + ${metricCount} 指标 +
+ `; + if (!invalid) { + item.addEventListener("click", () => { + document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected")); + item.classList.add("selected"); + Runner.selectedScenario = sc.path; + Runner.selectedScenarioInfo = sc; + document.getElementById("selected-scenario").textContent = sc.path; + document.getElementById("run-btn").disabled = false; + document.getElementById("llm-assignment-panel").hidden = false; + Runner._renderWeightPanel(sc); + document.getElementById("weight-config-panel").hidden = false; + }); + } + return item; + }, + + // 根据选中场景渲染指标权重行(动态) + _renderWeightPanel(sc) { + const metricRows = document.getElementById("metric-weight-rows"); + metricRows.innerHTML = ""; + const metrics = sc.metrics || []; + const existingWeights = sc.metric_weights || {}; + metrics.forEach(metric => { + const row = document.createElement("div"); + row.className = "weight-row"; + const currentVal = existingWeights[metric] != null ? existingWeights[metric] : 1.0; + row.innerHTML = ` + ${App.escape(metric)} + + `; + metricRows.appendChild(row); + }); + + // 填充已有文档权重 + const docRows = document.getElementById("doc-weight-rows"); + docRows.innerHTML = ""; + const existingDocWeights = sc.doc_weights || {}; + Object.entries(existingDocWeights).forEach(([docName, w]) => { + Runner._addDocWeightRow(docName, w); + }); + }, + + // 添加一行文档权重输入 + _addDocWeightRow(docName = "", weight = 1.0) { + const container = document.getElementById("doc-weight-rows"); + const row = document.createElement("div"); + row.className = "weight-row"; + row.innerHTML = ` + + + + `; + row.querySelector(".weight-row-remove").addEventListener("click", () => row.remove()); + container.appendChild(row); + }, + + // 收集权重面板当前值 + _collectWeights() { + const metricWeights = {}; + document.querySelectorAll("#metric-weight-rows .weight-row-input").forEach(input => { + const metric = input.dataset.metric; + const val = parseFloat(input.value); + if (metric && !isNaN(val)) metricWeights[metric] = val; + }); + + const docWeights = {}; + document.querySelectorAll("#doc-weight-rows .weight-row").forEach(row => { + const nameInput = row.querySelector(".doc-weight-name"); + const valInput = row.querySelector(".weight-row-input"); + if (!nameInput || !valInput) return; + const name = nameInput.value.trim(); + const val = parseFloat(valInput.value); + if (name && !isNaN(val)) docWeights[name] = val; + }); + + // 如果全部指标权重均为 1.0 且无文档权重,不发送(等权,跳过) + const allDefault = Object.values(metricWeights).every(v => Math.abs(v - 1.0) < 1e-9) + && Object.keys(docWeights).length === 0; + if (allDefault) return { metricWeights: null, docWeights: null }; + return { metricWeights, docWeights }; + }, + + async trigger() { + if (!Runner.selectedScenario) return; + const runBtn = document.getElementById("run-btn"); + runBtn.disabled = true; + const panel = document.getElementById("task-panel"); + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + panel.hidden = false; + reportBtn.hidden = true; + logBox.textContent = ""; + Runner._setStatus(statusBadge, "queued"); + try { + await Runner._applyProfilesIfNeeded(logBox); + const resp = await API.triggerEvaluation(Runner.selectedScenario); + Runner.poll(resp.task_id); + } catch (err) { + Runner._setStatus(statusBadge, "failed"); + logBox.textContent = (logBox.textContent ? logBox.textContent + "\n" : "") + `触发失败:${err.message}`; + runBtn.disabled = false; + } + }, + + async _applyProfilesIfNeeded(logBox) { + const judgeId = document.getElementById("role-judge").value; + const answerId = document.getElementById("role-answer").value; + const datasetId = document.getElementById("role-dataset").value; + const { metricWeights, docWeights } = Runner._collectWeights(); + + if (!judgeId && !answerId && !datasetId && !metricWeights && !docWeights) return; + + logBox.textContent = "正在将 LLM 配置和权重写入场景文件…\n"; + const body = { + scenario_path: Runner.selectedScenario, + judge_profile_id: judgeId || null, + answer_profile_id: answerId || null, + dataset_profile_id: datasetId || null, + metric_weights: metricWeights, + doc_weights: docWeights, + }; + const result = await API.applyProfiles(body); + const fields = (result.patched_fields || []).join(", "); + logBox.textContent += fields + ? `✓ 已更新字段:${fields}\n` + : "(未找到可更新的字段,继续运行)\n"; + }, + + poll(taskId) { + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + const runBtn = document.getElementById("run-btn"); + if (Runner.pollTimer) clearInterval(Runner.pollTimer); + Runner.pollTimer = setInterval(async () => { + try { + const status = await API.taskStatus(taskId); + logBox.textContent = (status.logs || []).join("\n"); + logBox.scrollTop = logBox.scrollHeight; + Runner._setStatus(statusBadge, status.status); + if (status.status === "completed" || status.status === "failed") { + clearInterval(Runner.pollTimer); + runBtn.disabled = false; + if (status.status === "completed" && status.run_id) { + Runner.lastRunId = status.run_id; + sessionStorage.setItem("rag_run_id", status.run_id); + reportBtn.hidden = false; + } + } + } catch (err) { + clearInterval(Runner.pollTimer); + logBox.textContent += `\n轮询失败:${err.message}`; + runBtn.disabled = false; + } + }, 1200); + }, + + _setStatus(badge, status) { + badge.textContent = status; + badge.className = "badge " + status; + }, +}; +``` + +- [ ] **Step 4: Update `report.js` — renderMetricCards** + +In `renderMetricCards`, after the `metrics.forEach` loop that renders individual cards, append this block to show the weighted_score card: + +```javascript +// 在 renderMetricCards 方法末尾,metrics.forEach 之后追加: +const wsValue = report.weighted_score_mean; +const wsCard = document.createElement("div"); +wsCard.className = "metric-card weighted-score-card"; +const wsCls = App.scoreClass(wsValue); +const wsText = wsValue === null || wsValue === undefined ? "n/a" : wsValue.toFixed(2); +wsCard.innerHTML = ` +
${wsText}
+
综合加权得分
+`; +wrap.appendChild(wsCard); +``` + +- [ ] **Step 5: Verify app loads without JS errors** + +Start the webapp: +``` +python webmain.py +``` +Open http://localhost:8000, navigate to「新建评估」, click a scenario and verify: +- Weight panel appears below LLM 角色配置 +- Each metric listed with a default weight of 1.0 +- 「添加文档权重」button adds a new row +- Navigate to any report and verify「综合加权得分」card appears + +- [ ] **Step 6: Commit** + +``` +git add webapp/static/index.html webapp/static/js/runner.js webapp/static/css/app.css webapp/static/js/report.js +git commit -m "feat: add weight config panel to 新建评估 and weighted_score card to report" +``` + +--- + +## Task 9: 全量回归测试 + +- [ ] **Step 1: Run all tests** + +``` +python -m pytest tests/ -v --tb=short +``` +Expected: all previously-passing tests still PASS, new tests PASS. + +Note: pre-existing failures in `webapp.test_*` (module import path issues) and `test_offline_eval::test_normalize_sample_pdf_offline_smoke_row` (missing CSV fixture) are known pre-existing issues — they are not regressions from this feature. + +- [ ] **Step 2: Run pipeline and llm-profiles tests explicitly** + +``` +python -m pytest tests/test_pipeline.py tests/webapp/test_llm_profiles_api.py tests/test_weights.py -v +``` +Expected: all PASS + +- [ ] **Step 3: Final commit** + +``` +git add . +git commit -m "feat: metric & doc weights — full implementation complete + +- New rag_eval/metrics/weights.py with pure-function weight computation +- Scenario YAML supports metric_weights and doc_weights (optional, backward-compatible) +- scores.csv gains weighted_score and sample_weight columns +- summary.md shows weighted metric means and overall weighted_score +- yaml_patcher writes metric_weights/doc_weights on apply +- report_builder uses weighted means; ReportData gains weighted_score_mean +- 新建评估 page: weight config panel with metric sliders and doc weight rows +- 报告详情 page: 综合加权得分 card + +Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +```