feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:06:19 +08:00
parent d68399d39b
commit f5c2dce64a
17 changed files with 2381 additions and 9 deletions
--- a/rag_eval/advisor/init.py
+++ b/rag_eval/advisor/init.py
@@ -0,0 +1,67 @@
+"""Optimization advisor: rule-based diagnosis + LLM-powered recommendations."""
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any
+
+from rag_eval.reporting.artifacts import build_artifact_paths
+from rag_eval.shared.models import EvaluationResult, Scenario
+
+from .llm_analyzer import analyze
+from .rules import Diagnosis, diagnose
+from .writer import write_advice
+
+logger = logging.getLogger("rag_eval.advisor")
+
+__all__ = ["run_advisor", "Diagnosis", "diagnose"]
+
+
+def run_advisor(
+    result: EvaluationResult,
+    scenario: Scenario,
+    llm: Any,
+) -> None:
+    """Run the full optimization advisor pipeline after an evaluation completes.
+
+    Skips silently if scenario.optimization_advisor is False.
+    Never raises — failures are logged as warnings, not exceptions.
+
+    Args:
+        result: Completed EvaluationResult from Evaluator.evaluate().
+        scenario: The resolved Scenario (provides metrics, judge_model, output_dir).
+        llm: Pre-built RAGAS LLM instance (from build_models()) for LLM analysis.
+    """
+    if not scenario.optimization_advisor:
+        return
+
+    logger.info("[advisor] starting optimization analysis  scenario=%s", scenario.scenario_name)
+
+    try:
+        artifact_paths = build_artifact_paths(scenario.output_dir, result.run_id)
+        if artifact_paths.advice_md is None:
+            logger.warning("[advisor] advice_md path not set in RunArtifactPaths — skipping")
+            return
+
+        diagnoses = diagnose(result.score_rows, scenario.metrics)
+        logger.info("[advisor] rule diagnosis complete: %d metric(s) triggered", len(diagnoses))
+
+        if diagnoses:
+            llm_markdown = asyncio.run(analyze(diagnoses, llm, scenario.scenario_name))
+        else:
+            llm_markdown = ""
+
+        write_advice(
+            diagnoses=diagnoses,
+            llm_markdown=llm_markdown,
+            advice_path=artifact_paths.advice_md,
+            scenario_name=scenario.scenario_name,
+            run_id=result.run_id,
+            judge_model=scenario.judge_model,
+        )
+
+    except Exception as exc:
+        logger.warning(
+            "[advisor] advisor failed (%s: %s) — evaluation result is unaffected",
+            type(exc).__name__, exc,
+        )
--- a/rag_eval/advisor/llm_analyzer.py
+++ b/rag_eval/advisor/llm_analyzer.py
@@ -0,0 +1,99 @@
+"""LLM-powered analysis of rule diagnostics and low-score samples."""
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from .rules import Diagnosis
+
+logger = logging.getLogger("rag_eval.advisor")
+
+_PROMPT_TEMPLATE = """\
+你是一个 RAG 系统优化专家，正在分析西门子医疗 CT 文档问答系统的评测结果。
+请用中文撰写一份优化建议报告，格式为 Markdown。
+
+## 评测诊断摘要
+
+{diagnosis_summary}
+
+## 低分样本示例
+
+{low_sample_text}
+
+## 报告要求
+
+1. 按指标分节（## 指标名  [severity]），先解释"为什么低"（结合低分样本具体分析），再给出"具体怎么改"
+2. "具体怎么改"要结合低分样本的实际内容，而不只是泛泛建议
+3. 最后写一节 **## 优先优化次序**，按性价比排序（不增加 LLM 调用次数的优化优先）
+4. 语言简洁，面向工程师，不要废话，不要重复列表内容
+
+只输出 Markdown 报告正文，不要任何前置说明。
+"""
+
+
+def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
+    lines = []
+    for d in diagnoses:
+        direction = "（越低越好）" if d.metric == "noise_sensitivity" else ""
+        lines.append(
+            f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}，"
+            f"阈值={d.threshold}，严重程度={d.severity}"
+        )
+        lines.append(f"  - 可能原因：{'; '.join(d.root_causes)}")
+        lines.append(f"  - 建议动作：{'; '.join(d.suggested_actions)}")
+    return "\n".join(lines)
+
+
+def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
+    lines = []
+    for d in diagnoses:
+        if not d.low_samples:
+            continue
+        lines.append(f"### {d.metric} 低分样本（最多 3 条）")
+        for i, s in enumerate(d.low_samples, 1):
+            score = s.get(d.metric, "N/A")
+            lines.append(f"\n**样本 {i}**（分数={score}）")
+            lines.append(f"- 问题：{s.get('question', '')}")
+            lines.append(f"- 回答：{s.get('answer', '')[:300]}")
+            lines.append(f"- 标准答案：{s.get('ground_truth', '')[:200]}")
+    return "\n".join(lines)
+
+
+async def analyze(
+    diagnoses: list[Diagnosis],
+    llm: Any,
+    scenario_name: str,
+) -> str:
+    """Call the judge LLM to generate a Chinese optimization report.
+
+    Args:
+        diagnoses: Non-empty list of Diagnosis from rules.diagnose().
+        llm: RAGAS LLM wrapper (has .agenerate() method).
+        scenario_name: Used only for logging.
+
+    Returns:
+        LLM-generated Markdown string, or "" on failure (triggers writer fallback).
+    """
+    if not diagnoses:
+        return ""
+
+    diagnosis_summary = _build_diagnosis_summary(diagnoses)
+    low_sample_text = _build_low_sample_text(diagnoses)
+    prompt = _PROMPT_TEMPLATE.format(
+        diagnosis_summary=diagnosis_summary,
+        low_sample_text=low_sample_text,
+    )
+
+    try:
+        logger.info("[advisor] calling LLM for optimization analysis  scenario=%s", scenario_name)
+        from langchain_core.messages import HumanMessage
+        result = await llm.agenerate(texts=[[HumanMessage(content=prompt)]])
+        text = result.generations[0][0].text.strip()
+        logger.info("[advisor] LLM analysis complete  chars=%d", len(text))
+        return text
+    except Exception as exc:
+        logger.warning(
+            "[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
+            type(exc).__name__, exc,
+        )
+        return ""
--- a/rag_eval/advisor/rules.py
+++ b/rag_eval/advisor/rules.py
@@ -0,0 +1,236 @@
+"""Rule-based diagnostic engine for RAG evaluation metric scores."""
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class MetricRule:
+    """Threshold configuration and diagnostic text for one metric."""
+    warning_threshold: float
+    critical_threshold: float
+    higher_is_better: bool  # False for noise_sensitivity
+    root_causes: list[str]
+    suggested_actions: list[str]
+
+
+METRIC_RULES: dict[str, MetricRule] = {
+    "faithfulness": MetricRule(
+        warning_threshold=0.7,
+        critical_threshold=0.5,
+        higher_is_better=True,
+        root_causes=[
+            "生成回答包含检索片段中不支持的陈述（幻觉）",
+            "生成阶段未严格遵循 grounding 约束",
+            "校验阶段未开启或未生效",
+        ],
+        suggested_actions=[
+            "强化生成 prompt 的 grounding 约束（'只依据参考资料作答'）",
+            "开启校验阶段（validation: by_scenario）",
+            "检查低分样本中模型是否引用了片段外的知识",
+        ],
+    ),
+    "answer_relevancy": MetricRule(
+        warning_threshold=0.7,
+        critical_threshold=0.5,
+        higher_is_better=True,
+        root_causes=[
+            "回答偏离问题主旨或包含大量冗余内容",
+            "查询改写后问题语义漂移",
+            "生成 prompt 格式约束不足",
+        ],
+        suggested_actions=[
+            "优化查询改写 prompt，确保改写后语义不偏移",
+            "在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
+            "检查低分样本的回答是否存在格式冗余或话题偏移",
+        ],
+    ),
+    "context_recall": MetricRule(
+        warning_threshold=0.7,
+        critical_threshold=0.5,
+        higher_is_better=True,
+        root_causes=[
+            "检索未能召回标准答案所涉及的关键信息",
+            "单一查询未能覆盖问题的多个角度",
+            "过召回数量不足，关键片段被截断",
+        ],
+        suggested_actions=[
+            "启用多查询扩展（use_multi_query）覆盖不同措辞",
+            "对多跳问题启用问题分解（sub_questions）",
+            "加大过召回宽度（recall_top_k）",
+            "对颗粒度细的问题尝试 Step-back 双路检索",
+        ],
+    ),
+    "context_precision": MetricRule(
+        warning_threshold=0.6,
+        critical_threshold=0.4,
+        higher_is_better=True,
+        root_causes=[
+            "检索引入过多与问题无关的片段",
+            "重排未能将相关片段排在前列",
+            "缺少相关性过滤，噪声片段进入上下文",
+        ],
+        suggested_actions=[
+            "启用或优化 listwise 重排，将相关片段排在前列",
+            "启用上下文压缩（compression）过滤无关句子",
+            "启用相关性过滤（relevance_filter）丢弃明确无关片段",
+            "缩小 rerank_keep_k（如从 8 降到 5）",
+        ],
+    ),
+    "noise_sensitivity": MetricRule(
+        warning_threshold=0.3,   # higher is worse; trigger when mean > threshold
+        critical_threshold=0.5,
+        higher_is_better=False,
+        root_causes=[
+            "回答中包含检索到的噪声片段所引入的错误陈述",
+            "相关性过滤未能拦截干扰性片段",
+            "生成阶段对噪声片段未加区分地引用",
+        ],
+        suggested_actions=[
+            "启用相关性过滤（relevance_filter）拦截噪声",
+            "优化重排，将不相关片段排到截断点之后",
+            "在生成 prompt 中强调'来源冲突时并列陈述，不擅自下定论'",
+        ],
+    ),
+    "factual_correctness": MetricRule(
+        warning_threshold=0.6,
+        critical_threshold=0.4,
+        higher_is_better=True,
+        root_causes=[
+            "回答的事实陈述与标准答案存在偏差",
+            "检索未能命中标准答案所依据的关键片段",
+            "生成阶段对多个来源综合时产生事实错误",
+        ],
+        suggested_actions=[
+            "重点检查低分样本，确认是检索遗漏还是生成错误",
+            "提升 context_recall 以确保关键信息被检索到",
+            "对事实型问题将 temperature 降至 0",
+        ],
+    ),
+    "semantic_similarity": MetricRule(
+        warning_threshold=0.7,
+        critical_threshold=0.5,
+        higher_is_better=True,
+        root_causes=[
+            "回答语义与标准答案差距较大",
+            "回答过于简短或过于冗长，语义偏移",
+            "检索到的片段质量不足，导致生成内容偏离",
+        ],
+        suggested_actions=[
+            "检查低分样本的回答与标准答案的表述差异",
+            "优化生成 prompt 使回答更贴近标准表述风格",
+            "提升检索质量（context_recall / context_precision）",
+        ],
+    ),
+}
+
+
+@dataclass
+class Diagnosis:
+    """Diagnostic result for one metric that triggered a threshold."""
+    metric: str
+    mean_score: float
+    threshold: float          # the triggered threshold
+    severity: str             # "warning" | "critical"
+    root_causes: list[str] = field(default_factory=list)
+    suggested_actions: list[str] = field(default_factory=list)
+    low_samples: list[dict[str, Any]] = field(default_factory=list)
+
+
+def _mean_ignoring_nan(values: list[float]) -> float | None:
+    valid = [v for v in values if not math.isnan(v)]
+    if not valid:
+        return None
+    return sum(valid) / len(valid)
+
+
+def _select_low_samples(
+    rows: list[dict[str, Any]],
+    metric: str,
+    top_n: int,
+    higher_is_better: bool,
+) -> list[dict[str, Any]]:
+    """Return the top_n worst-scoring rows for a metric, excluding NaN."""
+    valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
+    sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
+    worst = sorted_rows[:top_n]
+    keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
+    return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
+
+
+def diagnose(
+    score_rows: list[dict[str, Any]],
+    metrics: list[str],
+    top_low_samples: int = 3,
+) -> list[Diagnosis]:
+    """Analyse score_rows and return a Diagnosis for each metric below threshold.
+
+    Args:
+        score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
+        metrics: Metric names to evaluate (from Scenario.metrics).
+        top_low_samples: How many worst-scoring samples to attach per diagnosis.
+
+    Returns:
+        List of Diagnosis objects, one per triggered metric. Empty if all OK.
+    """
+    diagnoses: list[Diagnosis] = []
+
+    for metric in metrics:
+        rule = METRIC_RULES.get(metric)
+        if rule is None:
+            continue  # unknown metric, skip
+
+        values = []
+        for row in score_rows:
+            raw = row.get(metric)
+            if raw is None:
+                continue
+            try:
+                v = float(raw)
+            except (TypeError, ValueError):
+                continue
+            values.append(v)
+
+        if not values:
+            continue
+
+        mean = _mean_ignoring_nan(values)
+        if mean is None:
+            continue
+
+        # Determine severity (direction-aware)
+        if rule.higher_is_better:
+            if mean < rule.critical_threshold:
+                severity = "critical"
+                threshold = rule.critical_threshold
+            elif mean < rule.warning_threshold:
+                severity = "warning"
+                threshold = rule.warning_threshold
+            else:
+                continue  # above warning threshold → no diagnosis
+        else:
+            # lower is better (noise_sensitivity)
+            if mean > rule.critical_threshold:
+                severity = "critical"
+                threshold = rule.critical_threshold
+            elif mean > rule.warning_threshold:
+                severity = "warning"
+                threshold = rule.warning_threshold
+            else:
+                continue
+
+        low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
+
+        diagnoses.append(Diagnosis(
+            metric=metric,
+            mean_score=round(mean, 4),
+            threshold=threshold,
+            severity=severity,
+            root_causes=list(rule.root_causes),
+            suggested_actions=list(rule.suggested_actions),
+            low_samples=low_samples,
+        ))
+
+    return diagnoses
--- a/rag_eval/advisor/writer.py
+++ b/rag_eval/advisor/writer.py
@@ -0,0 +1,82 @@
+"""Write optimization advice to markdown file and emit log summary."""
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from .rules import Diagnosis
+
+logger = logging.getLogger("rag_eval.advisor")
+
+
+def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
+    """Return a single-line log summary of triggered diagnoses."""
+    if not diagnoses:
+        return "[advisor] 所有指标正常，无需优化建议。"
+    parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
+    triggered = " ".join(parts)
+    return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}  →  {advice_path}"
+
+
+def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
+    """Build a rules-only report when LLM analysis is unavailable."""
+    if not diagnoses:
+        return ""
+    lines = ["## 规则诊断（LLM 分析不可用）\n"]
+    for d in diagnoses:
+        lines.append(f"### {d.metric}  [{d.severity}]  均值={d.mean_score:.4f}")
+        lines.append("\n**可能原因：**")
+        for cause in d.root_causes:
+            lines.append(f"- {cause}")
+        lines.append("\n**建议动作：**")
+        for action in d.suggested_actions:
+            lines.append(f"- {action}")
+        lines.append("")
+    return "\n".join(lines)
+
+
+def write_advice(
+    diagnoses: list[Diagnosis],
+    llm_markdown: str,
+    advice_path: Path,
+    scenario_name: str,
+    run_id: str,
+    judge_model: str,
+) -> None:
+    """Write optimization_advice.md and emit a log summary line.
+
+    Args:
+        diagnoses: List of Diagnosis from rules.diagnose().
+        llm_markdown: LLM-generated Markdown body. Empty string triggers fallback.
+        advice_path: Full path to write the .md file.
+        scenario_name: Human-readable scenario identifier for the report header.
+        run_id: Run identifier string.
+        judge_model: Model used for LLM analysis (shown in header).
+    """
+    advice_path.parent.mkdir(parents=True, exist_ok=True)
+
+    from rag_eval.shared.utils import utc_now_iso
+    header_lines = [
+        f"# 优化建议报告 — {scenario_name}",
+        "",
+        f"- run_id: `{run_id}`",
+        f"- 生成时间: `{utc_now_iso()}`",
+        f"- judge_model: `{judge_model}`",
+        "",
+        "---",
+        "",
+    ]
+
+    if not diagnoses:
+        body = "## ✅ 未发现明显指标异常\n\n所有指标均在正常范围内，当前 RAG 链路表现良好。\n"
+    elif llm_markdown:
+        body = llm_markdown
+    else:
+        body = _build_fallback_report(diagnoses)
+
+    content = "\n".join(header_lines) + body
+    advice_path.write_text(content, encoding="utf-8")
+
+    summary = _format_log_summary(diagnoses, advice_path)
+    logger.info(summary)
+    logger.info("[advisor] 优化建议已写出: %s", advice_path)
--- a/rag_eval/config/loader.py
+++ b/rag_eval/config/loader.py
@@ -61,6 +61,7 @@ def load_scenario(path: str | Path) -> Scenario:
            max_samples=model.runtime.max_samples,
        ),
        source_path=scenario_path,
+        optimization_advisor=model.optimization_advisor,
    )
    # Run cross-field checks after all relative paths have been resolved.
    validate_scenario(scenario)
--- a/rag_eval/config/schema.py
+++ b/rag_eval/config/schema.py
@@ -54,6 +54,7 @@ class ScenarioModel(BaseModel):
    metrics: list[str]
    output_dir: str
    runtime: RuntimeConfigModel = Field(default_factory=RuntimeConfigModel)
+    optimization_advisor: bool = False

    @field_validator("metrics")
    @classmethod
--- a/rag_eval/execution/runner.py
+++ b/rag_eval/execution/runner.py
@@ -8,8 +8,9 @@ from pathlib import Path

 from rag_eval.adapters.http import HttpAppAdapter
 from rag_eval.adapters.python import PythonFunctionAdapter
+from rag_eval.advisor import run_advisor
 from rag_eval.config.loader import load_scenario
-from rag_eval.metrics.factory import build_metric_pipeline
+from rag_eval.metrics.factory import build_models, build_metric_pipeline
 from rag_eval.reporting.writers import write_run_artifacts
 from rag_eval.settings import EvaluationSettings
 from rag_eval.shared.models import Scenario
@@ -67,10 +68,17 @@ def run_scenario(
    logger.info("[runner] scenario loaded: name=%s  mode=%s  max_samples=%s",
                scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)

+    # Build models once; reuse llm in both MetricPipeline and advisor.
+    llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)
+
    adapter = build_adapter(scenario)
-    pipeline = build_metric_pipeline(scenario, settings)
+    pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
    evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
    result = evaluator.evaluate()
    write_run_artifacts(result)
    logger.info("[runner] artifacts written for run_id=%s", result.run_id)
+
+    # Optimization advisor — runs only if scenario.optimization_advisor is True.
+    run_advisor(result, scenario, llm)
+
    return result
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -18,7 +18,10 @@ from ragas.metrics.collections import (
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
+    FactualCorrectness,
    Faithfulness,
+    NoiseSensitivity,
+    SemanticSimilarity,
 )

 from .pipeline import MetricPipeline
@@ -39,19 +42,34 @@ def build_models(
 def build_metric_pipeline(
    scenario: Scenario,
    settings: EvaluationSettings,
+    llm: Any | None = None,
+    embeddings: Any | None = None,
 ) -> MetricPipeline:
-    """Build a metric pipeline containing only the metrics requested by the scenario."""
-    llm, embeddings = build_models(
-        scenario.judge_model,
-        scenario.embedding_model,
-        settings,
-    )
+    """Build a metric pipeline containing only the metrics requested by the scenario.
+
+    If llm and embeddings are provided (pre-built by the caller), they are reused.
+    Otherwise, new instances are created from scenario + settings.
+    """
+    if llm is None or embeddings is None:
+        llm, embeddings = build_models(
+            scenario.judge_model,
+            scenario.embedding_model,
+            settings,
+        )
+
    # Build the full registry once, then slice it by configured metric names.
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
+        # Robustness / end-to-end metrics (架构设计 §10.2).
+        # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
+        "noise_sensitivity": NoiseSensitivity(llm=llm),
+        # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
+        "factual_correctness": FactualCorrectness(llm=llm),
+        # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
+        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return MetricPipeline(
        metrics={name: registry[name] for name in scenario.metrics},
--- a/rag_eval/reporting/artifacts.py
+++ b/rag_eval/reporting/artifacts.py
@@ -17,4 +17,5 @@ def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
        invalid_csv=run_dir / "invalid.csv",
        summary_md=run_dir / "summary.md",
        metadata_json=run_dir / "metadata.json",
+        advice_md=run_dir / "optimization_advice.md",
    )
--- a/rag_eval/shared/models.py
+++ b/rag_eval/shared/models.py
@@ -76,6 +76,7 @@ class Scenario:
    runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
    app_adapter: AppAdapterConfig | None = None
    source_path: Path | None = None
+    optimization_advisor: bool = False

    def snapshot(self) -> dict[str, Any]:
        """Serialize the scenario into a reporting-friendly dictionary snapshot."""
@@ -159,3 +160,4 @@ class RunArtifactPaths:
    invalid_csv: Path
    summary_md: Path
    metadata_json: Path
+    advice_md: Path | None = None