Files
siemens_ragas/rag_eval/advisor/llm_analyzer.py
wangwei f5c2dce64a feat(advisor): add optimization advisor module
- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:06:19 +08:00

100 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""LLM-powered analysis of rule diagnostics and low-score samples."""
from __future__ import annotations
import logging
from typing import Any
from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor")
_PROMPT_TEMPLATE = """\
你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。
请用中文撰写一份优化建议报告,格式为 Markdown。
## 评测诊断摘要
{diagnosis_summary}
## 低分样本示例
{low_sample_text}
## 报告要求
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
只输出 Markdown 报告正文,不要任何前置说明。
"""
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
lines.append(
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}"
f"阈值={d.threshold},严重程度={d.severity}"
)
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
return "\n".join(lines)
def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
if not d.low_samples:
continue
lines.append(f"### {d.metric} 低分样本(最多 3 条)")
for i, s in enumerate(d.low_samples, 1):
score = s.get(d.metric, "N/A")
lines.append(f"\n**样本 {i}**(分数={score}")
lines.append(f"- 问题:{s.get('question', '')}")
lines.append(f"- 回答:{s.get('answer', '')[:300]}")
lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}")
return "\n".join(lines)
async def analyze(
diagnoses: list[Diagnosis],
llm: Any,
scenario_name: str,
) -> str:
"""Call the judge LLM to generate a Chinese optimization report.
Args:
diagnoses: Non-empty list of Diagnosis from rules.diagnose().
llm: RAGAS LLM wrapper (has .agenerate() method).
scenario_name: Used only for logging.
Returns:
LLM-generated Markdown string, or "" on failure (triggers writer fallback).
"""
if not diagnoses:
return ""
diagnosis_summary = _build_diagnosis_summary(diagnoses)
low_sample_text = _build_low_sample_text(diagnoses)
prompt = _PROMPT_TEMPLATE.format(
diagnosis_summary=diagnosis_summary,
low_sample_text=low_sample_text,
)
try:
logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name)
from langchain_core.messages import HumanMessage
result = await llm.agenerate(texts=[[HumanMessage(content=prompt)]])
text = result.generations[0][0].text.strip()
logger.info("[advisor] LLM analysis complete chars=%d", len(text))
return text
except Exception as exc:
logger.warning(
"[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
type(exc).__name__, exc,
)
return ""