Files
siemens_ragas/rag_eval/advisor/llm_analyzer.py

110 lines
3.9 KiB
Python
Raw Permalink Normal View History

"""LLM-powered analysis of rule diagnostics and low-score samples."""
from __future__ import annotations
import logging
from typing import Any
from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor")
_PROMPT_TEMPLATE = """\
你是一个 RAG 系统优化专家正在分析西门子医疗 CT 文档问答系统的评测结果
请用中文撰写一份优化建议报告格式为 Markdown
## 评测诊断摘要
{diagnosis_summary}
## 低分样本示例
{low_sample_text}
## 报告要求
1. 按指标分节## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. 严重程度说明critical=严重<阈值50%warning=警告<阈值70%low=待优化低于0.85有提升空间
3. "具体怎么改"要结合低分样本的实际内容而不只是泛泛建议
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先critical 和 warning 项优先于 low 项
5. 语言简洁面向工程师不要废话不要重复列表内容
只输出 Markdown 报告正文不要任何前置说明
"""
_SEVERITY_LABEL_ZH: dict[str, str] = {
"critical": "严重",
"warning": "警告",
"low": "待优化",
}
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
lines.append(
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}"
f"阈值={d.threshold},严重程度={label}"
)
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
return "\n".join(lines)
def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
if not d.low_samples:
continue
lines.append(f"### {d.metric} 低分样本(最多 3 条)")
for i, s in enumerate(d.low_samples, 1):
score = s.get(d.metric, "N/A")
lines.append(f"\n**样本 {i}**(分数={score}")
lines.append(f"- 问题:{s.get('question', '')}")
lines.append(f"- 回答:{s.get('answer', '')[:300]}")
lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}")
return "\n".join(lines)
async def analyze(
diagnoses: list[Diagnosis],
llm: Any,
scenario_name: str,
) -> str:
"""Call the judge LLM to generate a Chinese optimization report.
Args:
diagnoses: Non-empty list of Diagnosis from rules.diagnose().
llm: RAGAS LLM wrapper (has .agenerate() method).
scenario_name: Used only for logging.
Returns:
LLM-generated Markdown string, or "" on failure (triggers writer fallback).
"""
if not diagnoses:
return ""
diagnosis_summary = _build_diagnosis_summary(diagnoses)
low_sample_text = _build_low_sample_text(diagnoses)
prompt = _PROMPT_TEMPLATE.format(
diagnosis_summary=diagnosis_summary,
low_sample_text=low_sample_text,
)
try:
logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name)
from langchain_core.messages import HumanMessage
# Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel)
response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)])
text = response.content.strip()
logger.info("[advisor] LLM analysis complete chars=%d", len(text))
return text
except Exception as exc:
logger.warning(
"[advisor] LLM analysis failed (%s: %s) — falling back to rule report",
type(exc).__name__, exc,
)
return ""