"""LLM-powered analysis of rule diagnostics and low-score samples.""" from __future__ import annotations import logging from typing import Any from .rules import Diagnosis logger = logging.getLogger("rag_eval.advisor") _PROMPT_TEMPLATE = """\ 你是一个 RAG 系统优化专家,正在分析西门子医疗 CT 文档问答系统的评测结果。 请用中文撰写一份优化建议报告,格式为 Markdown。 ## 评测诊断摘要 {diagnosis_summary} ## 低分样本示例 {low_sample_text} ## 报告要求 1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改" 2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议 3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先) 4. 语言简洁,面向工程师,不要废话,不要重复列表内容 只输出 Markdown 报告正文,不要任何前置说明。 """ def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str: lines = [] for d in diagnoses: direction = "(越低越好)" if d.metric == "noise_sensitivity" else "" lines.append( f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}," f"阈值={d.threshold},严重程度={d.severity}" ) lines.append(f" - 可能原因:{'; '.join(d.root_causes)}") lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}") return "\n".join(lines) def _build_low_sample_text(diagnoses: list[Diagnosis]) -> str: lines = [] for d in diagnoses: if not d.low_samples: continue lines.append(f"### {d.metric} 低分样本(最多 3 条)") for i, s in enumerate(d.low_samples, 1): score = s.get(d.metric, "N/A") lines.append(f"\n**样本 {i}**(分数={score})") lines.append(f"- 问题:{s.get('question', '')}") lines.append(f"- 回答:{s.get('answer', '')[:300]}") lines.append(f"- 标准答案:{s.get('ground_truth', '')[:200]}") return "\n".join(lines) async def analyze( diagnoses: list[Diagnosis], llm: Any, scenario_name: str, ) -> str: """Call the judge LLM to generate a Chinese optimization report. Args: diagnoses: Non-empty list of Diagnosis from rules.diagnose(). llm: RAGAS LLM wrapper (has .agenerate() method). scenario_name: Used only for logging. Returns: LLM-generated Markdown string, or "" on failure (triggers writer fallback). """ if not diagnoses: return "" diagnosis_summary = _build_diagnosis_summary(diagnoses) low_sample_text = _build_low_sample_text(diagnoses) prompt = _PROMPT_TEMPLATE.format( diagnosis_summary=diagnosis_summary, low_sample_text=low_sample_text, ) try: logger.info("[advisor] calling LLM for optimization analysis scenario=%s", scenario_name) from langchain_core.messages import HumanMessage # Use the underlying langchain chat model directly (RAGAS LangchainLLMWrapper wraps BaseChatModel) response = await llm.langchain_llm.ainvoke([HumanMessage(content=prompt)]) text = response.content.strip() logger.info("[advisor] LLM analysis complete chars=%d", len(text)) return text except Exception as exc: logger.warning( "[advisor] LLM analysis failed (%s: %s) — falling back to rule report", type(exc).__name__, exc, ) return ""