diff --git a/rag_eval/advisor/llm_analyzer.py b/rag_eval/advisor/llm_analyzer.py index 97ffc9f..99d9997 100644 --- a/rag_eval/advisor/llm_analyzer.py +++ b/rag_eval/advisor/llm_analyzer.py @@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\ ## 报告要求 -1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改" -2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议 -3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先) -4. 语言简洁,面向工程师,不要废话,不要重复列表内容 +1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改" +2. 严重程度说明:critical=严重(<阈值50%),warning=警告(<阈值70%),low=待优化(低于0.85,有提升空间) +3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议 +4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先),critical 和 warning 项优先于 low 项 +5. 语言简洁,面向工程师,不要废话,不要重复列表内容 只输出 Markdown 报告正文,不要任何前置说明。 """ +_SEVERITY_LABEL_ZH: dict[str, str] = { + "critical": "严重", + "warning": "警告", + "low": "待优化", +} + + def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str: lines = [] for d in diagnoses: direction = "(越低越好)" if d.metric == "noise_sensitivity" else "" + label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity) lines.append( f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}," - f"阈值={d.threshold},严重程度={d.severity}" + f"阈值={d.threshold},严重程度={label}" ) lines.append(f" - 可能原因:{'; '.join(d.root_causes)}") lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}") diff --git a/rag_eval/advisor/rules.py b/rag_eval/advisor/rules.py index 8de7dc1..eee1829 100644 --- a/rag_eval/advisor/rules.py +++ b/rag_eval/advisor/rules.py @@ -14,6 +14,9 @@ class MetricRule: higher_is_better: bool # False for noise_sensitivity root_causes: list[str] suggested_actions: list[str] + # Scores below this threshold trigger a "low" advisory (LLM suggestion requested). + # Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds. + advisory_threshold: float = 0.85 METRIC_RULES: dict[str, MetricRule] = { @@ -208,10 +211,14 @@ def diagnose( elif mean < rule.warning_threshold: severity = "warning" threshold = rule.warning_threshold + elif mean < rule.advisory_threshold: + # Score is acceptable but below 0.85 — request LLM optimization advice. + severity = "low" + threshold = rule.advisory_threshold else: - continue # above warning threshold → no diagnosis + continue # >= advisory_threshold → no diagnosis needed else: - # lower is better (noise_sensitivity) + # lower is better (noise_sensitivity): keep existing two-tier logic if mean > rule.critical_threshold: severity = "critical" threshold = rule.critical_threshold diff --git a/rag_eval/advisor/writer.py b/rag_eval/advisor/writer.py index e46c919..b60e6dd 100644 --- a/rag_eval/advisor/writer.py +++ b/rag_eval/advisor/writer.py @@ -8,12 +8,22 @@ from .rules import Diagnosis logger = logging.getLogger("rag_eval.advisor") +# Chinese display labels for each severity tier. +_SEVERITY_LABEL: dict[str, str] = { + "critical": "严重", + "warning": "警告", + "low": "待优化", +} + def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str: """Return a single-line log summary of triggered diagnoses.""" if not diagnoses: return "[advisor] 所有指标正常,无需优化建议。" - parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses] + parts = [ + f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})" + for d in diagnoses + ] triggered = " ".join(parts) return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}" @@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str: return "" lines = ["## 规则诊断(LLM 分析不可用)\n"] for d in diagnoses: - lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}") + label = _SEVERITY_LABEL.get(d.severity, d.severity) + lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}") lines.append("\n**可能原因:**") for cause in d.root_causes: lines.append(f"- {cause}") diff --git a/tests/test_advisor_rules.py b/tests/test_advisor_rules.py index 1aa86da..a94b66d 100644 --- a/tests/test_advisor_rules.py +++ b/tests/test_advisor_rules.py @@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase): for i, s in enumerate(scores)] def test_no_diagnosis_when_all_scores_above_threshold(self): + # Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition). rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85]) result = diagnose(rows, metrics=["faithfulness"]) self.assertEqual(result, []) + def test_no_diagnosis_when_mean_above_advisory_threshold(self): + rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88]) + result = diagnose(rows, metrics=["answer_relevancy"]) + self.assertEqual(result, []) + + def test_low_severity_when_mean_below_advisory_threshold(self): + # Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low" + rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82]) + result = diagnose(rows, metrics=["faithfulness"]) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].severity, "low") + self.assertAlmostEqual(result[0].threshold, 0.85, places=2) + + def test_low_severity_answer_relevancy_at_0_84(self): + rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84]) + result = diagnose(rows, metrics=["answer_relevancy"]) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].severity, "low") + + def test_low_severity_has_root_causes_and_actions(self): + rows = self._make_rows("context_precision", [0.75, 0.76, 0.77]) + result = diagnose(rows, metrics=["context_precision"]) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].severity, "low") + self.assertTrue(len(result[0].root_causes) > 0) + self.assertTrue(len(result[0].suggested_actions) > 0) + def test_warning_when_mean_below_warning_threshold(self): rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68]) result = diagnose(rows, metrics=["faithfulness"]) diff --git a/tests/test_advisor_writer.py b/tests/test_advisor_writer.py index e2dd190..ea11a12 100644 --- a/tests/test_advisor_writer.py +++ b/tests/test_advisor_writer.py @@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase): ] summary = _format_log_summary(diags, self.advice_path) self.assertIn("faithfulness", summary) - self.assertIn("critical", summary) + self.assertIn("严重", summary) # "critical" maps to Chinese label self.assertIn("context_recall", summary) - self.assertIn("warning", summary) + self.assertIn("警告", summary) # "warning" maps to Chinese label def test_write_empty_diagnoses_still_creates_file(self): write_advice(