feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-25 11:35:49 +08:00
parent 4fd515d2d9
commit e1751447df
5 changed files with 66 additions and 11 deletions

View File

@@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\
## 报告要求 ## 报告要求
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改" 1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议 2. 严重程度说明critical=严重(<阈值50%warning=警告(<阈值70%low=待优化低于0.85,有提升空间)
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先) 3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
4. 语言简洁,面向工程师,不要废话,不要重复列表内容 4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先critical 和 warning 项优先于 low 项
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
只输出 Markdown 报告正文,不要任何前置说明。 只输出 Markdown 报告正文,不要任何前置说明。
""" """
_SEVERITY_LABEL_ZH: dict[str, str] = {
"critical": "严重",
"warning": "警告",
"low": "待优化",
}
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str: def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
lines = [] lines = []
for d in diagnoses: for d in diagnoses:
direction = "(越低越好)" if d.metric == "noise_sensitivity" else "" direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
lines.append( lines.append(
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}" f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}"
f"阈值={d.threshold},严重程度={d.severity}" f"阈值={d.threshold},严重程度={label}"
) )
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}") lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}") lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")

View File

@@ -14,6 +14,9 @@ class MetricRule:
higher_is_better: bool # False for noise_sensitivity higher_is_better: bool # False for noise_sensitivity
root_causes: list[str] root_causes: list[str]
suggested_actions: list[str] suggested_actions: list[str]
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
advisory_threshold: float = 0.85
METRIC_RULES: dict[str, MetricRule] = { METRIC_RULES: dict[str, MetricRule] = {
@@ -208,10 +211,14 @@ def diagnose(
elif mean < rule.warning_threshold: elif mean < rule.warning_threshold:
severity = "warning" severity = "warning"
threshold = rule.warning_threshold threshold = rule.warning_threshold
elif mean < rule.advisory_threshold:
# Score is acceptable but below 0.85 — request LLM optimization advice.
severity = "low"
threshold = rule.advisory_threshold
else: else:
continue # above warning threshold → no diagnosis continue # >= advisory_threshold → no diagnosis needed
else: else:
# lower is better (noise_sensitivity) # lower is better (noise_sensitivity): keep existing two-tier logic
if mean > rule.critical_threshold: if mean > rule.critical_threshold:
severity = "critical" severity = "critical"
threshold = rule.critical_threshold threshold = rule.critical_threshold

View File

@@ -8,12 +8,22 @@ from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor") logger = logging.getLogger("rag_eval.advisor")
# Chinese display labels for each severity tier.
_SEVERITY_LABEL: dict[str, str] = {
"critical": "严重",
"warning": "警告",
"low": "待优化",
}
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str: def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
"""Return a single-line log summary of triggered diagnoses.""" """Return a single-line log summary of triggered diagnoses."""
if not diagnoses: if not diagnoses:
return "[advisor] 所有指标正常,无需优化建议。" return "[advisor] 所有指标正常,无需优化建议。"
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses] parts = [
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
for d in diagnoses
]
triggered = " ".join(parts) triggered = " ".join(parts)
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}{advice_path}" return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}{advice_path}"
@@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
return "" return ""
lines = ["## 规则诊断LLM 分析不可用)\n"] lines = ["## 规则诊断LLM 分析不可用)\n"]
for d in diagnoses: for d in diagnoses:
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}") label = _SEVERITY_LABEL.get(d.severity, d.severity)
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
lines.append("\n**可能原因:**") lines.append("\n**可能原因:**")
for cause in d.root_causes: for cause in d.root_causes:
lines.append(f"- {cause}") lines.append(f"- {cause}")

View File

@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
for i, s in enumerate(scores)] for i, s in enumerate(scores)]
def test_no_diagnosis_when_all_scores_above_threshold(self): def test_no_diagnosis_when_all_scores_above_threshold(self):
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85]) rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
result = diagnose(rows, metrics=["faithfulness"]) result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(result, []) self.assertEqual(result, [])
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
result = diagnose(rows, metrics=["answer_relevancy"])
self.assertEqual(result, [])
def test_low_severity_when_mean_below_advisory_threshold(self):
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
def test_low_severity_answer_relevancy_at_0_84(self):
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
result = diagnose(rows, metrics=["answer_relevancy"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
def test_low_severity_has_root_causes_and_actions(self):
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
result = diagnose(rows, metrics=["context_precision"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
self.assertTrue(len(result[0].root_causes) > 0)
self.assertTrue(len(result[0].suggested_actions) > 0)
def test_warning_when_mean_below_warning_threshold(self): def test_warning_when_mean_below_warning_threshold(self):
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68]) rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
result = diagnose(rows, metrics=["faithfulness"]) result = diagnose(rows, metrics=["faithfulness"])

View File

@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
] ]
summary = _format_log_summary(diags, self.advice_path) summary = _format_log_summary(diags, self.advice_path)
self.assertIn("faithfulness", summary) self.assertIn("faithfulness", summary)
self.assertIn("critical", summary) self.assertIn("严重", summary) # "critical" maps to Chinese label
self.assertIn("context_recall", summary) self.assertIn("context_recall", summary)
self.assertIn("warning", summary) self.assertIn("警告", summary) # "warning" maps to Chinese label
def test_write_empty_diagnoses_still_creates_file(self): def test_write_empty_diagnoses_still_creates_file(self):
write_advice( write_advice(