feat(advisor): add 0.85 advisory threshold triggering LLM suggestions
- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics) - diagnose() now emits severity='low' for scores in (warning_threshold, 0.85) - noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds - writer.py: severity labels mapped to Chinese (严重/警告/待优化) - llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese - Tests: 5 new cases for 'low' severity, updated log summary assertions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\
|
||||
|
||||
## 报告要求
|
||||
|
||||
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
|
||||
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||
1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||
2. 严重程度说明:critical=严重(<阈值50%),warning=警告(<阈值70%),low=待优化(低于0.85,有提升空间)
|
||||
3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先),critical 和 warning 项优先于 low 项
|
||||
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||
|
||||
只输出 Markdown 报告正文,不要任何前置说明。
|
||||
"""
|
||||
|
||||
|
||||
_SEVERITY_LABEL_ZH: dict[str, str] = {
|
||||
"critical": "严重",
|
||||
"warning": "警告",
|
||||
"low": "待优化",
|
||||
}
|
||||
|
||||
|
||||
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
||||
lines = []
|
||||
for d in diagnoses:
|
||||
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
||||
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
|
||||
lines.append(
|
||||
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
||||
f"阈值={d.threshold},严重程度={d.severity}"
|
||||
f"阈值={d.threshold},严重程度={label}"
|
||||
)
|
||||
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
||||
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
||||
|
||||
@@ -14,6 +14,9 @@ class MetricRule:
|
||||
higher_is_better: bool # False for noise_sensitivity
|
||||
root_causes: list[str]
|
||||
suggested_actions: list[str]
|
||||
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
|
||||
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
|
||||
advisory_threshold: float = 0.85
|
||||
|
||||
|
||||
METRIC_RULES: dict[str, MetricRule] = {
|
||||
@@ -208,10 +211,14 @@ def diagnose(
|
||||
elif mean < rule.warning_threshold:
|
||||
severity = "warning"
|
||||
threshold = rule.warning_threshold
|
||||
elif mean < rule.advisory_threshold:
|
||||
# Score is acceptable but below 0.85 — request LLM optimization advice.
|
||||
severity = "low"
|
||||
threshold = rule.advisory_threshold
|
||||
else:
|
||||
continue # above warning threshold → no diagnosis
|
||||
continue # >= advisory_threshold → no diagnosis needed
|
||||
else:
|
||||
# lower is better (noise_sensitivity)
|
||||
# lower is better (noise_sensitivity): keep existing two-tier logic
|
||||
if mean > rule.critical_threshold:
|
||||
severity = "critical"
|
||||
threshold = rule.critical_threshold
|
||||
|
||||
@@ -8,12 +8,22 @@ from .rules import Diagnosis
|
||||
|
||||
logger = logging.getLogger("rag_eval.advisor")
|
||||
|
||||
# Chinese display labels for each severity tier.
|
||||
_SEVERITY_LABEL: dict[str, str] = {
|
||||
"critical": "严重",
|
||||
"warning": "警告",
|
||||
"low": "待优化",
|
||||
}
|
||||
|
||||
|
||||
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
||||
"""Return a single-line log summary of triggered diagnoses."""
|
||||
if not diagnoses:
|
||||
return "[advisor] 所有指标正常,无需优化建议。"
|
||||
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
|
||||
parts = [
|
||||
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
|
||||
for d in diagnoses
|
||||
]
|
||||
triggered = " ".join(parts)
|
||||
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
||||
|
||||
@@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
|
||||
return ""
|
||||
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
||||
for d in diagnoses:
|
||||
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
|
||||
label = _SEVERITY_LABEL.get(d.severity, d.severity)
|
||||
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
|
||||
lines.append("\n**可能原因:**")
|
||||
for cause in d.root_causes:
|
||||
lines.append(f"- {cause}")
|
||||
|
||||
@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
|
||||
for i, s in enumerate(scores)]
|
||||
|
||||
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
|
||||
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(result, [])
|
||||
|
||||
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
|
||||
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
|
||||
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||
self.assertEqual(result, [])
|
||||
|
||||
def test_low_severity_when_mean_below_advisory_threshold(self):
|
||||
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
|
||||
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].severity, "low")
|
||||
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
|
||||
|
||||
def test_low_severity_answer_relevancy_at_0_84(self):
|
||||
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
|
||||
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].severity, "low")
|
||||
|
||||
def test_low_severity_has_root_causes_and_actions(self):
|
||||
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
|
||||
result = diagnose(rows, metrics=["context_precision"])
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].severity, "low")
|
||||
self.assertTrue(len(result[0].root_causes) > 0)
|
||||
self.assertTrue(len(result[0].suggested_actions) > 0)
|
||||
|
||||
def test_warning_when_mean_below_warning_threshold(self):
|
||||
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||
result = diagnose(rows, metrics=["faithfulness"])
|
||||
|
||||
@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
|
||||
]
|
||||
summary = _format_log_summary(diags, self.advice_path)
|
||||
self.assertIn("faithfulness", summary)
|
||||
self.assertIn("critical", summary)
|
||||
self.assertIn("严重", summary) # "critical" maps to Chinese label
|
||||
self.assertIn("context_recall", summary)
|
||||
self.assertIn("warning", summary)
|
||||
self.assertIn("警告", summary) # "warning" maps to Chinese label
|
||||
|
||||
def test_write_empty_diagnoses_still_creates_file(self):
|
||||
write_advice(
|
||||
|
||||
Reference in New Issue
Block a user