feat(advisor): add 0.85 advisory threshold triggering LLM suggestions
- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics) - diagnose() now emits severity='low' for scores in (warning_threshold, 0.85) - noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds - writer.py: severity labels mapped to Chinese (严重/警告/待优化) - llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese - Tests: 5 new cases for 'low' severity, updated log summary assertions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\
|
|||||||
|
|
||||||
## 报告要求
|
## 报告要求
|
||||||
|
|
||||||
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
|
||||||
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
2. 严重程度说明:critical=严重(<阈值50%),warning=警告(<阈值70%),low=待优化(低于0.85,有提升空间)
|
||||||
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
|
3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
|
||||||
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先),critical 和 warning 项优先于 low 项
|
||||||
|
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
|
||||||
|
|
||||||
只输出 Markdown 报告正文,不要任何前置说明。
|
只输出 Markdown 报告正文,不要任何前置说明。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
_SEVERITY_LABEL_ZH: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
|
||||||
lines = []
|
lines = []
|
||||||
for d in diagnoses:
|
for d in diagnoses:
|
||||||
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
|
||||||
|
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
|
||||||
lines.append(
|
lines.append(
|
||||||
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f},"
|
||||||
f"阈值={d.threshold},严重程度={d.severity}"
|
f"阈值={d.threshold},严重程度={label}"
|
||||||
)
|
)
|
||||||
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
|
||||||
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ class MetricRule:
|
|||||||
higher_is_better: bool # False for noise_sensitivity
|
higher_is_better: bool # False for noise_sensitivity
|
||||||
root_causes: list[str]
|
root_causes: list[str]
|
||||||
suggested_actions: list[str]
|
suggested_actions: list[str]
|
||||||
|
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
|
||||||
|
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
|
||||||
|
advisory_threshold: float = 0.85
|
||||||
|
|
||||||
|
|
||||||
METRIC_RULES: dict[str, MetricRule] = {
|
METRIC_RULES: dict[str, MetricRule] = {
|
||||||
@@ -208,10 +211,14 @@ def diagnose(
|
|||||||
elif mean < rule.warning_threshold:
|
elif mean < rule.warning_threshold:
|
||||||
severity = "warning"
|
severity = "warning"
|
||||||
threshold = rule.warning_threshold
|
threshold = rule.warning_threshold
|
||||||
|
elif mean < rule.advisory_threshold:
|
||||||
|
# Score is acceptable but below 0.85 — request LLM optimization advice.
|
||||||
|
severity = "low"
|
||||||
|
threshold = rule.advisory_threshold
|
||||||
else:
|
else:
|
||||||
continue # above warning threshold → no diagnosis
|
continue # >= advisory_threshold → no diagnosis needed
|
||||||
else:
|
else:
|
||||||
# lower is better (noise_sensitivity)
|
# lower is better (noise_sensitivity): keep existing two-tier logic
|
||||||
if mean > rule.critical_threshold:
|
if mean > rule.critical_threshold:
|
||||||
severity = "critical"
|
severity = "critical"
|
||||||
threshold = rule.critical_threshold
|
threshold = rule.critical_threshold
|
||||||
|
|||||||
@@ -8,12 +8,22 @@ from .rules import Diagnosis
|
|||||||
|
|
||||||
logger = logging.getLogger("rag_eval.advisor")
|
logger = logging.getLogger("rag_eval.advisor")
|
||||||
|
|
||||||
|
# Chinese display labels for each severity tier.
|
||||||
|
_SEVERITY_LABEL: dict[str, str] = {
|
||||||
|
"critical": "严重",
|
||||||
|
"warning": "警告",
|
||||||
|
"low": "待优化",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
|
||||||
"""Return a single-line log summary of triggered diagnoses."""
|
"""Return a single-line log summary of triggered diagnoses."""
|
||||||
if not diagnoses:
|
if not diagnoses:
|
||||||
return "[advisor] 所有指标正常,无需优化建议。"
|
return "[advisor] 所有指标正常,无需优化建议。"
|
||||||
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
|
parts = [
|
||||||
|
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
|
||||||
|
for d in diagnoses
|
||||||
|
]
|
||||||
triggered = " ".join(parts)
|
triggered = " ".join(parts)
|
||||||
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered} → {advice_path}"
|
||||||
|
|
||||||
@@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
|
|||||||
return ""
|
return ""
|
||||||
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
lines = ["## 规则诊断(LLM 分析不可用)\n"]
|
||||||
for d in diagnoses:
|
for d in diagnoses:
|
||||||
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
|
label = _SEVERITY_LABEL.get(d.severity, d.severity)
|
||||||
|
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
|
||||||
lines.append("\n**可能原因:**")
|
lines.append("\n**可能原因:**")
|
||||||
for cause in d.root_causes:
|
for cause in d.root_causes:
|
||||||
lines.append(f"- {cause}")
|
lines.append(f"- {cause}")
|
||||||
|
|||||||
@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
|
|||||||
for i, s in enumerate(scores)]
|
for i, s in enumerate(scores)]
|
||||||
|
|
||||||
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||||
|
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
|
||||||
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||||
result = diagnose(rows, metrics=["faithfulness"])
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
self.assertEqual(result, [])
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_low_severity_when_mean_below_advisory_threshold(self):
|
||||||
|
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
|
||||||
|
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
|
||||||
|
|
||||||
|
def test_low_severity_answer_relevancy_at_0_84(self):
|
||||||
|
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
|
||||||
|
result = diagnose(rows, metrics=["answer_relevancy"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
|
||||||
|
def test_low_severity_has_root_causes_and_actions(self):
|
||||||
|
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
|
||||||
|
result = diagnose(rows, metrics=["context_precision"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].severity, "low")
|
||||||
|
self.assertTrue(len(result[0].root_causes) > 0)
|
||||||
|
self.assertTrue(len(result[0].suggested_actions) > 0)
|
||||||
|
|
||||||
def test_warning_when_mean_below_warning_threshold(self):
|
def test_warning_when_mean_below_warning_threshold(self):
|
||||||
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||||
result = diagnose(rows, metrics=["faithfulness"])
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
|||||||
@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
summary = _format_log_summary(diags, self.advice_path)
|
summary = _format_log_summary(diags, self.advice_path)
|
||||||
self.assertIn("faithfulness", summary)
|
self.assertIn("faithfulness", summary)
|
||||||
self.assertIn("critical", summary)
|
self.assertIn("严重", summary) # "critical" maps to Chinese label
|
||||||
self.assertIn("context_recall", summary)
|
self.assertIn("context_recall", summary)
|
||||||
self.assertIn("warning", summary)
|
self.assertIn("警告", summary) # "warning" maps to Chinese label
|
||||||
|
|
||||||
def test_write_empty_diagnoses_still_creates_file(self):
|
def test_write_empty_diagnoses_still_creates_file(self):
|
||||||
write_advice(
|
write_advice(
|
||||||
|
|||||||
Reference in New Issue
Block a user