feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-25 11:35:49 +08:00
parent 4fd515d2d9
commit e1751447df
5 changed files with 66 additions and 11 deletions

View File

@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
for i, s in enumerate(scores)]
def test_no_diagnosis_when_all_scores_above_threshold(self):
# Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(result, [])
def test_no_diagnosis_when_mean_above_advisory_threshold(self):
rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
result = diagnose(rows, metrics=["answer_relevancy"])
self.assertEqual(result, [])
def test_low_severity_when_mean_below_advisory_threshold(self):
# Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
result = diagnose(rows, metrics=["faithfulness"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
def test_low_severity_answer_relevancy_at_0_84(self):
rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
result = diagnose(rows, metrics=["answer_relevancy"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
def test_low_severity_has_root_causes_and_actions(self):
rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
result = diagnose(rows, metrics=["context_precision"])
self.assertEqual(len(result), 1)
self.assertEqual(result[0].severity, "low")
self.assertTrue(len(result[0].root_causes) > 0)
self.assertTrue(len(result[0].suggested_actions) > 0)
def test_warning_when_mean_below_warning_threshold(self):
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
result = diagnose(rows, metrics=["faithfulness"])

View File

@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
]
summary = _format_log_summary(diags, self.advice_path)
self.assertIn("faithfulness", summary)
self.assertIn("critical", summary)
self.assertIn("严重", summary) # "critical" maps to Chinese label
self.assertIn("context_recall", summary)
self.assertIn("warning", summary)
self.assertIn("警告", summary) # "warning" maps to Chinese label
def test_write_empty_diagnoses_still_creates_file(self):
write_advice(