feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics) - diagnose() now emits severity='low' for scores in (warning_threshold, 0.85) - noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds - writer.py: severity labels mapped to Chinese (严重/警告/待优化) - llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese - Tests: 5 new cases for 'low' severity, updated log summary assertions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-25 11:35:49 +08:00
parent 4fd515d2d9
commit e1751447df
5 changed files with 66 additions and 11 deletions
--- a/tests/test_advisor_rules.py
+++ b/tests/test_advisor_rules.py
@@ -10,10 +10,38 @@ class TestDiagnosis(unittest.TestCase):
                for i, s in enumerate(scores)]

    def test_no_diagnosis_when_all_scores_above_threshold(self):
+        # Mean exactly 0.85 should NOT trigger any diagnosis (< 0.85 is the condition).
        rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
        result = diagnose(rows, metrics=["faithfulness"])
        self.assertEqual(result, [])

+    def test_no_diagnosis_when_mean_above_advisory_threshold(self):
+        rows = self._make_rows("answer_relevancy", [0.9, 0.92, 0.88])
+        result = diagnose(rows, metrics=["answer_relevancy"])
+        self.assertEqual(result, [])
+
+    def test_low_severity_when_mean_below_advisory_threshold(self):
+        # Score between warning_threshold (0.7) and advisory_threshold (0.85) → "low"
+        rows = self._make_rows("faithfulness", [0.78, 0.80, 0.82])
+        result = diagnose(rows, metrics=["faithfulness"])
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].severity, "low")
+        self.assertAlmostEqual(result[0].threshold, 0.85, places=2)
+
+    def test_low_severity_answer_relevancy_at_0_84(self):
+        rows = self._make_rows("answer_relevancy", [0.84, 0.84, 0.84])
+        result = diagnose(rows, metrics=["answer_relevancy"])
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].severity, "low")
+
+    def test_low_severity_has_root_causes_and_actions(self):
+        rows = self._make_rows("context_precision", [0.75, 0.76, 0.77])
+        result = diagnose(rows, metrics=["context_precision"])
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].severity, "low")
+        self.assertTrue(len(result[0].root_causes) > 0)
+        self.assertTrue(len(result[0].suggested_actions) > 0)
+
    def test_warning_when_mean_below_warning_threshold(self):
        rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
        result = diagnose(rows, metrics=["faithfulness"])
--- a/tests/test_advisor_writer.py
+++ b/tests/test_advisor_writer.py
@@ -91,9 +91,9 @@ class TestWriteAdvice(unittest.TestCase):
        ]
        summary = _format_log_summary(diags, self.advice_path)
        self.assertIn("faithfulness", summary)
-        self.assertIn("critical", summary)
+        self.assertIn("严重", summary)   # "critical" maps to Chinese label
        self.assertIn("context_recall", summary)
-        self.assertIn("warning", summary)
+        self.assertIn("警告", summary)   # "warning" maps to Chinese label

    def test_write_empty_diagnoses_still_creates_file(self):
        write_advice(