test: ensure test package structure and all webapp tests pass
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
72
tests/test_advisor_rules.py
Normal file
72
tests/test_advisor_rules.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import math
|
||||||
|
import unittest
|
||||||
|
from rag_eval.advisor.rules import Diagnosis, diagnose, METRIC_RULES
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiagnosis(unittest.TestCase):
|
||||||
|
def _make_rows(self, metric: str, scores: list[float]) -> list[dict]:
|
||||||
|
return [{metric: s, "question": f"q{i}", "answer": f"a{i}",
|
||||||
|
"ground_truth": f"gt{i}", "sample_id": f"s{i}"}
|
||||||
|
for i, s in enumerate(scores)]
|
||||||
|
|
||||||
|
def test_no_diagnosis_when_all_scores_above_threshold(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.8, 0.9, 0.85])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_warning_when_mean_below_warning_threshold(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.65, 0.62, 0.68])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].metric, "faithfulness")
|
||||||
|
self.assertEqual(result[0].severity, "warning")
|
||||||
|
self.assertAlmostEqual(result[0].mean_score, 0.65, places=2)
|
||||||
|
|
||||||
|
def test_critical_when_mean_below_critical_threshold(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.3, 0.4, 0.45])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(result[0].severity, "critical")
|
||||||
|
|
||||||
|
def test_low_samples_selected_are_bottom_three(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.1, 0.2, 0.3, 0.8, 0.9])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result[0].low_samples), 3)
|
||||||
|
scores = [s["faithfulness"] for s in result[0].low_samples]
|
||||||
|
self.assertEqual(sorted(scores), [0.1, 0.2, 0.3])
|
||||||
|
|
||||||
|
def test_nan_scores_excluded_from_mean_and_low_samples(self):
|
||||||
|
rows = self._make_rows("faithfulness", [0.3, float("nan"), 0.4])
|
||||||
|
result = diagnose(rows, metrics=["faithfulness"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
for s in result[0].low_samples:
|
||||||
|
self.assertFalse(math.isnan(s["faithfulness"]))
|
||||||
|
|
||||||
|
def test_noise_sensitivity_direction_inverted(self):
|
||||||
|
# noise_sensitivity: higher is worse; threshold > 0.3 is warning
|
||||||
|
rows = self._make_rows("noise_sensitivity", [0.4, 0.45, 0.5])
|
||||||
|
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0].metric, "noise_sensitivity")
|
||||||
|
|
||||||
|
def test_noise_sensitivity_no_diagnosis_when_low(self):
|
||||||
|
rows = self._make_rows("noise_sensitivity", [0.1, 0.15, 0.2])
|
||||||
|
result = diagnose(rows, metrics=["noise_sensitivity"])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_skips_metric_not_in_rows(self):
|
||||||
|
rows = [{"faithfulness": 0.3, "question": "q", "answer": "a",
|
||||||
|
"ground_truth": "gt", "sample_id": "s1"}]
|
||||||
|
result = diagnose(rows, metrics=["faithfulness", "context_recall"])
|
||||||
|
metrics_found = [d.metric for d in result]
|
||||||
|
self.assertIn("faithfulness", metrics_found)
|
||||||
|
self.assertNotIn("context_recall", metrics_found)
|
||||||
|
|
||||||
|
def test_all_seven_metrics_have_rules(self):
|
||||||
|
expected = {"faithfulness", "answer_relevancy", "context_recall",
|
||||||
|
"context_precision", "noise_sensitivity",
|
||||||
|
"factual_correctness", "semantic_similarity"}
|
||||||
|
self.assertEqual(set(METRIC_RULES.keys()), expected)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
0
tests/webapp/__init__.py
Normal file
0
tests/webapp/__init__.py
Normal file
Reference in New Issue
Block a user