125 lines
5.0 KiB
Python
125 lines
5.0 KiB
Python
|
|
"""Unit tests for rag_eval/metrics/weights.py"""
|
||
|
|
import math
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from rag_eval.metrics.weights import (
|
||
|
|
compute_overall_weighted_score_mean,
|
||
|
|
compute_weighted_score,
|
||
|
|
resolve_weight,
|
||
|
|
weighted_metric_means,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class TestResolveWeight:
|
||
|
|
def test_returns_value_when_key_present(self):
|
||
|
|
assert resolve_weight({"faith": 0.5}, "faith") == 0.5
|
||
|
|
|
||
|
|
def test_returns_default_when_key_missing(self):
|
||
|
|
assert resolve_weight({}, "faith") == 1.0
|
||
|
|
|
||
|
|
def test_returns_custom_default_when_key_missing(self):
|
||
|
|
assert resolve_weight({}, "faith", default=2.0) == 2.0
|
||
|
|
|
||
|
|
def test_empty_dict_returns_default(self):
|
||
|
|
assert resolve_weight({}, "anything") == 1.0
|
||
|
|
|
||
|
|
|
||
|
|
class TestComputeWeightedScore:
|
||
|
|
def test_equal_weights_is_simple_mean(self):
|
||
|
|
scores = {"faithfulness": 0.8, "context_recall": 0.6}
|
||
|
|
result = compute_weighted_score(scores, {})
|
||
|
|
assert result == pytest.approx(0.7, rel=1e-4)
|
||
|
|
|
||
|
|
def test_explicit_weights(self):
|
||
|
|
scores = {"faithfulness": 1.0, "context_recall": 0.0}
|
||
|
|
weights = {"faithfulness": 3.0, "context_recall": 1.0}
|
||
|
|
result = compute_weighted_score(scores, weights)
|
||
|
|
assert result == pytest.approx(0.75, rel=1e-4)
|
||
|
|
|
||
|
|
def test_nan_values_excluded(self):
|
||
|
|
scores = {"faithfulness": float("nan"), "context_recall": 0.8}
|
||
|
|
result = compute_weighted_score(scores, {})
|
||
|
|
assert result == pytest.approx(0.8, rel=1e-4)
|
||
|
|
|
||
|
|
def test_none_values_excluded(self):
|
||
|
|
scores = {"faithfulness": None, "context_recall": 0.6}
|
||
|
|
result = compute_weighted_score(scores, {})
|
||
|
|
assert result == pytest.approx(0.6, rel=1e-4)
|
||
|
|
|
||
|
|
def test_all_nan_returns_none(self):
|
||
|
|
scores = {"faithfulness": float("nan"), "context_recall": float("nan")}
|
||
|
|
assert compute_weighted_score(scores, {}) is None
|
||
|
|
|
||
|
|
def test_empty_scores_returns_none(self):
|
||
|
|
assert compute_weighted_score({}, {}) is None
|
||
|
|
|
||
|
|
def test_missing_metric_in_weights_uses_default_1(self):
|
||
|
|
scores = {"faithfulness": 0.8, "context_recall": 0.4}
|
||
|
|
weights = {"faithfulness": 2.0}
|
||
|
|
result = compute_weighted_score(scores, weights)
|
||
|
|
assert result == pytest.approx(2.0 / 3, rel=1e-4)
|
||
|
|
|
||
|
|
|
||
|
|
class TestWeightedMetricMeans:
|
||
|
|
def _rows(self):
|
||
|
|
return [
|
||
|
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.5},
|
||
|
|
{"doc_name": "b.pdf", "faithfulness": 0.6, "context_recall": 0.8},
|
||
|
|
]
|
||
|
|
|
||
|
|
def test_equal_weights_gives_arithmetic_mean(self):
|
||
|
|
rows = self._rows()
|
||
|
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||
|
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||
|
|
assert result["context_recall"] == pytest.approx(0.65, rel=1e-4)
|
||
|
|
|
||
|
|
def test_doc_weight_amplifies_contribution(self):
|
||
|
|
rows = self._rows()
|
||
|
|
doc_weights = {"a.pdf": 3.0, "b.pdf": 1.0}
|
||
|
|
result = weighted_metric_means(rows, ["faithfulness"], doc_weights)
|
||
|
|
assert result["faithfulness"] == pytest.approx(0.9, rel=1e-4)
|
||
|
|
|
||
|
|
def test_nan_rows_skipped_per_metric(self):
|
||
|
|
rows = [
|
||
|
|
{"doc_name": "a.pdf", "faithfulness": float("nan"), "context_recall": 0.5},
|
||
|
|
{"doc_name": "b.pdf", "faithfulness": 0.8, "context_recall": 0.9},
|
||
|
|
]
|
||
|
|
result = weighted_metric_means(rows, ["faithfulness", "context_recall"], {})
|
||
|
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||
|
|
assert result["context_recall"] == pytest.approx(0.7, rel=1e-4)
|
||
|
|
|
||
|
|
def test_missing_metric_column_returns_none(self):
|
||
|
|
rows = [{"doc_name": "a.pdf", "faithfulness": 0.8}]
|
||
|
|
result = weighted_metric_means(rows, ["faithfulness", "unknown_metric"], {})
|
||
|
|
assert result["faithfulness"] == pytest.approx(0.8, rel=1e-4)
|
||
|
|
assert result["unknown_metric"] is None
|
||
|
|
|
||
|
|
def test_empty_rows_returns_none_for_all(self):
|
||
|
|
result = weighted_metric_means([], ["faithfulness"], {})
|
||
|
|
assert result["faithfulness"] is None
|
||
|
|
|
||
|
|
|
||
|
|
class TestComputeOverallWeightedScoreMean:
|
||
|
|
def test_basic_weighted_mean_of_weighted_scores(self):
|
||
|
|
rows = [
|
||
|
|
{"doc_name": "a.pdf", "faithfulness": 1.0, "context_recall": 0.0},
|
||
|
|
{"doc_name": "b.pdf", "faithfulness": 0.5, "context_recall": 0.5},
|
||
|
|
]
|
||
|
|
metric_weights = {"faithfulness": 1.0, "context_recall": 1.0}
|
||
|
|
result = compute_overall_weighted_score_mean(rows, metric_weights, {})
|
||
|
|
assert result == pytest.approx(0.5, rel=1e-4)
|
||
|
|
|
||
|
|
def test_doc_weight_amplifies_sample(self):
|
||
|
|
rows = [
|
||
|
|
{"doc_name": "important.pdf", "faithfulness": 1.0},
|
||
|
|
{"doc_name": "other.pdf", "faithfulness": 0.0},
|
||
|
|
]
|
||
|
|
doc_weights = {"important.pdf": 9.0, "other.pdf": 1.0}
|
||
|
|
result = compute_overall_weighted_score_mean(rows, {}, doc_weights)
|
||
|
|
assert result == pytest.approx(0.9, rel=1e-4)
|
||
|
|
|
||
|
|
def test_all_nan_returns_none(self):
|
||
|
|
rows = [{"doc_name": "a.pdf", "faithfulness": float("nan")}]
|
||
|
|
assert compute_overall_weighted_score_mean(rows, {}, {}) is None
|