update
This commit is contained in:
68
tests/test_metric_presenter.py
Normal file
68
tests/test_metric_presenter.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def _run_node(script: str) -> str:
|
||||
"""Execute a short Node.js script and return stdout."""
|
||||
completed = subprocess.run(
|
||||
["node", "-e", script],
|
||||
cwd=REPO_ROOT,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
check=True,
|
||||
)
|
||||
return completed.stdout.strip()
|
||||
|
||||
|
||||
def test_metric_presenter_applies_thresholds_and_noise_direction() -> None:
|
||||
"""MetricPresenter should centralize thresholds and inverse noise semantics."""
|
||||
metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix()
|
||||
script = f"""
|
||||
const fs = require("fs");
|
||||
const vm = require("vm");
|
||||
const code = fs.readFileSync("{metric_js}", "utf8");
|
||||
const sandbox = {{ window: {{}}, console }};
|
||||
vm.runInNewContext(code, sandbox);
|
||||
const p = sandbox.window.MetricPresenter;
|
||||
const result = {{
|
||||
faith085: p.scoreClass("faithfulness", 0.85),
|
||||
faith070: p.scoreClass("faithfulness", 0.70),
|
||||
faith064: p.scoreClass("faithfulness", 0.64),
|
||||
noise010: p.scoreClass("noise_sensitivity", 0.10),
|
||||
noise030: p.scoreClass("noise_sensitivity", 0.30),
|
||||
noise050: p.scoreClass("noise_sensitivity", 0.50),
|
||||
desc: p.describeMetric("faithfulness"),
|
||||
noiseDesc: p.describeMetric("noise_sensitivity"),
|
||||
noiseBin: p.binColor("noise_sensitivity", 0.0),
|
||||
faithBin: p.binColor("faithfulness", 0.8)
|
||||
}};
|
||||
console.log(JSON.stringify(result));
|
||||
"""
|
||||
output = _run_node(script)
|
||||
assert '"faith085":"good"' in output
|
||||
assert '"faith070":"warn"' in output
|
||||
assert '"faith064":"bad"' in output
|
||||
assert '"noise010":"good"' in output
|
||||
assert '"noise030":"warn"' in output
|
||||
assert '"noise050":"bad"' in output
|
||||
assert '"desc":"' in output
|
||||
assert '"noiseDesc":"' in output
|
||||
assert '"noiseBin":"#16a34a"' in output
|
||||
assert '"faithBin":"#16a34a"' in output
|
||||
|
||||
|
||||
def test_report_and_index_load_metric_presenter_helper() -> None:
|
||||
"""The report page should use the shared helper for card descriptions and colors."""
|
||||
index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8")
|
||||
report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8")
|
||||
app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8")
|
||||
|
||||
assert "js/metric_presenter.js" in index_html
|
||||
assert "MetricPresenter.describeMetric" in report_js
|
||||
assert "MetricPresenter.scoreClass" in app_js
|
||||
@@ -88,3 +88,30 @@ def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path)
|
||||
)
|
||||
|
||||
assert _infer_metrics_from_scores(run_dir) == ["faithfulness"]
|
||||
|
||||
|
||||
def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None:
|
||||
"""Lowest-sample review should treat higher noise sensitivity as worse."""
|
||||
run_dir = tmp_path / "run"
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
(run_dir / "scores.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"sample_id,question,noise_sensitivity",
|
||||
"s-good,q1,0.10",
|
||||
"s-warn,q2,0.30",
|
||||
"s-bad,q3,0.90",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(run_dir / "summary.md").write_text("summary", encoding="utf-8")
|
||||
(run_dir / "optimization_advice.md").write_text("", encoding="utf-8")
|
||||
|
||||
report = build_report(run_dir, ["noise_sensitivity"])
|
||||
|
||||
assert [sample.sample_id for sample in report.lowest_samples[:3]] == [
|
||||
"s-bad",
|
||||
"s-warn",
|
||||
"s-good",
|
||||
]
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import pytest
|
||||
from unittest.mock import sentinel
|
||||
|
||||
from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse
|
||||
|
||||
def test_llm_profile_defaults():
|
||||
@@ -147,3 +149,57 @@ def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch):
|
||||
assert kwargs["api_key"] == "sk-env"
|
||||
assert kwargs["base_url"] == "http://env-base/v1"
|
||||
assert kwargs["timeout"] == 45.0
|
||||
|
||||
|
||||
def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch):
|
||||
"""Structured RAGAS judge calls should use a larger completion budget by default."""
|
||||
import rag_eval.metrics.factory as factory
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_llm_factory(model, client=None, **kwargs):
|
||||
captured["model"] = model
|
||||
captured["client"] = client
|
||||
captured["kwargs"] = kwargs
|
||||
return sentinel.llm
|
||||
|
||||
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||
|
||||
llm, embeddings = factory.build_models(
|
||||
"gpt-5",
|
||||
"text-embedding-3-small",
|
||||
EvaluationSettings(),
|
||||
)
|
||||
|
||||
assert llm is sentinel.llm
|
||||
assert embeddings is sentinel.embeddings
|
||||
assert captured["model"] == "gpt-5"
|
||||
assert captured["client"] is sentinel.client
|
||||
assert captured["kwargs"] == {"max_tokens": 4096}
|
||||
|
||||
|
||||
def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch):
|
||||
"""Operators should be able to raise the judge completion budget via settings."""
|
||||
import rag_eval.metrics.factory as factory
|
||||
from rag_eval.settings import EvaluationSettings
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_llm_factory(model, client=None, **kwargs):
|
||||
captured["kwargs"] = kwargs
|
||||
return sentinel.llm
|
||||
|
||||
monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client)
|
||||
monkeypatch.setattr(factory, "llm_factory", fake_llm_factory)
|
||||
monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings)
|
||||
|
||||
factory.build_models(
|
||||
"gpt-5",
|
||||
"text-embedding-3-small",
|
||||
EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192),
|
||||
)
|
||||
|
||||
assert captured["kwargs"] == {"max_tokens": 8192}
|
||||
|
||||
Reference in New Issue
Block a user