first commit
This commit is contained in:
5
rag_eval/reporting/__init__.py
Normal file
5
rag_eval/reporting/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Reporting helpers that write evaluation outputs to disk."""
|
||||
|
||||
from .writers import write_run_artifacts
|
||||
|
||||
__all__ = ["write_run_artifacts"]
|
||||
20
rag_eval/reporting/artifacts.py
Normal file
20
rag_eval/reporting/artifacts.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Helpers for deriving file-system paths for run artifacts."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from rag_eval.shared.models import RunArtifactPaths
|
||||
|
||||
|
||||
def build_artifact_paths(output_dir: Path, run_id: str) -> RunArtifactPaths:
|
||||
"""Build the canonical artifact file paths for a single evaluation run."""
|
||||
run_dir = output_dir / run_id
|
||||
return RunArtifactPaths(
|
||||
root_dir=run_dir,
|
||||
scenario_snapshot=run_dir / "scenario.snapshot.yaml",
|
||||
scores_csv=run_dir / "scores.csv",
|
||||
invalid_csv=run_dir / "invalid.csv",
|
||||
summary_md=run_dir / "summary.md",
|
||||
metadata_json=run_dir / "metadata.json",
|
||||
)
|
||||
78
rag_eval/reporting/summary.py
Normal file
78
rag_eval/reporting/summary.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Markdown summary generation for completed evaluation runs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from rag_eval.shared.models import EvaluationResult
|
||||
|
||||
|
||||
def _table_from_frame(frame: pd.DataFrame) -> str:
|
||||
"""Render a small dataframe as a fixed-width markdown-friendly text table."""
|
||||
if frame.empty:
|
||||
return "No rows."
|
||||
|
||||
columns = list(frame.columns)
|
||||
rows = [[str(value) for value in row] for row in frame.astype(object).values.tolist()]
|
||||
widths = []
|
||||
for index, column in enumerate(columns):
|
||||
column_width = len(str(column))
|
||||
row_width = max((len(row[index]) for row in rows), default=0)
|
||||
widths.append(max(column_width, row_width))
|
||||
|
||||
header = " | ".join(str(column).ljust(widths[idx]) for idx, column in enumerate(columns))
|
||||
separator = "-|-".join("-" * widths[idx] for idx in range(len(columns)))
|
||||
body = [
|
||||
" | ".join(row[idx].ljust(widths[idx]) for idx in range(len(columns)))
|
||||
for row in rows
|
||||
]
|
||||
return "\n".join([header, separator, *body])
|
||||
|
||||
|
||||
def build_summary_markdown(result: EvaluationResult) -> str:
|
||||
"""Build the human-readable markdown summary written for each evaluation run."""
|
||||
total = len(result.valid_samples) + len(result.invalid_samples)
|
||||
scores = pd.DataFrame(result.score_rows)
|
||||
|
||||
lines = [
|
||||
f"# {result.scenario.scenario_name}",
|
||||
"",
|
||||
f"- run_id: `{result.run_id}`",
|
||||
f"- mode: `{result.scenario.mode}`",
|
||||
f"- total_samples: `{total}`",
|
||||
f"- valid_samples: `{len(result.valid_samples)}`",
|
||||
f"- invalid_samples: `{len(result.invalid_samples)}`",
|
||||
f"- judge_model: `{result.scenario.judge_model}`",
|
||||
f"- embedding_model: `{result.scenario.embedding_model}`",
|
||||
"",
|
||||
"## Metric Means",
|
||||
"",
|
||||
]
|
||||
|
||||
if scores.empty:
|
||||
lines.append("No valid samples were scored.")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
for metric in result.scenario.metrics:
|
||||
mean_value = scores[metric].mean(numeric_only=True)
|
||||
if isinstance(mean_value, float) and not math.isnan(mean_value):
|
||||
lines.append(f"- {metric}: `{mean_value:.4f}`")
|
||||
else:
|
||||
lines.append(f"- {metric}: `n/a`")
|
||||
|
||||
# Keep the summary self-sufficient by including every scored sample and its errors.
|
||||
detail_columns = ["sample_id", *result.scenario.metrics, "error"]
|
||||
detail = scores[detail_columns]
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Per-sample Scores",
|
||||
"",
|
||||
"```text",
|
||||
_table_from_frame(detail),
|
||||
"```",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
52
rag_eval/reporting/writers.py
Normal file
52
rag_eval/reporting/writers.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Writers that persist evaluation outputs as local run artifacts."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
from rag_eval.reporting.artifacts import build_artifact_paths
|
||||
from rag_eval.reporting.summary import build_summary_markdown
|
||||
from rag_eval.shared.models import EvaluationResult
|
||||
from rag_eval.shared.utils import ensure_directory
|
||||
|
||||
|
||||
def write_run_artifacts(result: EvaluationResult) -> None:
|
||||
"""Write all standard run artifacts for a completed evaluation result."""
|
||||
artifact_paths = build_artifact_paths(result.scenario.output_dir, result.run_id)
|
||||
ensure_directory(artifact_paths.root_dir)
|
||||
|
||||
artifact_paths.scenario_snapshot.write_text(
|
||||
yaml.safe_dump(result.scenario.snapshot(), sort_keys=False, allow_unicode=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
pd.DataFrame(result.score_rows).to_csv(artifact_paths.scores_csv, index=False)
|
||||
pd.DataFrame(
|
||||
[sample.to_record() for sample in result.invalid_samples]
|
||||
).to_csv(artifact_paths.invalid_csv, index=False)
|
||||
|
||||
artifact_paths.summary_md.write_text(
|
||||
build_summary_markdown(result),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# Keep a compact machine-readable summary alongside the larger CSV and markdown outputs.
|
||||
metadata = {
|
||||
"run_id": result.run_id,
|
||||
"scenario_name": result.scenario.scenario_name,
|
||||
"mode": result.scenario.mode,
|
||||
"judge_model": result.scenario.judge_model,
|
||||
"embedding_model": result.scenario.embedding_model,
|
||||
"started_at": result.started_at,
|
||||
"finished_at": result.finished_at,
|
||||
"dataset": result.scenario.dataset.path.as_posix(),
|
||||
"valid_samples": len(result.valid_samples),
|
||||
"invalid_samples": len(result.invalid_samples),
|
||||
}
|
||||
artifact_paths.metadata_json.write_text(
|
||||
json.dumps(metadata, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
Reference in New Issue
Block a user