2026-06-12 14:02:15 +08:00
|
|
|
"""Core evaluation workflow for offline and online scenarios."""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import asyncio
|
2026-06-16 10:48:41 +08:00
|
|
|
import logging
|
|
|
|
|
import time
|
2026-06-12 14:02:15 +08:00
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from rag_eval.adapters.base import AppAdapter
|
|
|
|
|
from rag_eval.datasets.loader import load_dataset_records
|
|
|
|
|
from rag_eval.datasets.normalizers import normalize_records
|
|
|
|
|
from rag_eval.execution.concurrency import gather_with_limit
|
|
|
|
|
from rag_eval.metrics.pipeline import MetricPipeline
|
2026-06-18 16:53:45 +08:00
|
|
|
from rag_eval.metrics.weights import compute_weighted_score, resolve_weight
|
2026-06-12 14:02:15 +08:00
|
|
|
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
|
|
|
|
|
from rag_eval.shared.utils import utc_now_iso
|
|
|
|
|
|
2026-06-16 10:48:41 +08:00
|
|
|
logger = logging.getLogger("rag_eval.execution.evaluator")
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
|
|
|
|
|
class Evaluator:
|
|
|
|
|
"""Coordinate dataset loading, optional app execution, and metric scoring."""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
scenario: Scenario,
|
|
|
|
|
metric_pipeline: MetricPipeline,
|
|
|
|
|
app_adapter: AppAdapter | None = None,
|
|
|
|
|
):
|
|
|
|
|
"""Create an evaluator for one resolved scenario."""
|
|
|
|
|
self.scenario = scenario
|
|
|
|
|
self.metric_pipeline = metric_pipeline
|
|
|
|
|
self.app_adapter = app_adapter
|
|
|
|
|
|
|
|
|
|
def evaluate(self) -> EvaluationResult:
|
|
|
|
|
"""Execute the full evaluation flow and return the collected results."""
|
|
|
|
|
started_at = utc_now_iso()
|
2026-06-16 10:48:41 +08:00
|
|
|
scenario_name = self.scenario.scenario_name
|
|
|
|
|
mode = self.scenario.mode
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
logger.info("[eval] START scenario=%s mode=%s", scenario_name, mode)
|
|
|
|
|
logger.info("[eval] dataset=%s", self.scenario.dataset.path)
|
|
|
|
|
logger.info("[eval] metrics=%s", list(self.scenario.metrics))
|
|
|
|
|
logger.info("[eval] judge=%s embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
raw_records = load_dataset_records(self.scenario.dataset.path)
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info("[eval] raw_records=%d", len(raw_records))
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
samples, invalid_samples = normalize_records(
|
|
|
|
|
raw_records,
|
|
|
|
|
mode=self.scenario.mode,
|
|
|
|
|
max_samples=self.scenario.runtime.max_samples,
|
|
|
|
|
)
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info("[eval] normalized: valid=%d invalid=%d", len(samples), len(invalid_samples))
|
2026-06-12 14:02:15 +08:00
|
|
|
|
|
|
|
|
if self.scenario.mode == "online":
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
|
|
|
|
|
t0 = time.monotonic()
|
2026-06-12 14:02:15 +08:00
|
|
|
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
|
2026-06-16 10:48:41 +08:00
|
|
|
elapsed = time.monotonic() - t0
|
2026-06-12 14:02:15 +08:00
|
|
|
invalid_samples.extend(online_invalids)
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info(
|
|
|
|
|
"[eval] adapter done: enriched=%d adapter_invalids=%d elapsed=%.1fs",
|
|
|
|
|
len(samples), len(online_invalids), elapsed,
|
|
|
|
|
)
|
2026-06-12 14:02:15 +08:00
|
|
|
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
|
|
|
|
|
t0 = time.monotonic()
|
2026-06-12 14:02:15 +08:00
|
|
|
metric_scores = asyncio.run(
|
|
|
|
|
self.metric_pipeline.score_samples(
|
|
|
|
|
samples,
|
|
|
|
|
max_concurrency=self.scenario.runtime.metric_limit(),
|
|
|
|
|
)
|
|
|
|
|
)
|
2026-06-16 10:48:41 +08:00
|
|
|
elapsed = time.monotonic() - t0
|
|
|
|
|
logger.info("[eval] metric scoring done elapsed=%.1fs", elapsed)
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
finished_at = utc_now_iso()
|
|
|
|
|
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
|
2026-06-16 10:48:41 +08:00
|
|
|
|
|
|
|
|
# Summary of NaN rates per metric
|
|
|
|
|
import math
|
|
|
|
|
for metric_name in self.scenario.metrics:
|
|
|
|
|
nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
|
|
|
|
|
logger.info("[eval] %-22s NaN=%d/%d (%.0f%%)",
|
|
|
|
|
metric_name, nan_count, len(score_rows),
|
|
|
|
|
100 * nan_count / len(score_rows) if score_rows else 0)
|
|
|
|
|
|
2026-06-12 14:02:15 +08:00
|
|
|
run_id = finished_at.replace(":", "-")
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.info("[eval] DONE run_id=%s total_valid=%d total_invalid=%d",
|
|
|
|
|
run_id, len(samples), len(invalid_samples))
|
|
|
|
|
logger.info("=" * 60)
|
2026-06-12 14:02:15 +08:00
|
|
|
return EvaluationResult(
|
|
|
|
|
scenario=self.scenario,
|
|
|
|
|
run_id=run_id,
|
|
|
|
|
started_at=started_at,
|
|
|
|
|
finished_at=finished_at,
|
|
|
|
|
valid_samples=samples,
|
|
|
|
|
invalid_samples=invalid_samples,
|
|
|
|
|
score_rows=score_rows,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def _enrich_online_samples(
|
|
|
|
|
self,
|
|
|
|
|
samples: list[NormalizedSample],
|
|
|
|
|
) -> tuple[list[NormalizedSample], list[InvalidSample]]:
|
|
|
|
|
"""Populate answers and contexts by calling the configured application adapter."""
|
|
|
|
|
if self.app_adapter is None:
|
|
|
|
|
raise ValueError("online mode requires an app adapter.")
|
|
|
|
|
|
|
|
|
|
valid: list[NormalizedSample] = []
|
|
|
|
|
invalid: list[InvalidSample] = []
|
2026-06-16 10:48:41 +08:00
|
|
|
total = len(samples)
|
2026-06-12 14:02:15 +08:00
|
|
|
|
2026-06-16 10:48:41 +08:00
|
|
|
async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
|
2026-06-12 14:02:15 +08:00
|
|
|
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
|
2026-06-16 10:48:41 +08:00
|
|
|
sid = sample.sample_id[:12]
|
|
|
|
|
logger.debug("[adapter] [%d/%d] calling adapter sample=%s question=%r",
|
|
|
|
|
idx + 1, total, sid, (sample.question or "")[:60])
|
|
|
|
|
t0 = time.monotonic()
|
2026-06-12 14:02:15 +08:00
|
|
|
try:
|
2026-06-16 10:48:41 +08:00
|
|
|
result = await self.app_adapter.enrich_sample(sample)
|
|
|
|
|
elapsed = time.monotonic() - t0
|
|
|
|
|
ans_len = len(result.answer or "")
|
|
|
|
|
ctx_count = len(result.contexts or [])
|
|
|
|
|
logger.info("[adapter] [%d/%d] OK sample=%-12s ans_len=%d ctx_count=%d elapsed=%.1fs",
|
|
|
|
|
idx + 1, total, sid, ans_len, ctx_count, elapsed)
|
|
|
|
|
return result
|
2026-06-12 14:02:15 +08:00
|
|
|
except Exception as exc:
|
2026-06-16 10:48:41 +08:00
|
|
|
elapsed = time.monotonic() - t0
|
2026-06-12 14:02:15 +08:00
|
|
|
error_type = type(exc).__name__
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.warning("[adapter] [%d/%d] FAIL sample=%-12s %s: %s (elapsed=%.1fs)",
|
|
|
|
|
idx + 1, total, sid, error_type, exc, elapsed)
|
2026-06-12 14:02:15 +08:00
|
|
|
return InvalidSample(
|
|
|
|
|
sample_id=sample.sample_id,
|
|
|
|
|
error=f"adapter failed [{error_type}]: {exc}",
|
|
|
|
|
raw=sample.raw,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
factories = [
|
2026-06-16 10:48:41 +08:00
|
|
|
(lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
|
|
|
|
|
for i, sample in enumerate(samples)
|
2026-06-12 14:02:15 +08:00
|
|
|
]
|
|
|
|
|
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
|
|
|
|
|
|
|
|
|
|
for sample in results:
|
|
|
|
|
if isinstance(sample, InvalidSample):
|
|
|
|
|
invalid.append(sample)
|
|
|
|
|
continue
|
|
|
|
|
# Treat incomplete adapter payloads as invalid so reporting stays explicit.
|
|
|
|
|
errors: list[str] = []
|
|
|
|
|
if not sample.answer:
|
|
|
|
|
errors.append("adapter returned empty answer")
|
|
|
|
|
if not sample.contexts:
|
|
|
|
|
errors.append("adapter returned empty contexts")
|
|
|
|
|
if errors:
|
2026-06-16 10:48:41 +08:00
|
|
|
logger.warning("[adapter] incomplete payload sample=%s errors=%s",
|
|
|
|
|
sample.sample_id[:12], errors)
|
2026-06-12 14:02:15 +08:00
|
|
|
invalid.append(
|
|
|
|
|
InvalidSample(
|
|
|
|
|
sample_id=sample.sample_id,
|
|
|
|
|
error="; ".join(errors),
|
|
|
|
|
raw=sample.raw,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
valid.append(sample)
|
2026-06-16 10:48:41 +08:00
|
|
|
|
|
|
|
|
logger.info("[adapter] enrichment summary: valid=%d invalid=%d of total=%d",
|
|
|
|
|
len(valid), len(invalid), total)
|
2026-06-12 14:02:15 +08:00
|
|
|
return valid, invalid
|
|
|
|
|
|
|
|
|
|
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:
|
2026-06-18 16:53:45 +08:00
|
|
|
"""Combine sample data, metric results, run metadata, and weight columns."""
|
2026-06-12 14:02:15 +08:00
|
|
|
record = sample.to_record()
|
|
|
|
|
record["contexts"] = sample.contexts
|
|
|
|
|
record.update(score.metrics)
|
|
|
|
|
record["error"] = score.error
|
|
|
|
|
record["judge_model"] = self.scenario.judge_model
|
|
|
|
|
record["embedding_model"] = self.scenario.embedding_model
|
|
|
|
|
record["run_id"] = self.scenario.scenario_name
|
2026-06-18 16:53:45 +08:00
|
|
|
# Weighted score columns — enable post-hoc weighted aggregation in reporting.
|
|
|
|
|
record["weighted_score"] = compute_weighted_score(
|
|
|
|
|
score.metrics, self.scenario.metric_weights
|
|
|
|
|
)
|
|
|
|
|
doc_name = str(sample.metadata.get("doc_name", "") or "")
|
|
|
|
|
record["sample_weight"] = resolve_weight(
|
|
|
|
|
self.scenario.doc_weights, doc_name, default=1.0
|
|
|
|
|
)
|
2026-06-12 14:02:15 +08:00
|
|
|
return record
|