feat(logging): add structured evaluation logs for metric-level debugging
- pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -3,12 +3,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from rag_eval.shared.models import MetricScore, NormalizedSample
|
||||
|
||||
logger = logging.getLogger("rag_eval.metrics.pipeline")
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class MetricPipeline:
|
||||
@@ -22,12 +26,43 @@ class MetricPipeline:
|
||||
results = {name: math.nan for name in self.metrics}
|
||||
errors: list[str] = []
|
||||
|
||||
sid = sample.sample_id[:12]
|
||||
ans_len = len(sample.answer or "")
|
||||
ctx_count = len(sample.contexts or [])
|
||||
logger.debug(
|
||||
"[score] sample=%s ans_len=%d ctx_count=%d question=%r",
|
||||
sid, ans_len, ctx_count,
|
||||
(sample.question or "")[:80],
|
||||
)
|
||||
|
||||
for name, metric in self.metrics.items():
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
result = await self._run_metric(name, metric, sample)
|
||||
results[name] = float(result.value)
|
||||
score_val = float(result.value)
|
||||
results[name] = score_val
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info(
|
||||
"[metric OK ] sample=%-12s %-20s score=%.4f elapsed=%.1fs",
|
||||
sid, name, score_val, elapsed,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
elapsed = time.monotonic() - t0
|
||||
msg = f"timeout after {self.metric_timeout_seconds}s"
|
||||
errors.append(f"{name}: {msg}")
|
||||
logger.warning(
|
||||
"[metric TMO] sample=%-12s %-20s TIMEOUT after %.1fs",
|
||||
sid, name, elapsed,
|
||||
)
|
||||
except Exception as exc:
|
||||
elapsed = time.monotonic() - t0
|
||||
exc_type = type(exc).__name__
|
||||
errors.append(f"{name}: {exc}")
|
||||
logger.warning(
|
||||
"[metric ERR] sample=%-12s %-20s %s: %s (elapsed=%.1fs)",
|
||||
sid, name, exc_type, exc, elapsed,
|
||||
)
|
||||
|
||||
return MetricScore(metrics=results, error=" | ".join(errors))
|
||||
|
||||
async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
|
||||
@@ -72,11 +107,22 @@ class MetricPipeline:
|
||||
max_concurrency: int,
|
||||
) -> list[MetricScore]:
|
||||
"""Score all samples while respecting the configured concurrency limit."""
|
||||
total = len(samples)
|
||||
logger.info("[pipeline] scoring %d samples concurrency=%d timeout=%ss",
|
||||
total, max_concurrency, self.metric_timeout_seconds)
|
||||
semaphore = asyncio.Semaphore(max(1, max_concurrency))
|
||||
completed = 0
|
||||
|
||||
async def guarded(sample: NormalizedSample) -> MetricScore:
|
||||
async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
|
||||
"""Throttle a single sample-scoring coroutine with the shared semaphore."""
|
||||
nonlocal completed
|
||||
async with semaphore:
|
||||
return await self.score_sample(sample)
|
||||
result = await self.score_sample(sample)
|
||||
completed += 1
|
||||
nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
|
||||
status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
|
||||
logger.info("[pipeline] progress %d/%d sample=%-12s %s",
|
||||
completed, total, sample.sample_id[:12], status)
|
||||
return result
|
||||
|
||||
return await asyncio.gather(*(guarded(sample) for sample in samples))
|
||||
return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))
|
||||
|
||||
Reference in New Issue
Block a user