feat(logging): add structured evaluation logs for metric-level debugging

- pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 10:48:41 +08:00
parent 1ff4a3943a
commit 629304aa6d
4 changed files with 164 additions and 10 deletions
--- a/rag_eval/metrics/pipeline.py
+++ b/rag_eval/metrics/pipeline.py
@@ -3,12 +3,16 @@
 from __future__ import annotations

 import asyncio
+import logging
 import math
+import time
 from dataclasses import dataclass
 from typing import Any

 from rag_eval.shared.models import MetricScore, NormalizedSample

+logger = logging.getLogger("rag_eval.metrics.pipeline")
+

@dataclass(slots=True)
 class MetricPipeline:
@@ -22,12 +26,43 @@ class MetricPipeline:
        results = {name: math.nan for name in self.metrics}
        errors: list[str] = []

+        sid = sample.sample_id[:12]
+        ans_len = len(sample.answer or "")
+        ctx_count = len(sample.contexts or [])
+        logger.debug(
+            "[score] sample=%s  ans_len=%d  ctx_count=%d  question=%r",
+            sid, ans_len, ctx_count,
+            (sample.question or "")[:80],
+        )
+
        for name, metric in self.metrics.items():
+            t0 = time.monotonic()
            try:
                result = await self._run_metric(name, metric, sample)
-                results[name] = float(result.value)
+                score_val = float(result.value)
+                results[name] = score_val
+                elapsed = time.monotonic() - t0
+                logger.info(
+                    "[metric OK ] sample=%-12s  %-20s  score=%.4f  elapsed=%.1fs",
+                    sid, name, score_val, elapsed,
+                )
+            except asyncio.TimeoutError:
+                elapsed = time.monotonic() - t0
+                msg = f"timeout after {self.metric_timeout_seconds}s"
+                errors.append(f"{name}: {msg}")
+                logger.warning(
+                    "[metric TMO] sample=%-12s  %-20s  TIMEOUT after %.1fs",
+                    sid, name, elapsed,
+                )
            except Exception as exc:
+                elapsed = time.monotonic() - t0
+                exc_type = type(exc).__name__
                errors.append(f"{name}: {exc}")
+                logger.warning(
+                    "[metric ERR] sample=%-12s  %-20s  %s: %s  (elapsed=%.1fs)",
+                    sid, name, exc_type, exc, elapsed,
+                )
+
        return MetricScore(metrics=results, error=" | ".join(errors))

    async def _run_metric(self, name: str, metric: Any, sample: NormalizedSample) -> Any:
@@ -72,11 +107,22 @@ class MetricPipeline:
        max_concurrency: int,
    ) -> list[MetricScore]:
        """Score all samples while respecting the configured concurrency limit."""
+        total = len(samples)
+        logger.info("[pipeline] scoring %d samples  concurrency=%d  timeout=%ss",
+                    total, max_concurrency, self.metric_timeout_seconds)
        semaphore = asyncio.Semaphore(max(1, max_concurrency))
+        completed = 0

-        async def guarded(sample: NormalizedSample) -> MetricScore:
+        async def guarded(idx: int, sample: NormalizedSample) -> MetricScore:
            """Throttle a single sample-scoring coroutine with the shared semaphore."""
+            nonlocal completed
            async with semaphore:
-                return await self.score_sample(sample)
+                result = await self.score_sample(sample)
+                completed += 1
+                nan_metrics = [k for k, v in result.metrics.items() if math.isnan(v)]
+                status = f"NaN={nan_metrics}" if nan_metrics else "all OK"
+                logger.info("[pipeline] progress %d/%d  sample=%-12s  %s",
+                            completed, total, sample.sample_id[:12], status)
+                return result

-        return await asyncio.gather(*(guarded(sample) for sample in samples))
+        return await asyncio.gather(*(guarded(i, s) for i, s in enumerate(samples)))