feat(logging): add structured evaluation logs for metric-level debugging

- pipeline.py: log each metric score/timeout/error with sample_id,
  elapsed time, and score value; log NaN list per sample; progress
  counter N/total after each sample completes
- evaluator.py: log eval start, dataset counts, adapter enrichment
  progress (per-sample OK/FAIL with elapsed), metric scoring summary,
  and per-metric NaN rate at end of run
- runner.py: _setup_logging() helper writes to stderr + optional file;
  ragas/httpx/openai noisy loggers throttled to WARNING
- main.py: add --log-file and --log-level CLI flags

Usage:
  python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2026-06-16 10:48:41 +08:00
parent 1ff4a3943a
commit 629304aa6d
4 changed files with 164 additions and 10 deletions

View File

@@ -3,6 +3,8 @@
from __future__ import annotations
import asyncio
import logging
import time
from typing import Any
from rag_eval.adapters.base import AppAdapter
@@ -13,6 +15,8 @@ from rag_eval.metrics.pipeline import MetricPipeline
from rag_eval.shared.models import EvaluationResult, InvalidSample, NormalizedSample, Scenario
from rag_eval.shared.utils import utc_now_iso
logger = logging.getLogger("rag_eval.execution.evaluator")
class Evaluator:
"""Coordinate dataset loading, optional app execution, and metric scoring."""
@@ -31,27 +35,61 @@ class Evaluator:
def evaluate(self) -> EvaluationResult:
"""Execute the full evaluation flow and return the collected results."""
started_at = utc_now_iso()
scenario_name = self.scenario.scenario_name
mode = self.scenario.mode
logger.info("=" * 60)
logger.info("[eval] START scenario=%s mode=%s", scenario_name, mode)
logger.info("[eval] dataset=%s", self.scenario.dataset.path)
logger.info("[eval] metrics=%s", list(self.scenario.metrics))
logger.info("[eval] judge=%s embed=%s", self.scenario.judge_model, self.scenario.embedding_model)
raw_records = load_dataset_records(self.scenario.dataset.path)
logger.info("[eval] raw_records=%d", len(raw_records))
samples, invalid_samples = normalize_records(
raw_records,
mode=self.scenario.mode,
max_samples=self.scenario.runtime.max_samples,
)
logger.info("[eval] normalized: valid=%d invalid=%d", len(samples), len(invalid_samples))
if self.scenario.mode == "online":
# Online mode enriches each sample by calling the target application first.
logger.info("[eval] online mode: calling app adapter for %d samples ...", len(samples))
t0 = time.monotonic()
samples, online_invalids = asyncio.run(self._enrich_online_samples(samples))
elapsed = time.monotonic() - t0
invalid_samples.extend(online_invalids)
logger.info(
"[eval] adapter done: enriched=%d adapter_invalids=%d elapsed=%.1fs",
len(samples), len(online_invalids), elapsed,
)
logger.info("[eval] scoring %d samples with metric pipeline ...", len(samples))
t0 = time.monotonic()
metric_scores = asyncio.run(
self.metric_pipeline.score_samples(
samples,
max_concurrency=self.scenario.runtime.metric_limit(),
)
)
elapsed = time.monotonic() - t0
logger.info("[eval] metric scoring done elapsed=%.1fs", elapsed)
finished_at = utc_now_iso()
score_rows = [self._merge_score(sample, score) for sample, score in zip(samples, metric_scores)]
# Summary of NaN rates per metric
import math
for metric_name in self.scenario.metrics:
nan_count = sum(1 for row in score_rows if math.isnan(float(row.get(metric_name, float("nan")) or float("nan"))))
logger.info("[eval] %-22s NaN=%d/%d (%.0f%%)",
metric_name, nan_count, len(score_rows),
100 * nan_count / len(score_rows) if score_rows else 0)
run_id = finished_at.replace(":", "-")
logger.info("[eval] DONE run_id=%s total_valid=%d total_invalid=%d",
run_id, len(samples), len(invalid_samples))
logger.info("=" * 60)
return EvaluationResult(
scenario=self.scenario,
run_id=run_id,
@@ -72,13 +110,27 @@ class Evaluator:
valid: list[NormalizedSample] = []
invalid: list[InvalidSample] = []
total = len(samples)
async def enrich_with_capture(sample: NormalizedSample) -> NormalizedSample | InvalidSample:
async def enrich_with_capture(idx: int, sample: NormalizedSample) -> NormalizedSample | InvalidSample:
"""Convert adapter exceptions into invalid samples instead of aborting the run."""
sid = sample.sample_id[:12]
logger.debug("[adapter] [%d/%d] calling adapter sample=%s question=%r",
idx + 1, total, sid, (sample.question or "")[:60])
t0 = time.monotonic()
try:
return await self.app_adapter.enrich_sample(sample)
result = await self.app_adapter.enrich_sample(sample)
elapsed = time.monotonic() - t0
ans_len = len(result.answer or "")
ctx_count = len(result.contexts or [])
logger.info("[adapter] [%d/%d] OK sample=%-12s ans_len=%d ctx_count=%d elapsed=%.1fs",
idx + 1, total, sid, ans_len, ctx_count, elapsed)
return result
except Exception as exc:
elapsed = time.monotonic() - t0
error_type = type(exc).__name__
logger.warning("[adapter] [%d/%d] FAIL sample=%-12s %s: %s (elapsed=%.1fs)",
idx + 1, total, sid, error_type, exc, elapsed)
return InvalidSample(
sample_id=sample.sample_id,
error=f"adapter failed [{error_type}]: {exc}",
@@ -86,8 +138,8 @@ class Evaluator:
)
factories = [
(lambda sample=sample: enrich_with_capture(sample))
for sample in samples
(lambda _idx=i, _sample=sample: enrich_with_capture(_idx, _sample))
for i, sample in enumerate(samples)
]
results = await gather_with_limit(factories, self.scenario.runtime.app_limit())
@@ -102,6 +154,8 @@ class Evaluator:
if not sample.contexts:
errors.append("adapter returned empty contexts")
if errors:
logger.warning("[adapter] incomplete payload sample=%s errors=%s",
sample.sample_id[:12], errors)
invalid.append(
InvalidSample(
sample_id=sample.sample_id,
@@ -111,6 +165,9 @@ class Evaluator:
)
continue
valid.append(sample)
logger.info("[adapter] enrichment summary: valid=%d invalid=%d of total=%d",
len(valid), len(invalid), total)
return valid, invalid
def _merge_score(self, sample: NormalizedSample, score: Any) -> dict[str, Any]:

View File

@@ -2,6 +2,10 @@
from __future__ import annotations
import logging
import sys
from pathlib import Path
from rag_eval.adapters.http import HttpAppAdapter
from rag_eval.adapters.python import PythonFunctionAdapter
from rag_eval.config.loader import load_scenario
@@ -12,6 +16,27 @@ from rag_eval.shared.models import Scenario
from .evaluator import Evaluator
logger = logging.getLogger("rag_eval.execution.runner")
def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
"""Configure root logger: always write to stderr, optionally also to a file."""
fmt = "%(asctime)s %(levelname)-8s %(name)s %(message)s"
datefmt = "%H:%M:%S"
handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if log_file is not None:
log_file.parent.mkdir(parents=True, exist_ok=True)
fh = logging.FileHandler(log_file, encoding="utf-8")
fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
handlers.append(fh)
logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
# Also show ragas internal logs at WARNING so we can see LLM errors
logging.getLogger("ragas").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.WARNING)
def build_adapter(scenario: Scenario):
"""Instantiate the adapter required by the resolved scenario, if any."""
@@ -27,16 +52,25 @@ def build_adapter(scenario: Scenario):
def run_scenario(
scenario_path: str,
settings: EvaluationSettings | None = None,
log_file: Path | None = None,
log_level: int = logging.INFO,
):
"""Run one scenario end to end and persist its reporting artifacts."""
_setup_logging(log_file=log_file, level=log_level)
logger.info("[runner] run_scenario path=%s", scenario_path)
settings = settings or EvaluationSettings()
if not settings.openai_api_key:
raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")
scenario = load_scenario(scenario_path)
logger.info("[runner] scenario loaded: name=%s mode=%s max_samples=%s",
scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)
adapter = build_adapter(scenario)
pipeline = build_metric_pipeline(scenario, settings)
evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
result = evaluator.evaluate()
write_run_artifacts(result)
logger.info("[runner] artifacts written for run_id=%s", result.run_id)
return result