feat(logging): add structured evaluation logs for metric-level debugging

- pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 10:48:41 +08:00
parent 1ff4a3943a
commit 629304aa6d
4 changed files with 164 additions and 10 deletions
--- a/main.py
+++ b/main.py
@@ -1,6 +1,8 @@
 from __future__ import annotations

 import argparse
+import logging
+from pathlib import Path

 from rag_eval.dataset_builder.runner import run_dataset_build
 from rag_eval.execution.runner import run_scenario
@@ -18,18 +20,33 @@ def parse_args() -> argparse.Namespace:
        "--dataset-build-config",
        help="Path to a YAML dataset build config file.",
    )
+    parser.add_argument(
+        "--log-file",
+        default=None,
+        help="Write evaluation logs to this file (in addition to stderr). "
+             "Example: logs/eval.log",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging verbosity level (default: INFO). Use DEBUG for per-metric detail.",
+    )
    return parser.parse_args()


 def main() -> None:
    """Dispatch the CLI call to the requested workflow."""
    args = parse_args()
+    log_level = getattr(logging, args.log_level.upper(), logging.INFO)
+    log_file = Path(args.log_file) if args.log_file else None
+
    if args.dataset_build_config:
        result = run_dataset_build(args.dataset_build_config)
        print(f"Completed dataset build: {result.artifact_paths.root_dir}")
        return

-    result = run_scenario(args.scenario)
+    result = run_scenario(args.scenario, log_file=log_file, log_level=log_level)
    print(f"Completed run: {result.scenario.output_dir}")