siemens_ragas/rag_eval/execution/runner.py

"""High-level scenario runner used by the package and CLI entrypoints."""

from __future__ import annotations

import logging
import sys
from pathlib import Path

from rag_eval.adapters.http import HttpAppAdapter
from rag_eval.adapters.python import PythonFunctionAdapter
from rag_eval.advisor import run_advisor
from rag_eval.config.loader import load_scenario
from rag_eval.metrics.factory import build_models, build_metric_pipeline
from rag_eval.reporting.writers import write_run_artifacts
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.models import Scenario

from .evaluator import Evaluator

logger = logging.getLogger("rag_eval.execution.runner")


def _setup_logging(log_file: Path | None = None, level: int = logging.INFO) -> None:
    """Configure root logger: always write to stderr, optionally also to a file."""
    fmt = "%(asctime)s  %(levelname)-8s  %(name)s  %(message)s"
    datefmt = "%H:%M:%S"

    handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
    if log_file is not None:
        log_file.parent.mkdir(parents=True, exist_ok=True)
        fh = logging.FileHandler(log_file, encoding="utf-8")
        fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))
        handlers.append(fh)

    logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)
    # Also show ragas internal logs at WARNING so we can see LLM errors
    logging.getLogger("ragas").setLevel(logging.WARNING)
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("openai").setLevel(logging.WARNING)


def build_adapter(scenario: Scenario):
    """Instantiate the adapter required by the resolved scenario, if any."""
    if scenario.app_adapter is None:
        return None
    if scenario.app_adapter.type == "http":
        return HttpAppAdapter(scenario.app_adapter)
    if scenario.app_adapter.type == "python":
        return PythonFunctionAdapter(scenario.app_adapter)
    raise ValueError(f"Unsupported adapter type: {scenario.app_adapter.type}")


def run_scenario(
    scenario_path: str,
    settings: EvaluationSettings | None = None,
    log_file: Path | None = None,
    log_level: int = logging.INFO,
):
    """Run one scenario end to end and persist its reporting artifacts."""
    _setup_logging(log_file=log_file, level=log_level)
    logger.info("[runner] run_scenario  path=%s", scenario_path)

    settings = settings or EvaluationSettings()
    if not settings.openai_api_key:
        raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")

    scenario = load_scenario(scenario_path)
    logger.info("[runner] scenario loaded: name=%s  mode=%s  max_samples=%s",
                scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)

    # Build models once; reuse llm in both MetricPipeline and advisor.
    llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)

    adapter = build_adapter(scenario)
    pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)
    evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)
    result = evaluator.evaluate()
    write_run_artifacts(result)
    logger.info("[runner] artifacts written for run_id=%s", result.run_id)

    # Optimization advisor — runs only if scenario.optimization_advisor is True.
    run_advisor(result, scenario, llm)

    return result
first commit 2026-06-12 14:02:15 +08:00			`"""High-level scenario runner used by the package and CLI entrypoints."""`

			`from __future__ import annotations`

feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`import logging`
			`import sys`
			`from pathlib import Path`

first commit 2026-06-12 14:02:15 +08:00			`from rag_eval.adapters.http import HttpAppAdapter`
			`from rag_eval.adapters.python import PythonFunctionAdapter`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`from rag_eval.advisor import run_advisor`
first commit 2026-06-12 14:02:15 +08:00			`from rag_eval.config.loader import load_scenario`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`from rag_eval.metrics.factory import build_models, build_metric_pipeline`
first commit 2026-06-12 14:02:15 +08:00			`from rag_eval.reporting.writers import write_run_artifacts`
			`from rag_eval.settings import EvaluationSettings`
			`from rag_eval.shared.models import Scenario`

			`from .evaluator import Evaluator`

feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`logger = logging.getLogger("rag_eval.execution.runner")`


			`def _setup_logging(log_file: Path \| None = None, level: int = logging.INFO) -> None:`
			`"""Configure root logger: always write to stderr, optionally also to a file."""`
			`fmt = "%(asctime)s %(levelname)-8s %(name)s %(message)s"`
			`datefmt = "%H:%M:%S"`

			`handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]`
			`if log_file is not None:`
			`log_file.parent.mkdir(parents=True, exist_ok=True)`
			`fh = logging.FileHandler(log_file, encoding="utf-8")`
			`fh.setFormatter(logging.Formatter(fmt, datefmt=datefmt))`
			`handlers.append(fh)`

			`logging.basicConfig(level=level, format=fmt, datefmt=datefmt, handlers=handlers, force=True)`
			`# Also show ragas internal logs at WARNING so we can see LLM errors`
			`logging.getLogger("ragas").setLevel(logging.WARNING)`
			`logging.getLogger("httpx").setLevel(logging.WARNING)`
			`logging.getLogger("openai").setLevel(logging.WARNING)`

first commit 2026-06-12 14:02:15 +08:00
			`def build_adapter(scenario: Scenario):`
			`"""Instantiate the adapter required by the resolved scenario, if any."""`
			`if scenario.app_adapter is None:`
			`return None`
			`if scenario.app_adapter.type == "http":`
			`return HttpAppAdapter(scenario.app_adapter)`
			`if scenario.app_adapter.type == "python":`
			`return PythonFunctionAdapter(scenario.app_adapter)`
			`raise ValueError(f"Unsupported adapter type: {scenario.app_adapter.type}")`


			`def run_scenario(`
			`scenario_path: str,`
			`settings: EvaluationSettings \| None = None,`
feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`log_file: Path \| None = None,`
			`log_level: int = logging.INFO,`
first commit 2026-06-12 14:02:15 +08:00			`):`
			`"""Run one scenario end to end and persist its reporting artifacts."""`
feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`_setup_logging(log_file=log_file, level=log_level)`
			`logger.info("[runner] run_scenario path=%s", scenario_path)`

first commit 2026-06-12 14:02:15 +08:00			`settings = settings or EvaluationSettings()`
			`if not settings.openai_api_key:`
			`raise EnvironmentError("OPENAI_API_KEY must be set before running the evaluator.")`

			`scenario = load_scenario(scenario_path)`
feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`logger.info("[runner] scenario loaded: name=%s mode=%s max_samples=%s",`
			`scenario.scenario_name, scenario.mode, scenario.runtime.max_samples)`

feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`# Build models once; reuse llm in both MetricPipeline and advisor.`
			`llm, embeddings = build_models(scenario.judge_model, scenario.embedding_model, settings)`

first commit 2026-06-12 14:02:15 +08:00			`adapter = build_adapter(scenario)`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`pipeline = build_metric_pipeline(scenario, settings, llm=llm, embeddings=embeddings)`
first commit 2026-06-12 14:02:15 +08:00			`evaluator = Evaluator(scenario=scenario, metric_pipeline=pipeline, app_adapter=adapter)`
			`result = evaluator.evaluate()`
			`write_run_artifacts(result)`
feat(logging): add structured evaluation logs for metric-level debugging - pipeline.py: log each metric score/timeout/error with sample_id, elapsed time, and score value; log NaN list per sample; progress counter N/total after each sample completes - evaluator.py: log eval start, dataset counts, adapter enrichment progress (per-sample OK/FAIL with elapsed), metric scoring summary, and per-metric NaN rate at end of run - runner.py: _setup_logging() helper writes to stderr + optional file; ragas/httpx/openai noisy loggers throttled to WARNING - main.py: add --log-file and --log-level CLI flags Usage: python main.py --scenario scenarios/online/... --log-file logs/eval.log --log-level DEBUG Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 10:48:41 +08:00			`logger.info("[runner] artifacts written for run_id=%s", result.run_id)`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00
			`# Optimization advisor — runs only if scenario.optimization_advisor is True.`
			`run_advisor(result, scenario, llm)`

first commit 2026-06-12 14:02:15 +08:00			`return result`