siemens_ragas/rag_eval/shared/models.py

"""Shared runtime data models exchanged across the evaluation pipeline."""

from __future__ import annotations

from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Literal


Mode = Literal["offline", "online"]
AdapterType = Literal["http", "python"]


def _serialize_paths(value: Any) -> Any:
    """Convert Path instances nested inside snapshot payloads into POSIX strings."""
    if isinstance(value, Path):
        return value.as_posix()
    if isinstance(value, dict):
        return {key: _serialize_paths(item) for key, item in value.items()}
    if isinstance(value, list):
        return [_serialize_paths(item) for item in value]
    return value


@dataclass(slots=True)
class RuntimeConfig:
    """Concurrency and sampling controls for one evaluation run."""

    batch_size: int = 4
    app_concurrency: int | None = None
    metric_concurrency: int | None = None
    max_samples: int | None = None

    def metric_limit(self) -> int:
        """Return the effective metric-scoring concurrency limit."""
        return self.metric_concurrency or self.batch_size

    def app_limit(self) -> int:
        """Return the effective application-call concurrency limit."""
        return self.app_concurrency or self.batch_size


@dataclass(slots=True)
class AppAdapterConfig:
    """Resolved adapter configuration used by online scenarios."""

    type: AdapterType
    endpoint: str | None = None
    method: str = "POST"
    timeout_seconds: int = 30
    callable: str | None = None
    request_template: dict[str, Any] = field(default_factory=dict)
    response_mapping: dict[str, str] = field(default_factory=dict)
    static_kwargs: dict[str, Any] = field(default_factory=dict)


@dataclass(slots=True)
class DatasetConfig:
    """Dataset location information for a scenario."""

    path: Path
    format: str | None = None


@dataclass(slots=True)
class Scenario:
    """Resolved evaluation scenario consumed by the execution pipeline."""

    scenario_name: str
    mode: Mode
    dataset: DatasetConfig
    judge_model: str
    embedding_model: str
    metrics: list[str]
    output_dir: Path
    runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
    app_adapter: AppAdapterConfig | None = None
    source_path: Path | None = None
    optimization_advisor: bool = False

    def snapshot(self) -> dict[str, Any]:
        """Serialize the scenario into a reporting-friendly dictionary snapshot."""
        return _serialize_paths(asdict(self))


@dataclass(slots=True)
class NormalizedSample:
    """Canonical sample shape used by adapters, metrics, and reporting."""

    sample_id: str
    question: str
    contexts: list[str]
    answer: str
    ground_truth: str
    scenario: str = ""
    language: str = ""
    retrieval_config: str = ""
    metadata: dict[str, Any] = field(default_factory=dict)
    raw: dict[str, Any] = field(default_factory=dict)

    def to_record(self) -> dict[str, Any]:
        """Convert the sample into a flat record for CSV and artifact generation."""
        record = {
            "sample_id": self.sample_id,
            "question": self.question,
            "contexts": self.contexts,
            "answer": self.answer,
            "ground_truth": self.ground_truth,
            "scenario": self.scenario,
            "language": self.language,
            "retrieval_config": self.retrieval_config,
        }
        record.update(self.metadata)
        return record


@dataclass(slots=True)
class InvalidSample:
    """A dataset or adapter sample that could not be evaluated."""

    sample_id: str
    error: str
    raw: dict[str, Any]

    def to_record(self) -> dict[str, Any]:
        """Convert the invalid sample into a flat reporting row."""
        record = {"sample_id": self.sample_id, "error": self.error}
        record.update(self.raw)
        return record


@dataclass(slots=True)
class MetricScore:
    """Metric values and accumulated errors for one evaluated sample."""

    metrics: dict[str, float | None]
    error: str = ""


@dataclass(slots=True)
class EvaluationResult:
    """Aggregate result object returned after a scenario completes."""

    scenario: Scenario
    run_id: str
    started_at: str
    finished_at: str
    valid_samples: list[NormalizedSample]
    invalid_samples: list[InvalidSample]
    score_rows: list[dict[str, Any]]


@dataclass(slots=True)
class RunArtifactPaths:
    """Canonical file-system paths for all artifacts produced by one run."""

    root_dir: Path
    scenario_snapshot: Path
    scores_csv: Path
    invalid_csv: Path
    summary_md: Path
    metadata_json: Path
    advice_md: Path | None = None
first commit 2026-06-12 14:02:15 +08:00			`"""Shared runtime data models exchanged across the evaluation pipeline."""`

			`from __future__ import annotations`

			`from dataclasses import asdict, dataclass, field`
			`from pathlib import Path`
			`from typing import Any, Literal`


			`Mode = Literal["offline", "online"]`
			`AdapterType = Literal["http", "python"]`


			`def _serialize_paths(value: Any) -> Any:`
			`"""Convert Path instances nested inside snapshot payloads into POSIX strings."""`
			`if isinstance(value, Path):`
			`return value.as_posix()`
			`if isinstance(value, dict):`
			`return {key: _serialize_paths(item) for key, item in value.items()}`
			`if isinstance(value, list):`
			`return [_serialize_paths(item) for item in value]`
			`return value`


			`@dataclass(slots=True)`
			`class RuntimeConfig:`
			`"""Concurrency and sampling controls for one evaluation run."""`

			`batch_size: int = 4`
			`app_concurrency: int \| None = None`
			`metric_concurrency: int \| None = None`
			`max_samples: int \| None = None`

			`def metric_limit(self) -> int:`
			`"""Return the effective metric-scoring concurrency limit."""`
			`return self.metric_concurrency or self.batch_size`

			`def app_limit(self) -> int:`
			`"""Return the effective application-call concurrency limit."""`
			`return self.app_concurrency or self.batch_size`


			`@dataclass(slots=True)`
			`class AppAdapterConfig:`
			`"""Resolved adapter configuration used by online scenarios."""`

			`type: AdapterType`
			`endpoint: str \| None = None`
			`method: str = "POST"`
			`timeout_seconds: int = 30`
			`callable: str \| None = None`
			`request_template: dict[str, Any] = field(default_factory=dict)`
			`response_mapping: dict[str, str] = field(default_factory=dict)`
			`static_kwargs: dict[str, Any] = field(default_factory=dict)`


			`@dataclass(slots=True)`
			`class DatasetConfig:`
			`"""Dataset location information for a scenario."""`

			`path: Path`
			`format: str \| None = None`


			`@dataclass(slots=True)`
			`class Scenario:`
			`"""Resolved evaluation scenario consumed by the execution pipeline."""`

			`scenario_name: str`
			`mode: Mode`
			`dataset: DatasetConfig`
			`judge_model: str`
			`embedding_model: str`
			`metrics: list[str]`
			`output_dir: Path`
			`runtime: RuntimeConfig = field(default_factory=RuntimeConfig)`
			`app_adapter: AppAdapterConfig \| None = None`
			`source_path: Path \| None = None`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`optimization_advisor: bool = False`
first commit 2026-06-12 14:02:15 +08:00
			`def snapshot(self) -> dict[str, Any]:`
			`"""Serialize the scenario into a reporting-friendly dictionary snapshot."""`
			`return _serialize_paths(asdict(self))`


			`@dataclass(slots=True)`
			`class NormalizedSample:`
			`"""Canonical sample shape used by adapters, metrics, and reporting."""`

			`sample_id: str`
			`question: str`
			`contexts: list[str]`
			`answer: str`
			`ground_truth: str`
			`scenario: str = ""`
			`language: str = ""`
			`retrieval_config: str = ""`
			`metadata: dict[str, Any] = field(default_factory=dict)`
			`raw: dict[str, Any] = field(default_factory=dict)`

			`def to_record(self) -> dict[str, Any]:`
			`"""Convert the sample into a flat record for CSV and artifact generation."""`
			`record = {`
			`"sample_id": self.sample_id,`
			`"question": self.question,`
			`"contexts": self.contexts,`
			`"answer": self.answer,`
			`"ground_truth": self.ground_truth,`
			`"scenario": self.scenario,`
			`"language": self.language,`
			`"retrieval_config": self.retrieval_config,`
			`}`
			`record.update(self.metadata)`
			`return record`


			`@dataclass(slots=True)`
			`class InvalidSample:`
			`"""A dataset or adapter sample that could not be evaluated."""`

			`sample_id: str`
			`error: str`
			`raw: dict[str, Any]`

			`def to_record(self) -> dict[str, Any]:`
			`"""Convert the invalid sample into a flat reporting row."""`
			`record = {"sample_id": self.sample_id, "error": self.error}`
			`record.update(self.raw)`
			`return record`


			`@dataclass(slots=True)`
			`class MetricScore:`
			`"""Metric values and accumulated errors for one evaluated sample."""`

			`metrics: dict[str, float \| None]`
			`error: str = ""`


			`@dataclass(slots=True)`
			`class EvaluationResult:`
			`"""Aggregate result object returned after a scenario completes."""`

			`scenario: Scenario`
			`run_id: str`
			`started_at: str`
			`finished_at: str`
			`valid_samples: list[NormalizedSample]`
			`invalid_samples: list[InvalidSample]`
			`score_rows: list[dict[str, Any]]`


			`@dataclass(slots=True)`
			`class RunArtifactPaths:`
			`"""Canonical file-system paths for all artifacts produced by one run."""`

			`root_dir: Path`
			`scenario_snapshot: Path`
			`scores_csv: Path`
			`invalid_csv: Path`
			`summary_md: Path`
			`metadata_json: Path`
feat(advisor): add optimization advisor module - rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com> 2026-06-16 17:06:19 +08:00			`advice_md: Path \| None = None`