siemens_ragas/rag_eval/datasets/loader.py

"""Load raw evaluation dataset records from disk.

Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
into NormalizedSample is handled by normalizers.py.
"""

from __future__ import annotations

import csv
import json
from pathlib import Path
from typing import Any


def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
    """Load raw records from a CSV or JSONL file.

    Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
    are left as-is; normalizers handle parsing.
    """
    file_path = Path(path)
    if not file_path.is_file():
        raise FileNotFoundError(f"Dataset file not found: {file_path}")

    suffix = file_path.suffix.lower()
    if suffix in (".jsonl", ".ndjson"):
        return _load_jsonl(file_path)
    if suffix in (".csv",):
        return _load_csv(file_path)
    # Fall back to CSV for unknown extensions.
    return _load_csv(file_path)


def _load_csv(path: Path) -> list[dict[str, Any]]:
    """Read a CSV file into a list of row dicts."""
    with path.open(encoding="utf-8", newline="") as fh:
        reader = csv.DictReader(fh)
        return [dict(row) for row in reader]


def _load_jsonl(path: Path) -> list[dict[str, Any]]:
    """Read a JSONL file into a list of record dicts."""
    records: list[dict[str, Any]] = []
    with path.open(encoding="utf-8") as fh:
        for lineno, line in enumerate(fh, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as exc:
                raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
            if not isinstance(obj, dict):
                raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
            records.append(obj)
    return records
Add RAGAS evaluation web console (FastAPI + vanilla JS) - webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com> 2026-06-15 15:53:57 +08:00			`"""Load raw evaluation dataset records from disk.`

			`Supports CSV and JSONL formats. Returns a list of plain dicts — normalization`
			`into NormalizedSample is handled by normalizers.py.`
			`"""`

			`from __future__ import annotations`

			`import csv`
			`import json`
			`from pathlib import Path`
			`from typing import Any`


			`def load_dataset_records(path: Path \| str) -> list[dict[str, Any]]:`
			`"""Load raw records from a CSV or JSONL file.`

			`Each row becomes a plain dict. Lists stored as JSON strings in CSV columns`
			`are left as-is; normalizers handle parsing.`
			`"""`
			`file_path = Path(path)`
			`if not file_path.is_file():`
			`raise FileNotFoundError(f"Dataset file not found: {file_path}")`

			`suffix = file_path.suffix.lower()`
			`if suffix in (".jsonl", ".ndjson"):`
			`return _load_jsonl(file_path)`
			`if suffix in (".csv",):`
			`return _load_csv(file_path)`
			`# Fall back to CSV for unknown extensions.`
			`return _load_csv(file_path)`


			`def _load_csv(path: Path) -> list[dict[str, Any]]:`
			`"""Read a CSV file into a list of row dicts."""`
			`with path.open(encoding="utf-8", newline="") as fh:`
			`reader = csv.DictReader(fh)`
			`return [dict(row) for row in reader]`


			`def _load_jsonl(path: Path) -> list[dict[str, Any]]:`
			`"""Read a JSONL file into a list of record dicts."""`
			`records: list[dict[str, Any]] = []`
			`with path.open(encoding="utf-8") as fh:`
			`for lineno, line in enumerate(fh, 1):`
			`line = line.strip()`
			`if not line:`
			`continue`
			`try:`
			`obj = json.loads(line)`
			`except json.JSONDecodeError as exc:`
			`raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc`
			`if not isinstance(obj, dict):`
			`raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")`
			`records.append(obj)`
			`return records`