"""Load raw evaluation dataset records from disk. Supports CSV and JSONL formats. Returns a list of plain dicts — normalization into NormalizedSample is handled by normalizers.py. """ from __future__ import annotations import csv import json from pathlib import Path from typing import Any def load_dataset_records(path: Path | str) -> list[dict[str, Any]]: """Load raw records from a CSV or JSONL file. Each row becomes a plain dict. Lists stored as JSON strings in CSV columns are left as-is; normalizers handle parsing. """ file_path = Path(path) if not file_path.is_file(): raise FileNotFoundError(f"Dataset file not found: {file_path}") suffix = file_path.suffix.lower() if suffix in (".jsonl", ".ndjson"): return _load_jsonl(file_path) if suffix in (".csv",): return _load_csv(file_path) # Fall back to CSV for unknown extensions. return _load_csv(file_path) def _load_csv(path: Path) -> list[dict[str, Any]]: """Read a CSV file into a list of row dicts.""" with path.open(encoding="utf-8", newline="") as fh: reader = csv.DictReader(fh) return [dict(row) for row in reader] def _load_jsonl(path: Path) -> list[dict[str, Any]]: """Read a JSONL file into a list of record dicts.""" records: list[dict[str, Any]] = [] with path.open(encoding="utf-8") as fh: for lineno, line in enumerate(fh, 1): line = line.strip() if not line: continue try: obj = json.loads(line) except json.JSONDecodeError as exc: raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc if not isinstance(obj, dict): raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}") records.append(obj) return records