57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
|
|
"""Load raw evaluation dataset records from disk.
|
||
|
|
|
||
|
|
Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
|
||
|
|
into NormalizedSample is handled by normalizers.py.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import csv
|
||
|
|
import json
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
|
||
|
|
def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
|
||
|
|
"""Load raw records from a CSV or JSONL file.
|
||
|
|
|
||
|
|
Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
|
||
|
|
are left as-is; normalizers handle parsing.
|
||
|
|
"""
|
||
|
|
file_path = Path(path)
|
||
|
|
if not file_path.is_file():
|
||
|
|
raise FileNotFoundError(f"Dataset file not found: {file_path}")
|
||
|
|
|
||
|
|
suffix = file_path.suffix.lower()
|
||
|
|
if suffix in (".jsonl", ".ndjson"):
|
||
|
|
return _load_jsonl(file_path)
|
||
|
|
if suffix in (".csv",):
|
||
|
|
return _load_csv(file_path)
|
||
|
|
# Fall back to CSV for unknown extensions.
|
||
|
|
return _load_csv(file_path)
|
||
|
|
|
||
|
|
|
||
|
|
def _load_csv(path: Path) -> list[dict[str, Any]]:
|
||
|
|
"""Read a CSV file into a list of row dicts."""
|
||
|
|
with path.open(encoding="utf-8", newline="") as fh:
|
||
|
|
reader = csv.DictReader(fh)
|
||
|
|
return [dict(row) for row in reader]
|
||
|
|
|
||
|
|
|
||
|
|
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||
|
|
"""Read a JSONL file into a list of record dicts."""
|
||
|
|
records: list[dict[str, Any]] = []
|
||
|
|
with path.open(encoding="utf-8") as fh:
|
||
|
|
for lineno, line in enumerate(fh, 1):
|
||
|
|
line = line.strip()
|
||
|
|
if not line:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
obj = json.loads(line)
|
||
|
|
except json.JSONDecodeError as exc:
|
||
|
|
raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
|
||
|
|
if not isinstance(obj, dict):
|
||
|
|
raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
|
||
|
|
records.append(obj)
|
||
|
|
return records
|