Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions
--- a/rag_eval/datasets/init.py
+++ b/rag_eval/datasets/init.py
@@ -0,0 +1 @@
+"""Dataset loading and normalization for the RAG evaluation platform."""
--- a/rag_eval/datasets/loader.py
+++ b/rag_eval/datasets/loader.py
@@ -0,0 +1,56 @@
+"""Load raw evaluation dataset records from disk.
+
+Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
+into NormalizedSample is handled by normalizers.py.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Any
+
+
+def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
+    """Load raw records from a CSV or JSONL file.
+
+    Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
+    are left as-is; normalizers handle parsing.
+    """
+    file_path = Path(path)
+    if not file_path.is_file():
+        raise FileNotFoundError(f"Dataset file not found: {file_path}")
+
+    suffix = file_path.suffix.lower()
+    if suffix in (".jsonl", ".ndjson"):
+        return _load_jsonl(file_path)
+    if suffix in (".csv",):
+        return _load_csv(file_path)
+    # Fall back to CSV for unknown extensions.
+    return _load_csv(file_path)
+
+
+def _load_csv(path: Path) -> list[dict[str, Any]]:
+    """Read a CSV file into a list of row dicts."""
+    with path.open(encoding="utf-8", newline="") as fh:
+        reader = csv.DictReader(fh)
+        return [dict(row) for row in reader]
+
+
+def _load_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Read a JSONL file into a list of record dicts."""
+    records: list[dict[str, Any]] = []
+    with path.open(encoding="utf-8") as fh:
+        for lineno, line in enumerate(fh, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
+            if not isinstance(obj, dict):
+                raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
+            records.append(obj)
+    return records
--- a/rag_eval/datasets/normalizers.py
+++ b/rag_eval/datasets/normalizers.py
@@ -0,0 +1,105 @@
+"""Normalize raw dataset records into NormalizedSample and InvalidSample objects.
+
+Handles both offline mode (records already contain answer + contexts) and online
+mode (records only contain question + ground_truth; adapter fills the rest).
+"""
+
+from __future__ import annotations
+
+import uuid
+from typing import Any
+
+from rag_eval.shared.models import InvalidSample, NormalizedSample
+from rag_eval.shared.utils import parse_contexts
+
+# Fields we always strip from the raw record before storing it in metadata.
+_CORE_FIELDS = {
+    "sample_id",
+    "question",
+    "contexts",
+    "answer",
+    "ground_truth",
+    "scenario",
+    "language",
+    "retrieval_config",
+}
+
+
+def _get_str(record: dict[str, Any], key: str, default: str = "") -> str:
+    """Return a string field from the record, coercing None/NaN to the default."""
+    value = record.get(key)
+    if value is None:
+        return default
+    text = str(value).strip()
+    return default if text.lower() == "nan" else text
+
+
+def normalize_records(
+    records: list[dict[str, Any]],
+    mode: str = "offline",
+    max_samples: int | None = None,
+) -> tuple[list[NormalizedSample], list[InvalidSample]]:
+    """Convert raw dicts into NormalizedSample / InvalidSample collections.
+
+    In offline mode every record must already contain answer and contexts.
+    In online mode those fields may be absent; they will be filled by the adapter.
+    """
+    if max_samples is not None:
+        records = records[:max_samples]
+
+    valid: list[NormalizedSample] = []
+    invalid: list[InvalidSample] = []
+
+    for raw in records:
+        sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12]
+
+        question = _get_str(raw, "question")
+        if not question:
+            invalid.append(InvalidSample(
+                sample_id=sample_id,
+                error="missing required field: question",
+                raw=raw,
+            ))
+            continue
+
+        ground_truth = _get_str(raw, "ground_truth")
+        contexts = parse_contexts(raw.get("contexts"))
+        answer = _get_str(raw, "answer")
+
+        if mode == "offline":
+            errors: list[str] = []
+            if not ground_truth:
+                errors.append("missing ground_truth")
+            if not answer:
+                errors.append("missing answer")
+            if not contexts:
+                errors.append("missing or empty contexts")
+            if errors:
+                invalid.append(InvalidSample(
+                    sample_id=sample_id,
+                    error="; ".join(errors),
+                    raw=raw,
+                ))
+                continue
+
+        # Collect any extra columns as opaque metadata for adapters and reporting.
+        metadata = {
+            key: value
+            for key, value in raw.items()
+            if key not in _CORE_FIELDS
+        }
+
+        valid.append(NormalizedSample(
+            sample_id=sample_id,
+            question=question,
+            contexts=contexts,
+            answer=answer,
+            ground_truth=ground_truth,
+            scenario=_get_str(raw, "scenario"),
+            language=_get_str(raw, "language"),
+            retrieval_config=_get_str(raw, "retrieval_config"),
+            metadata=metadata,
+            raw=raw,
+        ))
+
+    return valid, invalid
				`@@ -0,0 +1 @@`
				`"""Dataset loading and normalization for the RAG evaluation platform."""`