Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions
--- a/webapp/models.py
+++ b/webapp/models.py
@@ -0,0 +1,129 @@
+"""Pydantic response models for the evaluation console HTTP API."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class RunSummary(BaseModel):
+    """Compact description of a single evaluation run for list views."""
+
+    run_id: str
+    scenario_name: str
+    mode: str = ""
+    judge_model: str = ""
+    embedding_model: str = ""
+    started_at: str = ""
+    finished_at: str = ""
+    dataset: str = ""
+    total_samples: int = 0
+    valid_samples: int = 0
+    invalid_samples: int = 0
+    metrics: list[str] = Field(default_factory=list)
+    metric_means: dict[str, float | None] = Field(default_factory=dict)
+    output_path: str = ""
+
+
+class GroupStat(BaseModel):
+    """Mean metric values for one slice of samples grouped by a metadata field."""
+
+    key: str
+    count: int
+    means: dict[str, float | None] = Field(default_factory=dict)
+
+
+class DistributionBin(BaseModel):
+    """One histogram bucket of sample counts for a single metric."""
+
+    label: str
+    lower: float
+    upper: float
+    count: int
+
+
+class SampleScore(BaseModel):
+    """Per-sample row used for the lowest-score review table."""
+
+    sample_id: str
+    question: str = ""
+    contexts: list[str] = Field(default_factory=list)
+    answer: str = ""
+    ground_truth: str = ""
+    language: str = ""
+    difficulty: str = ""
+    question_type: str = ""
+    metrics: dict[str, float | None] = Field(default_factory=dict)
+    mean_score: float | None = None
+    error: str = ""
+
+
+class ReportData(BaseModel):
+    """Aggregated report payload rendered by the report detail page."""
+
+    metrics: list[str] = Field(default_factory=list)
+    metric_means: dict[str, float | None] = Field(default_factory=dict)
+    distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
+    groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
+    lowest_samples: list[SampleScore] = Field(default_factory=list)
+    summary_markdown: str = ""
+
+
+class RunDetail(BaseModel):
+    """Full payload for a single run: summary metadata plus the report."""
+
+    summary: RunSummary
+    report: ReportData
+
+
+class ScenarioInfo(BaseModel):
+    """One discoverable scenario YAML file that can be evaluated from the UI."""
+
+    path: str
+    scenario_name: str = ""
+    mode: str = ""
+    dataset: str = ""
+    judge_model: str = ""
+    metrics: list[str] = Field(default_factory=list)
+    error: str = ""
+
+
+class TaskStatus(BaseModel):
+    """State of a background evaluation task tracked by the task manager."""
+
+    task_id: str
+    scenario_path: str
+    status: str
+    logs: list[str] = Field(default_factory=list)
+    run_id: str | None = None
+    error: str | None = None
+    created_at: str = ""
+    finished_at: str = ""
+
+
+class TriggerEvaluationRequest(BaseModel):
+    """Request body for launching an evaluation run from the UI."""
+
+    scenario_path: str
+
+
+class TriggerEvaluationResponse(BaseModel):
+    """Response returned immediately after queuing an evaluation task."""
+
+    task_id: str
+
+
+def jsonable(value: Any) -> Any:
+    """Convert NaN/inf floats into None so the payload stays valid JSON."""
+    import math
+
+    if isinstance(value, float):
+        if math.isnan(value) or math.isinf(value):
+            return None
+        return value
+    if isinstance(value, dict):
+        return {key: jsonable(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [jsonable(item) for item in value]
+    return value