Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions

129
webapp/models.py Normal file
View File

@@ -0,0 +1,129 @@
"""Pydantic response models for the evaluation console HTTP API."""
from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
class RunSummary(BaseModel):
"""Compact description of a single evaluation run for list views."""
run_id: str
scenario_name: str
mode: str = ""
judge_model: str = ""
embedding_model: str = ""
started_at: str = ""
finished_at: str = ""
dataset: str = ""
total_samples: int = 0
valid_samples: int = 0
invalid_samples: int = 0
metrics: list[str] = Field(default_factory=list)
metric_means: dict[str, float | None] = Field(default_factory=dict)
output_path: str = ""
class GroupStat(BaseModel):
"""Mean metric values for one slice of samples grouped by a metadata field."""
key: str
count: int
means: dict[str, float | None] = Field(default_factory=dict)
class DistributionBin(BaseModel):
"""One histogram bucket of sample counts for a single metric."""
label: str
lower: float
upper: float
count: int
class SampleScore(BaseModel):
"""Per-sample row used for the lowest-score review table."""
sample_id: str
question: str = ""
contexts: list[str] = Field(default_factory=list)
answer: str = ""
ground_truth: str = ""
language: str = ""
difficulty: str = ""
question_type: str = ""
metrics: dict[str, float | None] = Field(default_factory=dict)
mean_score: float | None = None
error: str = ""
class ReportData(BaseModel):
"""Aggregated report payload rendered by the report detail page."""
metrics: list[str] = Field(default_factory=list)
metric_means: dict[str, float | None] = Field(default_factory=dict)
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
lowest_samples: list[SampleScore] = Field(default_factory=list)
summary_markdown: str = ""
class RunDetail(BaseModel):
"""Full payload for a single run: summary metadata plus the report."""
summary: RunSummary
report: ReportData
class ScenarioInfo(BaseModel):
"""One discoverable scenario YAML file that can be evaluated from the UI."""
path: str
scenario_name: str = ""
mode: str = ""
dataset: str = ""
judge_model: str = ""
metrics: list[str] = Field(default_factory=list)
error: str = ""
class TaskStatus(BaseModel):
"""State of a background evaluation task tracked by the task manager."""
task_id: str
scenario_path: str
status: str
logs: list[str] = Field(default_factory=list)
run_id: str | None = None
error: str | None = None
created_at: str = ""
finished_at: str = ""
class TriggerEvaluationRequest(BaseModel):
"""Request body for launching an evaluation run from the UI."""
scenario_path: str
class TriggerEvaluationResponse(BaseModel):
"""Response returned immediately after queuing an evaluation task."""
task_id: str
def jsonable(value: Any) -> Any:
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
import math
if isinstance(value, float):
if math.isnan(value) or math.isinf(value):
return None
return value
if isinstance(value, dict):
return {key: jsonable(item) for key, item in value.items()}
if isinstance(value, list):
return [jsonable(item) for item in value]
return value