Add RAGAS evaluation web console (FastAPI + vanilla JS)
- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
129
webapp/models.py
Normal file
129
webapp/models.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Pydantic response models for the evaluation console HTTP API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class RunSummary(BaseModel):
|
||||
"""Compact description of a single evaluation run for list views."""
|
||||
|
||||
run_id: str
|
||||
scenario_name: str
|
||||
mode: str = ""
|
||||
judge_model: str = ""
|
||||
embedding_model: str = ""
|
||||
started_at: str = ""
|
||||
finished_at: str = ""
|
||||
dataset: str = ""
|
||||
total_samples: int = 0
|
||||
valid_samples: int = 0
|
||||
invalid_samples: int = 0
|
||||
metrics: list[str] = Field(default_factory=list)
|
||||
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||||
output_path: str = ""
|
||||
|
||||
|
||||
class GroupStat(BaseModel):
|
||||
"""Mean metric values for one slice of samples grouped by a metadata field."""
|
||||
|
||||
key: str
|
||||
count: int
|
||||
means: dict[str, float | None] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class DistributionBin(BaseModel):
|
||||
"""One histogram bucket of sample counts for a single metric."""
|
||||
|
||||
label: str
|
||||
lower: float
|
||||
upper: float
|
||||
count: int
|
||||
|
||||
|
||||
class SampleScore(BaseModel):
|
||||
"""Per-sample row used for the lowest-score review table."""
|
||||
|
||||
sample_id: str
|
||||
question: str = ""
|
||||
contexts: list[str] = Field(default_factory=list)
|
||||
answer: str = ""
|
||||
ground_truth: str = ""
|
||||
language: str = ""
|
||||
difficulty: str = ""
|
||||
question_type: str = ""
|
||||
metrics: dict[str, float | None] = Field(default_factory=dict)
|
||||
mean_score: float | None = None
|
||||
error: str = ""
|
||||
|
||||
|
||||
class ReportData(BaseModel):
|
||||
"""Aggregated report payload rendered by the report detail page."""
|
||||
|
||||
metrics: list[str] = Field(default_factory=list)
|
||||
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||||
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
|
||||
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
||||
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||||
summary_markdown: str = ""
|
||||
|
||||
|
||||
class RunDetail(BaseModel):
|
||||
"""Full payload for a single run: summary metadata plus the report."""
|
||||
|
||||
summary: RunSummary
|
||||
report: ReportData
|
||||
|
||||
|
||||
class ScenarioInfo(BaseModel):
|
||||
"""One discoverable scenario YAML file that can be evaluated from the UI."""
|
||||
|
||||
path: str
|
||||
scenario_name: str = ""
|
||||
mode: str = ""
|
||||
dataset: str = ""
|
||||
judge_model: str = ""
|
||||
metrics: list[str] = Field(default_factory=list)
|
||||
error: str = ""
|
||||
|
||||
|
||||
class TaskStatus(BaseModel):
|
||||
"""State of a background evaluation task tracked by the task manager."""
|
||||
|
||||
task_id: str
|
||||
scenario_path: str
|
||||
status: str
|
||||
logs: list[str] = Field(default_factory=list)
|
||||
run_id: str | None = None
|
||||
error: str | None = None
|
||||
created_at: str = ""
|
||||
finished_at: str = ""
|
||||
|
||||
|
||||
class TriggerEvaluationRequest(BaseModel):
|
||||
"""Request body for launching an evaluation run from the UI."""
|
||||
|
||||
scenario_path: str
|
||||
|
||||
|
||||
class TriggerEvaluationResponse(BaseModel):
|
||||
"""Response returned immediately after queuing an evaluation task."""
|
||||
|
||||
task_id: str
|
||||
|
||||
|
||||
def jsonable(value: Any) -> Any:
|
||||
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||||
import math
|
||||
|
||||
if isinstance(value, float):
|
||||
if math.isnan(value) or math.isinf(value):
|
||||
return None
|
||||
return value
|
||||
if isinstance(value, dict):
|
||||
return {key: jsonable(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [jsonable(item) for item in value]
|
||||
return value
|
||||
Reference in New Issue
Block a user