130 lines
3.4 KiB
Python
130 lines
3.4 KiB
Python
|
|
"""Pydantic response models for the evaluation console HTTP API."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from pydantic import BaseModel, Field
|
||
|
|
|
||
|
|
|
||
|
|
class RunSummary(BaseModel):
|
||
|
|
"""Compact description of a single evaluation run for list views."""
|
||
|
|
|
||
|
|
run_id: str
|
||
|
|
scenario_name: str
|
||
|
|
mode: str = ""
|
||
|
|
judge_model: str = ""
|
||
|
|
embedding_model: str = ""
|
||
|
|
started_at: str = ""
|
||
|
|
finished_at: str = ""
|
||
|
|
dataset: str = ""
|
||
|
|
total_samples: int = 0
|
||
|
|
valid_samples: int = 0
|
||
|
|
invalid_samples: int = 0
|
||
|
|
metrics: list[str] = Field(default_factory=list)
|
||
|
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||
|
|
output_path: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
class GroupStat(BaseModel):
|
||
|
|
"""Mean metric values for one slice of samples grouped by a metadata field."""
|
||
|
|
|
||
|
|
key: str
|
||
|
|
count: int
|
||
|
|
means: dict[str, float | None] = Field(default_factory=dict)
|
||
|
|
|
||
|
|
|
||
|
|
class DistributionBin(BaseModel):
|
||
|
|
"""One histogram bucket of sample counts for a single metric."""
|
||
|
|
|
||
|
|
label: str
|
||
|
|
lower: float
|
||
|
|
upper: float
|
||
|
|
count: int
|
||
|
|
|
||
|
|
|
||
|
|
class SampleScore(BaseModel):
|
||
|
|
"""Per-sample row used for the lowest-score review table."""
|
||
|
|
|
||
|
|
sample_id: str
|
||
|
|
question: str = ""
|
||
|
|
contexts: list[str] = Field(default_factory=list)
|
||
|
|
answer: str = ""
|
||
|
|
ground_truth: str = ""
|
||
|
|
language: str = ""
|
||
|
|
difficulty: str = ""
|
||
|
|
question_type: str = ""
|
||
|
|
metrics: dict[str, float | None] = Field(default_factory=dict)
|
||
|
|
mean_score: float | None = None
|
||
|
|
error: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
class ReportData(BaseModel):
|
||
|
|
"""Aggregated report payload rendered by the report detail page."""
|
||
|
|
|
||
|
|
metrics: list[str] = Field(default_factory=list)
|
||
|
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||
|
|
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
|
||
|
|
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
||
|
|
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||
|
|
summary_markdown: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
class RunDetail(BaseModel):
|
||
|
|
"""Full payload for a single run: summary metadata plus the report."""
|
||
|
|
|
||
|
|
summary: RunSummary
|
||
|
|
report: ReportData
|
||
|
|
|
||
|
|
|
||
|
|
class ScenarioInfo(BaseModel):
|
||
|
|
"""One discoverable scenario YAML file that can be evaluated from the UI."""
|
||
|
|
|
||
|
|
path: str
|
||
|
|
scenario_name: str = ""
|
||
|
|
mode: str = ""
|
||
|
|
dataset: str = ""
|
||
|
|
judge_model: str = ""
|
||
|
|
metrics: list[str] = Field(default_factory=list)
|
||
|
|
error: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
class TaskStatus(BaseModel):
|
||
|
|
"""State of a background evaluation task tracked by the task manager."""
|
||
|
|
|
||
|
|
task_id: str
|
||
|
|
scenario_path: str
|
||
|
|
status: str
|
||
|
|
logs: list[str] = Field(default_factory=list)
|
||
|
|
run_id: str | None = None
|
||
|
|
error: str | None = None
|
||
|
|
created_at: str = ""
|
||
|
|
finished_at: str = ""
|
||
|
|
|
||
|
|
|
||
|
|
class TriggerEvaluationRequest(BaseModel):
|
||
|
|
"""Request body for launching an evaluation run from the UI."""
|
||
|
|
|
||
|
|
scenario_path: str
|
||
|
|
|
||
|
|
|
||
|
|
class TriggerEvaluationResponse(BaseModel):
|
||
|
|
"""Response returned immediately after queuing an evaluation task."""
|
||
|
|
|
||
|
|
task_id: str
|
||
|
|
|
||
|
|
|
||
|
|
def jsonable(value: Any) -> Any:
|
||
|
|
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||
|
|
import math
|
||
|
|
|
||
|
|
if isinstance(value, float):
|
||
|
|
if math.isnan(value) or math.isinf(value):
|
||
|
|
return None
|
||
|
|
return value
|
||
|
|
if isinstance(value, dict):
|
||
|
|
return {key: jsonable(item) for key, item in value.items()}
|
||
|
|
if isinstance(value, list):
|
||
|
|
return [jsonable(item) for item in value]
|
||
|
|
return value
|