- llm_analyzer.py: use llm.langchain_llm.ainvoke() (correct RAGAS 0.4.3 API) - webapp/models.py: add advice_markdown field to ReportData - webapp/services/run_reader.py: add read_advice_markdown() reading optimization_advice.md - webapp/services/report_builder.py: pass advice_markdown into ReportData - .env.example: OPENAI_TIMEOUT_SECONDS 30→180, RAGAS_METRIC_TIMEOUT_SECONDS 45→300 Co-Authored-By: Claude <noreply@anthropic.com>
175 lines
4.6 KiB
Python
175 lines
4.6 KiB
Python
"""Pydantic response models for the evaluation console HTTP API."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
def _utcnow_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
class RunSummary(BaseModel):
|
|
"""Compact description of a single evaluation run for list views."""
|
|
|
|
run_id: str
|
|
scenario_name: str
|
|
mode: str = ""
|
|
judge_model: str = ""
|
|
embedding_model: str = ""
|
|
started_at: str = ""
|
|
finished_at: str = ""
|
|
dataset: str = ""
|
|
total_samples: int = 0
|
|
valid_samples: int = 0
|
|
invalid_samples: int = 0
|
|
metrics: list[str] = Field(default_factory=list)
|
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
|
output_path: str = ""
|
|
|
|
|
|
class GroupStat(BaseModel):
|
|
"""Mean metric values for one slice of samples grouped by a metadata field."""
|
|
|
|
key: str
|
|
count: int
|
|
means: dict[str, float | None] = Field(default_factory=dict)
|
|
|
|
|
|
class DistributionBin(BaseModel):
|
|
"""One histogram bucket of sample counts for a single metric."""
|
|
|
|
label: str
|
|
lower: float
|
|
upper: float
|
|
count: int
|
|
|
|
|
|
class SampleScore(BaseModel):
|
|
"""Per-sample row used for the lowest-score review table."""
|
|
|
|
sample_id: str
|
|
question: str = ""
|
|
contexts: list[str] = Field(default_factory=list)
|
|
answer: str = ""
|
|
ground_truth: str = ""
|
|
language: str = ""
|
|
difficulty: str = ""
|
|
question_type: str = ""
|
|
metrics: dict[str, float | None] = Field(default_factory=dict)
|
|
mean_score: float | None = None
|
|
error: str = ""
|
|
|
|
|
|
class ReportData(BaseModel):
|
|
"""Aggregated report payload rendered by the report detail page."""
|
|
|
|
metrics: list[str] = Field(default_factory=list)
|
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
|
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
|
|
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
|
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
|
summary_markdown: str = ""
|
|
advice_markdown: str = "" # optimization_advice.md content (empty if not generated)
|
|
|
|
|
|
class RunDetail(BaseModel):
|
|
"""Full payload for a single run: summary metadata plus the report."""
|
|
|
|
summary: RunSummary
|
|
report: ReportData
|
|
|
|
|
|
class ScenarioInfo(BaseModel):
|
|
"""One discoverable scenario YAML file that can be evaluated from the UI."""
|
|
|
|
path: str
|
|
scenario_name: str = ""
|
|
mode: str = ""
|
|
dataset: str = ""
|
|
judge_model: str = ""
|
|
metrics: list[str] = Field(default_factory=list)
|
|
error: str = ""
|
|
|
|
|
|
class TaskStatus(BaseModel):
|
|
"""State of a background evaluation task tracked by the task manager."""
|
|
|
|
task_id: str
|
|
scenario_path: str
|
|
status: str
|
|
logs: list[str] = Field(default_factory=list)
|
|
run_id: str | None = None
|
|
error: str | None = None
|
|
created_at: str = ""
|
|
finished_at: str = ""
|
|
|
|
|
|
class TriggerEvaluationRequest(BaseModel):
|
|
"""Request body for launching an evaluation run from the UI."""
|
|
|
|
scenario_path: str
|
|
|
|
|
|
class TriggerEvaluationResponse(BaseModel):
|
|
"""Response returned immediately after queuing an evaluation task."""
|
|
|
|
task_id: str
|
|
|
|
|
|
class LLMProfile(BaseModel):
|
|
"""A named LLM connection configuration that can be reused across tasks."""
|
|
|
|
profile_id: str
|
|
name: str
|
|
model: str
|
|
base_url: str
|
|
api_key: str
|
|
timeout_seconds: int = 30
|
|
created_at: str = Field(default_factory=_utcnow_iso)
|
|
updated_at: str = Field(default_factory=_utcnow_iso)
|
|
|
|
|
|
class CreateProfileRequest(BaseModel):
|
|
"""Request body for creating or updating an LLM profile."""
|
|
|
|
name: str
|
|
model: str
|
|
base_url: str
|
|
api_key: str
|
|
timeout_seconds: int = 30
|
|
|
|
|
|
class ProfileApplyRequest(BaseModel):
|
|
"""Request body to patch LLM profile selections into a scenario YAML."""
|
|
|
|
scenario_path: str
|
|
judge_profile_id: str | None = None
|
|
answer_profile_id: str | None = None
|
|
dataset_profile_id: str | None = None
|
|
|
|
|
|
class ProfileApplyResponse(BaseModel):
|
|
"""Response after patching a scenario YAML with profile settings."""
|
|
|
|
scenario_path: str
|
|
patched_fields: list[str] = Field(default_factory=list)
|
|
|
|
|
|
def jsonable(value: Any) -> Any:
|
|
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
|
import math
|
|
|
|
if isinstance(value, float):
|
|
if math.isnan(value) or math.isinf(value):
|
|
return None
|
|
return value
|
|
if isinstance(value, dict):
|
|
return {key: jsonable(item) for key, item in value.items()}
|
|
if isinstance(value, list):
|
|
return [jsonable(item) for item in value]
|
|
return value
|