Add RAGAS evaluation web console (FastAPI + vanilla JS)

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 15:53:57 +08:00
parent 9cbdc1d95d
commit e89695e490
26 changed files with 2496 additions and 2 deletions

View File

@@ -0,0 +1 @@
"""Service package for the evaluation console (filesystem readers and task runner)."""

View File

@@ -0,0 +1,188 @@
"""Aggregate a run's per-sample scores into the report payload for the UI.
All aggregation reads only the standard scores.csv produced by the reporting
layer, plus the metric list resolved by run_reader. The output mirrors the
report detail page: metric means, per-metric distribution histograms, grouped
means by difficulty / question_type, and the lowest-scoring samples for review.
"""
from __future__ import annotations
import math
from pathlib import Path
import pandas as pd
from webapp.services.text_utils import parse_contexts
from webapp.models import (
DistributionBin,
GroupStat,
ReportData,
SampleScore,
)
from webapp.services import run_reader
# Number of equal-width buckets used for metric score histograms.
DISTRIBUTION_BIN_COUNT = 5
# Metadata columns that we group samples by when present in the data.
GROUPING_FIELDS = ("difficulty", "question_type", "language")
# How many lowest-scoring samples to surface for manual review.
LOWEST_SAMPLE_COUNT = 10
def _round_or_none(value: float | None) -> float | None:
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
if value is None:
return None
if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
return None
return round(float(value), 4)
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
"""Compute the mean of each metric column across all scored samples."""
means: dict[str, float | None] = {}
for metric in metrics:
if metric in frame.columns:
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
else:
means[metric] = None
return means
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
bins: list[DistributionBin] = []
if metric not in frame.columns:
return bins
series = pd.to_numeric(frame[metric], errors="coerce").dropna()
width = 1.0 / DISTRIBUTION_BIN_COUNT
for index in range(DISTRIBUTION_BIN_COUNT):
lower = index * width
upper = (index + 1) * width
# Include the right edge in the final bin so 1.0 is counted.
if index == DISTRIBUTION_BIN_COUNT - 1:
mask = (series >= lower) & (series <= upper)
else:
mask = (series >= lower) & (series < upper)
bins.append(
DistributionBin(
label=f"{lower:.1f}{upper:.1f}",
lower=round(lower, 2),
upper=round(upper, 2),
count=int(mask.sum()),
)
)
return bins
def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
"""Compute per-group metric means for each available grouping field."""
groupings: dict[str, list[GroupStat]] = {}
for field in GROUPING_FIELDS:
if field not in frame.columns:
continue
# Skip fields that are entirely empty so the UI does not render noise.
non_empty = frame[field].astype(str).str.strip().replace("nan", "")
if non_empty.eq("").all():
continue
stats: list[GroupStat] = []
for key, group in frame.groupby(frame[field].astype(str)):
key_text = str(key).strip()
if not key_text or key_text == "nan":
continue
means = {
metric: _round_or_none(group[metric].mean(numeric_only=True))
for metric in metrics
if metric in group.columns
}
stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
if stats:
stats.sort(key=lambda item: item.key)
groupings[field] = stats
return groupings
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
"""Average a single sample's available metric scores for ranking."""
values = [
float(row[metric])
for metric in metrics
if metric in row and pd.notna(row[metric])
]
if not values:
return None
return sum(values) / len(values)
def _cell_text(row: pd.Series, column: str) -> str:
"""Safely read a string cell, returning '' for missing or NaN values."""
if column not in row or pd.isna(row[column]):
return ""
return str(row[column]).strip()
def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
"""Select and shape the lowest-scoring samples for the review table."""
if frame.empty:
return []
enriched: list[tuple[float, SampleScore]] = []
for _, row in frame.iterrows():
mean_score = _sample_mean(row, metrics)
sample = SampleScore(
sample_id=_cell_text(row, "sample_id") or "",
question=_cell_text(row, "question"),
contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
answer=_cell_text(row, "answer"),
ground_truth=_cell_text(row, "ground_truth"),
language=_cell_text(row, "language"),
difficulty=_cell_text(row, "difficulty"),
question_type=_cell_text(row, "question_type"),
metrics={
metric: _round_or_none(float(row[metric]))
for metric in metrics
if metric in row and pd.notna(row[metric])
},
mean_score=_round_or_none(mean_score),
error=_cell_text(row, "error"),
)
# Samples without any score sort last (treated as worst for review).
sort_key = mean_score if mean_score is not None else -1.0
enriched.append((sort_key, sample))
enriched.sort(key=lambda item: item[0])
return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
"""Build the full aggregated report payload for one run directory."""
frame = run_reader.read_scores_frame(run_dir)
summary_markdown = run_reader.read_summary_markdown(run_dir)
if frame.empty or not metrics:
return ReportData(
metrics=metrics,
metric_means={metric: None for metric in metrics},
summary_markdown=summary_markdown,
)
distributions = {
metric: _distribution(frame, metric)
for metric in metrics
if metric in frame.columns
}
return ReportData(
metrics=metrics,
metric_means=_metric_means(frame, metrics),
distributions=distributions,
groupings=_groupings(frame, metrics),
lowest_samples=_lowest_samples(frame, metrics),
summary_markdown=summary_markdown,
)

View File

@@ -0,0 +1,222 @@
"""Read evaluation run artifacts from disk into API-friendly structures.
A "run" is any directory under the configured output roots that contains a
metadata.json file. This service stays decoupled from rag_eval internals: it
only reads the standard artifact files (metadata.json, scores.csv, summary.md,
scenario.snapshot.yaml) that the reporting layer writes.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pandas as pd
import yaml
from webapp.models import RunSummary
# Directory names that commonly hold run outputs, relative to the repo root.
DEFAULT_OUTPUT_ROOTS = ("outputs", "runs")
def _repo_root() -> Path:
"""Return the siemens_ragas repository root (parent of the webapp package)."""
return Path(__file__).resolve().parents[2]
def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]:
"""Collect existing output directories that may contain run artifacts."""
root = _repo_root()
roots: list[Path] = []
for name in DEFAULT_OUTPUT_ROOTS:
candidate = root / name
if candidate.is_dir():
roots.append(candidate)
for extra in extra_roots or []:
if extra.is_dir():
roots.append(extra)
return roots
def _read_json(path: Path) -> dict[str, Any]:
"""Load a JSON file, returning an empty dict on any failure."""
try:
return json.loads(path.read_text(encoding="utf-8"))
except (OSError, ValueError):
return {}
def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
"""Read the configured metric list from a scenario snapshot if present."""
snapshot = run_dir / "scenario.snapshot.yaml"
if not snapshot.is_file():
return []
try:
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
except (OSError, yaml.YAMLError):
return []
metrics = payload.get("metrics")
if isinstance(metrics, list):
return [str(item) for item in metrics]
return []
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
"""Find every run directory (one that contains metadata.json) under the roots."""
run_dirs: list[Path] = []
seen: set[Path] = set()
for root in _candidate_roots(extra_roots):
for metadata_path in root.rglob("metadata.json"):
run_dir = metadata_path.parent
# A dataset-build metadata.json also exists; keep only evaluation runs
# by requiring a scores.csv alongside, or a recognizable run metadata.
metadata = _read_json(metadata_path)
if "scenario_name" not in metadata:
continue
if run_dir in seen:
continue
seen.add(run_dir)
run_dirs.append(run_dir)
return run_dirs
def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]:
"""Compute per-metric mean scores from a run's scores.csv."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return {}
try:
frame = pd.read_csv(scores_path)
except (OSError, ValueError, pd.errors.ParserError):
return {}
means: dict[str, float | None] = {}
for metric in metrics:
if metric in frame.columns:
mean_value = frame[metric].mean(numeric_only=True)
means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4)
else:
means[metric] = None
return means
def build_run_summary(run_dir: Path) -> RunSummary | None:
"""Assemble a RunSummary from one run directory's artifacts."""
metadata = _read_json(run_dir / "metadata.json")
if "scenario_name" not in metadata:
return None
metrics = _read_metrics_from_snapshot(run_dir)
if not metrics:
# Fall back to numeric score columns inferred from the scores file.
metrics = _infer_metrics_from_scores(run_dir)
valid = int(metadata.get("valid_samples", 0) or 0)
invalid = int(metadata.get("invalid_samples", 0) or 0)
run_id = str(metadata.get("run_id") or run_dir.name)
return RunSummary(
run_id=run_id,
scenario_name=str(metadata.get("scenario_name", "")),
mode=str(metadata.get("mode", "")),
judge_model=str(metadata.get("judge_model", "")),
embedding_model=str(metadata.get("embedding_model", "")),
started_at=str(metadata.get("started_at", "")),
finished_at=str(metadata.get("finished_at", "")),
dataset=str(metadata.get("dataset", "")),
total_samples=valid + invalid,
valid_samples=valid,
invalid_samples=invalid,
metrics=metrics,
metric_means=_metric_means(run_dir, metrics),
output_path=run_dir.as_posix(),
)
# Columns in scores.csv that are sample fields rather than metric scores.
NON_METRIC_COLUMNS = {
"sample_id",
"question",
"contexts",
"answer",
"ground_truth",
"scenario",
"language",
"retrieval_config",
"error",
"judge_model",
"embedding_model",
"run_id",
"difficulty",
"question_type",
"doc_id",
"doc_name",
"section_path",
"page_start",
"page_end",
"source_chunk_ids",
"review_status",
"review_notes",
}
def _infer_metrics_from_scores(run_dir: Path) -> list[str]:
"""Infer metric column names from a scores.csv when no snapshot is available."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return []
try:
frame = pd.read_csv(scores_path, nrows=1)
except (OSError, ValueError, pd.errors.ParserError):
return []
metrics: list[str] = []
for column in frame.columns:
if column in NON_METRIC_COLUMNS:
continue
if pd.api.types.is_numeric_dtype(frame[column]):
metrics.append(str(column))
return metrics
def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]:
"""Return all run summaries sorted by finish time (most recent first)."""
summaries: list[RunSummary] = []
for run_dir in discover_run_dirs(extra_roots):
summary = build_run_summary(run_dir)
if summary is not None:
summaries.append(summary)
summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True)
return summaries
def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None:
"""Locate the run directory whose metadata or folder name matches run_id."""
for run_dir in discover_run_dirs(extra_roots):
metadata = _read_json(run_dir / "metadata.json")
if str(metadata.get("run_id") or run_dir.name) == run_id:
return run_dir
return None
def read_scores_frame(run_dir: Path) -> pd.DataFrame:
"""Load a run's scores.csv into a dataframe, or an empty frame if missing."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return pd.DataFrame()
try:
return pd.read_csv(scores_path)
except (OSError, ValueError, pd.errors.ParserError):
return pd.DataFrame()
def read_summary_markdown(run_dir: Path) -> str:
"""Return the human-readable summary.md for a run, or an empty string."""
summary_path = run_dir / "summary.md"
if not summary_path.is_file():
return ""
try:
return summary_path.read_text(encoding="utf-8")
except OSError:
return ""

View File

@@ -0,0 +1,84 @@
"""Discover scenario YAML files that can be launched from the console.
Scanning is intentionally tolerant: a malformed scenario file is reported with
an error string rather than aborting the whole listing, so the UI can show the
user which files are runnable and which need fixing.
"""
from __future__ import annotations
from pathlib import Path
import yaml
from webapp.models import ScenarioInfo
def _repo_root() -> Path:
"""Return the siemens_ragas repository root (parent of the webapp package)."""
return Path(__file__).resolve().parents[2]
def _scenarios_root() -> Path:
"""Return the conventional scenarios/ directory inside the repository."""
return _repo_root() / "scenarios"
def _summarize_scenario(path: Path) -> ScenarioInfo:
"""Read a scenario file into a compact info object, capturing parse errors."""
relative = path.relative_to(_repo_root()).as_posix()
try:
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
except (OSError, yaml.YAMLError) as exc:
return ScenarioInfo(path=relative, error=f"无法解析: {exc}")
if not isinstance(payload, dict):
return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。")
metrics = payload.get("metrics")
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
return ScenarioInfo(
path=relative,
scenario_name=str(payload.get("scenario_name", "")),
mode=str(payload.get("mode", "")),
dataset=str(payload.get("dataset", "")),
judge_model=str(payload.get("judge_model", "")),
metrics=metric_list,
)
def list_scenarios() -> list[ScenarioInfo]:
"""Return every scenario YAML under scenarios/, sorted by path."""
root = _scenarios_root()
if not root.is_dir():
return []
scenarios: list[ScenarioInfo] = []
for path in sorted(root.rglob("*.yaml")):
scenarios.append(_summarize_scenario(path))
for path in sorted(root.rglob("*.yml")):
scenarios.append(_summarize_scenario(path))
return scenarios
def resolve_scenario_path(relative_or_absolute: str) -> Path | None:
"""Resolve a user-supplied scenario path safely within the repository.
Only paths that live inside the repository's scenarios/ directory are
accepted, which prevents the trigger endpoint from reading arbitrary files.
"""
root = _repo_root()
candidate = Path(relative_or_absolute)
resolved = candidate if candidate.is_absolute() else (root / candidate)
try:
resolved = resolved.resolve()
except OSError:
return None
scenarios_root = _scenarios_root().resolve()
if scenarios_root not in resolved.parents and resolved != scenarios_root:
return None
if not resolved.is_file():
return None
return resolved

View File

@@ -0,0 +1,161 @@
"""In-process background task manager for evaluation runs.
Evaluations run in a thread pool so the FastAPI event loop is never blocked.
The heavy rag_eval / ragas import is performed lazily inside the worker thread,
which keeps the web server bootable even when the evaluation dependencies are
broken — failures then surface as task errors in the UI instead of crashing
startup. This matches the "coarse status + logs" progress decision.
"""
from __future__ import annotations
import io
import threading
import uuid
from concurrent.futures import ThreadPoolExecutor
from contextlib import redirect_stderr, redirect_stdout
from datetime import datetime, timezone
from pathlib import Path
from webapp.models import TaskStatus
def _now_iso() -> str:
"""Return the current UTC time as an ISO 8601 string."""
return datetime.now(timezone.utc).isoformat()
class _LineCapture(io.TextIOBase):
"""A writable stream that appends captured lines to a task's log buffer."""
def __init__(self, sink: "EvaluationTask") -> None:
"""Bind the capture stream to the owning task."""
self._sink = sink
self._buffer = ""
def write(self, text: str) -> int:
"""Buffer text and flush complete lines into the task log."""
self._buffer += text
while "\n" in self._buffer:
line, self._buffer = self._buffer.split("\n", 1)
self._sink.append_log(line)
return len(text)
def flush(self) -> None:
"""Flush any trailing partial line into the task log."""
if self._buffer:
self._sink.append_log(self._buffer)
self._buffer = ""
class EvaluationTask:
"""Mutable state for a single background evaluation run."""
def __init__(self, task_id: str, scenario_path: str) -> None:
"""Initialize a queued task for the given scenario path."""
self.task_id = task_id
self.scenario_path = scenario_path
self.status = "queued"
self.logs: list[str] = []
self.run_id: str | None = None
self.error: str | None = None
self.created_at = _now_iso()
self.finished_at = ""
self._lock = threading.Lock()
def append_log(self, line: str) -> None:
"""Append one log line in a thread-safe manner."""
with self._lock:
self.logs.append(line)
def snapshot(self) -> TaskStatus:
"""Return an immutable copy of the current task state for the API."""
with self._lock:
return TaskStatus(
task_id=self.task_id,
scenario_path=self.scenario_path,
status=self.status,
logs=list(self.logs),
run_id=self.run_id,
error=self.error,
created_at=self.created_at,
finished_at=self.finished_at,
)
class TaskManager:
"""Owns the thread pool and registry of evaluation tasks."""
def __init__(self, max_workers: int = 2) -> None:
"""Create a task manager backed by a small thread pool."""
self._executor = ThreadPoolExecutor(max_workers=max_workers)
self._tasks: dict[str, EvaluationTask] = {}
self._lock = threading.Lock()
def submit(self, scenario_path: str) -> str:
"""Register and schedule a new evaluation task, returning its id."""
task_id = uuid.uuid4().hex[:12]
task = EvaluationTask(task_id=task_id, scenario_path=scenario_path)
with self._lock:
self._tasks[task_id] = task
self._executor.submit(self._run, task)
return task_id
def get(self, task_id: str) -> TaskStatus | None:
"""Return a snapshot of one task, or None if the id is unknown."""
with self._lock:
task = self._tasks.get(task_id)
return task.snapshot() if task is not None else None
def list_tasks(self) -> list[TaskStatus]:
"""Return snapshots of all known tasks, newest first."""
with self._lock:
tasks = list(self._tasks.values())
snapshots = [task.snapshot() for task in tasks]
snapshots.sort(key=lambda item: item.created_at, reverse=True)
return snapshots
def _run(self, task: EvaluationTask) -> None:
"""Execute one evaluation end to end inside a worker thread."""
task.status = "running"
task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}")
capture = _LineCapture(task)
try:
# Lazy import keeps the web server bootable if ragas is unavailable.
task.append_log("加载评估引擎 (rag_eval / ragas)...")
from rag_eval.execution.runner import run_scenario
absolute_path = self._to_absolute(task.scenario_path)
task.append_log(f"运行场景文件: {absolute_path}")
with redirect_stdout(capture), redirect_stderr(capture):
result = run_scenario(str(absolute_path))
capture.flush()
task.run_id = getattr(result, "run_id", None)
output_dir = getattr(getattr(result, "scenario", None), "output_dir", "")
task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}")
if output_dir:
task.append_log(f"结果目录: {output_dir}")
task.status = "completed"
except Exception as exc: # noqa: BLE001 - surface any failure to the UI
capture.flush()
error_type = type(exc).__name__
task.error = f"{error_type}: {exc}"
task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}")
task.status = "failed"
finally:
task.finished_at = _now_iso()
def _to_absolute(self, scenario_path: str) -> Path:
"""Resolve a scenario path against the repository root if relative."""
candidate = Path(scenario_path)
if candidate.is_absolute():
return candidate
repo_root = Path(__file__).resolve().parents[2]
return (repo_root / candidate).resolve()
# Module-level singleton shared by the FastAPI routes.
task_manager = TaskManager()

View File

@@ -0,0 +1,47 @@
"""Self-contained text helpers for the web layer.
These intentionally avoid importing from rag_eval so the web server has no
import-time dependency on the evaluation engine (and therefore boots even when
ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the
console interprets serialized CSV context columns the same way the engine does.
"""
from __future__ import annotations
import ast
import json
import math
from typing import Any
def parse_contexts(value: Any) -> list[str]:
"""Normalize a context payload into a list of non-empty strings.
Accepts native lists, JSON/Python-literal serialized lists (as written into
scores.csv), and plain text, mirroring the engine's own parsing rules.
"""
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if value is None or (isinstance(value, float) and math.isnan(value)):
return []
text = str(value).strip()
if not text:
return []
# Accept serialized lists from CSV exports before falling back to plain text.
for parser in (json.loads, ast.literal_eval):
try:
parsed = parser(text)
except (ValueError, SyntaxError, json.JSONDecodeError):
continue
if isinstance(parsed, list):
return [str(item).strip() for item in parsed if str(item).strip()]
# Preserve paragraph-style context dumps by splitting on blank lines first.
if "\n\n" in text:
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
if chunks:
return chunks
return [text]