Add RAGAS evaluation web console (FastAPI + vanilla JS)
- webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
This commit is contained in:
1
webapp/services/__init__.py
Normal file
1
webapp/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Service package for the evaluation console (filesystem readers and task runner)."""
|
||||
188
webapp/services/report_builder.py
Normal file
188
webapp/services/report_builder.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""Aggregate a run's per-sample scores into the report payload for the UI.
|
||||
|
||||
All aggregation reads only the standard scores.csv produced by the reporting
|
||||
layer, plus the metric list resolved by run_reader. The output mirrors the
|
||||
report detail page: metric means, per-metric distribution histograms, grouped
|
||||
means by difficulty / question_type, and the lowest-scoring samples for review.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from webapp.services.text_utils import parse_contexts
|
||||
from webapp.models import (
|
||||
DistributionBin,
|
||||
GroupStat,
|
||||
ReportData,
|
||||
SampleScore,
|
||||
)
|
||||
from webapp.services import run_reader
|
||||
|
||||
|
||||
# Number of equal-width buckets used for metric score histograms.
|
||||
DISTRIBUTION_BIN_COUNT = 5
|
||||
|
||||
# Metadata columns that we group samples by when present in the data.
|
||||
GROUPING_FIELDS = ("difficulty", "question_type", "language")
|
||||
|
||||
# How many lowest-scoring samples to surface for manual review.
|
||||
LOWEST_SAMPLE_COUNT = 10
|
||||
|
||||
|
||||
def _round_or_none(value: float | None) -> float | None:
|
||||
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
|
||||
return None
|
||||
return round(float(value), 4)
|
||||
|
||||
|
||||
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
|
||||
"""Compute the mean of each metric column across all scored samples."""
|
||||
means: dict[str, float | None] = {}
|
||||
for metric in metrics:
|
||||
if metric in frame.columns:
|
||||
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
|
||||
else:
|
||||
means[metric] = None
|
||||
return means
|
||||
|
||||
|
||||
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
||||
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
||||
bins: list[DistributionBin] = []
|
||||
if metric not in frame.columns:
|
||||
return bins
|
||||
|
||||
series = pd.to_numeric(frame[metric], errors="coerce").dropna()
|
||||
width = 1.0 / DISTRIBUTION_BIN_COUNT
|
||||
for index in range(DISTRIBUTION_BIN_COUNT):
|
||||
lower = index * width
|
||||
upper = (index + 1) * width
|
||||
# Include the right edge in the final bin so 1.0 is counted.
|
||||
if index == DISTRIBUTION_BIN_COUNT - 1:
|
||||
mask = (series >= lower) & (series <= upper)
|
||||
else:
|
||||
mask = (series >= lower) & (series < upper)
|
||||
bins.append(
|
||||
DistributionBin(
|
||||
label=f"{lower:.1f}–{upper:.1f}",
|
||||
lower=round(lower, 2),
|
||||
upper=round(upper, 2),
|
||||
count=int(mask.sum()),
|
||||
)
|
||||
)
|
||||
return bins
|
||||
|
||||
|
||||
def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
|
||||
"""Compute per-group metric means for each available grouping field."""
|
||||
groupings: dict[str, list[GroupStat]] = {}
|
||||
for field in GROUPING_FIELDS:
|
||||
if field not in frame.columns:
|
||||
continue
|
||||
# Skip fields that are entirely empty so the UI does not render noise.
|
||||
non_empty = frame[field].astype(str).str.strip().replace("nan", "")
|
||||
if non_empty.eq("").all():
|
||||
continue
|
||||
|
||||
stats: list[GroupStat] = []
|
||||
for key, group in frame.groupby(frame[field].astype(str)):
|
||||
key_text = str(key).strip()
|
||||
if not key_text or key_text == "nan":
|
||||
continue
|
||||
means = {
|
||||
metric: _round_or_none(group[metric].mean(numeric_only=True))
|
||||
for metric in metrics
|
||||
if metric in group.columns
|
||||
}
|
||||
stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
|
||||
if stats:
|
||||
stats.sort(key=lambda item: item.key)
|
||||
groupings[field] = stats
|
||||
return groupings
|
||||
|
||||
|
||||
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
||||
"""Average a single sample's available metric scores for ranking."""
|
||||
values = [
|
||||
float(row[metric])
|
||||
for metric in metrics
|
||||
if metric in row and pd.notna(row[metric])
|
||||
]
|
||||
if not values:
|
||||
return None
|
||||
return sum(values) / len(values)
|
||||
|
||||
|
||||
def _cell_text(row: pd.Series, column: str) -> str:
|
||||
"""Safely read a string cell, returning '' for missing or NaN values."""
|
||||
if column not in row or pd.isna(row[column]):
|
||||
return ""
|
||||
return str(row[column]).strip()
|
||||
|
||||
|
||||
def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
|
||||
"""Select and shape the lowest-scoring samples for the review table."""
|
||||
if frame.empty:
|
||||
return []
|
||||
|
||||
enriched: list[tuple[float, SampleScore]] = []
|
||||
for _, row in frame.iterrows():
|
||||
mean_score = _sample_mean(row, metrics)
|
||||
sample = SampleScore(
|
||||
sample_id=_cell_text(row, "sample_id") or "—",
|
||||
question=_cell_text(row, "question"),
|
||||
contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
|
||||
answer=_cell_text(row, "answer"),
|
||||
ground_truth=_cell_text(row, "ground_truth"),
|
||||
language=_cell_text(row, "language"),
|
||||
difficulty=_cell_text(row, "difficulty"),
|
||||
question_type=_cell_text(row, "question_type"),
|
||||
metrics={
|
||||
metric: _round_or_none(float(row[metric]))
|
||||
for metric in metrics
|
||||
if metric in row and pd.notna(row[metric])
|
||||
},
|
||||
mean_score=_round_or_none(mean_score),
|
||||
error=_cell_text(row, "error"),
|
||||
)
|
||||
# Samples without any score sort last (treated as worst for review).
|
||||
sort_key = mean_score if mean_score is not None else -1.0
|
||||
enriched.append((sort_key, sample))
|
||||
|
||||
enriched.sort(key=lambda item: item[0])
|
||||
return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
|
||||
|
||||
|
||||
def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
||||
"""Build the full aggregated report payload for one run directory."""
|
||||
frame = run_reader.read_scores_frame(run_dir)
|
||||
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||
|
||||
if frame.empty or not metrics:
|
||||
return ReportData(
|
||||
metrics=metrics,
|
||||
metric_means={metric: None for metric in metrics},
|
||||
summary_markdown=summary_markdown,
|
||||
)
|
||||
|
||||
distributions = {
|
||||
metric: _distribution(frame, metric)
|
||||
for metric in metrics
|
||||
if metric in frame.columns
|
||||
}
|
||||
|
||||
return ReportData(
|
||||
metrics=metrics,
|
||||
metric_means=_metric_means(frame, metrics),
|
||||
distributions=distributions,
|
||||
groupings=_groupings(frame, metrics),
|
||||
lowest_samples=_lowest_samples(frame, metrics),
|
||||
summary_markdown=summary_markdown,
|
||||
)
|
||||
222
webapp/services/run_reader.py
Normal file
222
webapp/services/run_reader.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""Read evaluation run artifacts from disk into API-friendly structures.
|
||||
|
||||
A "run" is any directory under the configured output roots that contains a
|
||||
metadata.json file. This service stays decoupled from rag_eval internals: it
|
||||
only reads the standard artifact files (metadata.json, scores.csv, summary.md,
|
||||
scenario.snapshot.yaml) that the reporting layer writes.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
from webapp.models import RunSummary
|
||||
|
||||
|
||||
# Directory names that commonly hold run outputs, relative to the repo root.
|
||||
DEFAULT_OUTPUT_ROOTS = ("outputs", "runs")
|
||||
|
||||
|
||||
def _repo_root() -> Path:
|
||||
"""Return the siemens_ragas repository root (parent of the webapp package)."""
|
||||
return Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||
"""Collect existing output directories that may contain run artifacts."""
|
||||
root = _repo_root()
|
||||
roots: list[Path] = []
|
||||
for name in DEFAULT_OUTPUT_ROOTS:
|
||||
candidate = root / name
|
||||
if candidate.is_dir():
|
||||
roots.append(candidate)
|
||||
for extra in extra_roots or []:
|
||||
if extra.is_dir():
|
||||
roots.append(extra)
|
||||
return roots
|
||||
|
||||
|
||||
def _read_json(path: Path) -> dict[str, Any]:
|
||||
"""Load a JSON file, returning an empty dict on any failure."""
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except (OSError, ValueError):
|
||||
return {}
|
||||
|
||||
|
||||
def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
|
||||
"""Read the configured metric list from a scenario snapshot if present."""
|
||||
snapshot = run_dir / "scenario.snapshot.yaml"
|
||||
if not snapshot.is_file():
|
||||
return []
|
||||
try:
|
||||
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
|
||||
except (OSError, yaml.YAMLError):
|
||||
return []
|
||||
metrics = payload.get("metrics")
|
||||
if isinstance(metrics, list):
|
||||
return [str(item) for item in metrics]
|
||||
return []
|
||||
|
||||
|
||||
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||
"""Find every run directory (one that contains metadata.json) under the roots."""
|
||||
run_dirs: list[Path] = []
|
||||
seen: set[Path] = set()
|
||||
for root in _candidate_roots(extra_roots):
|
||||
for metadata_path in root.rglob("metadata.json"):
|
||||
run_dir = metadata_path.parent
|
||||
# A dataset-build metadata.json also exists; keep only evaluation runs
|
||||
# by requiring a scores.csv alongside, or a recognizable run metadata.
|
||||
metadata = _read_json(metadata_path)
|
||||
if "scenario_name" not in metadata:
|
||||
continue
|
||||
if run_dir in seen:
|
||||
continue
|
||||
seen.add(run_dir)
|
||||
run_dirs.append(run_dir)
|
||||
return run_dirs
|
||||
|
||||
|
||||
def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]:
|
||||
"""Compute per-metric mean scores from a run's scores.csv."""
|
||||
scores_path = run_dir / "scores.csv"
|
||||
if not scores_path.is_file():
|
||||
return {}
|
||||
try:
|
||||
frame = pd.read_csv(scores_path)
|
||||
except (OSError, ValueError, pd.errors.ParserError):
|
||||
return {}
|
||||
means: dict[str, float | None] = {}
|
||||
for metric in metrics:
|
||||
if metric in frame.columns:
|
||||
mean_value = frame[metric].mean(numeric_only=True)
|
||||
means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4)
|
||||
else:
|
||||
means[metric] = None
|
||||
return means
|
||||
|
||||
|
||||
def build_run_summary(run_dir: Path) -> RunSummary | None:
|
||||
"""Assemble a RunSummary from one run directory's artifacts."""
|
||||
metadata = _read_json(run_dir / "metadata.json")
|
||||
if "scenario_name" not in metadata:
|
||||
return None
|
||||
|
||||
metrics = _read_metrics_from_snapshot(run_dir)
|
||||
if not metrics:
|
||||
# Fall back to numeric score columns inferred from the scores file.
|
||||
metrics = _infer_metrics_from_scores(run_dir)
|
||||
|
||||
valid = int(metadata.get("valid_samples", 0) or 0)
|
||||
invalid = int(metadata.get("invalid_samples", 0) or 0)
|
||||
run_id = str(metadata.get("run_id") or run_dir.name)
|
||||
|
||||
return RunSummary(
|
||||
run_id=run_id,
|
||||
scenario_name=str(metadata.get("scenario_name", "")),
|
||||
mode=str(metadata.get("mode", "")),
|
||||
judge_model=str(metadata.get("judge_model", "")),
|
||||
embedding_model=str(metadata.get("embedding_model", "")),
|
||||
started_at=str(metadata.get("started_at", "")),
|
||||
finished_at=str(metadata.get("finished_at", "")),
|
||||
dataset=str(metadata.get("dataset", "")),
|
||||
total_samples=valid + invalid,
|
||||
valid_samples=valid,
|
||||
invalid_samples=invalid,
|
||||
metrics=metrics,
|
||||
metric_means=_metric_means(run_dir, metrics),
|
||||
output_path=run_dir.as_posix(),
|
||||
)
|
||||
|
||||
|
||||
# Columns in scores.csv that are sample fields rather than metric scores.
|
||||
NON_METRIC_COLUMNS = {
|
||||
"sample_id",
|
||||
"question",
|
||||
"contexts",
|
||||
"answer",
|
||||
"ground_truth",
|
||||
"scenario",
|
||||
"language",
|
||||
"retrieval_config",
|
||||
"error",
|
||||
"judge_model",
|
||||
"embedding_model",
|
||||
"run_id",
|
||||
"difficulty",
|
||||
"question_type",
|
||||
"doc_id",
|
||||
"doc_name",
|
||||
"section_path",
|
||||
"page_start",
|
||||
"page_end",
|
||||
"source_chunk_ids",
|
||||
"review_status",
|
||||
"review_notes",
|
||||
}
|
||||
|
||||
|
||||
def _infer_metrics_from_scores(run_dir: Path) -> list[str]:
|
||||
"""Infer metric column names from a scores.csv when no snapshot is available."""
|
||||
scores_path = run_dir / "scores.csv"
|
||||
if not scores_path.is_file():
|
||||
return []
|
||||
try:
|
||||
frame = pd.read_csv(scores_path, nrows=1)
|
||||
except (OSError, ValueError, pd.errors.ParserError):
|
||||
return []
|
||||
metrics: list[str] = []
|
||||
for column in frame.columns:
|
||||
if column in NON_METRIC_COLUMNS:
|
||||
continue
|
||||
if pd.api.types.is_numeric_dtype(frame[column]):
|
||||
metrics.append(str(column))
|
||||
return metrics
|
||||
|
||||
|
||||
def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]:
|
||||
"""Return all run summaries sorted by finish time (most recent first)."""
|
||||
summaries: list[RunSummary] = []
|
||||
for run_dir in discover_run_dirs(extra_roots):
|
||||
summary = build_run_summary(run_dir)
|
||||
if summary is not None:
|
||||
summaries.append(summary)
|
||||
summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True)
|
||||
return summaries
|
||||
|
||||
|
||||
def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None:
|
||||
"""Locate the run directory whose metadata or folder name matches run_id."""
|
||||
for run_dir in discover_run_dirs(extra_roots):
|
||||
metadata = _read_json(run_dir / "metadata.json")
|
||||
if str(metadata.get("run_id") or run_dir.name) == run_id:
|
||||
return run_dir
|
||||
return None
|
||||
|
||||
|
||||
def read_scores_frame(run_dir: Path) -> pd.DataFrame:
|
||||
"""Load a run's scores.csv into a dataframe, or an empty frame if missing."""
|
||||
scores_path = run_dir / "scores.csv"
|
||||
if not scores_path.is_file():
|
||||
return pd.DataFrame()
|
||||
try:
|
||||
return pd.read_csv(scores_path)
|
||||
except (OSError, ValueError, pd.errors.ParserError):
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def read_summary_markdown(run_dir: Path) -> str:
|
||||
"""Return the human-readable summary.md for a run, or an empty string."""
|
||||
summary_path = run_dir / "summary.md"
|
||||
if not summary_path.is_file():
|
||||
return ""
|
||||
try:
|
||||
return summary_path.read_text(encoding="utf-8")
|
||||
except OSError:
|
||||
return ""
|
||||
84
webapp/services/scenario_scanner.py
Normal file
84
webapp/services/scenario_scanner.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Discover scenario YAML files that can be launched from the console.
|
||||
|
||||
Scanning is intentionally tolerant: a malformed scenario file is reported with
|
||||
an error string rather than aborting the whole listing, so the UI can show the
|
||||
user which files are runnable and which need fixing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from webapp.models import ScenarioInfo
|
||||
|
||||
|
||||
def _repo_root() -> Path:
|
||||
"""Return the siemens_ragas repository root (parent of the webapp package)."""
|
||||
return Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def _scenarios_root() -> Path:
|
||||
"""Return the conventional scenarios/ directory inside the repository."""
|
||||
return _repo_root() / "scenarios"
|
||||
|
||||
|
||||
def _summarize_scenario(path: Path) -> ScenarioInfo:
|
||||
"""Read a scenario file into a compact info object, capturing parse errors."""
|
||||
relative = path.relative_to(_repo_root()).as_posix()
|
||||
try:
|
||||
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
except (OSError, yaml.YAMLError) as exc:
|
||||
return ScenarioInfo(path=relative, error=f"无法解析: {exc}")
|
||||
|
||||
if not isinstance(payload, dict):
|
||||
return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。")
|
||||
|
||||
metrics = payload.get("metrics")
|
||||
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
||||
|
||||
return ScenarioInfo(
|
||||
path=relative,
|
||||
scenario_name=str(payload.get("scenario_name", "")),
|
||||
mode=str(payload.get("mode", "")),
|
||||
dataset=str(payload.get("dataset", "")),
|
||||
judge_model=str(payload.get("judge_model", "")),
|
||||
metrics=metric_list,
|
||||
)
|
||||
|
||||
|
||||
def list_scenarios() -> list[ScenarioInfo]:
|
||||
"""Return every scenario YAML under scenarios/, sorted by path."""
|
||||
root = _scenarios_root()
|
||||
if not root.is_dir():
|
||||
return []
|
||||
|
||||
scenarios: list[ScenarioInfo] = []
|
||||
for path in sorted(root.rglob("*.yaml")):
|
||||
scenarios.append(_summarize_scenario(path))
|
||||
for path in sorted(root.rglob("*.yml")):
|
||||
scenarios.append(_summarize_scenario(path))
|
||||
return scenarios
|
||||
|
||||
|
||||
def resolve_scenario_path(relative_or_absolute: str) -> Path | None:
|
||||
"""Resolve a user-supplied scenario path safely within the repository.
|
||||
|
||||
Only paths that live inside the repository's scenarios/ directory are
|
||||
accepted, which prevents the trigger endpoint from reading arbitrary files.
|
||||
"""
|
||||
root = _repo_root()
|
||||
candidate = Path(relative_or_absolute)
|
||||
resolved = candidate if candidate.is_absolute() else (root / candidate)
|
||||
try:
|
||||
resolved = resolved.resolve()
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
scenarios_root = _scenarios_root().resolve()
|
||||
if scenarios_root not in resolved.parents and resolved != scenarios_root:
|
||||
return None
|
||||
if not resolved.is_file():
|
||||
return None
|
||||
return resolved
|
||||
161
webapp/services/task_manager.py
Normal file
161
webapp/services/task_manager.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""In-process background task manager for evaluation runs.
|
||||
|
||||
Evaluations run in a thread pool so the FastAPI event loop is never blocked.
|
||||
The heavy rag_eval / ragas import is performed lazily inside the worker thread,
|
||||
which keeps the web server bootable even when the evaluation dependencies are
|
||||
broken — failures then surface as task errors in the UI instead of crashing
|
||||
startup. This matches the "coarse status + logs" progress decision.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import threading
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import redirect_stderr, redirect_stdout
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from webapp.models import TaskStatus
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
"""Return the current UTC time as an ISO 8601 string."""
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class _LineCapture(io.TextIOBase):
|
||||
"""A writable stream that appends captured lines to a task's log buffer."""
|
||||
|
||||
def __init__(self, sink: "EvaluationTask") -> None:
|
||||
"""Bind the capture stream to the owning task."""
|
||||
self._sink = sink
|
||||
self._buffer = ""
|
||||
|
||||
def write(self, text: str) -> int:
|
||||
"""Buffer text and flush complete lines into the task log."""
|
||||
self._buffer += text
|
||||
while "\n" in self._buffer:
|
||||
line, self._buffer = self._buffer.split("\n", 1)
|
||||
self._sink.append_log(line)
|
||||
return len(text)
|
||||
|
||||
def flush(self) -> None:
|
||||
"""Flush any trailing partial line into the task log."""
|
||||
if self._buffer:
|
||||
self._sink.append_log(self._buffer)
|
||||
self._buffer = ""
|
||||
|
||||
|
||||
class EvaluationTask:
|
||||
"""Mutable state for a single background evaluation run."""
|
||||
|
||||
def __init__(self, task_id: str, scenario_path: str) -> None:
|
||||
"""Initialize a queued task for the given scenario path."""
|
||||
self.task_id = task_id
|
||||
self.scenario_path = scenario_path
|
||||
self.status = "queued"
|
||||
self.logs: list[str] = []
|
||||
self.run_id: str | None = None
|
||||
self.error: str | None = None
|
||||
self.created_at = _now_iso()
|
||||
self.finished_at = ""
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def append_log(self, line: str) -> None:
|
||||
"""Append one log line in a thread-safe manner."""
|
||||
with self._lock:
|
||||
self.logs.append(line)
|
||||
|
||||
def snapshot(self) -> TaskStatus:
|
||||
"""Return an immutable copy of the current task state for the API."""
|
||||
with self._lock:
|
||||
return TaskStatus(
|
||||
task_id=self.task_id,
|
||||
scenario_path=self.scenario_path,
|
||||
status=self.status,
|
||||
logs=list(self.logs),
|
||||
run_id=self.run_id,
|
||||
error=self.error,
|
||||
created_at=self.created_at,
|
||||
finished_at=self.finished_at,
|
||||
)
|
||||
|
||||
|
||||
class TaskManager:
|
||||
"""Owns the thread pool and registry of evaluation tasks."""
|
||||
|
||||
def __init__(self, max_workers: int = 2) -> None:
|
||||
"""Create a task manager backed by a small thread pool."""
|
||||
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||
self._tasks: dict[str, EvaluationTask] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def submit(self, scenario_path: str) -> str:
|
||||
"""Register and schedule a new evaluation task, returning its id."""
|
||||
task_id = uuid.uuid4().hex[:12]
|
||||
task = EvaluationTask(task_id=task_id, scenario_path=scenario_path)
|
||||
with self._lock:
|
||||
self._tasks[task_id] = task
|
||||
self._executor.submit(self._run, task)
|
||||
return task_id
|
||||
|
||||
def get(self, task_id: str) -> TaskStatus | None:
|
||||
"""Return a snapshot of one task, or None if the id is unknown."""
|
||||
with self._lock:
|
||||
task = self._tasks.get(task_id)
|
||||
return task.snapshot() if task is not None else None
|
||||
|
||||
def list_tasks(self) -> list[TaskStatus]:
|
||||
"""Return snapshots of all known tasks, newest first."""
|
||||
with self._lock:
|
||||
tasks = list(self._tasks.values())
|
||||
snapshots = [task.snapshot() for task in tasks]
|
||||
snapshots.sort(key=lambda item: item.created_at, reverse=True)
|
||||
return snapshots
|
||||
|
||||
def _run(self, task: EvaluationTask) -> None:
|
||||
"""Execute one evaluation end to end inside a worker thread."""
|
||||
task.status = "running"
|
||||
task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}")
|
||||
|
||||
capture = _LineCapture(task)
|
||||
try:
|
||||
# Lazy import keeps the web server bootable if ragas is unavailable.
|
||||
task.append_log("加载评估引擎 (rag_eval / ragas)...")
|
||||
from rag_eval.execution.runner import run_scenario
|
||||
|
||||
absolute_path = self._to_absolute(task.scenario_path)
|
||||
task.append_log(f"运行场景文件: {absolute_path}")
|
||||
|
||||
with redirect_stdout(capture), redirect_stderr(capture):
|
||||
result = run_scenario(str(absolute_path))
|
||||
capture.flush()
|
||||
|
||||
task.run_id = getattr(result, "run_id", None)
|
||||
output_dir = getattr(getattr(result, "scenario", None), "output_dir", "")
|
||||
task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}")
|
||||
if output_dir:
|
||||
task.append_log(f"结果目录: {output_dir}")
|
||||
task.status = "completed"
|
||||
except Exception as exc: # noqa: BLE001 - surface any failure to the UI
|
||||
capture.flush()
|
||||
error_type = type(exc).__name__
|
||||
task.error = f"{error_type}: {exc}"
|
||||
task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}")
|
||||
task.status = "failed"
|
||||
finally:
|
||||
task.finished_at = _now_iso()
|
||||
|
||||
def _to_absolute(self, scenario_path: str) -> Path:
|
||||
"""Resolve a scenario path against the repository root if relative."""
|
||||
candidate = Path(scenario_path)
|
||||
if candidate.is_absolute():
|
||||
return candidate
|
||||
repo_root = Path(__file__).resolve().parents[2]
|
||||
return (repo_root / candidate).resolve()
|
||||
|
||||
|
||||
# Module-level singleton shared by the FastAPI routes.
|
||||
task_manager = TaskManager()
|
||||
47
webapp/services/text_utils.py
Normal file
47
webapp/services/text_utils.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Self-contained text helpers for the web layer.
|
||||
|
||||
These intentionally avoid importing from rag_eval so the web server has no
|
||||
import-time dependency on the evaluation engine (and therefore boots even when
|
||||
ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the
|
||||
console interprets serialized CSV context columns the same way the engine does.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import json
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_contexts(value: Any) -> list[str]:
|
||||
"""Normalize a context payload into a list of non-empty strings.
|
||||
|
||||
Accepts native lists, JSON/Python-literal serialized lists (as written into
|
||||
scores.csv), and plain text, mirroring the engine's own parsing rules.
|
||||
"""
|
||||
if isinstance(value, list):
|
||||
return [str(item).strip() for item in value if str(item).strip()]
|
||||
if value is None or (isinstance(value, float) and math.isnan(value)):
|
||||
return []
|
||||
|
||||
text = str(value).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Accept serialized lists from CSV exports before falling back to plain text.
|
||||
for parser in (json.loads, ast.literal_eval):
|
||||
try:
|
||||
parsed = parser(text)
|
||||
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||
continue
|
||||
if isinstance(parsed, list):
|
||||
return [str(item).strip() for item in parsed if str(item).strip()]
|
||||
|
||||
# Preserve paragraph-style context dumps by splitting on blank lines first.
|
||||
if "\n\n" in text:
|
||||
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
|
||||
if chunks:
|
||||
return chunks
|
||||
|
||||
return [text]
|
||||
Reference in New Issue
Block a user