Compare commits

..

4 Commits

Author SHA1 Message Date
wangwei
1ff4a3943a feat(dataset-builder): add retry logic and ASCII-safe logging for Siemens PDF pipeline
- question_generator.py: add max_retries=3/retry_delay=5s loop with
  exponential backoff on LLM timeout or server errors; encode filenames
  with ascii/replace before printing to avoid UnicodeEncodeError on
  Windows cp1252 consoles
- runner.py: encode PDF filenames ASCII-safe for progress messages;
  catch generation failures per-document and skip (or re-raise) based
  on failure_mode, preventing one bad doc from aborting the whole build

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
2026-06-15 23:06:33 +08:00
wangwei
75ae7927ad Add Siemens CT document evaluation scenario (three-step pipeline)
- scenarios/siemens_build/siemens-pdf-build.yaml: dataset build for all 17
  Siemens medical-imaging PDFs (aliyun_docmind parser, 10 questions/doc,
  failure_mode=skip, ~170 question total)
- scenarios/offline/siemens-pdf-offline-smoke.yaml: offline evaluation using
  source chunks as contexts and ground_truth as answer (up to 30 samples)
- scenarios/online/siemens-pdf-question-bank-online.yaml: online evaluation
  calling siemens_pdf_qa adapter, batch_size=4, up to 50 samples
- apps/siemens_pdf_qa/adapter.py: Siemens-specific adapter with bilingual
  (zh/en) system prompt and strict evidence-grounding for CT domain
- scripts/build_siemens_offline_smoke.py: helper to derive offline smoke CSV
  from completed dataset build artifacts (run after dataset build step)
- docs/superpowers/specs/2026-06-15-siemens-scenario-design.md: design spec

All three scenarios are automatically discovered by the web console.

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 17:00:52 +08:00
wangwei
1288a366d1 Fix start.bat (ASCII-only, guaranteed window stays open) + add start.ps1
- start.bat: remove all Chinese characters (caused silent failure when
  Windows batch parser ran before chcp 65001 took effect); add :error
  label so window stays open with pause on any failure
- start.ps1: PowerShell alternative launcher with coloured output,
  works without worrying about cmd.exe encoding issues

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 16:14:53 +08:00
wangwei
e89695e490 Add RAGAS evaluation web console (FastAPI + vanilla JS)
- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-06-15 15:53:57 +08:00
36 changed files with 3042 additions and 26 deletions

6
.gitignore vendored
View File

@@ -17,5 +17,7 @@ wheels/
# outputs # outputs
outputs/ outputs/
# datasets # datasets — raw/normalized data files (large, not committed)
datasets/ # Note: rag_eval/datasets/ is source code and IS committed (see negation below)
datasets/
!rag_eval/datasets/

View File

@@ -0,0 +1,6 @@
"""Siemens PDF question bank adapter for online evaluation.
Wraps the generic pdf_question_bank adapter with a Siemens-specific system
prompt that instructs the model to answer in the same language as the question
(Chinese for Chinese CT documentation) and to cite only the provided evidence.
"""

View File

@@ -0,0 +1,170 @@
"""Online evaluation adapter for the Siemens medical-imaging PDF question bank.
Functionally identical to apps/pdf_question_bank/adapter.py but uses a
Siemens-specific system prompt that:
- Instructs the model to answer in the same language as the question
(important for Chinese CT documentation).
- Emphasises citation of source chunks and refusal when evidence is absent.
- Adds domain context (medical imaging / CT terminology).
The adapter contract is the same as all other adapters:
run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from openai import OpenAI
from rag_eval.settings import EvaluationSettings
from rag_eval.shared.utils import parse_contexts
# ── chunk cache (module-level, lives for the process lifetime) ────────────────
_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
"""Resolve the chunk artifact path; fall back to the latest timestamped run."""
resolved = Path(source_chunks_path).resolve()
if resolved.exists():
return resolved
if resolved.parent.name != "latest":
raise FileNotFoundError(resolved)
artifact_root = resolved.parent.parent
if not artifact_root.exists():
raise FileNotFoundError(resolved)
candidates = sorted(
[d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
key=lambda p: p.name,
reverse=True,
)
for run_dir in candidates:
candidate = run_dir / resolved.name
if candidate.exists():
return candidate
raise FileNotFoundError(resolved)
def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
"""Load and cache source chunks by chunk_id."""
resolved = _resolve_source_chunks_path(source_chunks_path)
cached = _CHUNK_CACHE.get(resolved)
if cached is not None:
return cached
lookup: dict[str, dict[str, Any]] = {}
with resolved.open(encoding="utf-8") as fh:
for lineno, line in enumerate(fh, 1):
text = line.strip()
if not text:
continue
payload = json.loads(text)
chunk_id = str(payload.get("chunk_id", "")).strip()
if not chunk_id:
raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
lookup[chunk_id] = payload
_CHUNK_CACHE[resolved] = lookup
return lookup
def _resolve_chunk_ids(raw: Any) -> list[str]:
"""Parse the source_chunk_ids column into a list of non-empty id strings."""
ids = parse_contexts(raw)
normalized = [i for i in ids if i]
if not normalized:
raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
return normalized
def _build_messages(
question: str,
contexts: list[str],
metadata: dict[str, Any],
) -> list[dict[str, str]]:
"""Build a Siemens-domain grounded prompt for the answer model."""
evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
meta_lines = [
f"doc_name: {metadata.get('doc_name', '')}",
f"section_path: {metadata.get('section_path', '')}",
f"page_range: {metadata.get('page_start', '')}{metadata.get('page_end', '')}",
]
# Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
system_prompt = (
"你是西门子医疗影像知识库的问答助手Siemens Healthineers CT Knowledge Base QA"
"请严格根据下方【证据片段】回答问题,不得使用片段之外的任何知识。"
"若证据不足以回答,请明确说明「根据现有资料无法回答」。"
"请用与问题相同的语言(中文或英文)作答,简洁准确,必要时引用片段编号。"
)
user_prompt = "\n".join([
"【问题】",
question,
"",
"【文档元信息】",
*meta_lines,
"",
"【证据片段】",
*evidence_lines,
"",
"请基于以上证据片段作答。",
])
return [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
def run(
question: str,
*,
source_chunks_path: str,
model: str | None = None,
client: OpenAI | None = None,
**kwargs: Any,
) -> dict[str, Any]:
"""Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
This is the adapter contract entry point used by the online evaluation runner.
"""
chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
chunk_lookup = _load_source_chunks(source_chunks_path)
missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
if missing:
raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
contexts = [
str(chunk.get("text", "")).strip()
for chunk in resolved_chunks
if str(chunk.get("text", "")).strip()
]
if not contexts:
raise ValueError("resolved source chunks contain no usable text.")
settings = EvaluationSettings()
target_model = (model or settings.ragas_judge_model).strip()
if not target_model:
raise ValueError("A model name is required for siemens_pdf_qa adapter.")
llm_client = client or OpenAI(**settings.openai_client_kwargs)
completion = llm_client.chat.completions.create(
model=target_model,
messages=_build_messages(question, contexts, kwargs),
temperature=0,
)
answer = str(completion.choices[0].message.content or "").strip()
return {
"answer": answer,
"contexts": contexts,
"raw_response": {
"resolved_chunk_ids": chunk_ids,
"doc_id": kwargs.get("doc_id", ""),
"doc_name": kwargs.get("doc_name", ""),
"model": target_model,
"response_text": answer,
},
}

View File

@@ -0,0 +1,59 @@
# Siemens PDF 场景设计 Spec
- 日期2026-06-15
- 状态:已确认,进入实现。
## 1. 目标
基于 `datasets/siemens-pdfs/`17 个西门子医疗 CT 中文 PDF跑通完整三步流水线
```
dataset_buildPDF→题库→ offline smoke 评估 → online 评估
```
完全镜像现有 `sample-pdf-*` 模式(方案 A不改动任何现有文件。
## 2. 参数决策
| 项目 | 值 |
|---|---|
| 输入 PDF | `datasets/siemens-pdfs/*.pdf`17 个) |
| failure_mode | `skip`(单个文档解析失败不中断整批) |
| max_questions_per_document | 10共 ~170 题) |
| max_source_chunks_per_question | 3 |
| generation model | `.env``DATASET_GENERATOR_MODEL`qwen3.6-plus |
| judge model | `.env``RAGAS_JUDGE_MODEL`deepseek-v4-flash |
| embedding model | `.env``RAGAS_EMBEDDING_MODEL`text-embedding-v3 |
| online answer model | `.env``RAGAS_JUDGE_MODEL` |
| metrics | faithfulness / answer_relevancy / context_recall / context_precision |
## 3. 新增文件4 个)
```
scenarios/siemens_build/siemens-pdf-build.yaml
scenarios/offline/siemens-pdf-offline-smoke.yaml
scenarios/online/siemens-pdf-question-bank-online.yaml
apps/siemens_pdf_qa/__init__.py
apps/siemens_pdf_qa/adapter.py
```
加上辅助脚本:
```
scripts/build_siemens_offline_smoke.py ← 从 build 产物生成 offline smoke CSV
```
## 4. 运行顺序
```
# 步骤 1dataset buildPDF → 题库草稿 + source_chunks.jsonl
python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
# 步骤 2生成 offline smoke 数据集一次性脚本build 跑完后执行)
python scripts/build_siemens_offline_smoke.py
# 步骤 3offline 评估(用 source chunks 作为 contextsground_truth 作为 answer
python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
# 步骤 4online 评估(实时调用 LLM 生成 answer再评分
python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
```

View File

@@ -3,6 +3,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any from typing import Any
@@ -150,24 +151,39 @@ class OpenAIQuestionGenerator(QuestionGenerator):
max_questions: int, max_questions: int,
max_chunks_per_question: int, max_chunks_per_question: int,
job_name: str, job_name: str,
max_retries: int = 3,
retry_delay: float = 5.0,
) -> list[DraftQuestionSample]: ) -> list[DraftQuestionSample]:
"""Generate draft questions for one parsed document.""" """Generate draft questions for one parsed document, with retry on timeout/server errors."""
prompt = self._build_prompt( prompt = self._build_prompt(
document, document,
max_questions=max_questions, max_questions=max_questions,
max_chunks_per_question=max_chunks_per_question, max_chunks_per_question=max_chunks_per_question,
) )
response = self.client.chat.completions.create( last_exc: Exception | None = None
model=self.model, for attempt in range(1, max_retries + 1):
messages=[ try:
{"role": "system", "content": "You generate structured draft question banks from source documents."}, response = self.client.chat.completions.create(
{"role": "user", "content": prompt}, model=self.model,
], messages=[
response_format={"type": "json_object"}, {"role": "system", "content": "You generate structured draft question banks from source documents."},
) {"role": "user", "content": prompt},
content = response.choices[0].message.content or "{}" ],
payload = self._parse_response_payload(content) response_format={"type": "json_object"},
return [ )
self._build_sample(document=document, payload=item, index=index, job_name=job_name) content = response.choices[0].message.content or "{}"
for index, item in enumerate(payload[:max_questions], start=1) payload = self._parse_response_payload(content)
] return [
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
for index, item in enumerate(payload[:max_questions], start=1)
]
except Exception as exc:
last_exc = exc
if attempt < max_retries:
wait = retry_delay * attempt
doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
print(f" [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
time.sleep(wait)
raise RuntimeError(
f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
) from last_exc

View File

@@ -111,12 +111,32 @@ def run_dataset_build(
continue continue
documents.append(document) documents.append(document)
generated = generator.generate( doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
document, print(f" [info] generating questions for: {doc_name_safe}")
max_questions=job.max_questions_per_document, try:
max_chunks_per_question=job.max_source_chunks_per_question, generated = generator.generate(
job_name=job.job_name, document,
) max_questions=job.max_questions_per_document,
max_chunks_per_question=job.max_source_chunks_per_question,
job_name=job.job_name,
)
except Exception as exc:
gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
failures.append(gen_failure)
print(f" [warn] skipping {doc_name_safe} after generation failure: {exc}")
if job.failure_mode == "fail":
result = DatasetBuildResult(
job=job,
run_id=run_id,
artifact_paths=artifact_paths,
documents=documents,
draft_samples=draft_samples,
parse_failures=failures,
)
write_dataset_build_artifacts(result)
raise
continue
valid_generated = [] valid_generated = []
for sample in generated: for sample in generated:
errors = validate_draft_sample( errors = validate_draft_sample(
@@ -126,9 +146,9 @@ def run_dataset_build(
) )
if not errors: if not errors:
valid_generated.append(sample) valid_generated.append(sample)
draft_samples.extend( new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
dedupe_samples(valid_generated)[: job.max_questions_per_document] draft_samples.extend(new_samples)
) print(f" [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
result = DatasetBuildResult( result = DatasetBuildResult(
job=job, job=job,

View File

@@ -0,0 +1 @@
"""Dataset loading and normalization for the RAG evaluation platform."""

View File

@@ -0,0 +1,56 @@
"""Load raw evaluation dataset records from disk.
Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
into NormalizedSample is handled by normalizers.py.
"""
from __future__ import annotations
import csv
import json
from pathlib import Path
from typing import Any
def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
"""Load raw records from a CSV or JSONL file.
Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
are left as-is; normalizers handle parsing.
"""
file_path = Path(path)
if not file_path.is_file():
raise FileNotFoundError(f"Dataset file not found: {file_path}")
suffix = file_path.suffix.lower()
if suffix in (".jsonl", ".ndjson"):
return _load_jsonl(file_path)
if suffix in (".csv",):
return _load_csv(file_path)
# Fall back to CSV for unknown extensions.
return _load_csv(file_path)
def _load_csv(path: Path) -> list[dict[str, Any]]:
"""Read a CSV file into a list of row dicts."""
with path.open(encoding="utf-8", newline="") as fh:
reader = csv.DictReader(fh)
return [dict(row) for row in reader]
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
"""Read a JSONL file into a list of record dicts."""
records: list[dict[str, Any]] = []
with path.open(encoding="utf-8") as fh:
for lineno, line in enumerate(fh, 1):
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
if not isinstance(obj, dict):
raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
records.append(obj)
return records

View File

@@ -0,0 +1,105 @@
"""Normalize raw dataset records into NormalizedSample and InvalidSample objects.
Handles both offline mode (records already contain answer + contexts) and online
mode (records only contain question + ground_truth; adapter fills the rest).
"""
from __future__ import annotations
import uuid
from typing import Any
from rag_eval.shared.models import InvalidSample, NormalizedSample
from rag_eval.shared.utils import parse_contexts
# Fields we always strip from the raw record before storing it in metadata.
_CORE_FIELDS = {
"sample_id",
"question",
"contexts",
"answer",
"ground_truth",
"scenario",
"language",
"retrieval_config",
}
def _get_str(record: dict[str, Any], key: str, default: str = "") -> str:
"""Return a string field from the record, coercing None/NaN to the default."""
value = record.get(key)
if value is None:
return default
text = str(value).strip()
return default if text.lower() == "nan" else text
def normalize_records(
records: list[dict[str, Any]],
mode: str = "offline",
max_samples: int | None = None,
) -> tuple[list[NormalizedSample], list[InvalidSample]]:
"""Convert raw dicts into NormalizedSample / InvalidSample collections.
In offline mode every record must already contain answer and contexts.
In online mode those fields may be absent; they will be filled by the adapter.
"""
if max_samples is not None:
records = records[:max_samples]
valid: list[NormalizedSample] = []
invalid: list[InvalidSample] = []
for raw in records:
sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12]
question = _get_str(raw, "question")
if not question:
invalid.append(InvalidSample(
sample_id=sample_id,
error="missing required field: question",
raw=raw,
))
continue
ground_truth = _get_str(raw, "ground_truth")
contexts = parse_contexts(raw.get("contexts"))
answer = _get_str(raw, "answer")
if mode == "offline":
errors: list[str] = []
if not ground_truth:
errors.append("missing ground_truth")
if not answer:
errors.append("missing answer")
if not contexts:
errors.append("missing or empty contexts")
if errors:
invalid.append(InvalidSample(
sample_id=sample_id,
error="; ".join(errors),
raw=raw,
))
continue
# Collect any extra columns as opaque metadata for adapters and reporting.
metadata = {
key: value
for key, value in raw.items()
if key not in _CORE_FIELDS
}
valid.append(NormalizedSample(
sample_id=sample_id,
question=question,
contexts=contexts,
answer=answer,
ground_truth=ground_truth,
scenario=_get_str(raw, "scenario"),
language=_get_str(raw, "language"),
retrieval_config=_get_str(raw, "retrieval_config"),
metadata=metadata,
raw=raw,
))
return valid, invalid

View File

@@ -0,0 +1,15 @@
scenario_name: siemens-pdf-offline-smoke
mode: offline
app_adapter: null
dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
output_dir: ../../outputs/siemens-pdf-offline-smoke
runtime:
batch_size: 4
max_samples: 30

View File

@@ -0,0 +1,22 @@
scenario_name: siemens-pdf-question-bank-online
mode: online
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3
metrics:
- faithfulness
- answer_relevancy
- context_recall
- context_precision
output_dir: ../../outputs/online/siemens-pdf-question-bank
runtime:
batch_size: 4
app_concurrency: 4
metric_concurrency: 4
max_samples: 50
app_adapter:
type: python
callable: apps.siemens_pdf_qa.adapter:run
static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash

View File

@@ -0,0 +1,17 @@
job_name: siemens-pdf-question-bank
input:
path: ../../datasets/siemens-pdfs
glob: "*.pdf"
parser:
provider: aliyun_docmind
failure_mode: skip
generation:
output_type: online_question_bank
review_mode: draft_with_manual_review
max_questions_per_document: 10
max_source_chunks_per_question: 3
output:
dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
runtime:
max_documents: 17

View File

@@ -0,0 +1,72 @@
"""Build the Siemens offline smoke dataset from a completed dataset_build run.
Must be run AFTER `python main.py --dataset-build-config
scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
It uses the stable `latest/` alias so you don't need to know the run_id.
Usage:
python scripts/build_siemens_offline_smoke.py
Output:
datasets/normalized/siemens_pdf_offline_smoke.csv
(referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
"""
from __future__ import annotations
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths — all relative to the siemens_ragas/ repository root
# ---------------------------------------------------------------------------
REPO_ROOT = Path(__file__).resolve().parents[1]
DRAFT_DATASET_PATH = (
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
/ "latest" / "dataset_draft.csv"
)
SOURCE_CHUNKS_PATH = (
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
/ "latest" / "source_chunks.jsonl"
)
OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
def main() -> None:
"""Convert the Siemens build artefacts into an offline-evaluable dataset."""
if not DRAFT_DATASET_PATH.exists():
raise FileNotFoundError(
f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
"Run the dataset build first:\n"
" python main.py --dataset-build-config "
"scenarios/siemens_build/siemens-pdf-build.yaml"
)
if not SOURCE_CHUNKS_PATH.exists():
raise FileNotFoundError(
f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
"Run the dataset build first."
)
# Import here so the script is importable even before rag_eval is fully set up.
from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
output = build_offline_smoke_dataset(
draft_dataset_path=DRAFT_DATASET_PATH,
source_chunks_path=SOURCE_CHUNKS_PATH,
output_path=OUTPUT_PATH,
)
import pandas as pd
frame = pd.read_csv(output)
print(f"Offline smoke dataset written to: {output}")
print(f"Total rows: {len(frame)}")
if len(frame) > 0:
lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
print(f"Language distribution: {lang_counts}")
print(f"Difficulty distribution: {diff_counts}")
if __name__ == "__main__":
main()

236
scripts/seed_sample_run.py Normal file
View File

@@ -0,0 +1,236 @@
"""Generate a realistic sample evaluation run so the console has demo data.
This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
layer produces, but without needing ragas or any network calls. It lets the
report board render immediately for demos and local development.
Usage:
python scripts/seed_sample_run.py
"""
from __future__ import annotations
import csv
import json
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
RUN_ID = "2026-06-15T08-30-00+00-00"
JUDGE_MODEL = "deepseek-distill-qwen-32b"
EMBEDDING_MODEL = "text-embedding-v3"
METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
# tail in the distribution, and clear weak groups by difficulty).
SAMPLES = [
{
"sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
"question": "员工入职满3年可享受多少天年休假",
"contexts": ["员工入司满1年不满10年的年休假5天。", "年休假在每年1月1日起可申请。"],
"answer": "根据规定入职满3年的员工可享受5天年休假。",
"ground_truth": "员工入司满1年不满10年的年休假5天。",
"faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
},
{
"sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
"question": "公司报销差旅费的截止提交时间是什么时候?",
"contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
"answer": "差旅费需在出差结束后30天内提交报销。",
"ground_truth": "差旅费报销须在出差结束后30天内提交。",
"faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
},
{
"sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
"question": "申请远程办公需要经过哪些审批流程?",
"contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
"answer": "需先由直属主管审批再提交人力资源部备案每月不超过8天。",
"ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案每月上限8天。",
"faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
},
{
"sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
"question": "How many days of paternity leave are employees entitled to?",
"contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
"answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
"ground_truth": "Employees are entitled to 15 days of paternity leave.",
"faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
},
{
"sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
"question": "正式员工与试用期员工在医疗保险待遇上有何区别?",
"contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
"answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。",
"ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。",
"faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
},
{
"sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
"question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
"contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
"answer": "第三方共享需签保密协议敏感数据须经数据保护官批准记录留存3年。",
"ground_truth": "向第三方共享数据须签署保密协议敏感数据共享须经数据保护官批准且共享记录至少留存3年。",
"faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
},
{
"sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
"question": "跨部门项目预算超支时的审批升级路径是怎样的?",
"contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
"answer": "超支10%以内项目经理批10%-20%需总监批超20%需财务委员会审批。",
"ground_truth": "超支10%以内由项目经理审批10%-20%由部门总监审批超过20%须提交财务委员会审批。",
"faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
},
{
"sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
"question": "员工持股计划的最低锁定期是多少年?",
"contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
"answer": "员工持股计划的最低锁定期为3年。",
"ground_truth": "员工持股计划的最低锁定期为4年。",
"faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
},
{
"sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
"question": "What is the difference in notice period between voluntary and involuntary termination?",
"contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
"answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
"ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
"faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
},
{
"sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
"question": "公司规定的标准工作时间是每周多少小时?",
"contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
"answer": "公司标准工作时间为每周40小时。",
"ground_truth": "公司标准工作时间为每周40小时。",
"faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
},
]
# Two samples that failed normalization, to exercise the invalid count display.
INVALID_SAMPLES = [
{"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"},
{"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"},
]
def _output_dir() -> Path:
"""Return the run directory where sample artifacts are written."""
return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
def _write_scores_csv(path: Path) -> None:
"""Write scores.csv with sample fields, metric scores, and metadata columns."""
fieldnames = [
"sample_id", "question", "contexts", "answer", "ground_truth",
"scenario", "language", "difficulty", "question_type",
*METRICS, "error", "judge_model", "embedding_model", "run_id",
]
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
for sample in SAMPLES:
row = {
"sample_id": sample["sample_id"],
"question": sample["question"],
# Serialize contexts as a JSON list, matching engine CSV output.
"contexts": json.dumps(sample["contexts"], ensure_ascii=False),
"answer": sample["answer"],
"ground_truth": sample["ground_truth"],
"scenario": SCENARIO_NAME,
"language": sample["language"],
"difficulty": sample["difficulty"],
"question_type": sample["question_type"],
"error": "",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"run_id": SCENARIO_NAME,
}
for metric in METRICS:
row[metric] = sample[metric]
writer.writerow(row)
def _write_invalid_csv(path: Path) -> None:
"""Write invalid.csv with the small set of unscored samples."""
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
writer.writeheader()
writer.writerows(INVALID_SAMPLES)
def _metric_mean(metric: str) -> float:
"""Compute the mean of one metric across the valid samples."""
return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
def _write_metadata(path: Path) -> None:
"""Write metadata.json mirroring the reporting layer's schema."""
metadata = {
"run_id": RUN_ID,
"scenario_name": SCENARIO_NAME,
"mode": "offline",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"started_at": "2026-06-15T08:29:12+00:00",
"finished_at": "2026-06-15T08:31:45+00:00",
"dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
"valid_samples": len(SAMPLES),
"invalid_samples": len(INVALID_SAMPLES),
}
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
def _write_summary(path: Path) -> None:
"""Write a human-readable summary.md echoing the metric means."""
lines = [
f"# {SCENARIO_NAME}",
"",
f"- run_id: `{RUN_ID}`",
"- mode: `offline`",
f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
f"- valid_samples: `{len(SAMPLES)}`",
f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
f"- judge_model: `{JUDGE_MODEL}`",
"",
"## Metric Means",
"",
]
for metric in METRICS:
lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def _write_scenario_snapshot(path: Path) -> None:
"""Write scenario.snapshot.yaml so the reader resolves the metric list."""
import yaml
snapshot = {
"scenario_name": SCENARIO_NAME,
"mode": "offline",
"judge_model": JUDGE_MODEL,
"embedding_model": EMBEDDING_MODEL,
"metrics": METRICS,
}
path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
def main() -> None:
"""Write all sample run artifacts into a fresh run directory."""
run_dir = _output_dir()
run_dir.mkdir(parents=True, exist_ok=True)
_write_scores_csv(run_dir / "scores.csv")
_write_invalid_csv(run_dir / "invalid.csv")
_write_metadata(run_dir / "metadata.json")
_write_summary(run_dir / "summary.md")
_write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
print(f"Sample run written to: {run_dir}")
print("Start the console with: python webmain.py")
if __name__ == "__main__":
main()

113
start.bat Normal file
View File

@@ -0,0 +1,113 @@
@echo off
setlocal
echo.
echo ============================================================
echo Siemens RAGAS Console - Starting...
echo ============================================================
echo.
:: Change to the directory where this script lives (siemens_ragas/)
cd /d "%~dp0"
echo Working directory: %CD%
echo.
:: ----------------------------------------------------------------
:: 1. Check Python
:: ----------------------------------------------------------------
python --version >nul 2>&1
if errorlevel 1 (
echo [ERROR] Python not found. Please install Python 3.12+ and add it to PATH.
goto :error
)
for /f "tokens=*" %%v in ('python --version 2^>^&1') do echo [OK] %%v
:: ----------------------------------------------------------------
:: 2. Check FastAPI / uvicorn
:: ----------------------------------------------------------------
python -c "import fastapi, uvicorn" >nul 2>&1
if errorlevel 1 (
echo [INFO] Installing fastapi and uvicorn...
pip install fastapi uvicorn --quiet
if errorlevel 1 (
echo [ERROR] Failed to install fastapi/uvicorn.
echo Run manually: pip install fastapi uvicorn
goto :error
)
echo [OK] fastapi and uvicorn installed.
) else (
echo [OK] fastapi / uvicorn ready.
)
:: ----------------------------------------------------------------
:: 3. Check ragas version
:: ----------------------------------------------------------------
python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" >nul 2>&1
if errorlevel 1 (
echo [INFO] Installing ragas==0.4.3 ...
pip install "ragas==0.4.3" --quiet
if errorlevel 1 (
echo [WARN] ragas install failed. Dashboard still works; evaluation trigger will show an error.
) else (
echo [OK] ragas 0.4.3 installed.
)
) else (
echo [OK] ragas 0.4.3 ready.
)
:: ----------------------------------------------------------------
:: 4. Seed demo data if no runs exist yet
:: ----------------------------------------------------------------
if not exist "outputs\kba-knowledge-base-offline-baseline" (
echo [INFO] No run data found. Generating demo data...
python scripts\seed_sample_run.py
if errorlevel 1 (
echo [WARN] Demo data generation failed. Dashboard may be empty.
) else (
echo [OK] Demo data generated.
)
) else (
echo [OK] Run data found, skipping demo generation.
)
:: ----------------------------------------------------------------
:: 5. Pick an available port
:: ----------------------------------------------------------------
set PORT=8800
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
if not errorlevel 1 (
echo [WARN] Port 8800 in use, trying 8801...
set PORT=8801
netstat -ano 2>nul | findstr ":8801" | findstr "LISTENING" >nul 2>&1
if not errorlevel 1 (
echo [ERROR] Ports 8800 and 8801 are both in use.
echo Run manually: python webmain.py --port 8802
goto :error
)
)
echo.
echo ============================================================
echo Console URL : http://127.0.0.1:%PORT%
echo Press Ctrl+C to stop the server
echo ============================================================
echo.
:: Open browser after 2-second delay (non-blocking)
start /b cmd /c "timeout /t 2 >nul && start http://127.0.0.1:%PORT%"
:: Launch uvicorn (blocking — window stays open while server runs)
python webmain.py --host 127.0.0.1 --port %PORT%
echo.
echo Server stopped.
pause
exit /b 0
:error
echo.
echo ============================================================
echo Startup failed. See error above.
echo ============================================================
pause
exit /b 1

111
start.ps1 Normal file
View File

@@ -0,0 +1,111 @@
# start.ps1 — Siemens RAGAS Console launcher for Windows PowerShell
# Usage: Right-click -> "Run with PowerShell", or: powershell -ExecutionPolicy Bypass -File start.ps1
$ErrorActionPreference = "Stop"
Set-Location $PSScriptRoot
Write-Host ""
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host " Siemens RAGAS Console - Starting..." -ForegroundColor Cyan
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
Write-Host "Working directory: $PSScriptRoot"
Write-Host ""
# ----------------------------------------------------------------
# 1. Check Python
# ----------------------------------------------------------------
try {
$pyver = & python --version 2>&1
Write-Host "[OK] $pyver" -ForegroundColor Green
} catch {
Write-Host "[ERROR] Python not found. Please install Python 3.12+ and add to PATH." -ForegroundColor Red
Read-Host "Press Enter to exit"
exit 1
}
# ----------------------------------------------------------------
# 2. Check FastAPI / uvicorn
# ----------------------------------------------------------------
$check = & python -c "import fastapi, uvicorn" 2>&1
if ($LASTEXITCODE -ne 0) {
Write-Host "[INFO] Installing fastapi and uvicorn..." -ForegroundColor Yellow
& pip install fastapi uvicorn --quiet
if ($LASTEXITCODE -ne 0) {
Write-Host "[ERROR] Failed to install fastapi/uvicorn. Run: pip install fastapi uvicorn" -ForegroundColor Red
Read-Host "Press Enter to exit"
exit 1
}
Write-Host "[OK] fastapi / uvicorn installed." -ForegroundColor Green
} else {
Write-Host "[OK] fastapi / uvicorn ready." -ForegroundColor Green
}
# ----------------------------------------------------------------
# 3. Check ragas version
# ----------------------------------------------------------------
$check = & python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" 2>&1
if ($LASTEXITCODE -ne 0) {
Write-Host "[INFO] Installing ragas==0.4.3 (evaluation engine)..." -ForegroundColor Yellow
& pip install "ragas==0.4.3" --quiet
if ($LASTEXITCODE -ne 0) {
Write-Host "[WARN] ragas install failed. Dashboard works; evaluation trigger will show error." -ForegroundColor Yellow
} else {
Write-Host "[OK] ragas 0.4.3 installed." -ForegroundColor Green
}
} else {
Write-Host "[OK] ragas 0.4.3 ready." -ForegroundColor Green
}
# ----------------------------------------------------------------
# 4. Seed demo data if missing
# ----------------------------------------------------------------
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
& python scripts\seed_sample_run.py
if ($LASTEXITCODE -ne 0) {
Write-Host "[WARN] Demo data generation failed. Dashboard may be empty." -ForegroundColor Yellow
} else {
Write-Host "[OK] Demo data generated." -ForegroundColor Green
}
} else {
Write-Host "[OK] Run data found, skipping demo generation." -ForegroundColor Green
}
# ----------------------------------------------------------------
# 5. Pick an available port
# ----------------------------------------------------------------
$PORT = 8800
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
if ($inUse) {
Write-Host "[WARN] Port $PORT in use, trying 8801..." -ForegroundColor Yellow
$PORT = 8801
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
if ($inUse) {
Write-Host "[ERROR] Ports 8800 and 8801 are both in use." -ForegroundColor Red
Write-Host " Run manually: python webmain.py --port 8802"
Read-Host "Press Enter to exit"
exit 1
}
}
Write-Host ""
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host " Console URL : http://127.0.0.1:$PORT" -ForegroundColor Green
Write-Host " Press Ctrl+C to stop the server" -ForegroundColor Cyan
Write-Host "============================================================" -ForegroundColor Cyan
Write-Host ""
# Open browser after 2-second delay
Start-Job -ScriptBlock {
param($port)
Start-Sleep 2
Start-Process "http://127.0.0.1:$port"
} -ArgumentList $PORT | Out-Null
# Launch uvicorn (blocking)
& python webmain.py --host 127.0.0.1 --port $PORT
Write-Host ""
Write-Host "Server stopped."
Read-Host "Press Enter to exit"

5
webapp/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""Lightweight FastAPI web console layered on top of the rag_eval platform.
This package is additive and non-invasive: it imports rag_eval as a library and
reads run artifacts from disk. It never modifies the core evaluation modules.
"""

1
webapp/api/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""API router package for the evaluation console."""

44
webapp/api/evaluations.py Normal file
View File

@@ -0,0 +1,44 @@
"""Routes for triggering evaluations and polling background task status."""
from __future__ import annotations
from fastapi import APIRouter, HTTPException
from webapp.models import (
TaskStatus,
TriggerEvaluationRequest,
TriggerEvaluationResponse,
)
from webapp.services import scenario_scanner
from webapp.services.task_manager import task_manager
router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
@router.post("", response_model=TriggerEvaluationResponse)
def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
"""Validate the scenario path and queue a background evaluation task."""
resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
if resolved is None:
raise HTTPException(
status_code=400,
detail=f"无效或不允许的场景路径: {request.scenario_path}",
)
task_id = task_manager.submit(request.scenario_path)
return TriggerEvaluationResponse(task_id=task_id)
@router.get("/{task_id}", response_model=TaskStatus)
def get_task_status(task_id: str) -> TaskStatus:
"""Return the current status and logs for one evaluation task."""
status = task_manager.get(task_id)
if status is None:
raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
return status
@router.get("", response_model=dict)
def list_tasks() -> dict[str, list]:
"""Return all known evaluation tasks for this server session."""
return {"tasks": [task.model_dump() for task in task_manager.list_tasks()]}

32
webapp/api/runs.py Normal file
View File

@@ -0,0 +1,32 @@
"""Routes for listing evaluation runs and fetching a single run's report."""
from __future__ import annotations
from fastapi import APIRouter, HTTPException
from webapp.models import RunDetail
from webapp.services import report_builder, run_reader
router = APIRouter(prefix="/api/runs", tags=["runs"])
@router.get("")
def get_runs() -> dict[str, list]:
"""Return summaries for every discoverable evaluation run."""
summaries = run_reader.list_run_summaries()
return {"runs": [summary.model_dump() for summary in summaries]}
@router.get("/{run_id}")
def get_run_detail(run_id: str) -> RunDetail:
"""Return the full summary and aggregated report for one run."""
run_dir = run_reader.find_run_dir(run_id)
if run_dir is None:
raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
summary = run_reader.build_run_summary(run_dir)
if summary is None:
raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
report = report_builder.build_report(run_dir, summary.metrics)
return RunDetail(summary=summary, report=report)

16
webapp/api/scenarios.py Normal file
View File

@@ -0,0 +1,16 @@
"""Route for discovering scenario YAML files that can be evaluated."""
from __future__ import annotations
from fastapi import APIRouter
from webapp.services import scenario_scanner
router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
@router.get("")
def get_scenarios() -> dict[str, list]:
"""Return every scenario file found under the scenarios/ directory."""
scenarios = scenario_scanner.list_scenarios()
return {"scenarios": [item.model_dump() for item in scenarios]}

129
webapp/models.py Normal file
View File

@@ -0,0 +1,129 @@
"""Pydantic response models for the evaluation console HTTP API."""
from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
class RunSummary(BaseModel):
"""Compact description of a single evaluation run for list views."""
run_id: str
scenario_name: str
mode: str = ""
judge_model: str = ""
embedding_model: str = ""
started_at: str = ""
finished_at: str = ""
dataset: str = ""
total_samples: int = 0
valid_samples: int = 0
invalid_samples: int = 0
metrics: list[str] = Field(default_factory=list)
metric_means: dict[str, float | None] = Field(default_factory=dict)
output_path: str = ""
class GroupStat(BaseModel):
"""Mean metric values for one slice of samples grouped by a metadata field."""
key: str
count: int
means: dict[str, float | None] = Field(default_factory=dict)
class DistributionBin(BaseModel):
"""One histogram bucket of sample counts for a single metric."""
label: str
lower: float
upper: float
count: int
class SampleScore(BaseModel):
"""Per-sample row used for the lowest-score review table."""
sample_id: str
question: str = ""
contexts: list[str] = Field(default_factory=list)
answer: str = ""
ground_truth: str = ""
language: str = ""
difficulty: str = ""
question_type: str = ""
metrics: dict[str, float | None] = Field(default_factory=dict)
mean_score: float | None = None
error: str = ""
class ReportData(BaseModel):
"""Aggregated report payload rendered by the report detail page."""
metrics: list[str] = Field(default_factory=list)
metric_means: dict[str, float | None] = Field(default_factory=dict)
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
lowest_samples: list[SampleScore] = Field(default_factory=list)
summary_markdown: str = ""
class RunDetail(BaseModel):
"""Full payload for a single run: summary metadata plus the report."""
summary: RunSummary
report: ReportData
class ScenarioInfo(BaseModel):
"""One discoverable scenario YAML file that can be evaluated from the UI."""
path: str
scenario_name: str = ""
mode: str = ""
dataset: str = ""
judge_model: str = ""
metrics: list[str] = Field(default_factory=list)
error: str = ""
class TaskStatus(BaseModel):
"""State of a background evaluation task tracked by the task manager."""
task_id: str
scenario_path: str
status: str
logs: list[str] = Field(default_factory=list)
run_id: str | None = None
error: str | None = None
created_at: str = ""
finished_at: str = ""
class TriggerEvaluationRequest(BaseModel):
"""Request body for launching an evaluation run from the UI."""
scenario_path: str
class TriggerEvaluationResponse(BaseModel):
"""Response returned immediately after queuing an evaluation task."""
task_id: str
def jsonable(value: Any) -> Any:
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
import math
if isinstance(value, float):
if math.isnan(value) or math.isinf(value):
return None
return value
if isinstance(value, dict):
return {key: jsonable(item) for key, item in value.items()}
if isinstance(value, list):
return [jsonable(item) for item in value]
return value

49
webapp/server.py Normal file
View File

@@ -0,0 +1,49 @@
"""FastAPI application factory for the RAGAS evaluation console.
The app mounts three JSON API routers and serves the single-page static
frontend. It imports rag_eval only lazily (inside the task manager worker), so
the server starts even when the evaluation dependencies are not yet installed.
"""
from __future__ import annotations
from pathlib import Path
from fastapi import FastAPI
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from webapp.api import evaluations, runs, scenarios
STATIC_DIR = Path(__file__).resolve().parent / "static"
def create_app() -> FastAPI:
"""Build and configure the FastAPI application instance."""
app = FastAPI(
title="Siemens RAGAS 评估控制台",
description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
version="0.1.0",
)
app.include_router(runs.router)
app.include_router(scenarios.router)
app.include_router(evaluations.router)
@app.get("/api/health", tags=["meta"])
def health() -> dict[str, str]:
"""Report basic liveness so the UI can confirm the server is reachable."""
return {"status": "ok"}
@app.get("/", include_in_schema=False)
def index() -> FileResponse:
"""Serve the single-page console entry document."""
return FileResponse(STATIC_DIR / "index.html")
# Serve CSS/JS assets under /static while keeping API routes at /api.
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
return app
app = create_app()

View File

@@ -0,0 +1 @@
"""Service package for the evaluation console (filesystem readers and task runner)."""

View File

@@ -0,0 +1,188 @@
"""Aggregate a run's per-sample scores into the report payload for the UI.
All aggregation reads only the standard scores.csv produced by the reporting
layer, plus the metric list resolved by run_reader. The output mirrors the
report detail page: metric means, per-metric distribution histograms, grouped
means by difficulty / question_type, and the lowest-scoring samples for review.
"""
from __future__ import annotations
import math
from pathlib import Path
import pandas as pd
from webapp.services.text_utils import parse_contexts
from webapp.models import (
DistributionBin,
GroupStat,
ReportData,
SampleScore,
)
from webapp.services import run_reader
# Number of equal-width buckets used for metric score histograms.
DISTRIBUTION_BIN_COUNT = 5
# Metadata columns that we group samples by when present in the data.
GROUPING_FIELDS = ("difficulty", "question_type", "language")
# How many lowest-scoring samples to surface for manual review.
LOWEST_SAMPLE_COUNT = 10
def _round_or_none(value: float | None) -> float | None:
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
if value is None:
return None
if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
return None
return round(float(value), 4)
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
"""Compute the mean of each metric column across all scored samples."""
means: dict[str, float | None] = {}
for metric in metrics:
if metric in frame.columns:
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
else:
means[metric] = None
return means
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
bins: list[DistributionBin] = []
if metric not in frame.columns:
return bins
series = pd.to_numeric(frame[metric], errors="coerce").dropna()
width = 1.0 / DISTRIBUTION_BIN_COUNT
for index in range(DISTRIBUTION_BIN_COUNT):
lower = index * width
upper = (index + 1) * width
# Include the right edge in the final bin so 1.0 is counted.
if index == DISTRIBUTION_BIN_COUNT - 1:
mask = (series >= lower) & (series <= upper)
else:
mask = (series >= lower) & (series < upper)
bins.append(
DistributionBin(
label=f"{lower:.1f}{upper:.1f}",
lower=round(lower, 2),
upper=round(upper, 2),
count=int(mask.sum()),
)
)
return bins
def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
"""Compute per-group metric means for each available grouping field."""
groupings: dict[str, list[GroupStat]] = {}
for field in GROUPING_FIELDS:
if field not in frame.columns:
continue
# Skip fields that are entirely empty so the UI does not render noise.
non_empty = frame[field].astype(str).str.strip().replace("nan", "")
if non_empty.eq("").all():
continue
stats: list[GroupStat] = []
for key, group in frame.groupby(frame[field].astype(str)):
key_text = str(key).strip()
if not key_text or key_text == "nan":
continue
means = {
metric: _round_or_none(group[metric].mean(numeric_only=True))
for metric in metrics
if metric in group.columns
}
stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
if stats:
stats.sort(key=lambda item: item.key)
groupings[field] = stats
return groupings
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
"""Average a single sample's available metric scores for ranking."""
values = [
float(row[metric])
for metric in metrics
if metric in row and pd.notna(row[metric])
]
if not values:
return None
return sum(values) / len(values)
def _cell_text(row: pd.Series, column: str) -> str:
"""Safely read a string cell, returning '' for missing or NaN values."""
if column not in row or pd.isna(row[column]):
return ""
return str(row[column]).strip()
def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
"""Select and shape the lowest-scoring samples for the review table."""
if frame.empty:
return []
enriched: list[tuple[float, SampleScore]] = []
for _, row in frame.iterrows():
mean_score = _sample_mean(row, metrics)
sample = SampleScore(
sample_id=_cell_text(row, "sample_id") or "",
question=_cell_text(row, "question"),
contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
answer=_cell_text(row, "answer"),
ground_truth=_cell_text(row, "ground_truth"),
language=_cell_text(row, "language"),
difficulty=_cell_text(row, "difficulty"),
question_type=_cell_text(row, "question_type"),
metrics={
metric: _round_or_none(float(row[metric]))
for metric in metrics
if metric in row and pd.notna(row[metric])
},
mean_score=_round_or_none(mean_score),
error=_cell_text(row, "error"),
)
# Samples without any score sort last (treated as worst for review).
sort_key = mean_score if mean_score is not None else -1.0
enriched.append((sort_key, sample))
enriched.sort(key=lambda item: item[0])
return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
"""Build the full aggregated report payload for one run directory."""
frame = run_reader.read_scores_frame(run_dir)
summary_markdown = run_reader.read_summary_markdown(run_dir)
if frame.empty or not metrics:
return ReportData(
metrics=metrics,
metric_means={metric: None for metric in metrics},
summary_markdown=summary_markdown,
)
distributions = {
metric: _distribution(frame, metric)
for metric in metrics
if metric in frame.columns
}
return ReportData(
metrics=metrics,
metric_means=_metric_means(frame, metrics),
distributions=distributions,
groupings=_groupings(frame, metrics),
lowest_samples=_lowest_samples(frame, metrics),
summary_markdown=summary_markdown,
)

View File

@@ -0,0 +1,222 @@
"""Read evaluation run artifacts from disk into API-friendly structures.
A "run" is any directory under the configured output roots that contains a
metadata.json file. This service stays decoupled from rag_eval internals: it
only reads the standard artifact files (metadata.json, scores.csv, summary.md,
scenario.snapshot.yaml) that the reporting layer writes.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pandas as pd
import yaml
from webapp.models import RunSummary
# Directory names that commonly hold run outputs, relative to the repo root.
DEFAULT_OUTPUT_ROOTS = ("outputs", "runs")
def _repo_root() -> Path:
"""Return the siemens_ragas repository root (parent of the webapp package)."""
return Path(__file__).resolve().parents[2]
def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]:
"""Collect existing output directories that may contain run artifacts."""
root = _repo_root()
roots: list[Path] = []
for name in DEFAULT_OUTPUT_ROOTS:
candidate = root / name
if candidate.is_dir():
roots.append(candidate)
for extra in extra_roots or []:
if extra.is_dir():
roots.append(extra)
return roots
def _read_json(path: Path) -> dict[str, Any]:
"""Load a JSON file, returning an empty dict on any failure."""
try:
return json.loads(path.read_text(encoding="utf-8"))
except (OSError, ValueError):
return {}
def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
"""Read the configured metric list from a scenario snapshot if present."""
snapshot = run_dir / "scenario.snapshot.yaml"
if not snapshot.is_file():
return []
try:
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
except (OSError, yaml.YAMLError):
return []
metrics = payload.get("metrics")
if isinstance(metrics, list):
return [str(item) for item in metrics]
return []
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
"""Find every run directory (one that contains metadata.json) under the roots."""
run_dirs: list[Path] = []
seen: set[Path] = set()
for root in _candidate_roots(extra_roots):
for metadata_path in root.rglob("metadata.json"):
run_dir = metadata_path.parent
# A dataset-build metadata.json also exists; keep only evaluation runs
# by requiring a scores.csv alongside, or a recognizable run metadata.
metadata = _read_json(metadata_path)
if "scenario_name" not in metadata:
continue
if run_dir in seen:
continue
seen.add(run_dir)
run_dirs.append(run_dir)
return run_dirs
def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]:
"""Compute per-metric mean scores from a run's scores.csv."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return {}
try:
frame = pd.read_csv(scores_path)
except (OSError, ValueError, pd.errors.ParserError):
return {}
means: dict[str, float | None] = {}
for metric in metrics:
if metric in frame.columns:
mean_value = frame[metric].mean(numeric_only=True)
means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4)
else:
means[metric] = None
return means
def build_run_summary(run_dir: Path) -> RunSummary | None:
"""Assemble a RunSummary from one run directory's artifacts."""
metadata = _read_json(run_dir / "metadata.json")
if "scenario_name" not in metadata:
return None
metrics = _read_metrics_from_snapshot(run_dir)
if not metrics:
# Fall back to numeric score columns inferred from the scores file.
metrics = _infer_metrics_from_scores(run_dir)
valid = int(metadata.get("valid_samples", 0) or 0)
invalid = int(metadata.get("invalid_samples", 0) or 0)
run_id = str(metadata.get("run_id") or run_dir.name)
return RunSummary(
run_id=run_id,
scenario_name=str(metadata.get("scenario_name", "")),
mode=str(metadata.get("mode", "")),
judge_model=str(metadata.get("judge_model", "")),
embedding_model=str(metadata.get("embedding_model", "")),
started_at=str(metadata.get("started_at", "")),
finished_at=str(metadata.get("finished_at", "")),
dataset=str(metadata.get("dataset", "")),
total_samples=valid + invalid,
valid_samples=valid,
invalid_samples=invalid,
metrics=metrics,
metric_means=_metric_means(run_dir, metrics),
output_path=run_dir.as_posix(),
)
# Columns in scores.csv that are sample fields rather than metric scores.
NON_METRIC_COLUMNS = {
"sample_id",
"question",
"contexts",
"answer",
"ground_truth",
"scenario",
"language",
"retrieval_config",
"error",
"judge_model",
"embedding_model",
"run_id",
"difficulty",
"question_type",
"doc_id",
"doc_name",
"section_path",
"page_start",
"page_end",
"source_chunk_ids",
"review_status",
"review_notes",
}
def _infer_metrics_from_scores(run_dir: Path) -> list[str]:
"""Infer metric column names from a scores.csv when no snapshot is available."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return []
try:
frame = pd.read_csv(scores_path, nrows=1)
except (OSError, ValueError, pd.errors.ParserError):
return []
metrics: list[str] = []
for column in frame.columns:
if column in NON_METRIC_COLUMNS:
continue
if pd.api.types.is_numeric_dtype(frame[column]):
metrics.append(str(column))
return metrics
def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]:
"""Return all run summaries sorted by finish time (most recent first)."""
summaries: list[RunSummary] = []
for run_dir in discover_run_dirs(extra_roots):
summary = build_run_summary(run_dir)
if summary is not None:
summaries.append(summary)
summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True)
return summaries
def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None:
"""Locate the run directory whose metadata or folder name matches run_id."""
for run_dir in discover_run_dirs(extra_roots):
metadata = _read_json(run_dir / "metadata.json")
if str(metadata.get("run_id") or run_dir.name) == run_id:
return run_dir
return None
def read_scores_frame(run_dir: Path) -> pd.DataFrame:
"""Load a run's scores.csv into a dataframe, or an empty frame if missing."""
scores_path = run_dir / "scores.csv"
if not scores_path.is_file():
return pd.DataFrame()
try:
return pd.read_csv(scores_path)
except (OSError, ValueError, pd.errors.ParserError):
return pd.DataFrame()
def read_summary_markdown(run_dir: Path) -> str:
"""Return the human-readable summary.md for a run, or an empty string."""
summary_path = run_dir / "summary.md"
if not summary_path.is_file():
return ""
try:
return summary_path.read_text(encoding="utf-8")
except OSError:
return ""

View File

@@ -0,0 +1,84 @@
"""Discover scenario YAML files that can be launched from the console.
Scanning is intentionally tolerant: a malformed scenario file is reported with
an error string rather than aborting the whole listing, so the UI can show the
user which files are runnable and which need fixing.
"""
from __future__ import annotations
from pathlib import Path
import yaml
from webapp.models import ScenarioInfo
def _repo_root() -> Path:
"""Return the siemens_ragas repository root (parent of the webapp package)."""
return Path(__file__).resolve().parents[2]
def _scenarios_root() -> Path:
"""Return the conventional scenarios/ directory inside the repository."""
return _repo_root() / "scenarios"
def _summarize_scenario(path: Path) -> ScenarioInfo:
"""Read a scenario file into a compact info object, capturing parse errors."""
relative = path.relative_to(_repo_root()).as_posix()
try:
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
except (OSError, yaml.YAMLError) as exc:
return ScenarioInfo(path=relative, error=f"无法解析: {exc}")
if not isinstance(payload, dict):
return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。")
metrics = payload.get("metrics")
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
return ScenarioInfo(
path=relative,
scenario_name=str(payload.get("scenario_name", "")),
mode=str(payload.get("mode", "")),
dataset=str(payload.get("dataset", "")),
judge_model=str(payload.get("judge_model", "")),
metrics=metric_list,
)
def list_scenarios() -> list[ScenarioInfo]:
"""Return every scenario YAML under scenarios/, sorted by path."""
root = _scenarios_root()
if not root.is_dir():
return []
scenarios: list[ScenarioInfo] = []
for path in sorted(root.rglob("*.yaml")):
scenarios.append(_summarize_scenario(path))
for path in sorted(root.rglob("*.yml")):
scenarios.append(_summarize_scenario(path))
return scenarios
def resolve_scenario_path(relative_or_absolute: str) -> Path | None:
"""Resolve a user-supplied scenario path safely within the repository.
Only paths that live inside the repository's scenarios/ directory are
accepted, which prevents the trigger endpoint from reading arbitrary files.
"""
root = _repo_root()
candidate = Path(relative_or_absolute)
resolved = candidate if candidate.is_absolute() else (root / candidate)
try:
resolved = resolved.resolve()
except OSError:
return None
scenarios_root = _scenarios_root().resolve()
if scenarios_root not in resolved.parents and resolved != scenarios_root:
return None
if not resolved.is_file():
return None
return resolved

View File

@@ -0,0 +1,161 @@
"""In-process background task manager for evaluation runs.
Evaluations run in a thread pool so the FastAPI event loop is never blocked.
The heavy rag_eval / ragas import is performed lazily inside the worker thread,
which keeps the web server bootable even when the evaluation dependencies are
broken — failures then surface as task errors in the UI instead of crashing
startup. This matches the "coarse status + logs" progress decision.
"""
from __future__ import annotations
import io
import threading
import uuid
from concurrent.futures import ThreadPoolExecutor
from contextlib import redirect_stderr, redirect_stdout
from datetime import datetime, timezone
from pathlib import Path
from webapp.models import TaskStatus
def _now_iso() -> str:
"""Return the current UTC time as an ISO 8601 string."""
return datetime.now(timezone.utc).isoformat()
class _LineCapture(io.TextIOBase):
"""A writable stream that appends captured lines to a task's log buffer."""
def __init__(self, sink: "EvaluationTask") -> None:
"""Bind the capture stream to the owning task."""
self._sink = sink
self._buffer = ""
def write(self, text: str) -> int:
"""Buffer text and flush complete lines into the task log."""
self._buffer += text
while "\n" in self._buffer:
line, self._buffer = self._buffer.split("\n", 1)
self._sink.append_log(line)
return len(text)
def flush(self) -> None:
"""Flush any trailing partial line into the task log."""
if self._buffer:
self._sink.append_log(self._buffer)
self._buffer = ""
class EvaluationTask:
"""Mutable state for a single background evaluation run."""
def __init__(self, task_id: str, scenario_path: str) -> None:
"""Initialize a queued task for the given scenario path."""
self.task_id = task_id
self.scenario_path = scenario_path
self.status = "queued"
self.logs: list[str] = []
self.run_id: str | None = None
self.error: str | None = None
self.created_at = _now_iso()
self.finished_at = ""
self._lock = threading.Lock()
def append_log(self, line: str) -> None:
"""Append one log line in a thread-safe manner."""
with self._lock:
self.logs.append(line)
def snapshot(self) -> TaskStatus:
"""Return an immutable copy of the current task state for the API."""
with self._lock:
return TaskStatus(
task_id=self.task_id,
scenario_path=self.scenario_path,
status=self.status,
logs=list(self.logs),
run_id=self.run_id,
error=self.error,
created_at=self.created_at,
finished_at=self.finished_at,
)
class TaskManager:
"""Owns the thread pool and registry of evaluation tasks."""
def __init__(self, max_workers: int = 2) -> None:
"""Create a task manager backed by a small thread pool."""
self._executor = ThreadPoolExecutor(max_workers=max_workers)
self._tasks: dict[str, EvaluationTask] = {}
self._lock = threading.Lock()
def submit(self, scenario_path: str) -> str:
"""Register and schedule a new evaluation task, returning its id."""
task_id = uuid.uuid4().hex[:12]
task = EvaluationTask(task_id=task_id, scenario_path=scenario_path)
with self._lock:
self._tasks[task_id] = task
self._executor.submit(self._run, task)
return task_id
def get(self, task_id: str) -> TaskStatus | None:
"""Return a snapshot of one task, or None if the id is unknown."""
with self._lock:
task = self._tasks.get(task_id)
return task.snapshot() if task is not None else None
def list_tasks(self) -> list[TaskStatus]:
"""Return snapshots of all known tasks, newest first."""
with self._lock:
tasks = list(self._tasks.values())
snapshots = [task.snapshot() for task in tasks]
snapshots.sort(key=lambda item: item.created_at, reverse=True)
return snapshots
def _run(self, task: EvaluationTask) -> None:
"""Execute one evaluation end to end inside a worker thread."""
task.status = "running"
task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}")
capture = _LineCapture(task)
try:
# Lazy import keeps the web server bootable if ragas is unavailable.
task.append_log("加载评估引擎 (rag_eval / ragas)...")
from rag_eval.execution.runner import run_scenario
absolute_path = self._to_absolute(task.scenario_path)
task.append_log(f"运行场景文件: {absolute_path}")
with redirect_stdout(capture), redirect_stderr(capture):
result = run_scenario(str(absolute_path))
capture.flush()
task.run_id = getattr(result, "run_id", None)
output_dir = getattr(getattr(result, "scenario", None), "output_dir", "")
task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}")
if output_dir:
task.append_log(f"结果目录: {output_dir}")
task.status = "completed"
except Exception as exc: # noqa: BLE001 - surface any failure to the UI
capture.flush()
error_type = type(exc).__name__
task.error = f"{error_type}: {exc}"
task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}")
task.status = "failed"
finally:
task.finished_at = _now_iso()
def _to_absolute(self, scenario_path: str) -> Path:
"""Resolve a scenario path against the repository root if relative."""
candidate = Path(scenario_path)
if candidate.is_absolute():
return candidate
repo_root = Path(__file__).resolve().parents[2]
return (repo_root / candidate).resolve()
# Module-level singleton shared by the FastAPI routes.
task_manager = TaskManager()

View File

@@ -0,0 +1,47 @@
"""Self-contained text helpers for the web layer.
These intentionally avoid importing from rag_eval so the web server has no
import-time dependency on the evaluation engine (and therefore boots even when
ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the
console interprets serialized CSV context columns the same way the engine does.
"""
from __future__ import annotations
import ast
import json
import math
from typing import Any
def parse_contexts(value: Any) -> list[str]:
"""Normalize a context payload into a list of non-empty strings.
Accepts native lists, JSON/Python-literal serialized lists (as written into
scores.csv), and plain text, mirroring the engine's own parsing rules.
"""
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if value is None or (isinstance(value, float) and math.isnan(value)):
return []
text = str(value).strip()
if not text:
return []
# Accept serialized lists from CSV exports before falling back to plain text.
for parser in (json.loads, ast.literal_eval):
try:
parsed = parser(text)
except (ValueError, SyntaxError, json.JSONDecodeError):
continue
if isinstance(parsed, list):
return [str(item).strip() for item in parsed if str(item).strip()]
# Preserve paragraph-style context dumps by splitting on blank lines first.
if "\n\n" in text:
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
if chunks:
return chunks
return [text]

267
webapp/static/css/app.css Normal file
View File

@@ -0,0 +1,267 @@
/* Siemens RAGAS 评估控制台 — 样式表
配色取自西门子品牌色petrol / 深青)与中性灰,呼应企业语境。 */
:root {
--petrol: #009999;
--petrol-dark: #007a7a;
--ink: #0f1b2d;
--ink-soft: #1a2942;
--slate: #64748b;
--slate-light: #94a3b8;
--line: #e2e8f0;
--bg: #f4f6f9;
--surface: #ffffff;
--good: #16a34a;
--warn: #eab308;
--bad: #dc2626;
--shadow: 0 1px 3px rgba(15, 27, 45, 0.08), 0 1px 2px rgba(15, 27, 45, 0.04);
--radius: 10px;
font-synthesis: none;
}
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: "Segoe UI", "Microsoft YaHei", system-ui, -apple-system, sans-serif;
background: var(--bg);
color: var(--ink);
font-size: 14px;
line-height: 1.5;
}
.app { display: flex; min-height: 100vh; }
/* ---------- 左侧导航 ---------- */
.sidebar {
width: 208px;
flex-shrink: 0;
background: linear-gradient(180deg, var(--ink) 0%, var(--ink-soft) 100%);
color: #cbd5e1;
display: flex;
flex-direction: column;
padding: 20px 14px;
position: sticky;
top: 0;
height: 100vh;
}
.brand { padding: 0 8px 22px; }
.brand-mark {
font-size: 20px; font-weight: 700; letter-spacing: 1px; color: #fff;
}
.brand-sub { font-size: 12px; color: var(--petrol); margin-top: 2px; letter-spacing: 2px; }
.nav { display: flex; flex-direction: column; gap: 4px; flex: 1; }
.nav-item {
display: flex; align-items: center; gap: 10px;
background: transparent; border: none; color: #cbd5e1;
padding: 10px 12px; border-radius: 8px; cursor: pointer;
font-size: 14px; text-align: left; width: 100%;
transition: background 0.15s, color 0.15s;
}
.nav-item:hover { background: rgba(255, 255, 255, 0.06); color: #fff; }
.nav-item.active { background: var(--petrol); color: #fff; }
.nav-item.active .nav-ico { color: #fff; }
.nav-item:disabled { opacity: 0.4; cursor: not-allowed; }
.nav-ico { width: 18px; text-align: center; color: var(--petrol); font-weight: 700; }
.nav-item.active .nav-ico { color: #fff; }
.sidebar-foot {
display: flex; align-items: center; gap: 8px;
font-size: 12px; color: var(--slate-light);
padding: 12px 8px 0; border-top: 1px solid rgba(255, 255, 255, 0.08);
}
.dot { width: 8px; height: 8px; border-radius: 50%; background: var(--slate-light); }
.dot.ok { background: var(--good); }
.dot.bad { background: var(--bad); }
/* ---------- 主内容区 ---------- */
.main { flex: 1; display: flex; flex-direction: column; min-width: 0; }
.topbar {
display: flex; align-items: center; justify-content: space-between;
padding: 18px 28px; background: var(--surface); border-bottom: 1px solid var(--line);
position: sticky; top: 0; z-index: 5;
}
.topbar h1 { font-size: 18px; font-weight: 600; }
.view { padding: 24px 28px; }
/* ---------- 按钮 ---------- */
.btn {
border: 1px solid var(--line); background: var(--surface); color: var(--ink);
padding: 8px 16px; border-radius: 8px; cursor: pointer; font-size: 13px;
transition: all 0.15s; font-family: inherit;
}
.btn:hover { border-color: var(--petrol); color: var(--petrol); }
.btn-primary { background: var(--petrol); border-color: var(--petrol); color: #fff; }
.btn-primary:hover { background: var(--petrol-dark); border-color: var(--petrol-dark); color: #fff; }
.btn-primary:disabled { background: var(--slate-light); border-color: var(--slate-light); cursor: not-allowed; }
.btn-ghost { background: transparent; }
/* ---------- 运行列表 ---------- */
.runs-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 16px; }
.run-card {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
padding: 16px; cursor: pointer; transition: all 0.15s; box-shadow: var(--shadow);
}
.run-card:hover { border-color: var(--petrol); transform: translateY(-1px); }
.run-card-head { display: flex; justify-content: space-between; align-items: flex-start; gap: 10px; }
.run-card-title { font-size: 15px; font-weight: 600; word-break: break-all; }
.run-card-meta { font-size: 12px; color: var(--slate); margin-top: 6px; line-height: 1.7; }
.run-card-metrics { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px; }
.metric-chip {
font-size: 12px; padding: 3px 8px; border-radius: 6px; background: var(--bg);
border: 1px solid var(--line);
}
.metric-chip b { font-variant-numeric: tabular-nums; }
/* ---------- 通用面板 ---------- */
.panel {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
padding: 20px; box-shadow: var(--shadow); margin-bottom: 18px;
}
.panel h2 { font-size: 16px; margin-bottom: 6px; }
.panel-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 12px; }
.muted { color: var(--slate); }
.tiny { font-size: 11px; margin-top: 8px; }
.tight { margin: 0 !important; }
code {
background: var(--bg); border: 1px solid var(--line); border-radius: 4px;
padding: 1px 6px; font-size: 12px; font-family: "Cascadia Code", Consolas, monospace;
}
/* ---------- 新建评估 ---------- */
.scenario-list { display: flex; flex-direction: column; gap: 8px; margin: 16px 0; }
.scenario-item {
display: flex; align-items: center; justify-content: space-between; gap: 12px;
border: 1px solid var(--line); border-radius: 8px; padding: 12px 14px; cursor: pointer;
transition: all 0.15s;
}
.scenario-item:hover { border-color: var(--petrol); background: #f0fbfb; }
.scenario-item.selected { border-color: var(--petrol); background: #e6f7f7; box-shadow: inset 0 0 0 1px var(--petrol); }
.scenario-item.invalid { opacity: 0.55; cursor: not-allowed; }
.scenario-name { font-weight: 600; font-size: 14px; }
.scenario-path { font-size: 12px; color: var(--slate); font-family: monospace; }
.scenario-tags { display: flex; gap: 6px; align-items: center; flex-shrink: 0; }
.tag {
font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--bg);
border: 1px solid var(--line); color: var(--slate);
}
.tag.mode-online { background: #eff6ff; color: #1d4ed8; border-color: #bfdbfe; }
.tag.mode-offline { background: #f0fdf4; color: #15803d; border-color: #bbf7d0; }
.run-actions { display: flex; align-items: center; gap: 14px; }
.selected-scenario { font-size: 13px; }
/* ---------- 任务进度 ---------- */
.task-head { display: flex; align-items: center; gap: 12px; margin-bottom: 12px; }
.badge {
font-size: 12px; padding: 3px 10px; border-radius: 999px; font-weight: 600;
background: var(--bg); color: var(--slate); border: 1px solid var(--line);
}
.badge.queued { background: #f1f5f9; color: var(--slate); }
.badge.running { background: #fef9c3; color: #854d0e; border-color: #fde68a; }
.badge.completed { background: #dcfce7; color: #166534; border-color: #bbf7d0; }
.badge.failed { background: #fee2e2; color: #991b1b; border-color: #fecaca; }
.log-box {
background: #0b1220; color: #cbd5e1; border-radius: 8px; padding: 14px;
font-family: "Cascadia Code", Consolas, monospace; font-size: 12px; line-height: 1.7;
max-height: 320px; overflow-y: auto; white-space: pre-wrap; word-break: break-word;
}
.task-actions { margin-top: 12px; }
/* ---------- 报告详情 ---------- */
.report-meta {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
padding: 14px 18px; display: flex; justify-content: space-between; align-items: center;
flex-wrap: wrap; gap: 10px; box-shadow: var(--shadow); margin-bottom: 18px;
}
.report-meta-title { font-size: 15px; font-weight: 600; }
.report-meta-info { font-size: 12px; color: var(--slate); }
.status-pill { font-size: 12px; font-weight: 600; }
.status-pill.completed { color: var(--good); }
.section-label {
font-size: 12px; font-weight: 600; letter-spacing: 0.5px; color: var(--slate);
text-transform: uppercase; margin: 18px 0 10px;
}
.metric-cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px; }
.metric-card {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
padding: 16px; text-align: center; box-shadow: var(--shadow);
}
.metric-value { font-size: 28px; font-weight: 700; font-variant-numeric: tabular-nums; }
.metric-value.good { color: var(--good); }
.metric-value.warn { color: var(--warn); }
.metric-value.bad { color: var(--bad); }
.metric-value.na { color: var(--slate-light); }
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
.report-half { margin-bottom: 0; }
.select {
border: 1px solid var(--line); border-radius: 6px; padding: 5px 10px; font-size: 12px;
background: var(--surface); color: var(--ink); font-family: inherit; cursor: pointer;
}
.grouping-tabs { display: flex; gap: 6px; margin-bottom: 10px; flex-wrap: wrap; }
.grouping-tab {
font-size: 12px; padding: 4px 10px; border-radius: 6px; border: 1px solid var(--line);
background: var(--surface); cursor: pointer; color: var(--slate);
}
.grouping-tab.active { background: var(--petrol); color: #fff; border-color: var(--petrol); }
table.group-table { width: 100%; border-collapse: collapse; font-size: 12px; }
table.group-table th, table.group-table td { padding: 6px 8px; text-align: left; }
table.group-table th { color: var(--slate); border-bottom: 1px solid var(--line); font-weight: 600; }
table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: tabular-nums; }
/* 最低分样本表 */
.lowest-table {
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
overflow: hidden; box-shadow: var(--shadow);
}
.lowest-row {
display: grid; grid-template-columns: 90px 1fr auto; gap: 12px; align-items: center;
padding: 11px 16px; border-bottom: 1px solid #f1f5f9; cursor: pointer; transition: background 0.12s;
}
.lowest-row:hover { background: var(--bg); }
.lowest-row .sid { font-size: 12px; color: var(--slate); font-family: monospace; }
.lowest-row .q { font-size: 13px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.lowest-row .scores { display: flex; gap: 8px; }
.score-badge {
font-size: 12px; padding: 2px 8px; border-radius: 6px; font-variant-numeric: tabular-nums;
font-weight: 600;
}
.score-badge.good { background: #dcfce7; color: #166534; }
.score-badge.warn { background: #fef9c3; color: #854d0e; }
.score-badge.bad { background: #fee2e2; color: #991b1b; }
.score-badge.na { background: var(--bg); color: var(--slate-light); }
.lowest-detail { padding: 0 16px; background: #fcfdfe; border-bottom: 1px solid #f1f5f9; }
.lowest-detail-inner { padding: 14px 0; font-size: 13px; line-height: 1.7; }
.detail-field { margin-bottom: 10px; }
.detail-label { font-size: 12px; color: var(--slate); font-weight: 600; margin-bottom: 3px; }
.detail-context { color: #475569; font-size: 12px; }
.detail-context .ctx-item {
padding: 4px 0; border-bottom: 1px dashed var(--line);
}
.detail-gt { color: var(--good); }
.empty { text-align: center; padding: 60px 20px; color: var(--slate); }
.empty p { margin-bottom: 8px; }
.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--line);
border-top-color: var(--petrol); border-radius: 50%; animation: spin 0.7s linear infinite;
vertical-align: middle; }
@keyframes spin { to { transform: rotate(360deg); } }
@media (max-width: 880px) {
.report-row { grid-template-columns: 1fr; }
.sidebar { width: 64px; }
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
}

118
webapp/static/index.html Normal file
View File

@@ -0,0 +1,118 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Siemens RAGAS 评估控制台</title>
<link rel="stylesheet" href="/static/css/app.css" />
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
</head>
<body>
<div class="app">
<!-- 左侧导航(布局 A -->
<aside class="sidebar">
<div class="brand">
<div class="brand-mark">RAGAS</div>
<div class="brand-sub">评估控制台</div>
</div>
<nav class="nav">
<button class="nav-item" data-view="runs">
<span class="nav-ico"></span><span>运行列表</span>
</button>
<button class="nav-item" data-view="new">
<span class="nav-ico"></span><span>新建评估</span>
</button>
<button class="nav-item" data-view="report" data-requires-run="1">
<span class="nav-ico"></span><span>报告详情</span>
</button>
</nav>
<div class="sidebar-foot">
<span class="dot" id="health-dot"></span>
<span id="health-text">连接中…</span>
</div>
</aside>
<!-- 主内容区 -->
<main class="main">
<header class="topbar">
<h1 id="view-title">运行列表</h1>
<button class="btn btn-ghost" id="refresh-btn">刷新</button>
</header>
<!-- 运行列表视图 -->
<section class="view" id="view-runs">
<div id="runs-container" class="runs-grid"></div>
<div class="empty" id="runs-empty" hidden>
<p>暂无评估运行。</p>
<p class="muted">从「新建评估」触发一次,或运行示例数据生成脚本:<code>python scripts/seed_sample_run.py</code></p>
</div>
</section>
<!-- 新建评估视图 -->
<section class="view" id="view-new" hidden>
<div class="panel">
<h2>选择场景并运行</h2>
<p class="muted"><code>scenarios/</code> 下选择一个场景配置,点击运行后在下方查看实时状态与日志。</p>
<div class="scenario-list" id="scenario-list"></div>
<div class="run-actions">
<button class="btn btn-primary" id="run-btn" disabled>运行评估</button>
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
</div>
</div>
<div class="panel" id="task-panel" hidden>
<div class="task-head">
<h2>评估进度</h2>
<span class="badge" id="task-status">queued</span>
</div>
<pre class="log-box" id="task-log"></pre>
<div class="task-actions">
<button class="btn btn-primary" id="view-report-btn" hidden>查看报告</button>
</div>
</div>
</section>
<!-- 报告详情视图 -->
<section class="view" id="view-report" hidden>
<div class="empty" id="report-empty">
<p>请先从「运行列表」选择一次运行。</p>
</div>
<div id="report-content" hidden>
<!-- 顶部元信息条 -->
<div class="report-meta" id="report-meta"></div>
<!-- ① 指标均值卡片 -->
<div class="section-label">① 指标均值 OVERVIEW</div>
<div class="metric-cards" id="metric-cards"></div>
<!-- ② 分布 + ③ 分组 并排 -->
<div class="report-row">
<div class="panel report-half">
<div class="panel-head">
<div class="section-label tight">② 分数分布</div>
<select id="dist-metric-select" class="select"></select>
</div>
<canvas id="dist-chart" height="160"></canvas>
<p class="muted tiny">暴露长尾失败样本</p>
</div>
<div class="panel report-half">
<div class="section-label tight">③ 分组均值</div>
<div id="grouping-tabs" class="grouping-tabs"></div>
<div id="grouping-table"></div>
<p class="muted tiny">定位薄弱类别</p>
</div>
</div>
<!-- ④ 最低分样本逐条复核 -->
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
<div class="lowest-table" id="lowest-table"></div>
</div>
</section>
</main>
</div>
<script src="/static/js/api.js"></script>
<script src="/static/js/report.js"></script>
<script src="/static/js/runner.js"></script>
<script src="/static/js/app.js"></script>
</body>
</html>

46
webapp/static/js/api.js Normal file
View File

@@ -0,0 +1,46 @@
// api.js — 控制台后端 HTTP 接口的轻量封装。
const API = {
// 通用 JSON GET失败时抛出带状态码的错误。
async get(path) {
const resp = await fetch(path);
if (!resp.ok) {
const detail = await API._extractError(resp);
throw new Error(detail);
}
return resp.json();
},
// 通用 JSON POST。
async post(path, body) {
const resp = await fetch(path, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body || {}),
});
if (!resp.ok) {
const detail = await API._extractError(resp);
throw new Error(detail);
}
return resp.json();
},
// 从错误响应中尽量解析出 detail 文本。
async _extractError(resp) {
try {
const data = await resp.json();
return data.detail || `请求失败 (${resp.status})`;
} catch (_e) {
return `请求失败 (${resp.status})`;
}
},
health() { return API.get("/api/health"); },
runs() { return API.get("/api/runs"); },
runDetail(runId) { return API.get(`/api/runs/${encodeURIComponent(runId)}`); },
scenarios() { return API.get("/api/scenarios"); },
triggerEvaluation(scenarioPath) {
return API.post("/api/evaluations", { scenario_path: scenarioPath });
},
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
};

152
webapp/static/js/app.js Normal file
View File

@@ -0,0 +1,152 @@
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
const App = {
currentRunId: null,
views: ["runs", "new", "report"],
titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
// 初始化:绑定导航、加载首屏、启动健康检查。
init() {
document.querySelectorAll(".nav-item").forEach((btn) => {
btn.addEventListener("click", () => App.switchView(btn.dataset.view));
});
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
Runner.init();
App.switchView("runs");
App.checkHealth();
setInterval(App.checkHealth, 15000);
},
// 切换主视图,并同步导航高亮与标题。
switchView(view) {
if (view === "report" && !App.currentRunId) {
// 没有选中的运行时,报告页显示占位。
}
App.views.forEach((name) => {
const el = document.getElementById(`view-${name}`);
if (el) el.hidden = name !== view;
});
document.querySelectorAll(".nav-item").forEach((btn) => {
btn.classList.toggle("active", btn.dataset.view === view);
});
document.getElementById("view-title").textContent = App.titles[view] || view;
App.activeView = view;
if (view === "runs") App.loadRuns();
if (view === "new") Runner.loadScenarios();
if (view === "report") Report.render(App.currentRunId);
},
// 刷新当前视图的数据。
refreshCurrent() {
App.switchView(App.activeView || "runs");
},
// 加载并渲染运行列表。
async loadRuns() {
const container = document.getElementById("runs-container");
const empty = document.getElementById("runs-empty");
container.innerHTML = '<p class="muted">加载中…</p>';
try {
const data = await API.runs();
const runs = data.runs || [];
if (runs.length === 0) {
container.innerHTML = "";
empty.hidden = false;
return;
}
empty.hidden = true;
container.innerHTML = "";
runs.forEach((run) => container.appendChild(App.renderRunCard(run)));
} catch (err) {
container.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
}
},
// 构造一张运行卡片。
renderRunCard(run) {
const card = document.createElement("div");
card.className = "run-card";
card.addEventListener("click", () => {
App.currentRunId = run.run_id;
App.enableReportNav();
App.switchView("report");
});
const chips = (run.metrics || [])
.map((m) => {
const val = run.metric_means ? run.metric_means[m] : null;
const cls = App.scoreClass(val);
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
})
.join("");
card.innerHTML = `
<div class="run-card-head">
<div class="run-card-title">${App.escape(run.scenario_name || run.run_id)}</div>
</div>
<div class="run-card-meta">
<div>${App.escape(run.mode || "—")} · judge: ${App.escape(run.judge_model || "—")}</div>
<div>${run.valid_samples} 有效 / ${run.invalid_samples} 无效 · ${App.escape(App.shortTime(run.finished_at))}</div>
</div>
<div class="run-card-metrics">${chips}</div>
`;
return card;
},
// 启用报告导航项(选中运行后)。
enableReportNav() {
const btn = document.querySelector('.nav-item[data-view="report"]');
if (btn) btn.disabled = false;
},
// 根据分值返回 good/warn/bad/na 配色类。
scoreClass(value) {
if (value === null || value === undefined) return "na";
if (value >= 0.8) return "good";
if (value >= 0.65) return "warn";
return "bad";
},
// 指标名缩写,节省卡片横向空间。
shortMetric(name) {
const map = {
faithfulness: "faith.",
answer_relevancy: "ans.rel.",
context_recall: "ctx.recall",
context_precision: "ctx.prec.",
};
return map[name] || name;
},
// 截取时间戳到分钟,便于阅读。
shortTime(iso) {
if (!iso) return "—";
return String(iso).replace("T", " ").slice(0, 16);
},
// 简单 HTML 转义,防止注入。
escape(text) {
const div = document.createElement("div");
div.textContent = text == null ? "" : String(text);
return div.innerHTML;
},
// 健康检查,更新左下角状态点。
async checkHealth() {
const dot = document.getElementById("health-dot");
const label = document.getElementById("health-text");
try {
await API.health();
dot.className = "dot ok";
label.textContent = "服务正常";
} catch (_e) {
dot.className = "dot bad";
label.textContent = "服务离线";
}
},
};
document.addEventListener("DOMContentLoaded", App.init);

258
webapp/static/js/report.js Normal file
View File

@@ -0,0 +1,258 @@
// report.js — 报告详情页渲染:元信息、指标卡片、分布图、分组表、低分样本复核。
const Report = {
distChart: null,
currentDetail: null,
activeGrouping: null,
// 加载并渲染指定运行的完整报告。
async render(runId) {
const empty = document.getElementById("report-empty");
const content = document.getElementById("report-content");
if (!runId) {
empty.hidden = false;
content.hidden = true;
return;
}
empty.hidden = true;
content.hidden = false;
content.style.opacity = "0.4";
try {
const detail = await API.runDetail(runId);
Report.currentDetail = detail;
Report.renderMeta(detail.summary);
Report.renderMetricCards(detail.summary, detail.report);
Report.renderDistribution(detail.report);
Report.renderGroupings(detail.report);
Report.renderLowest(detail.report);
content.style.opacity = "1";
} catch (err) {
empty.hidden = false;
content.hidden = true;
empty.innerHTML = `<p>加载报告失败:${App.escape(err.message)}</p>`;
}
},
// 顶部元信息条。
renderMeta(summary) {
const el = document.getElementById("report-meta");
el.innerHTML = `
<div>
<div class="report-meta-title">${App.escape(summary.scenario_name || summary.run_id)}
<span class="status-pill completed">● completed</span></div>
<div class="report-meta-info">run_id: ${App.escape(summary.run_id)}</div>
</div>
<div class="report-meta-info">
${App.escape(summary.mode || "—")} · judge: ${App.escape(summary.judge_model || "—")}
· ${summary.total_samples} 样本 (${summary.valid_samples} 有效 / ${summary.invalid_samples} 无效)
· ${App.escape(App.shortTime(summary.finished_at))}
</div>
`;
},
// ① 指标均值卡片。
renderMetricCards(summary, report) {
const wrap = document.getElementById("metric-cards");
wrap.innerHTML = "";
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
metrics.forEach((metric) => {
const value = report.metric_means ? report.metric_means[metric] : null;
const cls = App.scoreClass(value);
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
const card = document.createElement("div");
card.className = "metric-card";
card.innerHTML = `
<div class="metric-value ${cls}">${text}</div>
<div class="metric-name">${App.escape(metric)}</div>
`;
wrap.appendChild(card);
});
},
// ② 分数分布直方图(可切换指标)。
renderDistribution(report) {
const select = document.getElementById("dist-metric-select");
const distributions = report.distributions || {};
const metricsWithDist = Object.keys(distributions);
select.innerHTML = "";
if (metricsWithDist.length === 0) {
Report._drawDistChart([], []);
return;
}
metricsWithDist.forEach((metric) => {
const opt = document.createElement("option");
opt.value = metric;
opt.textContent = metric;
select.appendChild(opt);
});
select.onchange = () => Report._updateDistChart(select.value);
Report._updateDistChart(metricsWithDist[0]);
},
// 用选定指标的分箱数据刷新直方图。
_updateDistChart(metric) {
const distributions = Report.currentDetail.report.distributions || {};
const bins = distributions[metric] || [];
const labels = bins.map((b) => b.label);
const counts = bins.map((b) => b.count);
const colors = bins.map((b) => Report._binColor(b.lower));
Report._drawDistChart(labels, counts, colors);
},
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
_binColor(lower) {
if (lower >= 0.8) return "#16a34a";
if (lower >= 0.6) return "#84cc16";
if (lower >= 0.4) return "#eab308";
if (lower >= 0.2) return "#f97316";
return "#dc2626";
},
// 实际绘制 Chart.js 柱状图。
_drawDistChart(labels, counts, colors) {
const canvas = document.getElementById("dist-chart");
if (Report.distChart) Report.distChart.destroy();
Report.distChart = new Chart(canvas, {
type: "bar",
data: {
labels,
datasets: [{ data: counts, backgroundColor: colors || "#009999", borderRadius: 4 }],
},
options: {
responsive: true,
plugins: { legend: { display: false } },
scales: {
y: { beginAtZero: true, ticks: { precision: 0 }, grid: { color: "#f1f5f9" } },
x: { grid: { display: false } },
},
},
});
},
// ③ 分组均值difficulty / question_type / language
renderGroupings(report) {
const tabsEl = document.getElementById("grouping-tabs");
const tableEl = document.getElementById("grouping-table");
const groupings = report.groupings || {};
const fields = Object.keys(groupings);
tabsEl.innerHTML = "";
if (fields.length === 0) {
tableEl.innerHTML = '<p class="muted tiny">数据集未包含可分组字段difficulty / question_type。</p>';
return;
}
const fieldLabels = { difficulty: "难度", question_type: "类型", language: "语言" };
Report.activeGrouping = fields[0];
fields.forEach((field) => {
const tab = document.createElement("button");
tab.className = "grouping-tab" + (field === Report.activeGrouping ? " active" : "");
tab.textContent = fieldLabels[field] || field;
tab.onclick = () => {
Report.activeGrouping = field;
tabsEl.querySelectorAll(".grouping-tab").forEach((t) => t.classList.remove("active"));
tab.classList.add("active");
Report._drawGroupTable(report, field);
};
tabsEl.appendChild(tab);
});
Report._drawGroupTable(report, Report.activeGrouping);
},
// 渲染单个分组字段的均值表。
_drawGroupTable(report, field) {
const tableEl = document.getElementById("grouping-table");
const stats = report.groupings[field] || [];
const metrics = report.metrics || [];
let head = "<tr><th>组</th><th>样本</th>";
metrics.forEach((m) => (head += `<th>${App.escape(App.shortMetric(m))}</th>`));
head += "</tr>";
let body = "";
stats.forEach((stat) => {
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
metrics.forEach((m) => {
const v = stat.means ? stat.means[m] : null;
const cls = App.scoreClass(v);
const text = v === null || v === undefined ? "—" : v.toFixed(2);
body += `<td class="${cls}">${text}</td>`;
});
body += "</tr>";
});
tableEl.innerHTML = `<table class="group-table">${head}${body}</table>`;
},
// ④ 最低分样本逐条复核表(点击展开)。
renderLowest(report) {
const wrap = document.getElementById("lowest-table");
const samples = report.lowest_samples || [];
wrap.innerHTML = "";
if (samples.length === 0) {
wrap.innerHTML = '<div class="lowest-detail-inner" style="padding:16px">暂无可复核样本。</div>';
return;
}
const metrics = report.metrics || [];
samples.forEach((sample, idx) => {
const row = document.createElement("div");
row.className = "lowest-row";
const scoreBadges = metrics
.map((m) => {
const v = sample.metrics ? sample.metrics[m] : null;
const cls = App.scoreClass(v);
const text = v === null || v === undefined ? "—" : v.toFixed(2);
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
})
.join("");
row.innerHTML = `
<span class="sid">${App.escape(sample.sample_id)}</span>
<span class="q">${App.escape(sample.question || "—")}</span>
<span class="scores">${scoreBadges}</span>
`;
const detail = document.createElement("div");
detail.className = "lowest-detail";
detail.hidden = true;
detail.innerHTML = Report._detailHtml(sample);
row.addEventListener("click", () => {
detail.hidden = !detail.hidden;
});
wrap.appendChild(row);
wrap.appendChild(detail);
});
},
// 单条样本的展开详情question / contexts / answer / ground_truth。
_detailHtml(sample) {
const contexts = (sample.contexts || [])
.map((c, i) => `<div class="ctx-item">[${i + 1}] ${App.escape(c)}</div>`)
.join("");
const errorBlock = sample.error
? `<div class="detail-field"><div class="detail-label">错误 error</div><div style="color:#dc2626">${App.escape(sample.error)}</div></div>`
: "";
return `
<div class="lowest-detail-inner">
<div class="detail-field">
<div class="detail-label">问题 question</div>
<div>${App.escape(sample.question || "—")}</div>
</div>
<div class="detail-field">
<div class="detail-label">检索片段 contexts</div>
<div class="detail-context">${contexts || "(空)"}</div>
</div>
<div class="detail-field">
<div class="detail-label">生成答案 answer</div>
<div>${App.escape(sample.answer || "—")}</div>
</div>
<div class="detail-field">
<div class="detail-label">标准答案 ground_truth</div>
<div class="detail-gt">${App.escape(sample.ground_truth || "—")}</div>
</div>
${errorBlock}
</div>
`;
},
};

133
webapp/static/js/runner.js Normal file
View File

@@ -0,0 +1,133 @@
// runner.js — 新建评估视图:列出场景、触发评估、轮询任务状态与日志。
const Runner = {
selectedScenario: null,
pollTimer: null,
// 绑定运行按钮。
init() {
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
document.getElementById("view-report-btn").addEventListener("click", () => {
if (Runner.lastRunId) {
App.currentRunId = Runner.lastRunId;
App.enableReportNav();
App.switchView("report");
}
});
},
// 加载并渲染可触发的场景列表。
async loadScenarios() {
const list = document.getElementById("scenario-list");
list.innerHTML = '<p class="muted">加载中…</p>';
try {
const data = await API.scenarios();
const scenarios = data.scenarios || [];
if (scenarios.length === 0) {
list.innerHTML = '<p class="muted">未在 scenarios/ 下找到场景文件。</p>';
return;
}
list.innerHTML = "";
scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc)));
} catch (err) {
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
}
},
// 构造单个场景条目。
renderScenarioItem(sc) {
const item = document.createElement("div");
const invalid = !!sc.error;
item.className = "scenario-item" + (invalid ? " invalid" : "");
const modeTag = sc.mode
? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
: "";
const metricCount = (sc.metrics || []).length;
item.innerHTML = `
<div>
<div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
<div class="scenario-path">${App.escape(sc.path)}</div>
${sc.error ? `<div class="scenario-path" style="color:#dc2626">${App.escape(sc.error)}</div>` : ""}
</div>
<div class="scenario-tags">
${modeTag}
<span class="tag">${metricCount} 指标</span>
</div>
`;
if (!invalid) {
item.addEventListener("click", () => {
document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
item.classList.add("selected");
Runner.selectedScenario = sc.path;
document.getElementById("selected-scenario").textContent = sc.path;
document.getElementById("run-btn").disabled = false;
});
}
return item;
},
// 触发评估并开始轮询。
async trigger() {
if (!Runner.selectedScenario) return;
const runBtn = document.getElementById("run-btn");
runBtn.disabled = true;
const panel = document.getElementById("task-panel");
const logBox = document.getElementById("task-log");
const statusBadge = document.getElementById("task-status");
const reportBtn = document.getElementById("view-report-btn");
panel.hidden = false;
reportBtn.hidden = true;
logBox.textContent = "";
Runner._setStatus(statusBadge, "queued");
try {
const resp = await API.triggerEvaluation(Runner.selectedScenario);
Runner.poll(resp.task_id);
} catch (err) {
Runner._setStatus(statusBadge, "failed");
logBox.textContent = `触发失败:${err.message}`;
runBtn.disabled = false;
}
},
// 周期性轮询任务状态,刷新日志与徽标。
poll(taskId) {
const logBox = document.getElementById("task-log");
const statusBadge = document.getElementById("task-status");
const reportBtn = document.getElementById("view-report-btn");
const runBtn = document.getElementById("run-btn");
if (Runner.pollTimer) clearInterval(Runner.pollTimer);
Runner.pollTimer = setInterval(async () => {
try {
const status = await API.taskStatus(taskId);
logBox.textContent = (status.logs || []).join("\n");
logBox.scrollTop = logBox.scrollHeight;
Runner._setStatus(statusBadge, status.status);
if (status.status === "completed" || status.status === "failed") {
clearInterval(Runner.pollTimer);
runBtn.disabled = false;
if (status.status === "completed" && status.run_id) {
Runner.lastRunId = status.run_id;
reportBtn.hidden = false;
}
}
} catch (err) {
clearInterval(Runner.pollTimer);
logBox.textContent += `\n轮询失败:${err.message}`;
runBtn.disabled = false;
}
}, 1200);
},
// 更新状态徽标的文本与配色类。
_setStatus(badge, status) {
badge.textContent = status;
badge.className = "badge " + status;
},
};

42
webmain.py Normal file
View File

@@ -0,0 +1,42 @@
"""CLI entry point that launches the evaluation console web server.
Run alongside the existing main.py CLI; both share the same rag_eval library
and the same runs/ artifacts. Example:
python webmain.py
python webmain.py --host 0.0.0.0 --port 8800
"""
from __future__ import annotations
import argparse
import uvicorn
def parse_args() -> argparse.Namespace:
"""Parse host/port/reload options for the console server."""
parser = argparse.ArgumentParser(description="Launch the RAGAS evaluation console.")
parser.add_argument("--host", default="127.0.0.1", help="Bind host (default 127.0.0.1).")
parser.add_argument("--port", type=int, default=8800, help="Bind port (default 8800).")
parser.add_argument(
"--reload",
action="store_true",
help="Enable auto-reload for local development.",
)
return parser.parse_args()
def main() -> None:
"""Start uvicorn with the configured application."""
args = parse_args()
uvicorn.run(
"webapp.server:app",
host=args.host,
port=args.port,
reload=args.reload,
)
if __name__ == "__main__":
main()