Compare commits
4 Commits
9cbdc1d95d
...
1ff4a3943a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1ff4a3943a | ||
|
|
75ae7927ad | ||
|
|
1288a366d1 | ||
|
|
e89695e490 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -17,5 +17,7 @@ wheels/
|
|||||||
# outputs
|
# outputs
|
||||||
outputs/
|
outputs/
|
||||||
|
|
||||||
# datasets
|
# datasets — raw/normalized data files (large, not committed)
|
||||||
|
# Note: rag_eval/datasets/ is source code and IS committed (see negation below)
|
||||||
datasets/
|
datasets/
|
||||||
|
!rag_eval/datasets/
|
||||||
6
apps/siemens_pdf_qa/__init__.py
Normal file
6
apps/siemens_pdf_qa/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
"""Siemens PDF question bank adapter for online evaluation.
|
||||||
|
|
||||||
|
Wraps the generic pdf_question_bank adapter with a Siemens-specific system
|
||||||
|
prompt that instructs the model to answer in the same language as the question
|
||||||
|
(Chinese for Chinese CT documentation) and to cite only the provided evidence.
|
||||||
|
"""
|
||||||
170
apps/siemens_pdf_qa/adapter.py
Normal file
170
apps/siemens_pdf_qa/adapter.py
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
"""Online evaluation adapter for the Siemens medical-imaging PDF question bank.
|
||||||
|
|
||||||
|
Functionally identical to apps/pdf_question_bank/adapter.py but uses a
|
||||||
|
Siemens-specific system prompt that:
|
||||||
|
- Instructs the model to answer in the same language as the question
|
||||||
|
(important for Chinese CT documentation).
|
||||||
|
- Emphasises citation of source chunks and refusal when evidence is absent.
|
||||||
|
- Adds domain context (medical imaging / CT terminology).
|
||||||
|
|
||||||
|
The adapter contract is the same as all other adapters:
|
||||||
|
run(question, **kwargs) -> {"answer": str, "contexts": [str], "raw_response": {}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from rag_eval.settings import EvaluationSettings
|
||||||
|
from rag_eval.shared.utils import parse_contexts
|
||||||
|
|
||||||
|
|
||||||
|
# ── chunk cache (module-level, lives for the process lifetime) ────────────────
|
||||||
|
_CHUNK_CACHE: dict[Path, dict[str, dict[str, Any]]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_source_chunks_path(source_chunks_path: str) -> Path:
|
||||||
|
"""Resolve the chunk artifact path; fall back to the latest timestamped run."""
|
||||||
|
resolved = Path(source_chunks_path).resolve()
|
||||||
|
if resolved.exists():
|
||||||
|
return resolved
|
||||||
|
if resolved.parent.name != "latest":
|
||||||
|
raise FileNotFoundError(resolved)
|
||||||
|
artifact_root = resolved.parent.parent
|
||||||
|
if not artifact_root.exists():
|
||||||
|
raise FileNotFoundError(resolved)
|
||||||
|
candidates = sorted(
|
||||||
|
[d for d in artifact_root.iterdir() if d.is_dir() and d.name != "latest"],
|
||||||
|
key=lambda p: p.name,
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
for run_dir in candidates:
|
||||||
|
candidate = run_dir / resolved.name
|
||||||
|
if candidate.exists():
|
||||||
|
return candidate
|
||||||
|
raise FileNotFoundError(resolved)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_source_chunks(source_chunks_path: str) -> dict[str, dict[str, Any]]:
|
||||||
|
"""Load and cache source chunks by chunk_id."""
|
||||||
|
resolved = _resolve_source_chunks_path(source_chunks_path)
|
||||||
|
cached = _CHUNK_CACHE.get(resolved)
|
||||||
|
if cached is not None:
|
||||||
|
return cached
|
||||||
|
lookup: dict[str, dict[str, Any]] = {}
|
||||||
|
with resolved.open(encoding="utf-8") as fh:
|
||||||
|
for lineno, line in enumerate(fh, 1):
|
||||||
|
text = line.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
payload = json.loads(text)
|
||||||
|
chunk_id = str(payload.get("chunk_id", "")).strip()
|
||||||
|
if not chunk_id:
|
||||||
|
raise ValueError(f"source_chunks.jsonl row {lineno} missing chunk_id: {resolved}")
|
||||||
|
lookup[chunk_id] = payload
|
||||||
|
_CHUNK_CACHE[resolved] = lookup
|
||||||
|
return lookup
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_chunk_ids(raw: Any) -> list[str]:
|
||||||
|
"""Parse the source_chunk_ids column into a list of non-empty id strings."""
|
||||||
|
ids = parse_contexts(raw)
|
||||||
|
normalized = [i for i in ids if i]
|
||||||
|
if not normalized:
|
||||||
|
raise ValueError("source_chunk_ids is required for siemens_pdf_qa adapter.")
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def _build_messages(
|
||||||
|
question: str,
|
||||||
|
contexts: list[str],
|
||||||
|
metadata: dict[str, Any],
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Build a Siemens-domain grounded prompt for the answer model."""
|
||||||
|
evidence_lines = [f"[chunk {i}] {ctx}" for i, ctx in enumerate(contexts, 1)]
|
||||||
|
meta_lines = [
|
||||||
|
f"doc_name: {metadata.get('doc_name', '')}",
|
||||||
|
f"section_path: {metadata.get('section_path', '')}",
|
||||||
|
f"page_range: {metadata.get('page_start', '')}–{metadata.get('page_end', '')}",
|
||||||
|
]
|
||||||
|
# Siemens-specific system prompt: bilingual awareness, medical domain, strict grounding
|
||||||
|
system_prompt = (
|
||||||
|
"你是西门子医疗影像知识库的问答助手(Siemens Healthineers CT Knowledge Base QA)。"
|
||||||
|
"请严格根据下方【证据片段】回答问题,不得使用片段之外的任何知识。"
|
||||||
|
"若证据不足以回答,请明确说明「根据现有资料无法回答」。"
|
||||||
|
"请用与问题相同的语言(中文或英文)作答,简洁准确,必要时引用片段编号。"
|
||||||
|
)
|
||||||
|
user_prompt = "\n".join([
|
||||||
|
"【问题】",
|
||||||
|
question,
|
||||||
|
"",
|
||||||
|
"【文档元信息】",
|
||||||
|
*meta_lines,
|
||||||
|
"",
|
||||||
|
"【证据片段】",
|
||||||
|
*evidence_lines,
|
||||||
|
"",
|
||||||
|
"请基于以上证据片段作答。",
|
||||||
|
])
|
||||||
|
return [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run(
|
||||||
|
question: str,
|
||||||
|
*,
|
||||||
|
source_chunks_path: str,
|
||||||
|
model: str | None = None,
|
||||||
|
client: OpenAI | None = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Answer one question by resolving cited chunks and calling an OpenAI-compatible model.
|
||||||
|
|
||||||
|
This is the adapter contract entry point used by the online evaluation runner.
|
||||||
|
"""
|
||||||
|
chunk_ids = _resolve_chunk_ids(kwargs.get("source_chunk_ids"))
|
||||||
|
chunk_lookup = _load_source_chunks(source_chunks_path)
|
||||||
|
|
||||||
|
missing = [cid for cid in chunk_ids if cid not in chunk_lookup]
|
||||||
|
if missing:
|
||||||
|
raise ValueError("source_chunk_ids not found in artifact: " + ", ".join(missing))
|
||||||
|
|
||||||
|
resolved_chunks = [chunk_lookup[cid] for cid in chunk_ids]
|
||||||
|
contexts = [
|
||||||
|
str(chunk.get("text", "")).strip()
|
||||||
|
for chunk in resolved_chunks
|
||||||
|
if str(chunk.get("text", "")).strip()
|
||||||
|
]
|
||||||
|
if not contexts:
|
||||||
|
raise ValueError("resolved source chunks contain no usable text.")
|
||||||
|
|
||||||
|
settings = EvaluationSettings()
|
||||||
|
target_model = (model or settings.ragas_judge_model).strip()
|
||||||
|
if not target_model:
|
||||||
|
raise ValueError("A model name is required for siemens_pdf_qa adapter.")
|
||||||
|
|
||||||
|
llm_client = client or OpenAI(**settings.openai_client_kwargs)
|
||||||
|
completion = llm_client.chat.completions.create(
|
||||||
|
model=target_model,
|
||||||
|
messages=_build_messages(question, contexts, kwargs),
|
||||||
|
temperature=0,
|
||||||
|
)
|
||||||
|
answer = str(completion.choices[0].message.content or "").strip()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"answer": answer,
|
||||||
|
"contexts": contexts,
|
||||||
|
"raw_response": {
|
||||||
|
"resolved_chunk_ids": chunk_ids,
|
||||||
|
"doc_id": kwargs.get("doc_id", ""),
|
||||||
|
"doc_name": kwargs.get("doc_name", ""),
|
||||||
|
"model": target_model,
|
||||||
|
"response_text": answer,
|
||||||
|
},
|
||||||
|
}
|
||||||
59
docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
Normal file
59
docs/superpowers/specs/2026-06-15-siemens-scenario-design.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# Siemens PDF 场景设计 Spec
|
||||||
|
|
||||||
|
- 日期:2026-06-15
|
||||||
|
- 状态:已确认,进入实现。
|
||||||
|
|
||||||
|
## 1. 目标
|
||||||
|
|
||||||
|
基于 `datasets/siemens-pdfs/`(17 个西门子医疗 CT 中文 PDF),跑通完整三步流水线:
|
||||||
|
|
||||||
|
```
|
||||||
|
dataset_build(PDF→题库)→ offline smoke 评估 → online 评估
|
||||||
|
```
|
||||||
|
|
||||||
|
完全镜像现有 `sample-pdf-*` 模式(方案 A),不改动任何现有文件。
|
||||||
|
|
||||||
|
## 2. 参数决策
|
||||||
|
|
||||||
|
| 项目 | 值 |
|
||||||
|
|---|---|
|
||||||
|
| 输入 PDF | `datasets/siemens-pdfs/*.pdf`(17 个) |
|
||||||
|
| failure_mode | `skip`(单个文档解析失败不中断整批) |
|
||||||
|
| max_questions_per_document | 10(共 ~170 题) |
|
||||||
|
| max_source_chunks_per_question | 3 |
|
||||||
|
| generation model | `.env` 的 `DATASET_GENERATOR_MODEL`(qwen3.6-plus) |
|
||||||
|
| judge model | `.env` 的 `RAGAS_JUDGE_MODEL`(deepseek-v4-flash) |
|
||||||
|
| embedding model | `.env` 的 `RAGAS_EMBEDDING_MODEL`(text-embedding-v3) |
|
||||||
|
| online answer model | `.env` 的 `RAGAS_JUDGE_MODEL` |
|
||||||
|
| metrics | faithfulness / answer_relevancy / context_recall / context_precision |
|
||||||
|
|
||||||
|
## 3. 新增文件(4 个)
|
||||||
|
|
||||||
|
```
|
||||||
|
scenarios/siemens_build/siemens-pdf-build.yaml
|
||||||
|
scenarios/offline/siemens-pdf-offline-smoke.yaml
|
||||||
|
scenarios/online/siemens-pdf-question-bank-online.yaml
|
||||||
|
apps/siemens_pdf_qa/__init__.py
|
||||||
|
apps/siemens_pdf_qa/adapter.py
|
||||||
|
```
|
||||||
|
|
||||||
|
加上辅助脚本:
|
||||||
|
```
|
||||||
|
scripts/build_siemens_offline_smoke.py ← 从 build 产物生成 offline smoke CSV
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. 运行顺序
|
||||||
|
|
||||||
|
```
|
||||||
|
# 步骤 1:dataset build(PDF → 题库草稿 + source_chunks.jsonl)
|
||||||
|
python main.py --dataset-build-config scenarios/siemens_build/siemens-pdf-build.yaml
|
||||||
|
|
||||||
|
# 步骤 2:生成 offline smoke 数据集(一次性脚本,build 跑完后执行)
|
||||||
|
python scripts/build_siemens_offline_smoke.py
|
||||||
|
|
||||||
|
# 步骤 3:offline 评估(用 source chunks 作为 contexts,ground_truth 作为 answer)
|
||||||
|
python main.py --scenario scenarios/offline/siemens-pdf-offline-smoke.yaml
|
||||||
|
|
||||||
|
# 步骤 4:online 评估(实时调用 LLM 生成 answer,再评分)
|
||||||
|
python main.py --scenario scenarios/online/siemens-pdf-question-bank-online.yaml
|
||||||
|
```
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -150,13 +151,18 @@ class OpenAIQuestionGenerator(QuestionGenerator):
|
|||||||
max_questions: int,
|
max_questions: int,
|
||||||
max_chunks_per_question: int,
|
max_chunks_per_question: int,
|
||||||
job_name: str,
|
job_name: str,
|
||||||
|
max_retries: int = 3,
|
||||||
|
retry_delay: float = 5.0,
|
||||||
) -> list[DraftQuestionSample]:
|
) -> list[DraftQuestionSample]:
|
||||||
"""Generate draft questions for one parsed document."""
|
"""Generate draft questions for one parsed document, with retry on timeout/server errors."""
|
||||||
prompt = self._build_prompt(
|
prompt = self._build_prompt(
|
||||||
document,
|
document,
|
||||||
max_questions=max_questions,
|
max_questions=max_questions,
|
||||||
max_chunks_per_question=max_chunks_per_question,
|
max_chunks_per_question=max_chunks_per_question,
|
||||||
)
|
)
|
||||||
|
last_exc: Exception | None = None
|
||||||
|
for attempt in range(1, max_retries + 1):
|
||||||
|
try:
|
||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[
|
messages=[
|
||||||
@@ -171,3 +177,13 @@ class OpenAIQuestionGenerator(QuestionGenerator):
|
|||||||
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
|
self._build_sample(document=document, payload=item, index=index, job_name=job_name)
|
||||||
for index, item in enumerate(payload[:max_questions], start=1)
|
for index, item in enumerate(payload[:max_questions], start=1)
|
||||||
]
|
]
|
||||||
|
except Exception as exc:
|
||||||
|
last_exc = exc
|
||||||
|
if attempt < max_retries:
|
||||||
|
wait = retry_delay * attempt
|
||||||
|
doc_name_safe = document.doc_name.encode("ascii", "replace").decode("ascii")
|
||||||
|
print(f" [warn] generate attempt {attempt}/{max_retries} failed for {doc_name_safe!r}: {exc}. Retrying in {wait:.0f}s...")
|
||||||
|
time.sleep(wait)
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Question generation failed for {document.doc_name!r} after {max_retries} attempts"
|
||||||
|
) from last_exc
|
||||||
|
|||||||
@@ -111,12 +111,32 @@ def run_dataset_build(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
doc_name_safe = pdf_path.name.encode("ascii", "replace").decode("ascii")
|
||||||
|
print(f" [info] generating questions for: {doc_name_safe}")
|
||||||
|
try:
|
||||||
generated = generator.generate(
|
generated = generator.generate(
|
||||||
document,
|
document,
|
||||||
max_questions=job.max_questions_per_document,
|
max_questions=job.max_questions_per_document,
|
||||||
max_chunks_per_question=job.max_source_chunks_per_question,
|
max_chunks_per_question=job.max_source_chunks_per_question,
|
||||||
job_name=job.job_name,
|
job_name=job.job_name,
|
||||||
)
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
gen_failure = ParseFailure(file_path=pdf_path.as_posix(), error=f"generation failed: {exc}")
|
||||||
|
failures.append(gen_failure)
|
||||||
|
print(f" [warn] skipping {doc_name_safe} after generation failure: {exc}")
|
||||||
|
if job.failure_mode == "fail":
|
||||||
|
result = DatasetBuildResult(
|
||||||
|
job=job,
|
||||||
|
run_id=run_id,
|
||||||
|
artifact_paths=artifact_paths,
|
||||||
|
documents=documents,
|
||||||
|
draft_samples=draft_samples,
|
||||||
|
parse_failures=failures,
|
||||||
|
)
|
||||||
|
write_dataset_build_artifacts(result)
|
||||||
|
raise
|
||||||
|
continue
|
||||||
|
|
||||||
valid_generated = []
|
valid_generated = []
|
||||||
for sample in generated:
|
for sample in generated:
|
||||||
errors = validate_draft_sample(
|
errors = validate_draft_sample(
|
||||||
@@ -126,9 +146,9 @@ def run_dataset_build(
|
|||||||
)
|
)
|
||||||
if not errors:
|
if not errors:
|
||||||
valid_generated.append(sample)
|
valid_generated.append(sample)
|
||||||
draft_samples.extend(
|
new_samples = dedupe_samples(valid_generated)[: job.max_questions_per_document]
|
||||||
dedupe_samples(valid_generated)[: job.max_questions_per_document]
|
draft_samples.extend(new_samples)
|
||||||
)
|
print(f" [info] {doc_name_safe}: {len(new_samples)} questions generated (total so far: {len(draft_samples)})")
|
||||||
|
|
||||||
result = DatasetBuildResult(
|
result = DatasetBuildResult(
|
||||||
job=job,
|
job=job,
|
||||||
|
|||||||
1
rag_eval/datasets/__init__.py
Normal file
1
rag_eval/datasets/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Dataset loading and normalization for the RAG evaluation platform."""
|
||||||
56
rag_eval/datasets/loader.py
Normal file
56
rag_eval/datasets/loader.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
"""Load raw evaluation dataset records from disk.
|
||||||
|
|
||||||
|
Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
|
||||||
|
into NormalizedSample is handled by normalizers.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
|
||||||
|
"""Load raw records from a CSV or JSONL file.
|
||||||
|
|
||||||
|
Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
|
||||||
|
are left as-is; normalizers handle parsing.
|
||||||
|
"""
|
||||||
|
file_path = Path(path)
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise FileNotFoundError(f"Dataset file not found: {file_path}")
|
||||||
|
|
||||||
|
suffix = file_path.suffix.lower()
|
||||||
|
if suffix in (".jsonl", ".ndjson"):
|
||||||
|
return _load_jsonl(file_path)
|
||||||
|
if suffix in (".csv",):
|
||||||
|
return _load_csv(file_path)
|
||||||
|
# Fall back to CSV for unknown extensions.
|
||||||
|
return _load_csv(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_csv(path: Path) -> list[dict[str, Any]]:
|
||||||
|
"""Read a CSV file into a list of row dicts."""
|
||||||
|
with path.open(encoding="utf-8", newline="") as fh:
|
||||||
|
reader = csv.DictReader(fh)
|
||||||
|
return [dict(row) for row in reader]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||||
|
"""Read a JSONL file into a list of record dicts."""
|
||||||
|
records: list[dict[str, Any]] = []
|
||||||
|
with path.open(encoding="utf-8") as fh:
|
||||||
|
for lineno, line in enumerate(fh, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
|
||||||
|
if not isinstance(obj, dict):
|
||||||
|
raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
|
||||||
|
records.append(obj)
|
||||||
|
return records
|
||||||
105
rag_eval/datasets/normalizers.py
Normal file
105
rag_eval/datasets/normalizers.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""Normalize raw dataset records into NormalizedSample and InvalidSample objects.
|
||||||
|
|
||||||
|
Handles both offline mode (records already contain answer + contexts) and online
|
||||||
|
mode (records only contain question + ground_truth; adapter fills the rest).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rag_eval.shared.models import InvalidSample, NormalizedSample
|
||||||
|
from rag_eval.shared.utils import parse_contexts
|
||||||
|
|
||||||
|
# Fields we always strip from the raw record before storing it in metadata.
|
||||||
|
_CORE_FIELDS = {
|
||||||
|
"sample_id",
|
||||||
|
"question",
|
||||||
|
"contexts",
|
||||||
|
"answer",
|
||||||
|
"ground_truth",
|
||||||
|
"scenario",
|
||||||
|
"language",
|
||||||
|
"retrieval_config",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_str(record: dict[str, Any], key: str, default: str = "") -> str:
|
||||||
|
"""Return a string field from the record, coercing None/NaN to the default."""
|
||||||
|
value = record.get(key)
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
text = str(value).strip()
|
||||||
|
return default if text.lower() == "nan" else text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_records(
|
||||||
|
records: list[dict[str, Any]],
|
||||||
|
mode: str = "offline",
|
||||||
|
max_samples: int | None = None,
|
||||||
|
) -> tuple[list[NormalizedSample], list[InvalidSample]]:
|
||||||
|
"""Convert raw dicts into NormalizedSample / InvalidSample collections.
|
||||||
|
|
||||||
|
In offline mode every record must already contain answer and contexts.
|
||||||
|
In online mode those fields may be absent; they will be filled by the adapter.
|
||||||
|
"""
|
||||||
|
if max_samples is not None:
|
||||||
|
records = records[:max_samples]
|
||||||
|
|
||||||
|
valid: list[NormalizedSample] = []
|
||||||
|
invalid: list[InvalidSample] = []
|
||||||
|
|
||||||
|
for raw in records:
|
||||||
|
sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12]
|
||||||
|
|
||||||
|
question = _get_str(raw, "question")
|
||||||
|
if not question:
|
||||||
|
invalid.append(InvalidSample(
|
||||||
|
sample_id=sample_id,
|
||||||
|
error="missing required field: question",
|
||||||
|
raw=raw,
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
ground_truth = _get_str(raw, "ground_truth")
|
||||||
|
contexts = parse_contexts(raw.get("contexts"))
|
||||||
|
answer = _get_str(raw, "answer")
|
||||||
|
|
||||||
|
if mode == "offline":
|
||||||
|
errors: list[str] = []
|
||||||
|
if not ground_truth:
|
||||||
|
errors.append("missing ground_truth")
|
||||||
|
if not answer:
|
||||||
|
errors.append("missing answer")
|
||||||
|
if not contexts:
|
||||||
|
errors.append("missing or empty contexts")
|
||||||
|
if errors:
|
||||||
|
invalid.append(InvalidSample(
|
||||||
|
sample_id=sample_id,
|
||||||
|
error="; ".join(errors),
|
||||||
|
raw=raw,
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect any extra columns as opaque metadata for adapters and reporting.
|
||||||
|
metadata = {
|
||||||
|
key: value
|
||||||
|
for key, value in raw.items()
|
||||||
|
if key not in _CORE_FIELDS
|
||||||
|
}
|
||||||
|
|
||||||
|
valid.append(NormalizedSample(
|
||||||
|
sample_id=sample_id,
|
||||||
|
question=question,
|
||||||
|
contexts=contexts,
|
||||||
|
answer=answer,
|
||||||
|
ground_truth=ground_truth,
|
||||||
|
scenario=_get_str(raw, "scenario"),
|
||||||
|
language=_get_str(raw, "language"),
|
||||||
|
retrieval_config=_get_str(raw, "retrieval_config"),
|
||||||
|
metadata=metadata,
|
||||||
|
raw=raw,
|
||||||
|
))
|
||||||
|
|
||||||
|
return valid, invalid
|
||||||
15
scenarios/offline/siemens-pdf-offline-smoke.yaml
Normal file
15
scenarios/offline/siemens-pdf-offline-smoke.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
scenario_name: siemens-pdf-offline-smoke
|
||||||
|
mode: offline
|
||||||
|
app_adapter: null
|
||||||
|
dataset: ../../datasets/normalized/siemens_pdf_offline_smoke.csv
|
||||||
|
judge_model: deepseek-v4-flash
|
||||||
|
embedding_model: text-embedding-v3
|
||||||
|
metrics:
|
||||||
|
- faithfulness
|
||||||
|
- answer_relevancy
|
||||||
|
- context_recall
|
||||||
|
- context_precision
|
||||||
|
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
||||||
|
runtime:
|
||||||
|
batch_size: 4
|
||||||
|
max_samples: 30
|
||||||
22
scenarios/online/siemens-pdf-question-bank-online.yaml
Normal file
22
scenarios/online/siemens-pdf-question-bank-online.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
scenario_name: siemens-pdf-question-bank-online
|
||||||
|
mode: online
|
||||||
|
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||||
|
judge_model: deepseek-v4-flash
|
||||||
|
embedding_model: text-embedding-v3
|
||||||
|
metrics:
|
||||||
|
- faithfulness
|
||||||
|
- answer_relevancy
|
||||||
|
- context_recall
|
||||||
|
- context_precision
|
||||||
|
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
||||||
|
runtime:
|
||||||
|
batch_size: 4
|
||||||
|
app_concurrency: 4
|
||||||
|
metric_concurrency: 4
|
||||||
|
max_samples: 50
|
||||||
|
app_adapter:
|
||||||
|
type: python
|
||||||
|
callable: apps.siemens_pdf_qa.adapter:run
|
||||||
|
static_kwargs:
|
||||||
|
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
||||||
|
model: deepseek-v4-flash
|
||||||
17
scenarios/siemens_build/siemens-pdf-build.yaml
Normal file
17
scenarios/siemens_build/siemens-pdf-build.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
job_name: siemens-pdf-question-bank
|
||||||
|
input:
|
||||||
|
path: ../../datasets/siemens-pdfs
|
||||||
|
glob: "*.pdf"
|
||||||
|
parser:
|
||||||
|
provider: aliyun_docmind
|
||||||
|
failure_mode: skip
|
||||||
|
generation:
|
||||||
|
output_type: online_question_bank
|
||||||
|
review_mode: draft_with_manual_review
|
||||||
|
max_questions_per_document: 10
|
||||||
|
max_source_chunks_per_question: 3
|
||||||
|
output:
|
||||||
|
dataset_path: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||||
|
artifact_dir: ../../outputs/dataset-builds/siemens-pdf-question-bank
|
||||||
|
runtime:
|
||||||
|
max_documents: 17
|
||||||
72
scripts/build_siemens_offline_smoke.py
Normal file
72
scripts/build_siemens_offline_smoke.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Build the Siemens offline smoke dataset from a completed dataset_build run.
|
||||||
|
|
||||||
|
Must be run AFTER `python main.py --dataset-build-config
|
||||||
|
scenarios/siemens_build/siemens-pdf-build.yaml` has completed successfully.
|
||||||
|
|
||||||
|
It uses the stable `latest/` alias so you don't need to know the run_id.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/build_siemens_offline_smoke.py
|
||||||
|
|
||||||
|
Output:
|
||||||
|
datasets/normalized/siemens_pdf_offline_smoke.csv
|
||||||
|
(referenced by scenarios/offline/siemens-pdf-offline-smoke.yaml)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Paths — all relative to the siemens_ragas/ repository root
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
DRAFT_DATASET_PATH = (
|
||||||
|
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
|
||||||
|
/ "latest" / "dataset_draft.csv"
|
||||||
|
)
|
||||||
|
SOURCE_CHUNKS_PATH = (
|
||||||
|
REPO_ROOT / "outputs" / "dataset-builds" / "siemens-pdf-question-bank"
|
||||||
|
/ "latest" / "source_chunks.jsonl"
|
||||||
|
)
|
||||||
|
OUTPUT_PATH = REPO_ROOT / "datasets" / "normalized" / "siemens_pdf_offline_smoke.csv"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Convert the Siemens build artefacts into an offline-evaluable dataset."""
|
||||||
|
if not DRAFT_DATASET_PATH.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Draft dataset not found: {DRAFT_DATASET_PATH}\n"
|
||||||
|
"Run the dataset build first:\n"
|
||||||
|
" python main.py --dataset-build-config "
|
||||||
|
"scenarios/siemens_build/siemens-pdf-build.yaml"
|
||||||
|
)
|
||||||
|
if not SOURCE_CHUNKS_PATH.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Source chunks not found: {SOURCE_CHUNKS_PATH}\n"
|
||||||
|
"Run the dataset build first."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import here so the script is importable even before rag_eval is fully set up.
|
||||||
|
from rag_eval.dataset_builder.offline_converter import build_offline_smoke_dataset
|
||||||
|
|
||||||
|
output = build_offline_smoke_dataset(
|
||||||
|
draft_dataset_path=DRAFT_DATASET_PATH,
|
||||||
|
source_chunks_path=SOURCE_CHUNKS_PATH,
|
||||||
|
output_path=OUTPUT_PATH,
|
||||||
|
)
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
frame = pd.read_csv(output)
|
||||||
|
print(f"Offline smoke dataset written to: {output}")
|
||||||
|
print(f"Total rows: {len(frame)}")
|
||||||
|
if len(frame) > 0:
|
||||||
|
lang_counts = frame["language"].value_counts().to_dict() if "language" in frame.columns else {}
|
||||||
|
diff_counts = frame["difficulty"].value_counts().to_dict() if "difficulty" in frame.columns else {}
|
||||||
|
print(f"Language distribution: {lang_counts}")
|
||||||
|
print(f"Difficulty distribution: {diff_counts}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
236
scripts/seed_sample_run.py
Normal file
236
scripts/seed_sample_run.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
"""Generate a realistic sample evaluation run so the console has demo data.
|
||||||
|
|
||||||
|
This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
|
||||||
|
scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
|
||||||
|
layer produces, but without needing ragas or any network calls. It lets the
|
||||||
|
report board render immediately for demos and local development.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/seed_sample_run.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
|
SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
|
||||||
|
RUN_ID = "2026-06-15T08-30-00+00-00"
|
||||||
|
JUDGE_MODEL = "deepseek-distill-qwen-32b"
|
||||||
|
EMBEDDING_MODEL = "text-embedding-v3"
|
||||||
|
METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
|
||||||
|
|
||||||
|
# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
|
||||||
|
# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
|
||||||
|
# tail in the distribution, and clear weak groups by difficulty).
|
||||||
|
SAMPLES = [
|
||||||
|
{
|
||||||
|
"sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
|
||||||
|
"question": "员工入职满3年可享受多少天年休假?",
|
||||||
|
"contexts": ["员工入司满1年不满10年的,年休假5天。", "年休假在每年1月1日起可申请。"],
|
||||||
|
"answer": "根据规定,入职满3年的员工可享受5天年休假。",
|
||||||
|
"ground_truth": "员工入司满1年不满10年的,年休假5天。",
|
||||||
|
"faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
|
||||||
|
"question": "公司报销差旅费的截止提交时间是什么时候?",
|
||||||
|
"contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
|
||||||
|
"answer": "差旅费需在出差结束后30天内提交报销。",
|
||||||
|
"ground_truth": "差旅费报销须在出差结束后30天内提交。",
|
||||||
|
"faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
|
||||||
|
"question": "申请远程办公需要经过哪些审批流程?",
|
||||||
|
"contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
|
||||||
|
"answer": "需先由直属主管审批,再提交人力资源部备案,每月不超过8天。",
|
||||||
|
"ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案,每月上限8天。",
|
||||||
|
"faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
|
||||||
|
"question": "How many days of paternity leave are employees entitled to?",
|
||||||
|
"contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
|
||||||
|
"answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
|
||||||
|
"ground_truth": "Employees are entitled to 15 days of paternity leave.",
|
||||||
|
"faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
|
||||||
|
"question": "正式员工与试用期员工在医疗保险待遇上有何区别?",
|
||||||
|
"contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
|
||||||
|
"answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。",
|
||||||
|
"ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。",
|
||||||
|
"faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
|
||||||
|
"question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
|
||||||
|
"contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
|
||||||
|
"answer": "第三方共享需签保密协议,敏感数据须经数据保护官批准,记录留存3年。",
|
||||||
|
"ground_truth": "向第三方共享数据须签署保密协议,敏感数据共享须经数据保护官批准,且共享记录至少留存3年。",
|
||||||
|
"faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
|
||||||
|
"question": "跨部门项目预算超支时的审批升级路径是怎样的?",
|
||||||
|
"contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
|
||||||
|
"answer": "超支10%以内项目经理批,10%-20%需总监批,超20%需财务委员会审批。",
|
||||||
|
"ground_truth": "超支10%以内由项目经理审批,10%-20%由部门总监审批,超过20%须提交财务委员会审批。",
|
||||||
|
"faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
|
||||||
|
"question": "员工持股计划的最低锁定期是多少年?",
|
||||||
|
"contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
|
||||||
|
"answer": "员工持股计划的最低锁定期为3年。",
|
||||||
|
"ground_truth": "员工持股计划的最低锁定期为4年。",
|
||||||
|
"faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
|
||||||
|
"question": "What is the difference in notice period between voluntary and involuntary termination?",
|
||||||
|
"contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
|
||||||
|
"answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
|
||||||
|
"ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
|
||||||
|
"faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
|
||||||
|
"question": "公司规定的标准工作时间是每周多少小时?",
|
||||||
|
"contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
|
||||||
|
"answer": "公司标准工作时间为每周40小时。",
|
||||||
|
"ground_truth": "公司标准工作时间为每周40小时。",
|
||||||
|
"faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Two samples that failed normalization, to exercise the invalid count display.
|
||||||
|
INVALID_SAMPLES = [
|
||||||
|
{"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"},
|
||||||
|
{"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _output_dir() -> Path:
|
||||||
|
"""Return the run directory where sample artifacts are written."""
|
||||||
|
return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
|
||||||
|
|
||||||
|
|
||||||
|
def _write_scores_csv(path: Path) -> None:
|
||||||
|
"""Write scores.csv with sample fields, metric scores, and metadata columns."""
|
||||||
|
fieldnames = [
|
||||||
|
"sample_id", "question", "contexts", "answer", "ground_truth",
|
||||||
|
"scenario", "language", "difficulty", "question_type",
|
||||||
|
*METRICS, "error", "judge_model", "embedding_model", "run_id",
|
||||||
|
]
|
||||||
|
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||||
|
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for sample in SAMPLES:
|
||||||
|
row = {
|
||||||
|
"sample_id": sample["sample_id"],
|
||||||
|
"question": sample["question"],
|
||||||
|
# Serialize contexts as a JSON list, matching engine CSV output.
|
||||||
|
"contexts": json.dumps(sample["contexts"], ensure_ascii=False),
|
||||||
|
"answer": sample["answer"],
|
||||||
|
"ground_truth": sample["ground_truth"],
|
||||||
|
"scenario": SCENARIO_NAME,
|
||||||
|
"language": sample["language"],
|
||||||
|
"difficulty": sample["difficulty"],
|
||||||
|
"question_type": sample["question_type"],
|
||||||
|
"error": "",
|
||||||
|
"judge_model": JUDGE_MODEL,
|
||||||
|
"embedding_model": EMBEDDING_MODEL,
|
||||||
|
"run_id": SCENARIO_NAME,
|
||||||
|
}
|
||||||
|
for metric in METRICS:
|
||||||
|
row[metric] = sample[metric]
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_invalid_csv(path: Path) -> None:
|
||||||
|
"""Write invalid.csv with the small set of unscored samples."""
|
||||||
|
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||||
|
writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(INVALID_SAMPLES)
|
||||||
|
|
||||||
|
|
||||||
|
def _metric_mean(metric: str) -> float:
|
||||||
|
"""Compute the mean of one metric across the valid samples."""
|
||||||
|
return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_metadata(path: Path) -> None:
|
||||||
|
"""Write metadata.json mirroring the reporting layer's schema."""
|
||||||
|
metadata = {
|
||||||
|
"run_id": RUN_ID,
|
||||||
|
"scenario_name": SCENARIO_NAME,
|
||||||
|
"mode": "offline",
|
||||||
|
"judge_model": JUDGE_MODEL,
|
||||||
|
"embedding_model": EMBEDDING_MODEL,
|
||||||
|
"started_at": "2026-06-15T08:29:12+00:00",
|
||||||
|
"finished_at": "2026-06-15T08:31:45+00:00",
|
||||||
|
"dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
|
||||||
|
"valid_samples": len(SAMPLES),
|
||||||
|
"invalid_samples": len(INVALID_SAMPLES),
|
||||||
|
}
|
||||||
|
path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_summary(path: Path) -> None:
|
||||||
|
"""Write a human-readable summary.md echoing the metric means."""
|
||||||
|
lines = [
|
||||||
|
f"# {SCENARIO_NAME}",
|
||||||
|
"",
|
||||||
|
f"- run_id: `{RUN_ID}`",
|
||||||
|
"- mode: `offline`",
|
||||||
|
f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
|
||||||
|
f"- valid_samples: `{len(SAMPLES)}`",
|
||||||
|
f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
|
||||||
|
f"- judge_model: `{JUDGE_MODEL}`",
|
||||||
|
"",
|
||||||
|
"## Metric Means",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
for metric in METRICS:
|
||||||
|
lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
|
||||||
|
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_scenario_snapshot(path: Path) -> None:
|
||||||
|
"""Write scenario.snapshot.yaml so the reader resolves the metric list."""
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
snapshot = {
|
||||||
|
"scenario_name": SCENARIO_NAME,
|
||||||
|
"mode": "offline",
|
||||||
|
"judge_model": JUDGE_MODEL,
|
||||||
|
"embedding_model": EMBEDDING_MODEL,
|
||||||
|
"metrics": METRICS,
|
||||||
|
}
|
||||||
|
path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Write all sample run artifacts into a fresh run directory."""
|
||||||
|
run_dir = _output_dir()
|
||||||
|
run_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
_write_scores_csv(run_dir / "scores.csv")
|
||||||
|
_write_invalid_csv(run_dir / "invalid.csv")
|
||||||
|
_write_metadata(run_dir / "metadata.json")
|
||||||
|
_write_summary(run_dir / "summary.md")
|
||||||
|
_write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
|
||||||
|
|
||||||
|
print(f"Sample run written to: {run_dir}")
|
||||||
|
print("Start the console with: python webmain.py")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
113
start.bat
Normal file
113
start.bat
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
@echo off
|
||||||
|
setlocal
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Siemens RAGAS Console - Starting...
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: Change to the directory where this script lives (siemens_ragas/)
|
||||||
|
cd /d "%~dp0"
|
||||||
|
echo Working directory: %CD%
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 1. Check Python
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
python --version >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Python not found. Please install Python 3.12+ and add it to PATH.
|
||||||
|
goto :error
|
||||||
|
)
|
||||||
|
for /f "tokens=*" %%v in ('python --version 2^>^&1') do echo [OK] %%v
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 2. Check FastAPI / uvicorn
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
python -c "import fastapi, uvicorn" >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [INFO] Installing fastapi and uvicorn...
|
||||||
|
pip install fastapi uvicorn --quiet
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Failed to install fastapi/uvicorn.
|
||||||
|
echo Run manually: pip install fastapi uvicorn
|
||||||
|
goto :error
|
||||||
|
)
|
||||||
|
echo [OK] fastapi and uvicorn installed.
|
||||||
|
) else (
|
||||||
|
echo [OK] fastapi / uvicorn ready.
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 3. Check ragas version
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [INFO] Installing ragas==0.4.3 ...
|
||||||
|
pip install "ragas==0.4.3" --quiet
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [WARN] ragas install failed. Dashboard still works; evaluation trigger will show an error.
|
||||||
|
) else (
|
||||||
|
echo [OK] ragas 0.4.3 installed.
|
||||||
|
)
|
||||||
|
) else (
|
||||||
|
echo [OK] ragas 0.4.3 ready.
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 4. Seed demo data if no runs exist yet
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
if not exist "outputs\kba-knowledge-base-offline-baseline" (
|
||||||
|
echo [INFO] No run data found. Generating demo data...
|
||||||
|
python scripts\seed_sample_run.py
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [WARN] Demo data generation failed. Dashboard may be empty.
|
||||||
|
) else (
|
||||||
|
echo [OK] Demo data generated.
|
||||||
|
)
|
||||||
|
) else (
|
||||||
|
echo [OK] Run data found, skipping demo generation.
|
||||||
|
)
|
||||||
|
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
:: 5. Pick an available port
|
||||||
|
:: ----------------------------------------------------------------
|
||||||
|
set PORT=8800
|
||||||
|
netstat -ano 2>nul | findstr ":8800" | findstr "LISTENING" >nul 2>&1
|
||||||
|
if not errorlevel 1 (
|
||||||
|
echo [WARN] Port 8800 in use, trying 8801...
|
||||||
|
set PORT=8801
|
||||||
|
netstat -ano 2>nul | findstr ":8801" | findstr "LISTENING" >nul 2>&1
|
||||||
|
if not errorlevel 1 (
|
||||||
|
echo [ERROR] Ports 8800 and 8801 are both in use.
|
||||||
|
echo Run manually: python webmain.py --port 8802
|
||||||
|
goto :error
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Console URL : http://127.0.0.1:%PORT%
|
||||||
|
echo Press Ctrl+C to stop the server
|
||||||
|
echo ============================================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: Open browser after 2-second delay (non-blocking)
|
||||||
|
start /b cmd /c "timeout /t 2 >nul && start http://127.0.0.1:%PORT%"
|
||||||
|
|
||||||
|
:: Launch uvicorn (blocking — window stays open while server runs)
|
||||||
|
python webmain.py --host 127.0.0.1 --port %PORT%
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Server stopped.
|
||||||
|
pause
|
||||||
|
exit /b 0
|
||||||
|
|
||||||
|
:error
|
||||||
|
echo.
|
||||||
|
echo ============================================================
|
||||||
|
echo Startup failed. See error above.
|
||||||
|
echo ============================================================
|
||||||
|
pause
|
||||||
|
exit /b 1
|
||||||
111
start.ps1
Normal file
111
start.ps1
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
# start.ps1 — Siemens RAGAS Console launcher for Windows PowerShell
|
||||||
|
# Usage: Right-click -> "Run with PowerShell", or: powershell -ExecutionPolicy Bypass -File start.ps1
|
||||||
|
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
Set-Location $PSScriptRoot
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host " Siemens RAGAS Console - Starting..." -ForegroundColor Cyan
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "Working directory: $PSScriptRoot"
|
||||||
|
Write-Host ""
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 1. Check Python
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
try {
|
||||||
|
$pyver = & python --version 2>&1
|
||||||
|
Write-Host "[OK] $pyver" -ForegroundColor Green
|
||||||
|
} catch {
|
||||||
|
Write-Host "[ERROR] Python not found. Please install Python 3.12+ and add to PATH." -ForegroundColor Red
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 2. Check FastAPI / uvicorn
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$check = & python -c "import fastapi, uvicorn" 2>&1
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host "[INFO] Installing fastapi and uvicorn..." -ForegroundColor Yellow
|
||||||
|
& pip install fastapi uvicorn --quiet
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host "[ERROR] Failed to install fastapi/uvicorn. Run: pip install fastapi uvicorn" -ForegroundColor Red
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
Write-Host "[OK] fastapi / uvicorn installed." -ForegroundColor Green
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] fastapi / uvicorn ready." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 3. Check ragas version
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$check = & python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" 2>&1
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host "[INFO] Installing ragas==0.4.3 (evaluation engine)..." -ForegroundColor Yellow
|
||||||
|
& pip install "ragas==0.4.3" --quiet
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host "[WARN] ragas install failed. Dashboard works; evaluation trigger will show error." -ForegroundColor Yellow
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] ragas 0.4.3 installed." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] ragas 0.4.3 ready." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 4. Seed demo data if missing
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
if (-not (Test-Path "outputs\kba-knowledge-base-offline-baseline")) {
|
||||||
|
Write-Host "[INFO] No run data found. Generating demo data..." -ForegroundColor Yellow
|
||||||
|
& python scripts\seed_sample_run.py
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host "[WARN] Demo data generation failed. Dashboard may be empty." -ForegroundColor Yellow
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] Demo data generated." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Write-Host "[OK] Run data found, skipping demo generation." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# 5. Pick an available port
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
$PORT = 8800
|
||||||
|
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
|
||||||
|
if ($inUse) {
|
||||||
|
Write-Host "[WARN] Port $PORT in use, trying 8801..." -ForegroundColor Yellow
|
||||||
|
$PORT = 8801
|
||||||
|
$inUse = netstat -ano 2>$null | Select-String ":$PORT\s" | Select-String "LISTENING"
|
||||||
|
if ($inUse) {
|
||||||
|
Write-Host "[ERROR] Ports 8800 and 8801 are both in use." -ForegroundColor Red
|
||||||
|
Write-Host " Run manually: python webmain.py --port 8802"
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host " Console URL : http://127.0.0.1:$PORT" -ForegroundColor Green
|
||||||
|
Write-Host " Press Ctrl+C to stop the server" -ForegroundColor Cyan
|
||||||
|
Write-Host "============================================================" -ForegroundColor Cyan
|
||||||
|
Write-Host ""
|
||||||
|
|
||||||
|
# Open browser after 2-second delay
|
||||||
|
Start-Job -ScriptBlock {
|
||||||
|
param($port)
|
||||||
|
Start-Sleep 2
|
||||||
|
Start-Process "http://127.0.0.1:$port"
|
||||||
|
} -ArgumentList $PORT | Out-Null
|
||||||
|
|
||||||
|
# Launch uvicorn (blocking)
|
||||||
|
& python webmain.py --host 127.0.0.1 --port $PORT
|
||||||
|
|
||||||
|
Write-Host ""
|
||||||
|
Write-Host "Server stopped."
|
||||||
|
Read-Host "Press Enter to exit"
|
||||||
5
webapp/__init__.py
Normal file
5
webapp/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""Lightweight FastAPI web console layered on top of the rag_eval platform.
|
||||||
|
|
||||||
|
This package is additive and non-invasive: it imports rag_eval as a library and
|
||||||
|
reads run artifacts from disk. It never modifies the core evaluation modules.
|
||||||
|
"""
|
||||||
1
webapp/api/__init__.py
Normal file
1
webapp/api/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""API router package for the evaluation console."""
|
||||||
44
webapp/api/evaluations.py
Normal file
44
webapp/api/evaluations.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""Routes for triggering evaluations and polling background task status."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import (
|
||||||
|
TaskStatus,
|
||||||
|
TriggerEvaluationRequest,
|
||||||
|
TriggerEvaluationResponse,
|
||||||
|
)
|
||||||
|
from webapp.services import scenario_scanner
|
||||||
|
from webapp.services.task_manager import task_manager
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("", response_model=TriggerEvaluationResponse)
|
||||||
|
def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
|
||||||
|
"""Validate the scenario path and queue a background evaluation task."""
|
||||||
|
resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
|
||||||
|
if resolved is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"无效或不允许的场景路径: {request.scenario_path}",
|
||||||
|
)
|
||||||
|
|
||||||
|
task_id = task_manager.submit(request.scenario_path)
|
||||||
|
return TriggerEvaluationResponse(task_id=task_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{task_id}", response_model=TaskStatus)
|
||||||
|
def get_task_status(task_id: str) -> TaskStatus:
|
||||||
|
"""Return the current status and logs for one evaluation task."""
|
||||||
|
status = task_manager.get(task_id)
|
||||||
|
if status is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=dict)
|
||||||
|
def list_tasks() -> dict[str, list]:
|
||||||
|
"""Return all known evaluation tasks for this server session."""
|
||||||
|
return {"tasks": [task.model_dump() for task in task_manager.list_tasks()]}
|
||||||
32
webapp/api/runs.py
Normal file
32
webapp/api/runs.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Routes for listing evaluation runs and fetching a single run's report."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from webapp.models import RunDetail
|
||||||
|
from webapp.services import report_builder, run_reader
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/runs", tags=["runs"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("")
|
||||||
|
def get_runs() -> dict[str, list]:
|
||||||
|
"""Return summaries for every discoverable evaluation run."""
|
||||||
|
summaries = run_reader.list_run_summaries()
|
||||||
|
return {"runs": [summary.model_dump() for summary in summaries]}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{run_id}")
|
||||||
|
def get_run_detail(run_id: str) -> RunDetail:
|
||||||
|
"""Return the full summary and aggregated report for one run."""
|
||||||
|
run_dir = run_reader.find_run_dir(run_id)
|
||||||
|
if run_dir is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
|
||||||
|
|
||||||
|
summary = run_reader.build_run_summary(run_dir)
|
||||||
|
if summary is None:
|
||||||
|
raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
|
||||||
|
|
||||||
|
report = report_builder.build_report(run_dir, summary.metrics)
|
||||||
|
return RunDetail(summary=summary, report=report)
|
||||||
16
webapp/api/scenarios.py
Normal file
16
webapp/api/scenarios.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""Route for discovering scenario YAML files that can be evaluated."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from webapp.services import scenario_scanner
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("")
|
||||||
|
def get_scenarios() -> dict[str, list]:
|
||||||
|
"""Return every scenario file found under the scenarios/ directory."""
|
||||||
|
scenarios = scenario_scanner.list_scenarios()
|
||||||
|
return {"scenarios": [item.model_dump() for item in scenarios]}
|
||||||
129
webapp/models.py
Normal file
129
webapp/models.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
"""Pydantic response models for the evaluation console HTTP API."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class RunSummary(BaseModel):
|
||||||
|
"""Compact description of a single evaluation run for list views."""
|
||||||
|
|
||||||
|
run_id: str
|
||||||
|
scenario_name: str
|
||||||
|
mode: str = ""
|
||||||
|
judge_model: str = ""
|
||||||
|
embedding_model: str = ""
|
||||||
|
started_at: str = ""
|
||||||
|
finished_at: str = ""
|
||||||
|
dataset: str = ""
|
||||||
|
total_samples: int = 0
|
||||||
|
valid_samples: int = 0
|
||||||
|
invalid_samples: int = 0
|
||||||
|
metrics: list[str] = Field(default_factory=list)
|
||||||
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||||||
|
output_path: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class GroupStat(BaseModel):
|
||||||
|
"""Mean metric values for one slice of samples grouped by a metadata field."""
|
||||||
|
|
||||||
|
key: str
|
||||||
|
count: int
|
||||||
|
means: dict[str, float | None] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class DistributionBin(BaseModel):
|
||||||
|
"""One histogram bucket of sample counts for a single metric."""
|
||||||
|
|
||||||
|
label: str
|
||||||
|
lower: float
|
||||||
|
upper: float
|
||||||
|
count: int
|
||||||
|
|
||||||
|
|
||||||
|
class SampleScore(BaseModel):
|
||||||
|
"""Per-sample row used for the lowest-score review table."""
|
||||||
|
|
||||||
|
sample_id: str
|
||||||
|
question: str = ""
|
||||||
|
contexts: list[str] = Field(default_factory=list)
|
||||||
|
answer: str = ""
|
||||||
|
ground_truth: str = ""
|
||||||
|
language: str = ""
|
||||||
|
difficulty: str = ""
|
||||||
|
question_type: str = ""
|
||||||
|
metrics: dict[str, float | None] = Field(default_factory=dict)
|
||||||
|
mean_score: float | None = None
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class ReportData(BaseModel):
|
||||||
|
"""Aggregated report payload rendered by the report detail page."""
|
||||||
|
|
||||||
|
metrics: list[str] = Field(default_factory=list)
|
||||||
|
metric_means: dict[str, float | None] = Field(default_factory=dict)
|
||||||
|
distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
|
||||||
|
groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
|
||||||
|
lowest_samples: list[SampleScore] = Field(default_factory=list)
|
||||||
|
summary_markdown: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class RunDetail(BaseModel):
|
||||||
|
"""Full payload for a single run: summary metadata plus the report."""
|
||||||
|
|
||||||
|
summary: RunSummary
|
||||||
|
report: ReportData
|
||||||
|
|
||||||
|
|
||||||
|
class ScenarioInfo(BaseModel):
|
||||||
|
"""One discoverable scenario YAML file that can be evaluated from the UI."""
|
||||||
|
|
||||||
|
path: str
|
||||||
|
scenario_name: str = ""
|
||||||
|
mode: str = ""
|
||||||
|
dataset: str = ""
|
||||||
|
judge_model: str = ""
|
||||||
|
metrics: list[str] = Field(default_factory=list)
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class TaskStatus(BaseModel):
|
||||||
|
"""State of a background evaluation task tracked by the task manager."""
|
||||||
|
|
||||||
|
task_id: str
|
||||||
|
scenario_path: str
|
||||||
|
status: str
|
||||||
|
logs: list[str] = Field(default_factory=list)
|
||||||
|
run_id: str | None = None
|
||||||
|
error: str | None = None
|
||||||
|
created_at: str = ""
|
||||||
|
finished_at: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class TriggerEvaluationRequest(BaseModel):
|
||||||
|
"""Request body for launching an evaluation run from the UI."""
|
||||||
|
|
||||||
|
scenario_path: str
|
||||||
|
|
||||||
|
|
||||||
|
class TriggerEvaluationResponse(BaseModel):
|
||||||
|
"""Response returned immediately after queuing an evaluation task."""
|
||||||
|
|
||||||
|
task_id: str
|
||||||
|
|
||||||
|
|
||||||
|
def jsonable(value: Any) -> Any:
|
||||||
|
"""Convert NaN/inf floats into None so the payload stays valid JSON."""
|
||||||
|
import math
|
||||||
|
|
||||||
|
if isinstance(value, float):
|
||||||
|
if math.isnan(value) or math.isinf(value):
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return {key: jsonable(item) for key, item in value.items()}
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [jsonable(item) for item in value]
|
||||||
|
return value
|
||||||
49
webapp/server.py
Normal file
49
webapp/server.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""FastAPI application factory for the RAGAS evaluation console.
|
||||||
|
|
||||||
|
The app mounts three JSON API routers and serves the single-page static
|
||||||
|
frontend. It imports rag_eval only lazily (inside the task manager worker), so
|
||||||
|
the server starts even when the evaluation dependencies are not yet installed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
|
||||||
|
from webapp.api import evaluations, runs, scenarios
|
||||||
|
|
||||||
|
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||||
|
|
||||||
|
|
||||||
|
def create_app() -> FastAPI:
|
||||||
|
"""Build and configure the FastAPI application instance."""
|
||||||
|
app = FastAPI(
|
||||||
|
title="Siemens RAGAS 评估控制台",
|
||||||
|
description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
|
||||||
|
version="0.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(runs.router)
|
||||||
|
app.include_router(scenarios.router)
|
||||||
|
app.include_router(evaluations.router)
|
||||||
|
|
||||||
|
@app.get("/api/health", tags=["meta"])
|
||||||
|
def health() -> dict[str, str]:
|
||||||
|
"""Report basic liveness so the UI can confirm the server is reachable."""
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
@app.get("/", include_in_schema=False)
|
||||||
|
def index() -> FileResponse:
|
||||||
|
"""Serve the single-page console entry document."""
|
||||||
|
return FileResponse(STATIC_DIR / "index.html")
|
||||||
|
|
||||||
|
# Serve CSS/JS assets under /static while keeping API routes at /api.
|
||||||
|
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
1
webapp/services/__init__.py
Normal file
1
webapp/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Service package for the evaluation console (filesystem readers and task runner)."""
|
||||||
188
webapp/services/report_builder.py
Normal file
188
webapp/services/report_builder.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
"""Aggregate a run's per-sample scores into the report payload for the UI.
|
||||||
|
|
||||||
|
All aggregation reads only the standard scores.csv produced by the reporting
|
||||||
|
layer, plus the metric list resolved by run_reader. The output mirrors the
|
||||||
|
report detail page: metric means, per-metric distribution histograms, grouped
|
||||||
|
means by difficulty / question_type, and the lowest-scoring samples for review.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from webapp.services.text_utils import parse_contexts
|
||||||
|
from webapp.models import (
|
||||||
|
DistributionBin,
|
||||||
|
GroupStat,
|
||||||
|
ReportData,
|
||||||
|
SampleScore,
|
||||||
|
)
|
||||||
|
from webapp.services import run_reader
|
||||||
|
|
||||||
|
|
||||||
|
# Number of equal-width buckets used for metric score histograms.
|
||||||
|
DISTRIBUTION_BIN_COUNT = 5
|
||||||
|
|
||||||
|
# Metadata columns that we group samples by when present in the data.
|
||||||
|
GROUPING_FIELDS = ("difficulty", "question_type", "language")
|
||||||
|
|
||||||
|
# How many lowest-scoring samples to surface for manual review.
|
||||||
|
LOWEST_SAMPLE_COUNT = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _round_or_none(value: float | None) -> float | None:
|
||||||
|
"""Round a float to four places, mapping NaN/None to None for clean JSON."""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
|
||||||
|
return None
|
||||||
|
return round(float(value), 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
|
||||||
|
"""Compute the mean of each metric column across all scored samples."""
|
||||||
|
means: dict[str, float | None] = {}
|
||||||
|
for metric in metrics:
|
||||||
|
if metric in frame.columns:
|
||||||
|
means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
|
||||||
|
else:
|
||||||
|
means[metric] = None
|
||||||
|
return means
|
||||||
|
|
||||||
|
|
||||||
|
def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
|
||||||
|
"""Bucket one metric's scores into fixed-width [0,1] histogram bins."""
|
||||||
|
bins: list[DistributionBin] = []
|
||||||
|
if metric not in frame.columns:
|
||||||
|
return bins
|
||||||
|
|
||||||
|
series = pd.to_numeric(frame[metric], errors="coerce").dropna()
|
||||||
|
width = 1.0 / DISTRIBUTION_BIN_COUNT
|
||||||
|
for index in range(DISTRIBUTION_BIN_COUNT):
|
||||||
|
lower = index * width
|
||||||
|
upper = (index + 1) * width
|
||||||
|
# Include the right edge in the final bin so 1.0 is counted.
|
||||||
|
if index == DISTRIBUTION_BIN_COUNT - 1:
|
||||||
|
mask = (series >= lower) & (series <= upper)
|
||||||
|
else:
|
||||||
|
mask = (series >= lower) & (series < upper)
|
||||||
|
bins.append(
|
||||||
|
DistributionBin(
|
||||||
|
label=f"{lower:.1f}–{upper:.1f}",
|
||||||
|
lower=round(lower, 2),
|
||||||
|
upper=round(upper, 2),
|
||||||
|
count=int(mask.sum()),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return bins
|
||||||
|
|
||||||
|
|
||||||
|
def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
|
||||||
|
"""Compute per-group metric means for each available grouping field."""
|
||||||
|
groupings: dict[str, list[GroupStat]] = {}
|
||||||
|
for field in GROUPING_FIELDS:
|
||||||
|
if field not in frame.columns:
|
||||||
|
continue
|
||||||
|
# Skip fields that are entirely empty so the UI does not render noise.
|
||||||
|
non_empty = frame[field].astype(str).str.strip().replace("nan", "")
|
||||||
|
if non_empty.eq("").all():
|
||||||
|
continue
|
||||||
|
|
||||||
|
stats: list[GroupStat] = []
|
||||||
|
for key, group in frame.groupby(frame[field].astype(str)):
|
||||||
|
key_text = str(key).strip()
|
||||||
|
if not key_text or key_text == "nan":
|
||||||
|
continue
|
||||||
|
means = {
|
||||||
|
metric: _round_or_none(group[metric].mean(numeric_only=True))
|
||||||
|
for metric in metrics
|
||||||
|
if metric in group.columns
|
||||||
|
}
|
||||||
|
stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
|
||||||
|
if stats:
|
||||||
|
stats.sort(key=lambda item: item.key)
|
||||||
|
groupings[field] = stats
|
||||||
|
return groupings
|
||||||
|
|
||||||
|
|
||||||
|
def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
|
||||||
|
"""Average a single sample's available metric scores for ranking."""
|
||||||
|
values = [
|
||||||
|
float(row[metric])
|
||||||
|
for metric in metrics
|
||||||
|
if metric in row and pd.notna(row[metric])
|
||||||
|
]
|
||||||
|
if not values:
|
||||||
|
return None
|
||||||
|
return sum(values) / len(values)
|
||||||
|
|
||||||
|
|
||||||
|
def _cell_text(row: pd.Series, column: str) -> str:
|
||||||
|
"""Safely read a string cell, returning '' for missing or NaN values."""
|
||||||
|
if column not in row or pd.isna(row[column]):
|
||||||
|
return ""
|
||||||
|
return str(row[column]).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
|
||||||
|
"""Select and shape the lowest-scoring samples for the review table."""
|
||||||
|
if frame.empty:
|
||||||
|
return []
|
||||||
|
|
||||||
|
enriched: list[tuple[float, SampleScore]] = []
|
||||||
|
for _, row in frame.iterrows():
|
||||||
|
mean_score = _sample_mean(row, metrics)
|
||||||
|
sample = SampleScore(
|
||||||
|
sample_id=_cell_text(row, "sample_id") or "—",
|
||||||
|
question=_cell_text(row, "question"),
|
||||||
|
contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
|
||||||
|
answer=_cell_text(row, "answer"),
|
||||||
|
ground_truth=_cell_text(row, "ground_truth"),
|
||||||
|
language=_cell_text(row, "language"),
|
||||||
|
difficulty=_cell_text(row, "difficulty"),
|
||||||
|
question_type=_cell_text(row, "question_type"),
|
||||||
|
metrics={
|
||||||
|
metric: _round_or_none(float(row[metric]))
|
||||||
|
for metric in metrics
|
||||||
|
if metric in row and pd.notna(row[metric])
|
||||||
|
},
|
||||||
|
mean_score=_round_or_none(mean_score),
|
||||||
|
error=_cell_text(row, "error"),
|
||||||
|
)
|
||||||
|
# Samples without any score sort last (treated as worst for review).
|
||||||
|
sort_key = mean_score if mean_score is not None else -1.0
|
||||||
|
enriched.append((sort_key, sample))
|
||||||
|
|
||||||
|
enriched.sort(key=lambda item: item[0])
|
||||||
|
return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
|
||||||
|
|
||||||
|
|
||||||
|
def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
|
||||||
|
"""Build the full aggregated report payload for one run directory."""
|
||||||
|
frame = run_reader.read_scores_frame(run_dir)
|
||||||
|
summary_markdown = run_reader.read_summary_markdown(run_dir)
|
||||||
|
|
||||||
|
if frame.empty or not metrics:
|
||||||
|
return ReportData(
|
||||||
|
metrics=metrics,
|
||||||
|
metric_means={metric: None for metric in metrics},
|
||||||
|
summary_markdown=summary_markdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
distributions = {
|
||||||
|
metric: _distribution(frame, metric)
|
||||||
|
for metric in metrics
|
||||||
|
if metric in frame.columns
|
||||||
|
}
|
||||||
|
|
||||||
|
return ReportData(
|
||||||
|
metrics=metrics,
|
||||||
|
metric_means=_metric_means(frame, metrics),
|
||||||
|
distributions=distributions,
|
||||||
|
groupings=_groupings(frame, metrics),
|
||||||
|
lowest_samples=_lowest_samples(frame, metrics),
|
||||||
|
summary_markdown=summary_markdown,
|
||||||
|
)
|
||||||
222
webapp/services/run_reader.py
Normal file
222
webapp/services/run_reader.py
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
"""Read evaluation run artifacts from disk into API-friendly structures.
|
||||||
|
|
||||||
|
A "run" is any directory under the configured output roots that contains a
|
||||||
|
metadata.json file. This service stays decoupled from rag_eval internals: it
|
||||||
|
only reads the standard artifact files (metadata.json, scores.csv, summary.md,
|
||||||
|
scenario.snapshot.yaml) that the reporting layer writes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from webapp.models import RunSummary
|
||||||
|
|
||||||
|
|
||||||
|
# Directory names that commonly hold run outputs, relative to the repo root.
|
||||||
|
DEFAULT_OUTPUT_ROOTS = ("outputs", "runs")
|
||||||
|
|
||||||
|
|
||||||
|
def _repo_root() -> Path:
|
||||||
|
"""Return the siemens_ragas repository root (parent of the webapp package)."""
|
||||||
|
return Path(__file__).resolve().parents[2]
|
||||||
|
|
||||||
|
|
||||||
|
def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||||
|
"""Collect existing output directories that may contain run artifacts."""
|
||||||
|
root = _repo_root()
|
||||||
|
roots: list[Path] = []
|
||||||
|
for name in DEFAULT_OUTPUT_ROOTS:
|
||||||
|
candidate = root / name
|
||||||
|
if candidate.is_dir():
|
||||||
|
roots.append(candidate)
|
||||||
|
for extra in extra_roots or []:
|
||||||
|
if extra.is_dir():
|
||||||
|
roots.append(extra)
|
||||||
|
return roots
|
||||||
|
|
||||||
|
|
||||||
|
def _read_json(path: Path) -> dict[str, Any]:
|
||||||
|
"""Load a JSON file, returning an empty dict on any failure."""
|
||||||
|
try:
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
|
||||||
|
"""Read the configured metric list from a scenario snapshot if present."""
|
||||||
|
snapshot = run_dir / "scenario.snapshot.yaml"
|
||||||
|
if not snapshot.is_file():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
|
||||||
|
except (OSError, yaml.YAMLError):
|
||||||
|
return []
|
||||||
|
metrics = payload.get("metrics")
|
||||||
|
if isinstance(metrics, list):
|
||||||
|
return [str(item) for item in metrics]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
|
||||||
|
"""Find every run directory (one that contains metadata.json) under the roots."""
|
||||||
|
run_dirs: list[Path] = []
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for root in _candidate_roots(extra_roots):
|
||||||
|
for metadata_path in root.rglob("metadata.json"):
|
||||||
|
run_dir = metadata_path.parent
|
||||||
|
# A dataset-build metadata.json also exists; keep only evaluation runs
|
||||||
|
# by requiring a scores.csv alongside, or a recognizable run metadata.
|
||||||
|
metadata = _read_json(metadata_path)
|
||||||
|
if "scenario_name" not in metadata:
|
||||||
|
continue
|
||||||
|
if run_dir in seen:
|
||||||
|
continue
|
||||||
|
seen.add(run_dir)
|
||||||
|
run_dirs.append(run_dir)
|
||||||
|
return run_dirs
|
||||||
|
|
||||||
|
|
||||||
|
def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]:
|
||||||
|
"""Compute per-metric mean scores from a run's scores.csv."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path)
|
||||||
|
except (OSError, ValueError, pd.errors.ParserError):
|
||||||
|
return {}
|
||||||
|
means: dict[str, float | None] = {}
|
||||||
|
for metric in metrics:
|
||||||
|
if metric in frame.columns:
|
||||||
|
mean_value = frame[metric].mean(numeric_only=True)
|
||||||
|
means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4)
|
||||||
|
else:
|
||||||
|
means[metric] = None
|
||||||
|
return means
|
||||||
|
|
||||||
|
|
||||||
|
def build_run_summary(run_dir: Path) -> RunSummary | None:
|
||||||
|
"""Assemble a RunSummary from one run directory's artifacts."""
|
||||||
|
metadata = _read_json(run_dir / "metadata.json")
|
||||||
|
if "scenario_name" not in metadata:
|
||||||
|
return None
|
||||||
|
|
||||||
|
metrics = _read_metrics_from_snapshot(run_dir)
|
||||||
|
if not metrics:
|
||||||
|
# Fall back to numeric score columns inferred from the scores file.
|
||||||
|
metrics = _infer_metrics_from_scores(run_dir)
|
||||||
|
|
||||||
|
valid = int(metadata.get("valid_samples", 0) or 0)
|
||||||
|
invalid = int(metadata.get("invalid_samples", 0) or 0)
|
||||||
|
run_id = str(metadata.get("run_id") or run_dir.name)
|
||||||
|
|
||||||
|
return RunSummary(
|
||||||
|
run_id=run_id,
|
||||||
|
scenario_name=str(metadata.get("scenario_name", "")),
|
||||||
|
mode=str(metadata.get("mode", "")),
|
||||||
|
judge_model=str(metadata.get("judge_model", "")),
|
||||||
|
embedding_model=str(metadata.get("embedding_model", "")),
|
||||||
|
started_at=str(metadata.get("started_at", "")),
|
||||||
|
finished_at=str(metadata.get("finished_at", "")),
|
||||||
|
dataset=str(metadata.get("dataset", "")),
|
||||||
|
total_samples=valid + invalid,
|
||||||
|
valid_samples=valid,
|
||||||
|
invalid_samples=invalid,
|
||||||
|
metrics=metrics,
|
||||||
|
metric_means=_metric_means(run_dir, metrics),
|
||||||
|
output_path=run_dir.as_posix(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Columns in scores.csv that are sample fields rather than metric scores.
|
||||||
|
NON_METRIC_COLUMNS = {
|
||||||
|
"sample_id",
|
||||||
|
"question",
|
||||||
|
"contexts",
|
||||||
|
"answer",
|
||||||
|
"ground_truth",
|
||||||
|
"scenario",
|
||||||
|
"language",
|
||||||
|
"retrieval_config",
|
||||||
|
"error",
|
||||||
|
"judge_model",
|
||||||
|
"embedding_model",
|
||||||
|
"run_id",
|
||||||
|
"difficulty",
|
||||||
|
"question_type",
|
||||||
|
"doc_id",
|
||||||
|
"doc_name",
|
||||||
|
"section_path",
|
||||||
|
"page_start",
|
||||||
|
"page_end",
|
||||||
|
"source_chunk_ids",
|
||||||
|
"review_status",
|
||||||
|
"review_notes",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_metrics_from_scores(run_dir: Path) -> list[str]:
|
||||||
|
"""Infer metric column names from a scores.csv when no snapshot is available."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
frame = pd.read_csv(scores_path, nrows=1)
|
||||||
|
except (OSError, ValueError, pd.errors.ParserError):
|
||||||
|
return []
|
||||||
|
metrics: list[str] = []
|
||||||
|
for column in frame.columns:
|
||||||
|
if column in NON_METRIC_COLUMNS:
|
||||||
|
continue
|
||||||
|
if pd.api.types.is_numeric_dtype(frame[column]):
|
||||||
|
metrics.append(str(column))
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]:
|
||||||
|
"""Return all run summaries sorted by finish time (most recent first)."""
|
||||||
|
summaries: list[RunSummary] = []
|
||||||
|
for run_dir in discover_run_dirs(extra_roots):
|
||||||
|
summary = build_run_summary(run_dir)
|
||||||
|
if summary is not None:
|
||||||
|
summaries.append(summary)
|
||||||
|
summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True)
|
||||||
|
return summaries
|
||||||
|
|
||||||
|
|
||||||
|
def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None:
|
||||||
|
"""Locate the run directory whose metadata or folder name matches run_id."""
|
||||||
|
for run_dir in discover_run_dirs(extra_roots):
|
||||||
|
metadata = _read_json(run_dir / "metadata.json")
|
||||||
|
if str(metadata.get("run_id") or run_dir.name) == run_id:
|
||||||
|
return run_dir
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def read_scores_frame(run_dir: Path) -> pd.DataFrame:
|
||||||
|
"""Load a run's scores.csv into a dataframe, or an empty frame if missing."""
|
||||||
|
scores_path = run_dir / "scores.csv"
|
||||||
|
if not scores_path.is_file():
|
||||||
|
return pd.DataFrame()
|
||||||
|
try:
|
||||||
|
return pd.read_csv(scores_path)
|
||||||
|
except (OSError, ValueError, pd.errors.ParserError):
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
def read_summary_markdown(run_dir: Path) -> str:
|
||||||
|
"""Return the human-readable summary.md for a run, or an empty string."""
|
||||||
|
summary_path = run_dir / "summary.md"
|
||||||
|
if not summary_path.is_file():
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return summary_path.read_text(encoding="utf-8")
|
||||||
|
except OSError:
|
||||||
|
return ""
|
||||||
84
webapp/services/scenario_scanner.py
Normal file
84
webapp/services/scenario_scanner.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
"""Discover scenario YAML files that can be launched from the console.
|
||||||
|
|
||||||
|
Scanning is intentionally tolerant: a malformed scenario file is reported with
|
||||||
|
an error string rather than aborting the whole listing, so the UI can show the
|
||||||
|
user which files are runnable and which need fixing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from webapp.models import ScenarioInfo
|
||||||
|
|
||||||
|
|
||||||
|
def _repo_root() -> Path:
|
||||||
|
"""Return the siemens_ragas repository root (parent of the webapp package)."""
|
||||||
|
return Path(__file__).resolve().parents[2]
|
||||||
|
|
||||||
|
|
||||||
|
def _scenarios_root() -> Path:
|
||||||
|
"""Return the conventional scenarios/ directory inside the repository."""
|
||||||
|
return _repo_root() / "scenarios"
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_scenario(path: Path) -> ScenarioInfo:
|
||||||
|
"""Read a scenario file into a compact info object, capturing parse errors."""
|
||||||
|
relative = path.relative_to(_repo_root()).as_posix()
|
||||||
|
try:
|
||||||
|
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||||
|
except (OSError, yaml.YAMLError) as exc:
|
||||||
|
return ScenarioInfo(path=relative, error=f"无法解析: {exc}")
|
||||||
|
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。")
|
||||||
|
|
||||||
|
metrics = payload.get("metrics")
|
||||||
|
metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
|
||||||
|
|
||||||
|
return ScenarioInfo(
|
||||||
|
path=relative,
|
||||||
|
scenario_name=str(payload.get("scenario_name", "")),
|
||||||
|
mode=str(payload.get("mode", "")),
|
||||||
|
dataset=str(payload.get("dataset", "")),
|
||||||
|
judge_model=str(payload.get("judge_model", "")),
|
||||||
|
metrics=metric_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_scenarios() -> list[ScenarioInfo]:
|
||||||
|
"""Return every scenario YAML under scenarios/, sorted by path."""
|
||||||
|
root = _scenarios_root()
|
||||||
|
if not root.is_dir():
|
||||||
|
return []
|
||||||
|
|
||||||
|
scenarios: list[ScenarioInfo] = []
|
||||||
|
for path in sorted(root.rglob("*.yaml")):
|
||||||
|
scenarios.append(_summarize_scenario(path))
|
||||||
|
for path in sorted(root.rglob("*.yml")):
|
||||||
|
scenarios.append(_summarize_scenario(path))
|
||||||
|
return scenarios
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_scenario_path(relative_or_absolute: str) -> Path | None:
|
||||||
|
"""Resolve a user-supplied scenario path safely within the repository.
|
||||||
|
|
||||||
|
Only paths that live inside the repository's scenarios/ directory are
|
||||||
|
accepted, which prevents the trigger endpoint from reading arbitrary files.
|
||||||
|
"""
|
||||||
|
root = _repo_root()
|
||||||
|
candidate = Path(relative_or_absolute)
|
||||||
|
resolved = candidate if candidate.is_absolute() else (root / candidate)
|
||||||
|
try:
|
||||||
|
resolved = resolved.resolve()
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
scenarios_root = _scenarios_root().resolve()
|
||||||
|
if scenarios_root not in resolved.parents and resolved != scenarios_root:
|
||||||
|
return None
|
||||||
|
if not resolved.is_file():
|
||||||
|
return None
|
||||||
|
return resolved
|
||||||
161
webapp/services/task_manager.py
Normal file
161
webapp/services/task_manager.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
"""In-process background task manager for evaluation runs.
|
||||||
|
|
||||||
|
Evaluations run in a thread pool so the FastAPI event loop is never blocked.
|
||||||
|
The heavy rag_eval / ragas import is performed lazily inside the worker thread,
|
||||||
|
which keeps the web server bootable even when the evaluation dependencies are
|
||||||
|
broken — failures then surface as task errors in the UI instead of crashing
|
||||||
|
startup. This matches the "coarse status + logs" progress decision.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from contextlib import redirect_stderr, redirect_stdout
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from webapp.models import TaskStatus
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
"""Return the current UTC time as an ISO 8601 string."""
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class _LineCapture(io.TextIOBase):
|
||||||
|
"""A writable stream that appends captured lines to a task's log buffer."""
|
||||||
|
|
||||||
|
def __init__(self, sink: "EvaluationTask") -> None:
|
||||||
|
"""Bind the capture stream to the owning task."""
|
||||||
|
self._sink = sink
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
def write(self, text: str) -> int:
|
||||||
|
"""Buffer text and flush complete lines into the task log."""
|
||||||
|
self._buffer += text
|
||||||
|
while "\n" in self._buffer:
|
||||||
|
line, self._buffer = self._buffer.split("\n", 1)
|
||||||
|
self._sink.append_log(line)
|
||||||
|
return len(text)
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
"""Flush any trailing partial line into the task log."""
|
||||||
|
if self._buffer:
|
||||||
|
self._sink.append_log(self._buffer)
|
||||||
|
self._buffer = ""
|
||||||
|
|
||||||
|
|
||||||
|
class EvaluationTask:
|
||||||
|
"""Mutable state for a single background evaluation run."""
|
||||||
|
|
||||||
|
def __init__(self, task_id: str, scenario_path: str) -> None:
|
||||||
|
"""Initialize a queued task for the given scenario path."""
|
||||||
|
self.task_id = task_id
|
||||||
|
self.scenario_path = scenario_path
|
||||||
|
self.status = "queued"
|
||||||
|
self.logs: list[str] = []
|
||||||
|
self.run_id: str | None = None
|
||||||
|
self.error: str | None = None
|
||||||
|
self.created_at = _now_iso()
|
||||||
|
self.finished_at = ""
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def append_log(self, line: str) -> None:
|
||||||
|
"""Append one log line in a thread-safe manner."""
|
||||||
|
with self._lock:
|
||||||
|
self.logs.append(line)
|
||||||
|
|
||||||
|
def snapshot(self) -> TaskStatus:
|
||||||
|
"""Return an immutable copy of the current task state for the API."""
|
||||||
|
with self._lock:
|
||||||
|
return TaskStatus(
|
||||||
|
task_id=self.task_id,
|
||||||
|
scenario_path=self.scenario_path,
|
||||||
|
status=self.status,
|
||||||
|
logs=list(self.logs),
|
||||||
|
run_id=self.run_id,
|
||||||
|
error=self.error,
|
||||||
|
created_at=self.created_at,
|
||||||
|
finished_at=self.finished_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TaskManager:
|
||||||
|
"""Owns the thread pool and registry of evaluation tasks."""
|
||||||
|
|
||||||
|
def __init__(self, max_workers: int = 2) -> None:
|
||||||
|
"""Create a task manager backed by a small thread pool."""
|
||||||
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
self._tasks: dict[str, EvaluationTask] = {}
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def submit(self, scenario_path: str) -> str:
|
||||||
|
"""Register and schedule a new evaluation task, returning its id."""
|
||||||
|
task_id = uuid.uuid4().hex[:12]
|
||||||
|
task = EvaluationTask(task_id=task_id, scenario_path=scenario_path)
|
||||||
|
with self._lock:
|
||||||
|
self._tasks[task_id] = task
|
||||||
|
self._executor.submit(self._run, task)
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
def get(self, task_id: str) -> TaskStatus | None:
|
||||||
|
"""Return a snapshot of one task, or None if the id is unknown."""
|
||||||
|
with self._lock:
|
||||||
|
task = self._tasks.get(task_id)
|
||||||
|
return task.snapshot() if task is not None else None
|
||||||
|
|
||||||
|
def list_tasks(self) -> list[TaskStatus]:
|
||||||
|
"""Return snapshots of all known tasks, newest first."""
|
||||||
|
with self._lock:
|
||||||
|
tasks = list(self._tasks.values())
|
||||||
|
snapshots = [task.snapshot() for task in tasks]
|
||||||
|
snapshots.sort(key=lambda item: item.created_at, reverse=True)
|
||||||
|
return snapshots
|
||||||
|
|
||||||
|
def _run(self, task: EvaluationTask) -> None:
|
||||||
|
"""Execute one evaluation end to end inside a worker thread."""
|
||||||
|
task.status = "running"
|
||||||
|
task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}")
|
||||||
|
|
||||||
|
capture = _LineCapture(task)
|
||||||
|
try:
|
||||||
|
# Lazy import keeps the web server bootable if ragas is unavailable.
|
||||||
|
task.append_log("加载评估引擎 (rag_eval / ragas)...")
|
||||||
|
from rag_eval.execution.runner import run_scenario
|
||||||
|
|
||||||
|
absolute_path = self._to_absolute(task.scenario_path)
|
||||||
|
task.append_log(f"运行场景文件: {absolute_path}")
|
||||||
|
|
||||||
|
with redirect_stdout(capture), redirect_stderr(capture):
|
||||||
|
result = run_scenario(str(absolute_path))
|
||||||
|
capture.flush()
|
||||||
|
|
||||||
|
task.run_id = getattr(result, "run_id", None)
|
||||||
|
output_dir = getattr(getattr(result, "scenario", None), "output_dir", "")
|
||||||
|
task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}")
|
||||||
|
if output_dir:
|
||||||
|
task.append_log(f"结果目录: {output_dir}")
|
||||||
|
task.status = "completed"
|
||||||
|
except Exception as exc: # noqa: BLE001 - surface any failure to the UI
|
||||||
|
capture.flush()
|
||||||
|
error_type = type(exc).__name__
|
||||||
|
task.error = f"{error_type}: {exc}"
|
||||||
|
task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}")
|
||||||
|
task.status = "failed"
|
||||||
|
finally:
|
||||||
|
task.finished_at = _now_iso()
|
||||||
|
|
||||||
|
def _to_absolute(self, scenario_path: str) -> Path:
|
||||||
|
"""Resolve a scenario path against the repository root if relative."""
|
||||||
|
candidate = Path(scenario_path)
|
||||||
|
if candidate.is_absolute():
|
||||||
|
return candidate
|
||||||
|
repo_root = Path(__file__).resolve().parents[2]
|
||||||
|
return (repo_root / candidate).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton shared by the FastAPI routes.
|
||||||
|
task_manager = TaskManager()
|
||||||
47
webapp/services/text_utils.py
Normal file
47
webapp/services/text_utils.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
"""Self-contained text helpers for the web layer.
|
||||||
|
|
||||||
|
These intentionally avoid importing from rag_eval so the web server has no
|
||||||
|
import-time dependency on the evaluation engine (and therefore boots even when
|
||||||
|
ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the
|
||||||
|
console interprets serialized CSV context columns the same way the engine does.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def parse_contexts(value: Any) -> list[str]:
|
||||||
|
"""Normalize a context payload into a list of non-empty strings.
|
||||||
|
|
||||||
|
Accepts native lists, JSON/Python-literal serialized lists (as written into
|
||||||
|
scores.csv), and plain text, mirroring the engine's own parsing rules.
|
||||||
|
"""
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [str(item).strip() for item in value if str(item).strip()]
|
||||||
|
if value is None or (isinstance(value, float) and math.isnan(value)):
|
||||||
|
return []
|
||||||
|
|
||||||
|
text = str(value).strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Accept serialized lists from CSV exports before falling back to plain text.
|
||||||
|
for parser in (json.loads, ast.literal_eval):
|
||||||
|
try:
|
||||||
|
parsed = parser(text)
|
||||||
|
except (ValueError, SyntaxError, json.JSONDecodeError):
|
||||||
|
continue
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return [str(item).strip() for item in parsed if str(item).strip()]
|
||||||
|
|
||||||
|
# Preserve paragraph-style context dumps by splitting on blank lines first.
|
||||||
|
if "\n\n" in text:
|
||||||
|
chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
|
||||||
|
if chunks:
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
return [text]
|
||||||
267
webapp/static/css/app.css
Normal file
267
webapp/static/css/app.css
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
/* Siemens RAGAS 评估控制台 — 样式表
|
||||||
|
配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--petrol: #009999;
|
||||||
|
--petrol-dark: #007a7a;
|
||||||
|
--ink: #0f1b2d;
|
||||||
|
--ink-soft: #1a2942;
|
||||||
|
--slate: #64748b;
|
||||||
|
--slate-light: #94a3b8;
|
||||||
|
--line: #e2e8f0;
|
||||||
|
--bg: #f4f6f9;
|
||||||
|
--surface: #ffffff;
|
||||||
|
--good: #16a34a;
|
||||||
|
--warn: #eab308;
|
||||||
|
--bad: #dc2626;
|
||||||
|
--shadow: 0 1px 3px rgba(15, 27, 45, 0.08), 0 1px 2px rgba(15, 27, 45, 0.04);
|
||||||
|
--radius: 10px;
|
||||||
|
font-synthesis: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: "Segoe UI", "Microsoft YaHei", system-ui, -apple-system, sans-serif;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--ink);
|
||||||
|
font-size: 14px;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.app { display: flex; min-height: 100vh; }
|
||||||
|
|
||||||
|
/* ---------- 左侧导航 ---------- */
|
||||||
|
.sidebar {
|
||||||
|
width: 208px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: linear-gradient(180deg, var(--ink) 0%, var(--ink-soft) 100%);
|
||||||
|
color: #cbd5e1;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
padding: 20px 14px;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
height: 100vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.brand { padding: 0 8px 22px; }
|
||||||
|
.brand-mark {
|
||||||
|
font-size: 20px; font-weight: 700; letter-spacing: 1px; color: #fff;
|
||||||
|
}
|
||||||
|
.brand-sub { font-size: 12px; color: var(--petrol); margin-top: 2px; letter-spacing: 2px; }
|
||||||
|
|
||||||
|
.nav { display: flex; flex-direction: column; gap: 4px; flex: 1; }
|
||||||
|
.nav-item {
|
||||||
|
display: flex; align-items: center; gap: 10px;
|
||||||
|
background: transparent; border: none; color: #cbd5e1;
|
||||||
|
padding: 10px 12px; border-radius: 8px; cursor: pointer;
|
||||||
|
font-size: 14px; text-align: left; width: 100%;
|
||||||
|
transition: background 0.15s, color 0.15s;
|
||||||
|
}
|
||||||
|
.nav-item:hover { background: rgba(255, 255, 255, 0.06); color: #fff; }
|
||||||
|
.nav-item.active { background: var(--petrol); color: #fff; }
|
||||||
|
.nav-item.active .nav-ico { color: #fff; }
|
||||||
|
.nav-item:disabled { opacity: 0.4; cursor: not-allowed; }
|
||||||
|
.nav-ico { width: 18px; text-align: center; color: var(--petrol); font-weight: 700; }
|
||||||
|
.nav-item.active .nav-ico { color: #fff; }
|
||||||
|
|
||||||
|
.sidebar-foot {
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
font-size: 12px; color: var(--slate-light);
|
||||||
|
padding: 12px 8px 0; border-top: 1px solid rgba(255, 255, 255, 0.08);
|
||||||
|
}
|
||||||
|
.dot { width: 8px; height: 8px; border-radius: 50%; background: var(--slate-light); }
|
||||||
|
.dot.ok { background: var(--good); }
|
||||||
|
.dot.bad { background: var(--bad); }
|
||||||
|
|
||||||
|
/* ---------- 主内容区 ---------- */
|
||||||
|
.main { flex: 1; display: flex; flex-direction: column; min-width: 0; }
|
||||||
|
|
||||||
|
.topbar {
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
padding: 18px 28px; background: var(--surface); border-bottom: 1px solid var(--line);
|
||||||
|
position: sticky; top: 0; z-index: 5;
|
||||||
|
}
|
||||||
|
.topbar h1 { font-size: 18px; font-weight: 600; }
|
||||||
|
|
||||||
|
.view { padding: 24px 28px; }
|
||||||
|
|
||||||
|
/* ---------- 按钮 ---------- */
|
||||||
|
.btn {
|
||||||
|
border: 1px solid var(--line); background: var(--surface); color: var(--ink);
|
||||||
|
padding: 8px 16px; border-radius: 8px; cursor: pointer; font-size: 13px;
|
||||||
|
transition: all 0.15s; font-family: inherit;
|
||||||
|
}
|
||||||
|
.btn:hover { border-color: var(--petrol); color: var(--petrol); }
|
||||||
|
.btn-primary { background: var(--petrol); border-color: var(--petrol); color: #fff; }
|
||||||
|
.btn-primary:hover { background: var(--petrol-dark); border-color: var(--petrol-dark); color: #fff; }
|
||||||
|
.btn-primary:disabled { background: var(--slate-light); border-color: var(--slate-light); cursor: not-allowed; }
|
||||||
|
.btn-ghost { background: transparent; }
|
||||||
|
|
||||||
|
/* ---------- 运行列表 ---------- */
|
||||||
|
.runs-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 16px; }
|
||||||
|
.run-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
padding: 16px; cursor: pointer; transition: all 0.15s; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.run-card:hover { border-color: var(--petrol); transform: translateY(-1px); }
|
||||||
|
.run-card-head { display: flex; justify-content: space-between; align-items: flex-start; gap: 10px; }
|
||||||
|
.run-card-title { font-size: 15px; font-weight: 600; word-break: break-all; }
|
||||||
|
.run-card-meta { font-size: 12px; color: var(--slate); margin-top: 6px; line-height: 1.7; }
|
||||||
|
.run-card-metrics { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px; }
|
||||||
|
.metric-chip {
|
||||||
|
font-size: 12px; padding: 3px 8px; border-radius: 6px; background: var(--bg);
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
}
|
||||||
|
.metric-chip b { font-variant-numeric: tabular-nums; }
|
||||||
|
|
||||||
|
/* ---------- 通用面板 ---------- */
|
||||||
|
.panel {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
padding: 20px; box-shadow: var(--shadow); margin-bottom: 18px;
|
||||||
|
}
|
||||||
|
.panel h2 { font-size: 16px; margin-bottom: 6px; }
|
||||||
|
.panel-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 12px; }
|
||||||
|
|
||||||
|
.muted { color: var(--slate); }
|
||||||
|
.tiny { font-size: 11px; margin-top: 8px; }
|
||||||
|
.tight { margin: 0 !important; }
|
||||||
|
code {
|
||||||
|
background: var(--bg); border: 1px solid var(--line); border-radius: 4px;
|
||||||
|
padding: 1px 6px; font-size: 12px; font-family: "Cascadia Code", Consolas, monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- 新建评估 ---------- */
|
||||||
|
.scenario-list { display: flex; flex-direction: column; gap: 8px; margin: 16px 0; }
|
||||||
|
.scenario-item {
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 12px;
|
||||||
|
border: 1px solid var(--line); border-radius: 8px; padding: 12px 14px; cursor: pointer;
|
||||||
|
transition: all 0.15s;
|
||||||
|
}
|
||||||
|
.scenario-item:hover { border-color: var(--petrol); background: #f0fbfb; }
|
||||||
|
.scenario-item.selected { border-color: var(--petrol); background: #e6f7f7; box-shadow: inset 0 0 0 1px var(--petrol); }
|
||||||
|
.scenario-item.invalid { opacity: 0.55; cursor: not-allowed; }
|
||||||
|
.scenario-name { font-weight: 600; font-size: 14px; }
|
||||||
|
.scenario-path { font-size: 12px; color: var(--slate); font-family: monospace; }
|
||||||
|
.scenario-tags { display: flex; gap: 6px; align-items: center; flex-shrink: 0; }
|
||||||
|
.tag {
|
||||||
|
font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--bg);
|
||||||
|
border: 1px solid var(--line); color: var(--slate);
|
||||||
|
}
|
||||||
|
.tag.mode-online { background: #eff6ff; color: #1d4ed8; border-color: #bfdbfe; }
|
||||||
|
.tag.mode-offline { background: #f0fdf4; color: #15803d; border-color: #bbf7d0; }
|
||||||
|
|
||||||
|
.run-actions { display: flex; align-items: center; gap: 14px; }
|
||||||
|
.selected-scenario { font-size: 13px; }
|
||||||
|
|
||||||
|
/* ---------- 任务进度 ---------- */
|
||||||
|
.task-head { display: flex; align-items: center; gap: 12px; margin-bottom: 12px; }
|
||||||
|
.badge {
|
||||||
|
font-size: 12px; padding: 3px 10px; border-radius: 999px; font-weight: 600;
|
||||||
|
background: var(--bg); color: var(--slate); border: 1px solid var(--line);
|
||||||
|
}
|
||||||
|
.badge.queued { background: #f1f5f9; color: var(--slate); }
|
||||||
|
.badge.running { background: #fef9c3; color: #854d0e; border-color: #fde68a; }
|
||||||
|
.badge.completed { background: #dcfce7; color: #166534; border-color: #bbf7d0; }
|
||||||
|
.badge.failed { background: #fee2e2; color: #991b1b; border-color: #fecaca; }
|
||||||
|
.log-box {
|
||||||
|
background: #0b1220; color: #cbd5e1; border-radius: 8px; padding: 14px;
|
||||||
|
font-family: "Cascadia Code", Consolas, monospace; font-size: 12px; line-height: 1.7;
|
||||||
|
max-height: 320px; overflow-y: auto; white-space: pre-wrap; word-break: break-word;
|
||||||
|
}
|
||||||
|
.task-actions { margin-top: 12px; }
|
||||||
|
|
||||||
|
/* ---------- 报告详情 ---------- */
|
||||||
|
.report-meta {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
padding: 14px 18px; display: flex; justify-content: space-between; align-items: center;
|
||||||
|
flex-wrap: wrap; gap: 10px; box-shadow: var(--shadow); margin-bottom: 18px;
|
||||||
|
}
|
||||||
|
.report-meta-title { font-size: 15px; font-weight: 600; }
|
||||||
|
.report-meta-info { font-size: 12px; color: var(--slate); }
|
||||||
|
.status-pill { font-size: 12px; font-weight: 600; }
|
||||||
|
.status-pill.completed { color: var(--good); }
|
||||||
|
|
||||||
|
.section-label {
|
||||||
|
font-size: 12px; font-weight: 600; letter-spacing: 0.5px; color: var(--slate);
|
||||||
|
text-transform: uppercase; margin: 18px 0 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.metric-cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px; }
|
||||||
|
.metric-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
padding: 16px; text-align: center; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.metric-value { font-size: 28px; font-weight: 700; font-variant-numeric: tabular-nums; }
|
||||||
|
.metric-value.good { color: var(--good); }
|
||||||
|
.metric-value.warn { color: var(--warn); }
|
||||||
|
.metric-value.bad { color: var(--bad); }
|
||||||
|
.metric-value.na { color: var(--slate-light); }
|
||||||
|
.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
|
||||||
|
|
||||||
|
.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
|
.report-half { margin-bottom: 0; }
|
||||||
|
|
||||||
|
.select {
|
||||||
|
border: 1px solid var(--line); border-radius: 6px; padding: 5px 10px; font-size: 12px;
|
||||||
|
background: var(--surface); color: var(--ink); font-family: inherit; cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
.grouping-tabs { display: flex; gap: 6px; margin-bottom: 10px; flex-wrap: wrap; }
|
||||||
|
.grouping-tab {
|
||||||
|
font-size: 12px; padding: 4px 10px; border-radius: 6px; border: 1px solid var(--line);
|
||||||
|
background: var(--surface); cursor: pointer; color: var(--slate);
|
||||||
|
}
|
||||||
|
.grouping-tab.active { background: var(--petrol); color: #fff; border-color: var(--petrol); }
|
||||||
|
|
||||||
|
table.group-table { width: 100%; border-collapse: collapse; font-size: 12px; }
|
||||||
|
table.group-table th, table.group-table td { padding: 6px 8px; text-align: left; }
|
||||||
|
table.group-table th { color: var(--slate); border-bottom: 1px solid var(--line); font-weight: 600; }
|
||||||
|
table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: tabular-nums; }
|
||||||
|
|
||||||
|
/* 最低分样本表 */
|
||||||
|
.lowest-table {
|
||||||
|
background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
|
||||||
|
overflow: hidden; box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.lowest-row {
|
||||||
|
display: grid; grid-template-columns: 90px 1fr auto; gap: 12px; align-items: center;
|
||||||
|
padding: 11px 16px; border-bottom: 1px solid #f1f5f9; cursor: pointer; transition: background 0.12s;
|
||||||
|
}
|
||||||
|
.lowest-row:hover { background: var(--bg); }
|
||||||
|
.lowest-row .sid { font-size: 12px; color: var(--slate); font-family: monospace; }
|
||||||
|
.lowest-row .q { font-size: 13px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||||
|
.lowest-row .scores { display: flex; gap: 8px; }
|
||||||
|
.score-badge {
|
||||||
|
font-size: 12px; padding: 2px 8px; border-radius: 6px; font-variant-numeric: tabular-nums;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.score-badge.good { background: #dcfce7; color: #166534; }
|
||||||
|
.score-badge.warn { background: #fef9c3; color: #854d0e; }
|
||||||
|
.score-badge.bad { background: #fee2e2; color: #991b1b; }
|
||||||
|
.score-badge.na { background: var(--bg); color: var(--slate-light); }
|
||||||
|
|
||||||
|
.lowest-detail { padding: 0 16px; background: #fcfdfe; border-bottom: 1px solid #f1f5f9; }
|
||||||
|
.lowest-detail-inner { padding: 14px 0; font-size: 13px; line-height: 1.7; }
|
||||||
|
.detail-field { margin-bottom: 10px; }
|
||||||
|
.detail-label { font-size: 12px; color: var(--slate); font-weight: 600; margin-bottom: 3px; }
|
||||||
|
.detail-context { color: #475569; font-size: 12px; }
|
||||||
|
.detail-context .ctx-item {
|
||||||
|
padding: 4px 0; border-bottom: 1px dashed var(--line);
|
||||||
|
}
|
||||||
|
.detail-gt { color: var(--good); }
|
||||||
|
|
||||||
|
.empty { text-align: center; padding: 60px 20px; color: var(--slate); }
|
||||||
|
.empty p { margin-bottom: 8px; }
|
||||||
|
|
||||||
|
.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--line);
|
||||||
|
border-top-color: var(--petrol); border-radius: 50%; animation: spin 0.7s linear infinite;
|
||||||
|
vertical-align: middle; }
|
||||||
|
@keyframes spin { to { transform: rotate(360deg); } }
|
||||||
|
|
||||||
|
@media (max-width: 880px) {
|
||||||
|
.report-row { grid-template-columns: 1fr; }
|
||||||
|
.sidebar { width: 64px; }
|
||||||
|
.brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
|
||||||
|
}
|
||||||
118
webapp/static/index.html
Normal file
118
webapp/static/index.html
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>Siemens RAGAS 评估控制台</title>
|
||||||
|
<link rel="stylesheet" href="/static/css/app.css" />
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="app">
|
||||||
|
<!-- 左侧导航(布局 A) -->
|
||||||
|
<aside class="sidebar">
|
||||||
|
<div class="brand">
|
||||||
|
<div class="brand-mark">RAGAS</div>
|
||||||
|
<div class="brand-sub">评估控制台</div>
|
||||||
|
</div>
|
||||||
|
<nav class="nav">
|
||||||
|
<button class="nav-item" data-view="runs">
|
||||||
|
<span class="nav-ico">▢</span><span>运行列表</span>
|
||||||
|
</button>
|
||||||
|
<button class="nav-item" data-view="new">
|
||||||
|
<span class="nav-ico">+</span><span>新建评估</span>
|
||||||
|
</button>
|
||||||
|
<button class="nav-item" data-view="report" data-requires-run="1">
|
||||||
|
<span class="nav-ico">▤</span><span>报告详情</span>
|
||||||
|
</button>
|
||||||
|
</nav>
|
||||||
|
<div class="sidebar-foot">
|
||||||
|
<span class="dot" id="health-dot"></span>
|
||||||
|
<span id="health-text">连接中…</span>
|
||||||
|
</div>
|
||||||
|
</aside>
|
||||||
|
|
||||||
|
<!-- 主内容区 -->
|
||||||
|
<main class="main">
|
||||||
|
<header class="topbar">
|
||||||
|
<h1 id="view-title">运行列表</h1>
|
||||||
|
<button class="btn btn-ghost" id="refresh-btn">刷新</button>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- 运行列表视图 -->
|
||||||
|
<section class="view" id="view-runs">
|
||||||
|
<div id="runs-container" class="runs-grid"></div>
|
||||||
|
<div class="empty" id="runs-empty" hidden>
|
||||||
|
<p>暂无评估运行。</p>
|
||||||
|
<p class="muted">从「新建评估」触发一次,或运行示例数据生成脚本:<code>python scripts/seed_sample_run.py</code></p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- 新建评估视图 -->
|
||||||
|
<section class="view" id="view-new" hidden>
|
||||||
|
<div class="panel">
|
||||||
|
<h2>选择场景并运行</h2>
|
||||||
|
<p class="muted">从 <code>scenarios/</code> 下选择一个场景配置,点击运行后在下方查看实时状态与日志。</p>
|
||||||
|
<div class="scenario-list" id="scenario-list"></div>
|
||||||
|
<div class="run-actions">
|
||||||
|
<button class="btn btn-primary" id="run-btn" disabled>运行评估</button>
|
||||||
|
<span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="panel" id="task-panel" hidden>
|
||||||
|
<div class="task-head">
|
||||||
|
<h2>评估进度</h2>
|
||||||
|
<span class="badge" id="task-status">queued</span>
|
||||||
|
</div>
|
||||||
|
<pre class="log-box" id="task-log"></pre>
|
||||||
|
<div class="task-actions">
|
||||||
|
<button class="btn btn-primary" id="view-report-btn" hidden>查看报告</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- 报告详情视图 -->
|
||||||
|
<section class="view" id="view-report" hidden>
|
||||||
|
<div class="empty" id="report-empty">
|
||||||
|
<p>请先从「运行列表」选择一次运行。</p>
|
||||||
|
</div>
|
||||||
|
<div id="report-content" hidden>
|
||||||
|
<!-- 顶部元信息条 -->
|
||||||
|
<div class="report-meta" id="report-meta"></div>
|
||||||
|
|
||||||
|
<!-- ① 指标均值卡片 -->
|
||||||
|
<div class="section-label">① 指标均值 OVERVIEW</div>
|
||||||
|
<div class="metric-cards" id="metric-cards"></div>
|
||||||
|
|
||||||
|
<!-- ② 分布 + ③ 分组 并排 -->
|
||||||
|
<div class="report-row">
|
||||||
|
<div class="panel report-half">
|
||||||
|
<div class="panel-head">
|
||||||
|
<div class="section-label tight">② 分数分布</div>
|
||||||
|
<select id="dist-metric-select" class="select"></select>
|
||||||
|
</div>
|
||||||
|
<canvas id="dist-chart" height="160"></canvas>
|
||||||
|
<p class="muted tiny">暴露长尾失败样本</p>
|
||||||
|
</div>
|
||||||
|
<div class="panel report-half">
|
||||||
|
<div class="section-label tight">③ 分组均值</div>
|
||||||
|
<div id="grouping-tabs" class="grouping-tabs"></div>
|
||||||
|
<div id="grouping-table"></div>
|
||||||
|
<p class="muted tiny">定位薄弱类别</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ④ 最低分样本逐条复核 -->
|
||||||
|
<div class="section-label">④ 最低分样本(点击展开逐条复核)</div>
|
||||||
|
<div class="lowest-table" id="lowest-table"></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script src="/static/js/api.js"></script>
|
||||||
|
<script src="/static/js/report.js"></script>
|
||||||
|
<script src="/static/js/runner.js"></script>
|
||||||
|
<script src="/static/js/app.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
46
webapp/static/js/api.js
Normal file
46
webapp/static/js/api.js
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
// api.js — 控制台后端 HTTP 接口的轻量封装。
|
||||||
|
|
||||||
|
const API = {
|
||||||
|
// 通用 JSON GET,失败时抛出带状态码的错误。
|
||||||
|
async get(path) {
|
||||||
|
const resp = await fetch(path);
|
||||||
|
if (!resp.ok) {
|
||||||
|
const detail = await API._extractError(resp);
|
||||||
|
throw new Error(detail);
|
||||||
|
}
|
||||||
|
return resp.json();
|
||||||
|
},
|
||||||
|
|
||||||
|
// 通用 JSON POST。
|
||||||
|
async post(path, body) {
|
||||||
|
const resp = await fetch(path, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify(body || {}),
|
||||||
|
});
|
||||||
|
if (!resp.ok) {
|
||||||
|
const detail = await API._extractError(resp);
|
||||||
|
throw new Error(detail);
|
||||||
|
}
|
||||||
|
return resp.json();
|
||||||
|
},
|
||||||
|
|
||||||
|
// 从错误响应中尽量解析出 detail 文本。
|
||||||
|
async _extractError(resp) {
|
||||||
|
try {
|
||||||
|
const data = await resp.json();
|
||||||
|
return data.detail || `请求失败 (${resp.status})`;
|
||||||
|
} catch (_e) {
|
||||||
|
return `请求失败 (${resp.status})`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
health() { return API.get("/api/health"); },
|
||||||
|
runs() { return API.get("/api/runs"); },
|
||||||
|
runDetail(runId) { return API.get(`/api/runs/${encodeURIComponent(runId)}`); },
|
||||||
|
scenarios() { return API.get("/api/scenarios"); },
|
||||||
|
triggerEvaluation(scenarioPath) {
|
||||||
|
return API.post("/api/evaluations", { scenario_path: scenarioPath });
|
||||||
|
},
|
||||||
|
taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
|
||||||
|
};
|
||||||
152
webapp/static/js/app.js
Normal file
152
webapp/static/js/app.js
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
|
||||||
|
|
||||||
|
const App = {
|
||||||
|
currentRunId: null,
|
||||||
|
views: ["runs", "new", "report"],
|
||||||
|
titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
|
||||||
|
|
||||||
|
// 初始化:绑定导航、加载首屏、启动健康检查。
|
||||||
|
init() {
|
||||||
|
document.querySelectorAll(".nav-item").forEach((btn) => {
|
||||||
|
btn.addEventListener("click", () => App.switchView(btn.dataset.view));
|
||||||
|
});
|
||||||
|
document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
|
||||||
|
|
||||||
|
Runner.init();
|
||||||
|
App.switchView("runs");
|
||||||
|
App.checkHealth();
|
||||||
|
setInterval(App.checkHealth, 15000);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 切换主视图,并同步导航高亮与标题。
|
||||||
|
switchView(view) {
|
||||||
|
if (view === "report" && !App.currentRunId) {
|
||||||
|
// 没有选中的运行时,报告页显示占位。
|
||||||
|
}
|
||||||
|
App.views.forEach((name) => {
|
||||||
|
const el = document.getElementById(`view-${name}`);
|
||||||
|
if (el) el.hidden = name !== view;
|
||||||
|
});
|
||||||
|
document.querySelectorAll(".nav-item").forEach((btn) => {
|
||||||
|
btn.classList.toggle("active", btn.dataset.view === view);
|
||||||
|
});
|
||||||
|
document.getElementById("view-title").textContent = App.titles[view] || view;
|
||||||
|
App.activeView = view;
|
||||||
|
|
||||||
|
if (view === "runs") App.loadRuns();
|
||||||
|
if (view === "new") Runner.loadScenarios();
|
||||||
|
if (view === "report") Report.render(App.currentRunId);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 刷新当前视图的数据。
|
||||||
|
refreshCurrent() {
|
||||||
|
App.switchView(App.activeView || "runs");
|
||||||
|
},
|
||||||
|
|
||||||
|
// 加载并渲染运行列表。
|
||||||
|
async loadRuns() {
|
||||||
|
const container = document.getElementById("runs-container");
|
||||||
|
const empty = document.getElementById("runs-empty");
|
||||||
|
container.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
|
try {
|
||||||
|
const data = await API.runs();
|
||||||
|
const runs = data.runs || [];
|
||||||
|
if (runs.length === 0) {
|
||||||
|
container.innerHTML = "";
|
||||||
|
empty.hidden = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
empty.hidden = true;
|
||||||
|
container.innerHTML = "";
|
||||||
|
runs.forEach((run) => container.appendChild(App.renderRunCard(run)));
|
||||||
|
} catch (err) {
|
||||||
|
container.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// 构造一张运行卡片。
|
||||||
|
renderRunCard(run) {
|
||||||
|
const card = document.createElement("div");
|
||||||
|
card.className = "run-card";
|
||||||
|
card.addEventListener("click", () => {
|
||||||
|
App.currentRunId = run.run_id;
|
||||||
|
App.enableReportNav();
|
||||||
|
App.switchView("report");
|
||||||
|
});
|
||||||
|
|
||||||
|
const chips = (run.metrics || [])
|
||||||
|
.map((m) => {
|
||||||
|
const val = run.metric_means ? run.metric_means[m] : null;
|
||||||
|
const cls = App.scoreClass(val);
|
||||||
|
const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
|
||||||
|
return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
|
||||||
|
})
|
||||||
|
.join("");
|
||||||
|
|
||||||
|
card.innerHTML = `
|
||||||
|
<div class="run-card-head">
|
||||||
|
<div class="run-card-title">${App.escape(run.scenario_name || run.run_id)}</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-meta">
|
||||||
|
<div>${App.escape(run.mode || "—")} · judge: ${App.escape(run.judge_model || "—")}</div>
|
||||||
|
<div>${run.valid_samples} 有效 / ${run.invalid_samples} 无效 · ${App.escape(App.shortTime(run.finished_at))}</div>
|
||||||
|
</div>
|
||||||
|
<div class="run-card-metrics">${chips}</div>
|
||||||
|
`;
|
||||||
|
return card;
|
||||||
|
},
|
||||||
|
|
||||||
|
// 启用报告导航项(选中运行后)。
|
||||||
|
enableReportNav() {
|
||||||
|
const btn = document.querySelector('.nav-item[data-view="report"]');
|
||||||
|
if (btn) btn.disabled = false;
|
||||||
|
},
|
||||||
|
|
||||||
|
// 根据分值返回 good/warn/bad/na 配色类。
|
||||||
|
scoreClass(value) {
|
||||||
|
if (value === null || value === undefined) return "na";
|
||||||
|
if (value >= 0.8) return "good";
|
||||||
|
if (value >= 0.65) return "warn";
|
||||||
|
return "bad";
|
||||||
|
},
|
||||||
|
|
||||||
|
// 指标名缩写,节省卡片横向空间。
|
||||||
|
shortMetric(name) {
|
||||||
|
const map = {
|
||||||
|
faithfulness: "faith.",
|
||||||
|
answer_relevancy: "ans.rel.",
|
||||||
|
context_recall: "ctx.recall",
|
||||||
|
context_precision: "ctx.prec.",
|
||||||
|
};
|
||||||
|
return map[name] || name;
|
||||||
|
},
|
||||||
|
|
||||||
|
// 截取时间戳到分钟,便于阅读。
|
||||||
|
shortTime(iso) {
|
||||||
|
if (!iso) return "—";
|
||||||
|
return String(iso).replace("T", " ").slice(0, 16);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 简单 HTML 转义,防止注入。
|
||||||
|
escape(text) {
|
||||||
|
const div = document.createElement("div");
|
||||||
|
div.textContent = text == null ? "" : String(text);
|
||||||
|
return div.innerHTML;
|
||||||
|
},
|
||||||
|
|
||||||
|
// 健康检查,更新左下角状态点。
|
||||||
|
async checkHealth() {
|
||||||
|
const dot = document.getElementById("health-dot");
|
||||||
|
const label = document.getElementById("health-text");
|
||||||
|
try {
|
||||||
|
await API.health();
|
||||||
|
dot.className = "dot ok";
|
||||||
|
label.textContent = "服务正常";
|
||||||
|
} catch (_e) {
|
||||||
|
dot.className = "dot bad";
|
||||||
|
label.textContent = "服务离线";
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
document.addEventListener("DOMContentLoaded", App.init);
|
||||||
258
webapp/static/js/report.js
Normal file
258
webapp/static/js/report.js
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
// report.js — 报告详情页渲染:元信息、指标卡片、分布图、分组表、低分样本复核。
|
||||||
|
|
||||||
|
const Report = {
|
||||||
|
distChart: null,
|
||||||
|
currentDetail: null,
|
||||||
|
activeGrouping: null,
|
||||||
|
|
||||||
|
// 加载并渲染指定运行的完整报告。
|
||||||
|
async render(runId) {
|
||||||
|
const empty = document.getElementById("report-empty");
|
||||||
|
const content = document.getElementById("report-content");
|
||||||
|
if (!runId) {
|
||||||
|
empty.hidden = false;
|
||||||
|
content.hidden = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
empty.hidden = true;
|
||||||
|
content.hidden = false;
|
||||||
|
content.style.opacity = "0.4";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const detail = await API.runDetail(runId);
|
||||||
|
Report.currentDetail = detail;
|
||||||
|
Report.renderMeta(detail.summary);
|
||||||
|
Report.renderMetricCards(detail.summary, detail.report);
|
||||||
|
Report.renderDistribution(detail.report);
|
||||||
|
Report.renderGroupings(detail.report);
|
||||||
|
Report.renderLowest(detail.report);
|
||||||
|
content.style.opacity = "1";
|
||||||
|
} catch (err) {
|
||||||
|
empty.hidden = false;
|
||||||
|
content.hidden = true;
|
||||||
|
empty.innerHTML = `<p>加载报告失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// 顶部元信息条。
|
||||||
|
renderMeta(summary) {
|
||||||
|
const el = document.getElementById("report-meta");
|
||||||
|
el.innerHTML = `
|
||||||
|
<div>
|
||||||
|
<div class="report-meta-title">${App.escape(summary.scenario_name || summary.run_id)}
|
||||||
|
<span class="status-pill completed">● completed</span></div>
|
||||||
|
<div class="report-meta-info">run_id: ${App.escape(summary.run_id)}</div>
|
||||||
|
</div>
|
||||||
|
<div class="report-meta-info">
|
||||||
|
${App.escape(summary.mode || "—")} · judge: ${App.escape(summary.judge_model || "—")}
|
||||||
|
· ${summary.total_samples} 样本 (${summary.valid_samples} 有效 / ${summary.invalid_samples} 无效)
|
||||||
|
· ${App.escape(App.shortTime(summary.finished_at))}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
},
|
||||||
|
|
||||||
|
// ① 指标均值卡片。
|
||||||
|
renderMetricCards(summary, report) {
|
||||||
|
const wrap = document.getElementById("metric-cards");
|
||||||
|
wrap.innerHTML = "";
|
||||||
|
const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
|
||||||
|
metrics.forEach((metric) => {
|
||||||
|
const value = report.metric_means ? report.metric_means[metric] : null;
|
||||||
|
const cls = App.scoreClass(value);
|
||||||
|
const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
|
||||||
|
const card = document.createElement("div");
|
||||||
|
card.className = "metric-card";
|
||||||
|
card.innerHTML = `
|
||||||
|
<div class="metric-value ${cls}">${text}</div>
|
||||||
|
<div class="metric-name">${App.escape(metric)}</div>
|
||||||
|
`;
|
||||||
|
wrap.appendChild(card);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// ② 分数分布直方图(可切换指标)。
|
||||||
|
renderDistribution(report) {
|
||||||
|
const select = document.getElementById("dist-metric-select");
|
||||||
|
const distributions = report.distributions || {};
|
||||||
|
const metricsWithDist = Object.keys(distributions);
|
||||||
|
|
||||||
|
select.innerHTML = "";
|
||||||
|
if (metricsWithDist.length === 0) {
|
||||||
|
Report._drawDistChart([], []);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
metricsWithDist.forEach((metric) => {
|
||||||
|
const opt = document.createElement("option");
|
||||||
|
opt.value = metric;
|
||||||
|
opt.textContent = metric;
|
||||||
|
select.appendChild(opt);
|
||||||
|
});
|
||||||
|
select.onchange = () => Report._updateDistChart(select.value);
|
||||||
|
Report._updateDistChart(metricsWithDist[0]);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 用选定指标的分箱数据刷新直方图。
|
||||||
|
_updateDistChart(metric) {
|
||||||
|
const distributions = Report.currentDetail.report.distributions || {};
|
||||||
|
const bins = distributions[metric] || [];
|
||||||
|
const labels = bins.map((b) => b.label);
|
||||||
|
const counts = bins.map((b) => b.count);
|
||||||
|
const colors = bins.map((b) => Report._binColor(b.lower));
|
||||||
|
Report._drawDistChart(labels, counts, colors);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 低分箱偏红、高分箱偏绿,直观暴露长尾。
|
||||||
|
_binColor(lower) {
|
||||||
|
if (lower >= 0.8) return "#16a34a";
|
||||||
|
if (lower >= 0.6) return "#84cc16";
|
||||||
|
if (lower >= 0.4) return "#eab308";
|
||||||
|
if (lower >= 0.2) return "#f97316";
|
||||||
|
return "#dc2626";
|
||||||
|
},
|
||||||
|
|
||||||
|
// 实际绘制 Chart.js 柱状图。
|
||||||
|
_drawDistChart(labels, counts, colors) {
|
||||||
|
const canvas = document.getElementById("dist-chart");
|
||||||
|
if (Report.distChart) Report.distChart.destroy();
|
||||||
|
Report.distChart = new Chart(canvas, {
|
||||||
|
type: "bar",
|
||||||
|
data: {
|
||||||
|
labels,
|
||||||
|
datasets: [{ data: counts, backgroundColor: colors || "#009999", borderRadius: 4 }],
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
responsive: true,
|
||||||
|
plugins: { legend: { display: false } },
|
||||||
|
scales: {
|
||||||
|
y: { beginAtZero: true, ticks: { precision: 0 }, grid: { color: "#f1f5f9" } },
|
||||||
|
x: { grid: { display: false } },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// ③ 分组均值(difficulty / question_type / language)。
|
||||||
|
renderGroupings(report) {
|
||||||
|
const tabsEl = document.getElementById("grouping-tabs");
|
||||||
|
const tableEl = document.getElementById("grouping-table");
|
||||||
|
const groupings = report.groupings || {};
|
||||||
|
const fields = Object.keys(groupings);
|
||||||
|
|
||||||
|
tabsEl.innerHTML = "";
|
||||||
|
if (fields.length === 0) {
|
||||||
|
tableEl.innerHTML = '<p class="muted tiny">数据集未包含可分组字段(difficulty / question_type)。</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const fieldLabels = { difficulty: "难度", question_type: "类型", language: "语言" };
|
||||||
|
Report.activeGrouping = fields[0];
|
||||||
|
fields.forEach((field) => {
|
||||||
|
const tab = document.createElement("button");
|
||||||
|
tab.className = "grouping-tab" + (field === Report.activeGrouping ? " active" : "");
|
||||||
|
tab.textContent = fieldLabels[field] || field;
|
||||||
|
tab.onclick = () => {
|
||||||
|
Report.activeGrouping = field;
|
||||||
|
tabsEl.querySelectorAll(".grouping-tab").forEach((t) => t.classList.remove("active"));
|
||||||
|
tab.classList.add("active");
|
||||||
|
Report._drawGroupTable(report, field);
|
||||||
|
};
|
||||||
|
tabsEl.appendChild(tab);
|
||||||
|
});
|
||||||
|
Report._drawGroupTable(report, Report.activeGrouping);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 渲染单个分组字段的均值表。
|
||||||
|
_drawGroupTable(report, field) {
|
||||||
|
const tableEl = document.getElementById("grouping-table");
|
||||||
|
const stats = report.groupings[field] || [];
|
||||||
|
const metrics = report.metrics || [];
|
||||||
|
|
||||||
|
let head = "<tr><th>组</th><th>样本</th>";
|
||||||
|
metrics.forEach((m) => (head += `<th>${App.escape(App.shortMetric(m))}</th>`));
|
||||||
|
head += "</tr>";
|
||||||
|
|
||||||
|
let body = "";
|
||||||
|
stats.forEach((stat) => {
|
||||||
|
body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
|
||||||
|
metrics.forEach((m) => {
|
||||||
|
const v = stat.means ? stat.means[m] : null;
|
||||||
|
const cls = App.scoreClass(v);
|
||||||
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
|
body += `<td class="${cls}">${text}</td>`;
|
||||||
|
});
|
||||||
|
body += "</tr>";
|
||||||
|
});
|
||||||
|
tableEl.innerHTML = `<table class="group-table">${head}${body}</table>`;
|
||||||
|
},
|
||||||
|
|
||||||
|
// ④ 最低分样本逐条复核表(点击展开)。
|
||||||
|
renderLowest(report) {
|
||||||
|
const wrap = document.getElementById("lowest-table");
|
||||||
|
const samples = report.lowest_samples || [];
|
||||||
|
wrap.innerHTML = "";
|
||||||
|
if (samples.length === 0) {
|
||||||
|
wrap.innerHTML = '<div class="lowest-detail-inner" style="padding:16px">暂无可复核样本。</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const metrics = report.metrics || [];
|
||||||
|
samples.forEach((sample, idx) => {
|
||||||
|
const row = document.createElement("div");
|
||||||
|
row.className = "lowest-row";
|
||||||
|
const scoreBadges = metrics
|
||||||
|
.map((m) => {
|
||||||
|
const v = sample.metrics ? sample.metrics[m] : null;
|
||||||
|
const cls = App.scoreClass(v);
|
||||||
|
const text = v === null || v === undefined ? "—" : v.toFixed(2);
|
||||||
|
return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
|
||||||
|
})
|
||||||
|
.join("");
|
||||||
|
row.innerHTML = `
|
||||||
|
<span class="sid">${App.escape(sample.sample_id)}</span>
|
||||||
|
<span class="q">${App.escape(sample.question || "—")}</span>
|
||||||
|
<span class="scores">${scoreBadges}</span>
|
||||||
|
`;
|
||||||
|
|
||||||
|
const detail = document.createElement("div");
|
||||||
|
detail.className = "lowest-detail";
|
||||||
|
detail.hidden = true;
|
||||||
|
detail.innerHTML = Report._detailHtml(sample);
|
||||||
|
|
||||||
|
row.addEventListener("click", () => {
|
||||||
|
detail.hidden = !detail.hidden;
|
||||||
|
});
|
||||||
|
wrap.appendChild(row);
|
||||||
|
wrap.appendChild(detail);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// 单条样本的展开详情:question / contexts / answer / ground_truth。
|
||||||
|
_detailHtml(sample) {
|
||||||
|
const contexts = (sample.contexts || [])
|
||||||
|
.map((c, i) => `<div class="ctx-item">[${i + 1}] ${App.escape(c)}</div>`)
|
||||||
|
.join("");
|
||||||
|
const errorBlock = sample.error
|
||||||
|
? `<div class="detail-field"><div class="detail-label">错误 error</div><div style="color:#dc2626">${App.escape(sample.error)}</div></div>`
|
||||||
|
: "";
|
||||||
|
return `
|
||||||
|
<div class="lowest-detail-inner">
|
||||||
|
<div class="detail-field">
|
||||||
|
<div class="detail-label">问题 question</div>
|
||||||
|
<div>${App.escape(sample.question || "—")}</div>
|
||||||
|
</div>
|
||||||
|
<div class="detail-field">
|
||||||
|
<div class="detail-label">检索片段 contexts</div>
|
||||||
|
<div class="detail-context">${contexts || "(空)"}</div>
|
||||||
|
</div>
|
||||||
|
<div class="detail-field">
|
||||||
|
<div class="detail-label">生成答案 answer</div>
|
||||||
|
<div>${App.escape(sample.answer || "—")}</div>
|
||||||
|
</div>
|
||||||
|
<div class="detail-field">
|
||||||
|
<div class="detail-label">标准答案 ground_truth</div>
|
||||||
|
<div class="detail-gt">${App.escape(sample.ground_truth || "—")}</div>
|
||||||
|
</div>
|
||||||
|
${errorBlock}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
},
|
||||||
|
};
|
||||||
133
webapp/static/js/runner.js
Normal file
133
webapp/static/js/runner.js
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
// runner.js — 新建评估视图:列出场景、触发评估、轮询任务状态与日志。
|
||||||
|
|
||||||
|
const Runner = {
|
||||||
|
selectedScenario: null,
|
||||||
|
pollTimer: null,
|
||||||
|
|
||||||
|
// 绑定运行按钮。
|
||||||
|
init() {
|
||||||
|
document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
|
||||||
|
document.getElementById("view-report-btn").addEventListener("click", () => {
|
||||||
|
if (Runner.lastRunId) {
|
||||||
|
App.currentRunId = Runner.lastRunId;
|
||||||
|
App.enableReportNav();
|
||||||
|
App.switchView("report");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// 加载并渲染可触发的场景列表。
|
||||||
|
async loadScenarios() {
|
||||||
|
const list = document.getElementById("scenario-list");
|
||||||
|
list.innerHTML = '<p class="muted">加载中…</p>';
|
||||||
|
try {
|
||||||
|
const data = await API.scenarios();
|
||||||
|
const scenarios = data.scenarios || [];
|
||||||
|
if (scenarios.length === 0) {
|
||||||
|
list.innerHTML = '<p class="muted">未在 scenarios/ 下找到场景文件。</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
list.innerHTML = "";
|
||||||
|
scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc)));
|
||||||
|
} catch (err) {
|
||||||
|
list.innerHTML = `<p class="muted">加载失败:${App.escape(err.message)}</p>`;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// 构造单个场景条目。
|
||||||
|
renderScenarioItem(sc) {
|
||||||
|
const item = document.createElement("div");
|
||||||
|
const invalid = !!sc.error;
|
||||||
|
item.className = "scenario-item" + (invalid ? " invalid" : "");
|
||||||
|
|
||||||
|
const modeTag = sc.mode
|
||||||
|
? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
|
||||||
|
: "";
|
||||||
|
const metricCount = (sc.metrics || []).length;
|
||||||
|
|
||||||
|
item.innerHTML = `
|
||||||
|
<div>
|
||||||
|
<div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
|
||||||
|
<div class="scenario-path">${App.escape(sc.path)}</div>
|
||||||
|
${sc.error ? `<div class="scenario-path" style="color:#dc2626">${App.escape(sc.error)}</div>` : ""}
|
||||||
|
</div>
|
||||||
|
<div class="scenario-tags">
|
||||||
|
${modeTag}
|
||||||
|
<span class="tag">${metricCount} 指标</span>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
|
||||||
|
if (!invalid) {
|
||||||
|
item.addEventListener("click", () => {
|
||||||
|
document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
|
||||||
|
item.classList.add("selected");
|
||||||
|
Runner.selectedScenario = sc.path;
|
||||||
|
document.getElementById("selected-scenario").textContent = sc.path;
|
||||||
|
document.getElementById("run-btn").disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return item;
|
||||||
|
},
|
||||||
|
|
||||||
|
// 触发评估并开始轮询。
|
||||||
|
async trigger() {
|
||||||
|
if (!Runner.selectedScenario) return;
|
||||||
|
const runBtn = document.getElementById("run-btn");
|
||||||
|
runBtn.disabled = true;
|
||||||
|
|
||||||
|
const panel = document.getElementById("task-panel");
|
||||||
|
const logBox = document.getElementById("task-log");
|
||||||
|
const statusBadge = document.getElementById("task-status");
|
||||||
|
const reportBtn = document.getElementById("view-report-btn");
|
||||||
|
panel.hidden = false;
|
||||||
|
reportBtn.hidden = true;
|
||||||
|
logBox.textContent = "";
|
||||||
|
Runner._setStatus(statusBadge, "queued");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const resp = await API.triggerEvaluation(Runner.selectedScenario);
|
||||||
|
Runner.poll(resp.task_id);
|
||||||
|
} catch (err) {
|
||||||
|
Runner._setStatus(statusBadge, "failed");
|
||||||
|
logBox.textContent = `触发失败:${err.message}`;
|
||||||
|
runBtn.disabled = false;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// 周期性轮询任务状态,刷新日志与徽标。
|
||||||
|
poll(taskId) {
|
||||||
|
const logBox = document.getElementById("task-log");
|
||||||
|
const statusBadge = document.getElementById("task-status");
|
||||||
|
const reportBtn = document.getElementById("view-report-btn");
|
||||||
|
const runBtn = document.getElementById("run-btn");
|
||||||
|
|
||||||
|
if (Runner.pollTimer) clearInterval(Runner.pollTimer);
|
||||||
|
Runner.pollTimer = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
const status = await API.taskStatus(taskId);
|
||||||
|
logBox.textContent = (status.logs || []).join("\n");
|
||||||
|
logBox.scrollTop = logBox.scrollHeight;
|
||||||
|
Runner._setStatus(statusBadge, status.status);
|
||||||
|
|
||||||
|
if (status.status === "completed" || status.status === "failed") {
|
||||||
|
clearInterval(Runner.pollTimer);
|
||||||
|
runBtn.disabled = false;
|
||||||
|
if (status.status === "completed" && status.run_id) {
|
||||||
|
Runner.lastRunId = status.run_id;
|
||||||
|
reportBtn.hidden = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
clearInterval(Runner.pollTimer);
|
||||||
|
logBox.textContent += `\n轮询失败:${err.message}`;
|
||||||
|
runBtn.disabled = false;
|
||||||
|
}
|
||||||
|
}, 1200);
|
||||||
|
},
|
||||||
|
|
||||||
|
// 更新状态徽标的文本与配色类。
|
||||||
|
_setStatus(badge, status) {
|
||||||
|
badge.textContent = status;
|
||||||
|
badge.className = "badge " + status;
|
||||||
|
},
|
||||||
|
};
|
||||||
42
webmain.py
Normal file
42
webmain.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
"""CLI entry point that launches the evaluation console web server.
|
||||||
|
|
||||||
|
Run alongside the existing main.py CLI; both share the same rag_eval library
|
||||||
|
and the same runs/ artifacts. Example:
|
||||||
|
|
||||||
|
python webmain.py
|
||||||
|
python webmain.py --host 0.0.0.0 --port 8800
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
"""Parse host/port/reload options for the console server."""
|
||||||
|
parser = argparse.ArgumentParser(description="Launch the RAGAS evaluation console.")
|
||||||
|
parser.add_argument("--host", default="127.0.0.1", help="Bind host (default 127.0.0.1).")
|
||||||
|
parser.add_argument("--port", type=int, default=8800, help="Bind port (default 8800).")
|
||||||
|
parser.add_argument(
|
||||||
|
"--reload",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable auto-reload for local development.",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Start uvicorn with the configured application."""
|
||||||
|
args = parse_args()
|
||||||
|
uvicorn.run(
|
||||||
|
"webapp.server:app",
|
||||||
|
host=args.host,
|
||||||
|
port=args.port,
|
||||||
|
reload=args.reload,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user