From e89695e49000016be4ce413587b5078e920df8c4 Mon Sep 17 00:00:00 2001 From: wangwei Date: Mon, 15 Jun 2026 15:53:57 +0800 Subject: [PATCH] Add RAGAS evaluation web console (FastAPI + vanilla JS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - webapp/: FastAPI backend with runs/scenarios/evaluations API routers; services for run_reader, report_builder, scenario_scanner, task_manager (lazy ragas import — server boots even without ragas); Pydantic models - webapp/static/: single-page console (layout A: left-nav + main area); report detail with metric cards, Chart.js distribution histogram, grouping table, lowest-score sample review; trigger evaluation + log polling - webmain.py: uvicorn entry point (alongside existing main.py CLI) - start.bat: Windows one-click launcher with env checks and auto-browser open - rag_eval/datasets/: implement missing loader + normalizer modules (load_dataset_records, normalize_records) required by evaluator - scripts/seed_sample_run.py: generate realistic demo run artifacts - .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source Co-Authored-By: Claude Sonnet 4 --- .gitignore | 6 +- rag_eval/datasets/__init__.py | 1 + rag_eval/datasets/loader.py | 56 ++++++ rag_eval/datasets/normalizers.py | 105 +++++++++++ scripts/seed_sample_run.py | 236 ++++++++++++++++++++++++ start.bat | 99 +++++++++++ webapp/__init__.py | 5 + webapp/api/__init__.py | 1 + webapp/api/evaluations.py | 44 +++++ webapp/api/runs.py | 32 ++++ webapp/api/scenarios.py | 16 ++ webapp/models.py | 129 ++++++++++++++ webapp/server.py | 49 +++++ webapp/services/__init__.py | 1 + webapp/services/report_builder.py | 188 ++++++++++++++++++++ webapp/services/run_reader.py | 222 +++++++++++++++++++++++ webapp/services/scenario_scanner.py | 84 +++++++++ webapp/services/task_manager.py | 161 +++++++++++++++++ webapp/services/text_utils.py | 47 +++++ webapp/static/css/app.css | 267 ++++++++++++++++++++++++++++ webapp/static/index.html | 118 ++++++++++++ webapp/static/js/api.js | 46 +++++ webapp/static/js/app.js | 152 ++++++++++++++++ webapp/static/js/report.js | 258 +++++++++++++++++++++++++++ webapp/static/js/runner.js | 133 ++++++++++++++ webmain.py | 42 +++++ 26 files changed, 2496 insertions(+), 2 deletions(-) create mode 100644 rag_eval/datasets/__init__.py create mode 100644 rag_eval/datasets/loader.py create mode 100644 rag_eval/datasets/normalizers.py create mode 100644 scripts/seed_sample_run.py create mode 100644 start.bat create mode 100644 webapp/__init__.py create mode 100644 webapp/api/__init__.py create mode 100644 webapp/api/evaluations.py create mode 100644 webapp/api/runs.py create mode 100644 webapp/api/scenarios.py create mode 100644 webapp/models.py create mode 100644 webapp/server.py create mode 100644 webapp/services/__init__.py create mode 100644 webapp/services/report_builder.py create mode 100644 webapp/services/run_reader.py create mode 100644 webapp/services/scenario_scanner.py create mode 100644 webapp/services/task_manager.py create mode 100644 webapp/services/text_utils.py create mode 100644 webapp/static/css/app.css create mode 100644 webapp/static/index.html create mode 100644 webapp/static/js/api.js create mode 100644 webapp/static/js/app.js create mode 100644 webapp/static/js/report.js create mode 100644 webapp/static/js/runner.js create mode 100644 webmain.py diff --git a/.gitignore b/.gitignore index 6c851d2..f2d09f1 100644 --- a/.gitignore +++ b/.gitignore @@ -17,5 +17,7 @@ wheels/ # outputs outputs/ -# datasets -datasets/ \ No newline at end of file +# datasets — raw/normalized data files (large, not committed) +# Note: rag_eval/datasets/ is source code and IS committed (see negation below) +datasets/ +!rag_eval/datasets/ \ No newline at end of file diff --git a/rag_eval/datasets/__init__.py b/rag_eval/datasets/__init__.py new file mode 100644 index 0000000..750585e --- /dev/null +++ b/rag_eval/datasets/__init__.py @@ -0,0 +1 @@ +"""Dataset loading and normalization for the RAG evaluation platform.""" diff --git a/rag_eval/datasets/loader.py b/rag_eval/datasets/loader.py new file mode 100644 index 0000000..7939a27 --- /dev/null +++ b/rag_eval/datasets/loader.py @@ -0,0 +1,56 @@ +"""Load raw evaluation dataset records from disk. + +Supports CSV and JSONL formats. Returns a list of plain dicts — normalization +into NormalizedSample is handled by normalizers.py. +""" + +from __future__ import annotations + +import csv +import json +from pathlib import Path +from typing import Any + + +def load_dataset_records(path: Path | str) -> list[dict[str, Any]]: + """Load raw records from a CSV or JSONL file. + + Each row becomes a plain dict. Lists stored as JSON strings in CSV columns + are left as-is; normalizers handle parsing. + """ + file_path = Path(path) + if not file_path.is_file(): + raise FileNotFoundError(f"Dataset file not found: {file_path}") + + suffix = file_path.suffix.lower() + if suffix in (".jsonl", ".ndjson"): + return _load_jsonl(file_path) + if suffix in (".csv",): + return _load_csv(file_path) + # Fall back to CSV for unknown extensions. + return _load_csv(file_path) + + +def _load_csv(path: Path) -> list[dict[str, Any]]: + """Read a CSV file into a list of row dicts.""" + with path.open(encoding="utf-8", newline="") as fh: + reader = csv.DictReader(fh) + return [dict(row) for row in reader] + + +def _load_jsonl(path: Path) -> list[dict[str, Any]]: + """Read a JSONL file into a list of record dicts.""" + records: list[dict[str, Any]] = [] + with path.open(encoding="utf-8") as fh: + for lineno, line in enumerate(fh, 1): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc + if not isinstance(obj, dict): + raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}") + records.append(obj) + return records diff --git a/rag_eval/datasets/normalizers.py b/rag_eval/datasets/normalizers.py new file mode 100644 index 0000000..d77b3f8 --- /dev/null +++ b/rag_eval/datasets/normalizers.py @@ -0,0 +1,105 @@ +"""Normalize raw dataset records into NormalizedSample and InvalidSample objects. + +Handles both offline mode (records already contain answer + contexts) and online +mode (records only contain question + ground_truth; adapter fills the rest). +""" + +from __future__ import annotations + +import uuid +from typing import Any + +from rag_eval.shared.models import InvalidSample, NormalizedSample +from rag_eval.shared.utils import parse_contexts + +# Fields we always strip from the raw record before storing it in metadata. +_CORE_FIELDS = { + "sample_id", + "question", + "contexts", + "answer", + "ground_truth", + "scenario", + "language", + "retrieval_config", +} + + +def _get_str(record: dict[str, Any], key: str, default: str = "") -> str: + """Return a string field from the record, coercing None/NaN to the default.""" + value = record.get(key) + if value is None: + return default + text = str(value).strip() + return default if text.lower() == "nan" else text + + +def normalize_records( + records: list[dict[str, Any]], + mode: str = "offline", + max_samples: int | None = None, +) -> tuple[list[NormalizedSample], list[InvalidSample]]: + """Convert raw dicts into NormalizedSample / InvalidSample collections. + + In offline mode every record must already contain answer and contexts. + In online mode those fields may be absent; they will be filled by the adapter. + """ + if max_samples is not None: + records = records[:max_samples] + + valid: list[NormalizedSample] = [] + invalid: list[InvalidSample] = [] + + for raw in records: + sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12] + + question = _get_str(raw, "question") + if not question: + invalid.append(InvalidSample( + sample_id=sample_id, + error="missing required field: question", + raw=raw, + )) + continue + + ground_truth = _get_str(raw, "ground_truth") + contexts = parse_contexts(raw.get("contexts")) + answer = _get_str(raw, "answer") + + if mode == "offline": + errors: list[str] = [] + if not ground_truth: + errors.append("missing ground_truth") + if not answer: + errors.append("missing answer") + if not contexts: + errors.append("missing or empty contexts") + if errors: + invalid.append(InvalidSample( + sample_id=sample_id, + error="; ".join(errors), + raw=raw, + )) + continue + + # Collect any extra columns as opaque metadata for adapters and reporting. + metadata = { + key: value + for key, value in raw.items() + if key not in _CORE_FIELDS + } + + valid.append(NormalizedSample( + sample_id=sample_id, + question=question, + contexts=contexts, + answer=answer, + ground_truth=ground_truth, + scenario=_get_str(raw, "scenario"), + language=_get_str(raw, "language"), + retrieval_config=_get_str(raw, "retrieval_config"), + metadata=metadata, + raw=raw, + )) + + return valid, invalid diff --git a/scripts/seed_sample_run.py b/scripts/seed_sample_run.py new file mode 100644 index 0000000..0d8c18b --- /dev/null +++ b/scripts/seed_sample_run.py @@ -0,0 +1,236 @@ +"""Generate a realistic sample evaluation run so the console has demo data. + +This writes the standard run artifacts (metadata.json, scores.csv, summary.md, +scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting +layer produces, but without needing ragas or any network calls. It lets the +report board render immediately for demos and local development. + +Usage: + python scripts/seed_sample_run.py +""" + +from __future__ import annotations + +import csv +import json +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] + +SCENARIO_NAME = "kba-knowledge-base-offline-baseline" +RUN_ID = "2026-06-15T08-30-00+00-00" +JUDGE_MODEL = "deepseek-distill-qwen-32b" +EMBEDDING_MODEL = "text-embedding-v3" +METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"] + +# Each row mirrors a scores.csv record: sample fields + metric scores + metadata. +# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long +# tail in the distribution, and clear weak groups by difficulty). +SAMPLES = [ + { + "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact", + "question": "员工入职满3年可享受多少天年休假?", + "contexts": ["员工入司满1年不满10年的,年休假5天。", "年休假在每年1月1日起可申请。"], + "answer": "根据规定,入职满3年的员工可享受5天年休假。", + "ground_truth": "员工入司满1年不满10年的,年休假5天。", + "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92, + }, + { + "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact", + "question": "公司报销差旅费的截止提交时间是什么时候?", + "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"], + "answer": "差旅费需在出差结束后30天内提交报销。", + "ground_truth": "差旅费报销须在出差结束后30天内提交。", + "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88, + }, + { + "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure", + "question": "申请远程办公需要经过哪些审批流程?", + "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"], + "answer": "需先由直属主管审批,再提交人力资源部备案,每月不超过8天。", + "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案,每月上限8天。", + "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79, + }, + { + "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact", + "question": "How many days of paternity leave are employees entitled to?", + "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."], + "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.", + "ground_truth": "Employees are entitled to 15 days of paternity leave.", + "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74, + }, + { + "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison", + "question": "正式员工与试用期员工在医疗保险待遇上有何区别?", + "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"], + "answer": "正式员工额外享受补充医疗保险,试用期员工仅有基础医疗保险。", + "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险,试用期员工仅享基础医疗保险。", + "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71, + }, + { + "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary", + "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。", + "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"], + "answer": "第三方共享需签保密协议,敏感数据须经数据保护官批准,记录留存3年。", + "ground_truth": "向第三方共享数据须签署保密协议,敏感数据共享须经数据保护官批准,且共享记录至少留存3年。", + "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62, + }, + { + "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure", + "question": "跨部门项目预算超支时的审批升级路径是怎样的?", + "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"], + "answer": "超支10%以内项目经理批,10%-20%需总监批,超20%需财务委员会审批。", + "ground_truth": "超支10%以内由项目经理审批,10%-20%由部门总监审批,超过20%须提交财务委员会审批。", + "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48, + }, + { + "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact", + "question": "员工持股计划的最低锁定期是多少年?", + "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"], + "answer": "员工持股计划的最低锁定期为3年。", + "ground_truth": "员工持股计划的最低锁定期为4年。", + "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30, + }, + { + "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison", + "question": "What is the difference in notice period between voluntary and involuntary termination?", + "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."], + "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.", + "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.", + "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33, + }, + { + "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact", + "question": "公司规定的标准工作时间是每周多少小时?", + "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"], + "answer": "公司标准工作时间为每周40小时。", + "ground_truth": "公司标准工作时间为每周40小时。", + "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90, + }, +] + +# Two samples that failed normalization, to exercise the invalid count display. +INVALID_SAMPLES = [ + {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办?"}, + {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡?"}, +] + + +def _output_dir() -> Path: + """Return the run directory where sample artifacts are written.""" + return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID + + +def _write_scores_csv(path: Path) -> None: + """Write scores.csv with sample fields, metric scores, and metadata columns.""" + fieldnames = [ + "sample_id", "question", "contexts", "answer", "ground_truth", + "scenario", "language", "difficulty", "question_type", + *METRICS, "error", "judge_model", "embedding_model", "run_id", + ] + with path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + for sample in SAMPLES: + row = { + "sample_id": sample["sample_id"], + "question": sample["question"], + # Serialize contexts as a JSON list, matching engine CSV output. + "contexts": json.dumps(sample["contexts"], ensure_ascii=False), + "answer": sample["answer"], + "ground_truth": sample["ground_truth"], + "scenario": SCENARIO_NAME, + "language": sample["language"], + "difficulty": sample["difficulty"], + "question_type": sample["question_type"], + "error": "", + "judge_model": JUDGE_MODEL, + "embedding_model": EMBEDDING_MODEL, + "run_id": SCENARIO_NAME, + } + for metric in METRICS: + row[metric] = sample[metric] + writer.writerow(row) + + +def _write_invalid_csv(path: Path) -> None: + """Write invalid.csv with the small set of unscored samples.""" + with path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"]) + writer.writeheader() + writer.writerows(INVALID_SAMPLES) + + +def _metric_mean(metric: str) -> float: + """Compute the mean of one metric across the valid samples.""" + return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4) + + +def _write_metadata(path: Path) -> None: + """Write metadata.json mirroring the reporting layer's schema.""" + metadata = { + "run_id": RUN_ID, + "scenario_name": SCENARIO_NAME, + "mode": "offline", + "judge_model": JUDGE_MODEL, + "embedding_model": EMBEDDING_MODEL, + "started_at": "2026-06-15T08:29:12+00:00", + "finished_at": "2026-06-15T08:31:45+00:00", + "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv", + "valid_samples": len(SAMPLES), + "invalid_samples": len(INVALID_SAMPLES), + } + path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _write_summary(path: Path) -> None: + """Write a human-readable summary.md echoing the metric means.""" + lines = [ + f"# {SCENARIO_NAME}", + "", + f"- run_id: `{RUN_ID}`", + "- mode: `offline`", + f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`", + f"- valid_samples: `{len(SAMPLES)}`", + f"- invalid_samples: `{len(INVALID_SAMPLES)}`", + f"- judge_model: `{JUDGE_MODEL}`", + "", + "## Metric Means", + "", + ] + for metric in METRICS: + lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`") + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def _write_scenario_snapshot(path: Path) -> None: + """Write scenario.snapshot.yaml so the reader resolves the metric list.""" + import yaml + + snapshot = { + "scenario_name": SCENARIO_NAME, + "mode": "offline", + "judge_model": JUDGE_MODEL, + "embedding_model": EMBEDDING_MODEL, + "metrics": METRICS, + } + path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8") + + +def main() -> None: + """Write all sample run artifacts into a fresh run directory.""" + run_dir = _output_dir() + run_dir.mkdir(parents=True, exist_ok=True) + + _write_scores_csv(run_dir / "scores.csv") + _write_invalid_csv(run_dir / "invalid.csv") + _write_metadata(run_dir / "metadata.json") + _write_summary(run_dir / "summary.md") + _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml") + + print(f"Sample run written to: {run_dir}") + print("Start the console with: python webmain.py") + + +if __name__ == "__main__": + main() diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..353b309 --- /dev/null +++ b/start.bat @@ -0,0 +1,99 @@ +@echo off +chcp 65001 >nul +setlocal enabledelayedexpansion + +echo. +echo ============================================================ +echo Siemens RAGAS 评估控制台 启动脚本 +echo ============================================================ +echo. + +:: ---- 切换到脚本所在目录(即 siemens_ragas/)------------------- +cd /d "%~dp0" + +:: ---- 检查 Python --------------------------------------------------- +python --version >nul 2>&1 +if errorlevel 1 ( + echo [错误] 未找到 Python,请确认已安装 Python 3.12+ 并加入 PATH。 + pause + exit /b 1 +) +for /f "tokens=*" %%v in ('python --version 2^>^&1') do set PY_VER=%%v +echo [OK] %PY_VER% + +:: ---- 检查 FastAPI / uvicorn --------------------------------------- +python -c "import fastapi, uvicorn" >nul 2>&1 +if errorlevel 1 ( + echo [提示] 正在安装 FastAPI 和 uvicorn... + pip install fastapi uvicorn --quiet + if errorlevel 1 ( + echo [错误] 安装依赖失败,请手动运行: pip install fastapi uvicorn + pause + exit /b 1 + ) + echo [OK] FastAPI / uvicorn 安装完成。 +) else ( + echo [OK] FastAPI / uvicorn 已就绪。 +) + +:: ---- 检查 ragas 版本 ---------------------------------------------- +python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" >nul 2>&1 +if errorlevel 1 ( + echo [提示] 正在安装 ragas==0.4.3(评估引擎依赖)... + pip install "ragas==0.4.3" --quiet + if errorlevel 1 ( + echo [警告] ragas 安装失败。 + echo 控制台仍可启动:报告看板可用,触发评估功能将显示错误。 + echo. + ) else ( + echo [OK] ragas 0.4.3 安装完成。 + ) +) else ( + echo [OK] ragas 0.4.3 已就绪。 +) + +:: ---- 检查是否有示例数据,没有则自动生成 --------------------------- +set SAMPLE_META=outputs\kba-knowledge-base-offline-baseline\2026-06-15T08-30-00+00-00\metadata.json +if not exist "%SAMPLE_META%" ( + echo [提示] 未找到示例运行数据,正在生成... + python scripts\seed_sample_run.py + if errorlevel 1 ( + echo [警告] 示例数据生成失败,看板可能为空。继续启动... + ) else ( + echo [OK] 示例数据已生成。 + ) +) else ( + echo [OK] 已有运行数据,跳过示例生成。 +) + +:: ---- 检查端口是否已占用 ------------------------------------------ +set PORT=8800 +netstat -ano | findstr /r ":%PORT%[^0-9]" | findstr "LISTENING" >nul 2>&1 +if not errorlevel 1 ( + echo [警告] 端口 %PORT% 已被占用,尝试使用 8801... + set PORT=8801 + netstat -ano | findstr /r ":8801[^0-9]" | findstr "LISTENING" >nul 2>&1 + if not errorlevel 1 ( + echo [错误] 端口 8800 和 8801 均被占用,请手动指定端口: + echo python webmain.py --port ^<端口号^> + pause + exit /b 1 + ) +) + +echo. +echo ============================================================ +echo 启动控制台:http://127.0.0.1:%PORT% +echo 按 Ctrl+C 停止服务 +echo ============================================================ +echo. + +:: ---- 稍等 1 秒后在默认浏览器打开页面 ---------------------------- +start /b cmd /c "timeout /t 2 >nul && start http://127.0.0.1:%PORT%" + +:: ---- 启动 uvicorn ------------------------------------------------- +python webmain.py --host 127.0.0.1 --port %PORT% + +echo. +echo 服务已停止。 +pause diff --git a/webapp/__init__.py b/webapp/__init__.py new file mode 100644 index 0000000..68c9b41 --- /dev/null +++ b/webapp/__init__.py @@ -0,0 +1,5 @@ +"""Lightweight FastAPI web console layered on top of the rag_eval platform. + +This package is additive and non-invasive: it imports rag_eval as a library and +reads run artifacts from disk. It never modifies the core evaluation modules. +""" diff --git a/webapp/api/__init__.py b/webapp/api/__init__.py new file mode 100644 index 0000000..f471954 --- /dev/null +++ b/webapp/api/__init__.py @@ -0,0 +1 @@ +"""API router package for the evaluation console.""" diff --git a/webapp/api/evaluations.py b/webapp/api/evaluations.py new file mode 100644 index 0000000..3775b47 --- /dev/null +++ b/webapp/api/evaluations.py @@ -0,0 +1,44 @@ +"""Routes for triggering evaluations and polling background task status.""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException + +from webapp.models import ( + TaskStatus, + TriggerEvaluationRequest, + TriggerEvaluationResponse, +) +from webapp.services import scenario_scanner +from webapp.services.task_manager import task_manager + +router = APIRouter(prefix="/api/evaluations", tags=["evaluations"]) + + +@router.post("", response_model=TriggerEvaluationResponse) +def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse: + """Validate the scenario path and queue a background evaluation task.""" + resolved = scenario_scanner.resolve_scenario_path(request.scenario_path) + if resolved is None: + raise HTTPException( + status_code=400, + detail=f"无效或不允许的场景路径: {request.scenario_path}", + ) + + task_id = task_manager.submit(request.scenario_path) + return TriggerEvaluationResponse(task_id=task_id) + + +@router.get("/{task_id}", response_model=TaskStatus) +def get_task_status(task_id: str) -> TaskStatus: + """Return the current status and logs for one evaluation task.""" + status = task_manager.get(task_id) + if status is None: + raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}") + return status + + +@router.get("", response_model=dict) +def list_tasks() -> dict[str, list]: + """Return all known evaluation tasks for this server session.""" + return {"tasks": [task.model_dump() for task in task_manager.list_tasks()]} diff --git a/webapp/api/runs.py b/webapp/api/runs.py new file mode 100644 index 0000000..da3765d --- /dev/null +++ b/webapp/api/runs.py @@ -0,0 +1,32 @@ +"""Routes for listing evaluation runs and fetching a single run's report.""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException + +from webapp.models import RunDetail +from webapp.services import report_builder, run_reader + +router = APIRouter(prefix="/api/runs", tags=["runs"]) + + +@router.get("") +def get_runs() -> dict[str, list]: + """Return summaries for every discoverable evaluation run.""" + summaries = run_reader.list_run_summaries() + return {"runs": [summary.model_dump() for summary in summaries]} + + +@router.get("/{run_id}") +def get_run_detail(run_id: str) -> RunDetail: + """Return the full summary and aggregated report for one run.""" + run_dir = run_reader.find_run_dir(run_id) + if run_dir is None: + raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}") + + summary = run_reader.build_run_summary(run_dir) + if summary is None: + raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}") + + report = report_builder.build_report(run_dir, summary.metrics) + return RunDetail(summary=summary, report=report) diff --git a/webapp/api/scenarios.py b/webapp/api/scenarios.py new file mode 100644 index 0000000..45aa31c --- /dev/null +++ b/webapp/api/scenarios.py @@ -0,0 +1,16 @@ +"""Route for discovering scenario YAML files that can be evaluated.""" + +from __future__ import annotations + +from fastapi import APIRouter + +from webapp.services import scenario_scanner + +router = APIRouter(prefix="/api/scenarios", tags=["scenarios"]) + + +@router.get("") +def get_scenarios() -> dict[str, list]: + """Return every scenario file found under the scenarios/ directory.""" + scenarios = scenario_scanner.list_scenarios() + return {"scenarios": [item.model_dump() for item in scenarios]} diff --git a/webapp/models.py b/webapp/models.py new file mode 100644 index 0000000..03dd6a9 --- /dev/null +++ b/webapp/models.py @@ -0,0 +1,129 @@ +"""Pydantic response models for the evaluation console HTTP API.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class RunSummary(BaseModel): + """Compact description of a single evaluation run for list views.""" + + run_id: str + scenario_name: str + mode: str = "" + judge_model: str = "" + embedding_model: str = "" + started_at: str = "" + finished_at: str = "" + dataset: str = "" + total_samples: int = 0 + valid_samples: int = 0 + invalid_samples: int = 0 + metrics: list[str] = Field(default_factory=list) + metric_means: dict[str, float | None] = Field(default_factory=dict) + output_path: str = "" + + +class GroupStat(BaseModel): + """Mean metric values for one slice of samples grouped by a metadata field.""" + + key: str + count: int + means: dict[str, float | None] = Field(default_factory=dict) + + +class DistributionBin(BaseModel): + """One histogram bucket of sample counts for a single metric.""" + + label: str + lower: float + upper: float + count: int + + +class SampleScore(BaseModel): + """Per-sample row used for the lowest-score review table.""" + + sample_id: str + question: str = "" + contexts: list[str] = Field(default_factory=list) + answer: str = "" + ground_truth: str = "" + language: str = "" + difficulty: str = "" + question_type: str = "" + metrics: dict[str, float | None] = Field(default_factory=dict) + mean_score: float | None = None + error: str = "" + + +class ReportData(BaseModel): + """Aggregated report payload rendered by the report detail page.""" + + metrics: list[str] = Field(default_factory=list) + metric_means: dict[str, float | None] = Field(default_factory=dict) + distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict) + groupings: dict[str, list[GroupStat]] = Field(default_factory=dict) + lowest_samples: list[SampleScore] = Field(default_factory=list) + summary_markdown: str = "" + + +class RunDetail(BaseModel): + """Full payload for a single run: summary metadata plus the report.""" + + summary: RunSummary + report: ReportData + + +class ScenarioInfo(BaseModel): + """One discoverable scenario YAML file that can be evaluated from the UI.""" + + path: str + scenario_name: str = "" + mode: str = "" + dataset: str = "" + judge_model: str = "" + metrics: list[str] = Field(default_factory=list) + error: str = "" + + +class TaskStatus(BaseModel): + """State of a background evaluation task tracked by the task manager.""" + + task_id: str + scenario_path: str + status: str + logs: list[str] = Field(default_factory=list) + run_id: str | None = None + error: str | None = None + created_at: str = "" + finished_at: str = "" + + +class TriggerEvaluationRequest(BaseModel): + """Request body for launching an evaluation run from the UI.""" + + scenario_path: str + + +class TriggerEvaluationResponse(BaseModel): + """Response returned immediately after queuing an evaluation task.""" + + task_id: str + + +def jsonable(value: Any) -> Any: + """Convert NaN/inf floats into None so the payload stays valid JSON.""" + import math + + if isinstance(value, float): + if math.isnan(value) or math.isinf(value): + return None + return value + if isinstance(value, dict): + return {key: jsonable(item) for key, item in value.items()} + if isinstance(value, list): + return [jsonable(item) for item in value] + return value diff --git a/webapp/server.py b/webapp/server.py new file mode 100644 index 0000000..49ea03d --- /dev/null +++ b/webapp/server.py @@ -0,0 +1,49 @@ +"""FastAPI application factory for the RAGAS evaluation console. + +The app mounts three JSON API routers and serves the single-page static +frontend. It imports rag_eval only lazily (inside the task manager worker), so +the server starts even when the evaluation dependencies are not yet installed. +""" + +from __future__ import annotations + +from pathlib import Path + +from fastapi import FastAPI +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles + +from webapp.api import evaluations, runs, scenarios + +STATIC_DIR = Path(__file__).resolve().parent / "static" + + +def create_app() -> FastAPI: + """Build and configure the FastAPI application instance.""" + app = FastAPI( + title="Siemens RAGAS 评估控制台", + description="RAGAS 评估子系统的可视化报告与评估触发控制台。", + version="0.1.0", + ) + + app.include_router(runs.router) + app.include_router(scenarios.router) + app.include_router(evaluations.router) + + @app.get("/api/health", tags=["meta"]) + def health() -> dict[str, str]: + """Report basic liveness so the UI can confirm the server is reachable.""" + return {"status": "ok"} + + @app.get("/", include_in_schema=False) + def index() -> FileResponse: + """Serve the single-page console entry document.""" + return FileResponse(STATIC_DIR / "index.html") + + # Serve CSS/JS assets under /static while keeping API routes at /api. + app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static") + + return app + + +app = create_app() diff --git a/webapp/services/__init__.py b/webapp/services/__init__.py new file mode 100644 index 0000000..fbdd2df --- /dev/null +++ b/webapp/services/__init__.py @@ -0,0 +1 @@ +"""Service package for the evaluation console (filesystem readers and task runner).""" diff --git a/webapp/services/report_builder.py b/webapp/services/report_builder.py new file mode 100644 index 0000000..1320578 --- /dev/null +++ b/webapp/services/report_builder.py @@ -0,0 +1,188 @@ +"""Aggregate a run's per-sample scores into the report payload for the UI. + +All aggregation reads only the standard scores.csv produced by the reporting +layer, plus the metric list resolved by run_reader. The output mirrors the +report detail page: metric means, per-metric distribution histograms, grouped +means by difficulty / question_type, and the lowest-scoring samples for review. +""" + +from __future__ import annotations + +import math +from pathlib import Path + +import pandas as pd + +from webapp.services.text_utils import parse_contexts +from webapp.models import ( + DistributionBin, + GroupStat, + ReportData, + SampleScore, +) +from webapp.services import run_reader + + +# Number of equal-width buckets used for metric score histograms. +DISTRIBUTION_BIN_COUNT = 5 + +# Metadata columns that we group samples by when present in the data. +GROUPING_FIELDS = ("difficulty", "question_type", "language") + +# How many lowest-scoring samples to surface for manual review. +LOWEST_SAMPLE_COUNT = 10 + + +def _round_or_none(value: float | None) -> float | None: + """Round a float to four places, mapping NaN/None to None for clean JSON.""" + if value is None: + return None + if isinstance(value, float) and (math.isnan(value) or math.isinf(value)): + return None + return round(float(value), 4) + + +def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]: + """Compute the mean of each metric column across all scored samples.""" + means: dict[str, float | None] = {} + for metric in metrics: + if metric in frame.columns: + means[metric] = _round_or_none(frame[metric].mean(numeric_only=True)) + else: + means[metric] = None + return means + + +def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]: + """Bucket one metric's scores into fixed-width [0,1] histogram bins.""" + bins: list[DistributionBin] = [] + if metric not in frame.columns: + return bins + + series = pd.to_numeric(frame[metric], errors="coerce").dropna() + width = 1.0 / DISTRIBUTION_BIN_COUNT + for index in range(DISTRIBUTION_BIN_COUNT): + lower = index * width + upper = (index + 1) * width + # Include the right edge in the final bin so 1.0 is counted. + if index == DISTRIBUTION_BIN_COUNT - 1: + mask = (series >= lower) & (series <= upper) + else: + mask = (series >= lower) & (series < upper) + bins.append( + DistributionBin( + label=f"{lower:.1f}–{upper:.1f}", + lower=round(lower, 2), + upper=round(upper, 2), + count=int(mask.sum()), + ) + ) + return bins + + +def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]: + """Compute per-group metric means for each available grouping field.""" + groupings: dict[str, list[GroupStat]] = {} + for field in GROUPING_FIELDS: + if field not in frame.columns: + continue + # Skip fields that are entirely empty so the UI does not render noise. + non_empty = frame[field].astype(str).str.strip().replace("nan", "") + if non_empty.eq("").all(): + continue + + stats: list[GroupStat] = [] + for key, group in frame.groupby(frame[field].astype(str)): + key_text = str(key).strip() + if not key_text or key_text == "nan": + continue + means = { + metric: _round_or_none(group[metric].mean(numeric_only=True)) + for metric in metrics + if metric in group.columns + } + stats.append(GroupStat(key=key_text, count=int(len(group)), means=means)) + if stats: + stats.sort(key=lambda item: item.key) + groupings[field] = stats + return groupings + + +def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None: + """Average a single sample's available metric scores for ranking.""" + values = [ + float(row[metric]) + for metric in metrics + if metric in row and pd.notna(row[metric]) + ] + if not values: + return None + return sum(values) / len(values) + + +def _cell_text(row: pd.Series, column: str) -> str: + """Safely read a string cell, returning '' for missing or NaN values.""" + if column not in row or pd.isna(row[column]): + return "" + return str(row[column]).strip() + + +def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]: + """Select and shape the lowest-scoring samples for the review table.""" + if frame.empty: + return [] + + enriched: list[tuple[float, SampleScore]] = [] + for _, row in frame.iterrows(): + mean_score = _sample_mean(row, metrics) + sample = SampleScore( + sample_id=_cell_text(row, "sample_id") or "—", + question=_cell_text(row, "question"), + contexts=parse_contexts(row["contexts"]) if "contexts" in row else [], + answer=_cell_text(row, "answer"), + ground_truth=_cell_text(row, "ground_truth"), + language=_cell_text(row, "language"), + difficulty=_cell_text(row, "difficulty"), + question_type=_cell_text(row, "question_type"), + metrics={ + metric: _round_or_none(float(row[metric])) + for metric in metrics + if metric in row and pd.notna(row[metric]) + }, + mean_score=_round_or_none(mean_score), + error=_cell_text(row, "error"), + ) + # Samples without any score sort last (treated as worst for review). + sort_key = mean_score if mean_score is not None else -1.0 + enriched.append((sort_key, sample)) + + enriched.sort(key=lambda item: item[0]) + return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]] + + +def build_report(run_dir: Path, metrics: list[str]) -> ReportData: + """Build the full aggregated report payload for one run directory.""" + frame = run_reader.read_scores_frame(run_dir) + summary_markdown = run_reader.read_summary_markdown(run_dir) + + if frame.empty or not metrics: + return ReportData( + metrics=metrics, + metric_means={metric: None for metric in metrics}, + summary_markdown=summary_markdown, + ) + + distributions = { + metric: _distribution(frame, metric) + for metric in metrics + if metric in frame.columns + } + + return ReportData( + metrics=metrics, + metric_means=_metric_means(frame, metrics), + distributions=distributions, + groupings=_groupings(frame, metrics), + lowest_samples=_lowest_samples(frame, metrics), + summary_markdown=summary_markdown, + ) diff --git a/webapp/services/run_reader.py b/webapp/services/run_reader.py new file mode 100644 index 0000000..0d9eb27 --- /dev/null +++ b/webapp/services/run_reader.py @@ -0,0 +1,222 @@ +"""Read evaluation run artifacts from disk into API-friendly structures. + +A "run" is any directory under the configured output roots that contains a +metadata.json file. This service stays decoupled from rag_eval internals: it +only reads the standard artifact files (metadata.json, scores.csv, summary.md, +scenario.snapshot.yaml) that the reporting layer writes. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pandas as pd +import yaml + +from webapp.models import RunSummary + + +# Directory names that commonly hold run outputs, relative to the repo root. +DEFAULT_OUTPUT_ROOTS = ("outputs", "runs") + + +def _repo_root() -> Path: + """Return the siemens_ragas repository root (parent of the webapp package).""" + return Path(__file__).resolve().parents[2] + + +def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]: + """Collect existing output directories that may contain run artifacts.""" + root = _repo_root() + roots: list[Path] = [] + for name in DEFAULT_OUTPUT_ROOTS: + candidate = root / name + if candidate.is_dir(): + roots.append(candidate) + for extra in extra_roots or []: + if extra.is_dir(): + roots.append(extra) + return roots + + +def _read_json(path: Path) -> dict[str, Any]: + """Load a JSON file, returning an empty dict on any failure.""" + try: + return json.loads(path.read_text(encoding="utf-8")) + except (OSError, ValueError): + return {} + + +def _read_metrics_from_snapshot(run_dir: Path) -> list[str]: + """Read the configured metric list from a scenario snapshot if present.""" + snapshot = run_dir / "scenario.snapshot.yaml" + if not snapshot.is_file(): + return [] + try: + payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {} + except (OSError, yaml.YAMLError): + return [] + metrics = payload.get("metrics") + if isinstance(metrics, list): + return [str(item) for item in metrics] + return [] + + +def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]: + """Find every run directory (one that contains metadata.json) under the roots.""" + run_dirs: list[Path] = [] + seen: set[Path] = set() + for root in _candidate_roots(extra_roots): + for metadata_path in root.rglob("metadata.json"): + run_dir = metadata_path.parent + # A dataset-build metadata.json also exists; keep only evaluation runs + # by requiring a scores.csv alongside, or a recognizable run metadata. + metadata = _read_json(metadata_path) + if "scenario_name" not in metadata: + continue + if run_dir in seen: + continue + seen.add(run_dir) + run_dirs.append(run_dir) + return run_dirs + + +def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]: + """Compute per-metric mean scores from a run's scores.csv.""" + scores_path = run_dir / "scores.csv" + if not scores_path.is_file(): + return {} + try: + frame = pd.read_csv(scores_path) + except (OSError, ValueError, pd.errors.ParserError): + return {} + means: dict[str, float | None] = {} + for metric in metrics: + if metric in frame.columns: + mean_value = frame[metric].mean(numeric_only=True) + means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4) + else: + means[metric] = None + return means + + +def build_run_summary(run_dir: Path) -> RunSummary | None: + """Assemble a RunSummary from one run directory's artifacts.""" + metadata = _read_json(run_dir / "metadata.json") + if "scenario_name" not in metadata: + return None + + metrics = _read_metrics_from_snapshot(run_dir) + if not metrics: + # Fall back to numeric score columns inferred from the scores file. + metrics = _infer_metrics_from_scores(run_dir) + + valid = int(metadata.get("valid_samples", 0) or 0) + invalid = int(metadata.get("invalid_samples", 0) or 0) + run_id = str(metadata.get("run_id") or run_dir.name) + + return RunSummary( + run_id=run_id, + scenario_name=str(metadata.get("scenario_name", "")), + mode=str(metadata.get("mode", "")), + judge_model=str(metadata.get("judge_model", "")), + embedding_model=str(metadata.get("embedding_model", "")), + started_at=str(metadata.get("started_at", "")), + finished_at=str(metadata.get("finished_at", "")), + dataset=str(metadata.get("dataset", "")), + total_samples=valid + invalid, + valid_samples=valid, + invalid_samples=invalid, + metrics=metrics, + metric_means=_metric_means(run_dir, metrics), + output_path=run_dir.as_posix(), + ) + + +# Columns in scores.csv that are sample fields rather than metric scores. +NON_METRIC_COLUMNS = { + "sample_id", + "question", + "contexts", + "answer", + "ground_truth", + "scenario", + "language", + "retrieval_config", + "error", + "judge_model", + "embedding_model", + "run_id", + "difficulty", + "question_type", + "doc_id", + "doc_name", + "section_path", + "page_start", + "page_end", + "source_chunk_ids", + "review_status", + "review_notes", +} + + +def _infer_metrics_from_scores(run_dir: Path) -> list[str]: + """Infer metric column names from a scores.csv when no snapshot is available.""" + scores_path = run_dir / "scores.csv" + if not scores_path.is_file(): + return [] + try: + frame = pd.read_csv(scores_path, nrows=1) + except (OSError, ValueError, pd.errors.ParserError): + return [] + metrics: list[str] = [] + for column in frame.columns: + if column in NON_METRIC_COLUMNS: + continue + if pd.api.types.is_numeric_dtype(frame[column]): + metrics.append(str(column)) + return metrics + + +def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]: + """Return all run summaries sorted by finish time (most recent first).""" + summaries: list[RunSummary] = [] + for run_dir in discover_run_dirs(extra_roots): + summary = build_run_summary(run_dir) + if summary is not None: + summaries.append(summary) + summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True) + return summaries + + +def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None: + """Locate the run directory whose metadata or folder name matches run_id.""" + for run_dir in discover_run_dirs(extra_roots): + metadata = _read_json(run_dir / "metadata.json") + if str(metadata.get("run_id") or run_dir.name) == run_id: + return run_dir + return None + + +def read_scores_frame(run_dir: Path) -> pd.DataFrame: + """Load a run's scores.csv into a dataframe, or an empty frame if missing.""" + scores_path = run_dir / "scores.csv" + if not scores_path.is_file(): + return pd.DataFrame() + try: + return pd.read_csv(scores_path) + except (OSError, ValueError, pd.errors.ParserError): + return pd.DataFrame() + + +def read_summary_markdown(run_dir: Path) -> str: + """Return the human-readable summary.md for a run, or an empty string.""" + summary_path = run_dir / "summary.md" + if not summary_path.is_file(): + return "" + try: + return summary_path.read_text(encoding="utf-8") + except OSError: + return "" diff --git a/webapp/services/scenario_scanner.py b/webapp/services/scenario_scanner.py new file mode 100644 index 0000000..910f316 --- /dev/null +++ b/webapp/services/scenario_scanner.py @@ -0,0 +1,84 @@ +"""Discover scenario YAML files that can be launched from the console. + +Scanning is intentionally tolerant: a malformed scenario file is reported with +an error string rather than aborting the whole listing, so the UI can show the +user which files are runnable and which need fixing. +""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +from webapp.models import ScenarioInfo + + +def _repo_root() -> Path: + """Return the siemens_ragas repository root (parent of the webapp package).""" + return Path(__file__).resolve().parents[2] + + +def _scenarios_root() -> Path: + """Return the conventional scenarios/ directory inside the repository.""" + return _repo_root() / "scenarios" + + +def _summarize_scenario(path: Path) -> ScenarioInfo: + """Read a scenario file into a compact info object, capturing parse errors.""" + relative = path.relative_to(_repo_root()).as_posix() + try: + payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + except (OSError, yaml.YAMLError) as exc: + return ScenarioInfo(path=relative, error=f"无法解析: {exc}") + + if not isinstance(payload, dict): + return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。") + + metrics = payload.get("metrics") + metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else [] + + return ScenarioInfo( + path=relative, + scenario_name=str(payload.get("scenario_name", "")), + mode=str(payload.get("mode", "")), + dataset=str(payload.get("dataset", "")), + judge_model=str(payload.get("judge_model", "")), + metrics=metric_list, + ) + + +def list_scenarios() -> list[ScenarioInfo]: + """Return every scenario YAML under scenarios/, sorted by path.""" + root = _scenarios_root() + if not root.is_dir(): + return [] + + scenarios: list[ScenarioInfo] = [] + for path in sorted(root.rglob("*.yaml")): + scenarios.append(_summarize_scenario(path)) + for path in sorted(root.rglob("*.yml")): + scenarios.append(_summarize_scenario(path)) + return scenarios + + +def resolve_scenario_path(relative_or_absolute: str) -> Path | None: + """Resolve a user-supplied scenario path safely within the repository. + + Only paths that live inside the repository's scenarios/ directory are + accepted, which prevents the trigger endpoint from reading arbitrary files. + """ + root = _repo_root() + candidate = Path(relative_or_absolute) + resolved = candidate if candidate.is_absolute() else (root / candidate) + try: + resolved = resolved.resolve() + except OSError: + return None + + scenarios_root = _scenarios_root().resolve() + if scenarios_root not in resolved.parents and resolved != scenarios_root: + return None + if not resolved.is_file(): + return None + return resolved diff --git a/webapp/services/task_manager.py b/webapp/services/task_manager.py new file mode 100644 index 0000000..27a5725 --- /dev/null +++ b/webapp/services/task_manager.py @@ -0,0 +1,161 @@ +"""In-process background task manager for evaluation runs. + +Evaluations run in a thread pool so the FastAPI event loop is never blocked. +The heavy rag_eval / ragas import is performed lazily inside the worker thread, +which keeps the web server bootable even when the evaluation dependencies are +broken — failures then surface as task errors in the UI instead of crashing +startup. This matches the "coarse status + logs" progress decision. +""" + +from __future__ import annotations + +import io +import threading +import uuid +from concurrent.futures import ThreadPoolExecutor +from contextlib import redirect_stderr, redirect_stdout +from datetime import datetime, timezone +from pathlib import Path + +from webapp.models import TaskStatus + + +def _now_iso() -> str: + """Return the current UTC time as an ISO 8601 string.""" + return datetime.now(timezone.utc).isoformat() + + +class _LineCapture(io.TextIOBase): + """A writable stream that appends captured lines to a task's log buffer.""" + + def __init__(self, sink: "EvaluationTask") -> None: + """Bind the capture stream to the owning task.""" + self._sink = sink + self._buffer = "" + + def write(self, text: str) -> int: + """Buffer text and flush complete lines into the task log.""" + self._buffer += text + while "\n" in self._buffer: + line, self._buffer = self._buffer.split("\n", 1) + self._sink.append_log(line) + return len(text) + + def flush(self) -> None: + """Flush any trailing partial line into the task log.""" + if self._buffer: + self._sink.append_log(self._buffer) + self._buffer = "" + + +class EvaluationTask: + """Mutable state for a single background evaluation run.""" + + def __init__(self, task_id: str, scenario_path: str) -> None: + """Initialize a queued task for the given scenario path.""" + self.task_id = task_id + self.scenario_path = scenario_path + self.status = "queued" + self.logs: list[str] = [] + self.run_id: str | None = None + self.error: str | None = None + self.created_at = _now_iso() + self.finished_at = "" + self._lock = threading.Lock() + + def append_log(self, line: str) -> None: + """Append one log line in a thread-safe manner.""" + with self._lock: + self.logs.append(line) + + def snapshot(self) -> TaskStatus: + """Return an immutable copy of the current task state for the API.""" + with self._lock: + return TaskStatus( + task_id=self.task_id, + scenario_path=self.scenario_path, + status=self.status, + logs=list(self.logs), + run_id=self.run_id, + error=self.error, + created_at=self.created_at, + finished_at=self.finished_at, + ) + + +class TaskManager: + """Owns the thread pool and registry of evaluation tasks.""" + + def __init__(self, max_workers: int = 2) -> None: + """Create a task manager backed by a small thread pool.""" + self._executor = ThreadPoolExecutor(max_workers=max_workers) + self._tasks: dict[str, EvaluationTask] = {} + self._lock = threading.Lock() + + def submit(self, scenario_path: str) -> str: + """Register and schedule a new evaluation task, returning its id.""" + task_id = uuid.uuid4().hex[:12] + task = EvaluationTask(task_id=task_id, scenario_path=scenario_path) + with self._lock: + self._tasks[task_id] = task + self._executor.submit(self._run, task) + return task_id + + def get(self, task_id: str) -> TaskStatus | None: + """Return a snapshot of one task, or None if the id is unknown.""" + with self._lock: + task = self._tasks.get(task_id) + return task.snapshot() if task is not None else None + + def list_tasks(self) -> list[TaskStatus]: + """Return snapshots of all known tasks, newest first.""" + with self._lock: + tasks = list(self._tasks.values()) + snapshots = [task.snapshot() for task in tasks] + snapshots.sort(key=lambda item: item.created_at, reverse=True) + return snapshots + + def _run(self, task: EvaluationTask) -> None: + """Execute one evaluation end to end inside a worker thread.""" + task.status = "running" + task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}") + + capture = _LineCapture(task) + try: + # Lazy import keeps the web server bootable if ragas is unavailable. + task.append_log("加载评估引擎 (rag_eval / ragas)...") + from rag_eval.execution.runner import run_scenario + + absolute_path = self._to_absolute(task.scenario_path) + task.append_log(f"运行场景文件: {absolute_path}") + + with redirect_stdout(capture), redirect_stderr(capture): + result = run_scenario(str(absolute_path)) + capture.flush() + + task.run_id = getattr(result, "run_id", None) + output_dir = getattr(getattr(result, "scenario", None), "output_dir", "") + task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}") + if output_dir: + task.append_log(f"结果目录: {output_dir}") + task.status = "completed" + except Exception as exc: # noqa: BLE001 - surface any failure to the UI + capture.flush() + error_type = type(exc).__name__ + task.error = f"{error_type}: {exc}" + task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}") + task.status = "failed" + finally: + task.finished_at = _now_iso() + + def _to_absolute(self, scenario_path: str) -> Path: + """Resolve a scenario path against the repository root if relative.""" + candidate = Path(scenario_path) + if candidate.is_absolute(): + return candidate + repo_root = Path(__file__).resolve().parents[2] + return (repo_root / candidate).resolve() + + +# Module-level singleton shared by the FastAPI routes. +task_manager = TaskManager() diff --git a/webapp/services/text_utils.py b/webapp/services/text_utils.py new file mode 100644 index 0000000..94b563a --- /dev/null +++ b/webapp/services/text_utils.py @@ -0,0 +1,47 @@ +"""Self-contained text helpers for the web layer. + +These intentionally avoid importing from rag_eval so the web server has no +import-time dependency on the evaluation engine (and therefore boots even when +ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the +console interprets serialized CSV context columns the same way the engine does. +""" + +from __future__ import annotations + +import ast +import json +import math +from typing import Any + + +def parse_contexts(value: Any) -> list[str]: + """Normalize a context payload into a list of non-empty strings. + + Accepts native lists, JSON/Python-literal serialized lists (as written into + scores.csv), and plain text, mirroring the engine's own parsing rules. + """ + if isinstance(value, list): + return [str(item).strip() for item in value if str(item).strip()] + if value is None or (isinstance(value, float) and math.isnan(value)): + return [] + + text = str(value).strip() + if not text: + return [] + + # Accept serialized lists from CSV exports before falling back to plain text. + for parser in (json.loads, ast.literal_eval): + try: + parsed = parser(text) + except (ValueError, SyntaxError, json.JSONDecodeError): + continue + if isinstance(parsed, list): + return [str(item).strip() for item in parsed if str(item).strip()] + + # Preserve paragraph-style context dumps by splitting on blank lines first. + if "\n\n" in text: + chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()] + if chunks: + return chunks + + return [text] diff --git a/webapp/static/css/app.css b/webapp/static/css/app.css new file mode 100644 index 0000000..22f7886 --- /dev/null +++ b/webapp/static/css/app.css @@ -0,0 +1,267 @@ +/* Siemens RAGAS 评估控制台 — 样式表 + 配色取自西门子品牌色(petrol / 深青)与中性灰,呼应企业语境。 */ + +:root { + --petrol: #009999; + --petrol-dark: #007a7a; + --ink: #0f1b2d; + --ink-soft: #1a2942; + --slate: #64748b; + --slate-light: #94a3b8; + --line: #e2e8f0; + --bg: #f4f6f9; + --surface: #ffffff; + --good: #16a34a; + --warn: #eab308; + --bad: #dc2626; + --shadow: 0 1px 3px rgba(15, 27, 45, 0.08), 0 1px 2px rgba(15, 27, 45, 0.04); + --radius: 10px; + font-synthesis: none; +} + +* { box-sizing: border-box; margin: 0; padding: 0; } + +body { + font-family: "Segoe UI", "Microsoft YaHei", system-ui, -apple-system, sans-serif; + background: var(--bg); + color: var(--ink); + font-size: 14px; + line-height: 1.5; +} + +.app { display: flex; min-height: 100vh; } + +/* ---------- 左侧导航 ---------- */ +.sidebar { + width: 208px; + flex-shrink: 0; + background: linear-gradient(180deg, var(--ink) 0%, var(--ink-soft) 100%); + color: #cbd5e1; + display: flex; + flex-direction: column; + padding: 20px 14px; + position: sticky; + top: 0; + height: 100vh; +} + +.brand { padding: 0 8px 22px; } +.brand-mark { + font-size: 20px; font-weight: 700; letter-spacing: 1px; color: #fff; +} +.brand-sub { font-size: 12px; color: var(--petrol); margin-top: 2px; letter-spacing: 2px; } + +.nav { display: flex; flex-direction: column; gap: 4px; flex: 1; } +.nav-item { + display: flex; align-items: center; gap: 10px; + background: transparent; border: none; color: #cbd5e1; + padding: 10px 12px; border-radius: 8px; cursor: pointer; + font-size: 14px; text-align: left; width: 100%; + transition: background 0.15s, color 0.15s; +} +.nav-item:hover { background: rgba(255, 255, 255, 0.06); color: #fff; } +.nav-item.active { background: var(--petrol); color: #fff; } +.nav-item.active .nav-ico { color: #fff; } +.nav-item:disabled { opacity: 0.4; cursor: not-allowed; } +.nav-ico { width: 18px; text-align: center; color: var(--petrol); font-weight: 700; } +.nav-item.active .nav-ico { color: #fff; } + +.sidebar-foot { + display: flex; align-items: center; gap: 8px; + font-size: 12px; color: var(--slate-light); + padding: 12px 8px 0; border-top: 1px solid rgba(255, 255, 255, 0.08); +} +.dot { width: 8px; height: 8px; border-radius: 50%; background: var(--slate-light); } +.dot.ok { background: var(--good); } +.dot.bad { background: var(--bad); } + +/* ---------- 主内容区 ---------- */ +.main { flex: 1; display: flex; flex-direction: column; min-width: 0; } + +.topbar { + display: flex; align-items: center; justify-content: space-between; + padding: 18px 28px; background: var(--surface); border-bottom: 1px solid var(--line); + position: sticky; top: 0; z-index: 5; +} +.topbar h1 { font-size: 18px; font-weight: 600; } + +.view { padding: 24px 28px; } + +/* ---------- 按钮 ---------- */ +.btn { + border: 1px solid var(--line); background: var(--surface); color: var(--ink); + padding: 8px 16px; border-radius: 8px; cursor: pointer; font-size: 13px; + transition: all 0.15s; font-family: inherit; +} +.btn:hover { border-color: var(--petrol); color: var(--petrol); } +.btn-primary { background: var(--petrol); border-color: var(--petrol); color: #fff; } +.btn-primary:hover { background: var(--petrol-dark); border-color: var(--petrol-dark); color: #fff; } +.btn-primary:disabled { background: var(--slate-light); border-color: var(--slate-light); cursor: not-allowed; } +.btn-ghost { background: transparent; } + +/* ---------- 运行列表 ---------- */ +.runs-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 16px; } +.run-card { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + padding: 16px; cursor: pointer; transition: all 0.15s; box-shadow: var(--shadow); +} +.run-card:hover { border-color: var(--petrol); transform: translateY(-1px); } +.run-card-head { display: flex; justify-content: space-between; align-items: flex-start; gap: 10px; } +.run-card-title { font-size: 15px; font-weight: 600; word-break: break-all; } +.run-card-meta { font-size: 12px; color: var(--slate); margin-top: 6px; line-height: 1.7; } +.run-card-metrics { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px; } +.metric-chip { + font-size: 12px; padding: 3px 8px; border-radius: 6px; background: var(--bg); + border: 1px solid var(--line); +} +.metric-chip b { font-variant-numeric: tabular-nums; } + +/* ---------- 通用面板 ---------- */ +.panel { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + padding: 20px; box-shadow: var(--shadow); margin-bottom: 18px; +} +.panel h2 { font-size: 16px; margin-bottom: 6px; } +.panel-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 12px; } + +.muted { color: var(--slate); } +.tiny { font-size: 11px; margin-top: 8px; } +.tight { margin: 0 !important; } +code { + background: var(--bg); border: 1px solid var(--line); border-radius: 4px; + padding: 1px 6px; font-size: 12px; font-family: "Cascadia Code", Consolas, monospace; +} + +/* ---------- 新建评估 ---------- */ +.scenario-list { display: flex; flex-direction: column; gap: 8px; margin: 16px 0; } +.scenario-item { + display: flex; align-items: center; justify-content: space-between; gap: 12px; + border: 1px solid var(--line); border-radius: 8px; padding: 12px 14px; cursor: pointer; + transition: all 0.15s; +} +.scenario-item:hover { border-color: var(--petrol); background: #f0fbfb; } +.scenario-item.selected { border-color: var(--petrol); background: #e6f7f7; box-shadow: inset 0 0 0 1px var(--petrol); } +.scenario-item.invalid { opacity: 0.55; cursor: not-allowed; } +.scenario-name { font-weight: 600; font-size: 14px; } +.scenario-path { font-size: 12px; color: var(--slate); font-family: monospace; } +.scenario-tags { display: flex; gap: 6px; align-items: center; flex-shrink: 0; } +.tag { + font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--bg); + border: 1px solid var(--line); color: var(--slate); +} +.tag.mode-online { background: #eff6ff; color: #1d4ed8; border-color: #bfdbfe; } +.tag.mode-offline { background: #f0fdf4; color: #15803d; border-color: #bbf7d0; } + +.run-actions { display: flex; align-items: center; gap: 14px; } +.selected-scenario { font-size: 13px; } + +/* ---------- 任务进度 ---------- */ +.task-head { display: flex; align-items: center; gap: 12px; margin-bottom: 12px; } +.badge { + font-size: 12px; padding: 3px 10px; border-radius: 999px; font-weight: 600; + background: var(--bg); color: var(--slate); border: 1px solid var(--line); +} +.badge.queued { background: #f1f5f9; color: var(--slate); } +.badge.running { background: #fef9c3; color: #854d0e; border-color: #fde68a; } +.badge.completed { background: #dcfce7; color: #166534; border-color: #bbf7d0; } +.badge.failed { background: #fee2e2; color: #991b1b; border-color: #fecaca; } +.log-box { + background: #0b1220; color: #cbd5e1; border-radius: 8px; padding: 14px; + font-family: "Cascadia Code", Consolas, monospace; font-size: 12px; line-height: 1.7; + max-height: 320px; overflow-y: auto; white-space: pre-wrap; word-break: break-word; +} +.task-actions { margin-top: 12px; } + +/* ---------- 报告详情 ---------- */ +.report-meta { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + padding: 14px 18px; display: flex; justify-content: space-between; align-items: center; + flex-wrap: wrap; gap: 10px; box-shadow: var(--shadow); margin-bottom: 18px; +} +.report-meta-title { font-size: 15px; font-weight: 600; } +.report-meta-info { font-size: 12px; color: var(--slate); } +.status-pill { font-size: 12px; font-weight: 600; } +.status-pill.completed { color: var(--good); } + +.section-label { + font-size: 12px; font-weight: 600; letter-spacing: 0.5px; color: var(--slate); + text-transform: uppercase; margin: 18px 0 10px; +} + +.metric-cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px; } +.metric-card { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + padding: 16px; text-align: center; box-shadow: var(--shadow); +} +.metric-value { font-size: 28px; font-weight: 700; font-variant-numeric: tabular-nums; } +.metric-value.good { color: var(--good); } +.metric-value.warn { color: var(--warn); } +.metric-value.bad { color: var(--bad); } +.metric-value.na { color: var(--slate-light); } +.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; } + +.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; } +.report-half { margin-bottom: 0; } + +.select { + border: 1px solid var(--line); border-radius: 6px; padding: 5px 10px; font-size: 12px; + background: var(--surface); color: var(--ink); font-family: inherit; cursor: pointer; +} + +.grouping-tabs { display: flex; gap: 6px; margin-bottom: 10px; flex-wrap: wrap; } +.grouping-tab { + font-size: 12px; padding: 4px 10px; border-radius: 6px; border: 1px solid var(--line); + background: var(--surface); cursor: pointer; color: var(--slate); +} +.grouping-tab.active { background: var(--petrol); color: #fff; border-color: var(--petrol); } + +table.group-table { width: 100%; border-collapse: collapse; font-size: 12px; } +table.group-table th, table.group-table td { padding: 6px 8px; text-align: left; } +table.group-table th { color: var(--slate); border-bottom: 1px solid var(--line); font-weight: 600; } +table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: tabular-nums; } + +/* 最低分样本表 */ +.lowest-table { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + overflow: hidden; box-shadow: var(--shadow); +} +.lowest-row { + display: grid; grid-template-columns: 90px 1fr auto; gap: 12px; align-items: center; + padding: 11px 16px; border-bottom: 1px solid #f1f5f9; cursor: pointer; transition: background 0.12s; +} +.lowest-row:hover { background: var(--bg); } +.lowest-row .sid { font-size: 12px; color: var(--slate); font-family: monospace; } +.lowest-row .q { font-size: 13px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } +.lowest-row .scores { display: flex; gap: 8px; } +.score-badge { + font-size: 12px; padding: 2px 8px; border-radius: 6px; font-variant-numeric: tabular-nums; + font-weight: 600; +} +.score-badge.good { background: #dcfce7; color: #166534; } +.score-badge.warn { background: #fef9c3; color: #854d0e; } +.score-badge.bad { background: #fee2e2; color: #991b1b; } +.score-badge.na { background: var(--bg); color: var(--slate-light); } + +.lowest-detail { padding: 0 16px; background: #fcfdfe; border-bottom: 1px solid #f1f5f9; } +.lowest-detail-inner { padding: 14px 0; font-size: 13px; line-height: 1.7; } +.detail-field { margin-bottom: 10px; } +.detail-label { font-size: 12px; color: var(--slate); font-weight: 600; margin-bottom: 3px; } +.detail-context { color: #475569; font-size: 12px; } +.detail-context .ctx-item { + padding: 4px 0; border-bottom: 1px dashed var(--line); +} +.detail-gt { color: var(--good); } + +.empty { text-align: center; padding: 60px 20px; color: var(--slate); } +.empty p { margin-bottom: 8px; } + +.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--line); + border-top-color: var(--petrol); border-radius: 50%; animation: spin 0.7s linear infinite; + vertical-align: middle; } +@keyframes spin { to { transform: rotate(360deg); } } + +@media (max-width: 880px) { + .report-row { grid-template-columns: 1fr; } + .sidebar { width: 64px; } + .brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; } +} diff --git a/webapp/static/index.html b/webapp/static/index.html new file mode 100644 index 0000000..c270cbb --- /dev/null +++ b/webapp/static/index.html @@ -0,0 +1,118 @@ + + + + + + Siemens RAGAS 评估控制台 + + + + +
+ + + + +
+
+

运行列表

+ +
+ + +
+
+ +
+ + + + + + +
+
+ + + + + + + diff --git a/webapp/static/js/api.js b/webapp/static/js/api.js new file mode 100644 index 0000000..28fcca2 --- /dev/null +++ b/webapp/static/js/api.js @@ -0,0 +1,46 @@ +// api.js — 控制台后端 HTTP 接口的轻量封装。 + +const API = { + // 通用 JSON GET,失败时抛出带状态码的错误。 + async get(path) { + const resp = await fetch(path); + if (!resp.ok) { + const detail = await API._extractError(resp); + throw new Error(detail); + } + return resp.json(); + }, + + // 通用 JSON POST。 + async post(path, body) { + const resp = await fetch(path, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body || {}), + }); + if (!resp.ok) { + const detail = await API._extractError(resp); + throw new Error(detail); + } + return resp.json(); + }, + + // 从错误响应中尽量解析出 detail 文本。 + async _extractError(resp) { + try { + const data = await resp.json(); + return data.detail || `请求失败 (${resp.status})`; + } catch (_e) { + return `请求失败 (${resp.status})`; + } + }, + + health() { return API.get("/api/health"); }, + runs() { return API.get("/api/runs"); }, + runDetail(runId) { return API.get(`/api/runs/${encodeURIComponent(runId)}`); }, + scenarios() { return API.get("/api/scenarios"); }, + triggerEvaluation(scenarioPath) { + return API.post("/api/evaluations", { scenario_path: scenarioPath }); + }, + taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); }, +}; diff --git a/webapp/static/js/app.js b/webapp/static/js/app.js new file mode 100644 index 0000000..e36a66b --- /dev/null +++ b/webapp/static/js/app.js @@ -0,0 +1,152 @@ +// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。 + +const App = { + currentRunId: null, + views: ["runs", "new", "report"], + titles: { runs: "运行列表", new: "新建评估", report: "报告详情" }, + + // 初始化:绑定导航、加载首屏、启动健康检查。 + init() { + document.querySelectorAll(".nav-item").forEach((btn) => { + btn.addEventListener("click", () => App.switchView(btn.dataset.view)); + }); + document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent()); + + Runner.init(); + App.switchView("runs"); + App.checkHealth(); + setInterval(App.checkHealth, 15000); + }, + + // 切换主视图,并同步导航高亮与标题。 + switchView(view) { + if (view === "report" && !App.currentRunId) { + // 没有选中的运行时,报告页显示占位。 + } + App.views.forEach((name) => { + const el = document.getElementById(`view-${name}`); + if (el) el.hidden = name !== view; + }); + document.querySelectorAll(".nav-item").forEach((btn) => { + btn.classList.toggle("active", btn.dataset.view === view); + }); + document.getElementById("view-title").textContent = App.titles[view] || view; + App.activeView = view; + + if (view === "runs") App.loadRuns(); + if (view === "new") Runner.loadScenarios(); + if (view === "report") Report.render(App.currentRunId); + }, + + // 刷新当前视图的数据。 + refreshCurrent() { + App.switchView(App.activeView || "runs"); + }, + + // 加载并渲染运行列表。 + async loadRuns() { + const container = document.getElementById("runs-container"); + const empty = document.getElementById("runs-empty"); + container.innerHTML = '

加载中…

'; + try { + const data = await API.runs(); + const runs = data.runs || []; + if (runs.length === 0) { + container.innerHTML = ""; + empty.hidden = false; + return; + } + empty.hidden = true; + container.innerHTML = ""; + runs.forEach((run) => container.appendChild(App.renderRunCard(run))); + } catch (err) { + container.innerHTML = `

加载失败:${App.escape(err.message)}

`; + } + }, + + // 构造一张运行卡片。 + renderRunCard(run) { + const card = document.createElement("div"); + card.className = "run-card"; + card.addEventListener("click", () => { + App.currentRunId = run.run_id; + App.enableReportNav(); + App.switchView("report"); + }); + + const chips = (run.metrics || []) + .map((m) => { + const val = run.metric_means ? run.metric_means[m] : null; + const cls = App.scoreClass(val); + const text = val === null || val === undefined ? "n/a" : val.toFixed(2); + return `${App.escape(App.shortMetric(m))} ${text}`; + }) + .join(""); + + card.innerHTML = ` +
+
${App.escape(run.scenario_name || run.run_id)}
+
+
+
${App.escape(run.mode || "—")} · judge: ${App.escape(run.judge_model || "—")}
+
${run.valid_samples} 有效 / ${run.invalid_samples} 无效 · ${App.escape(App.shortTime(run.finished_at))}
+
+
${chips}
+ `; + return card; + }, + + // 启用报告导航项(选中运行后)。 + enableReportNav() { + const btn = document.querySelector('.nav-item[data-view="report"]'); + if (btn) btn.disabled = false; + }, + + // 根据分值返回 good/warn/bad/na 配色类。 + scoreClass(value) { + if (value === null || value === undefined) return "na"; + if (value >= 0.8) return "good"; + if (value >= 0.65) return "warn"; + return "bad"; + }, + + // 指标名缩写,节省卡片横向空间。 + shortMetric(name) { + const map = { + faithfulness: "faith.", + answer_relevancy: "ans.rel.", + context_recall: "ctx.recall", + context_precision: "ctx.prec.", + }; + return map[name] || name; + }, + + // 截取时间戳到分钟,便于阅读。 + shortTime(iso) { + if (!iso) return "—"; + return String(iso).replace("T", " ").slice(0, 16); + }, + + // 简单 HTML 转义,防止注入。 + escape(text) { + const div = document.createElement("div"); + div.textContent = text == null ? "" : String(text); + return div.innerHTML; + }, + + // 健康检查,更新左下角状态点。 + async checkHealth() { + const dot = document.getElementById("health-dot"); + const label = document.getElementById("health-text"); + try { + await API.health(); + dot.className = "dot ok"; + label.textContent = "服务正常"; + } catch (_e) { + dot.className = "dot bad"; + label.textContent = "服务离线"; + } + }, +}; + +document.addEventListener("DOMContentLoaded", App.init); diff --git a/webapp/static/js/report.js b/webapp/static/js/report.js new file mode 100644 index 0000000..882e27e --- /dev/null +++ b/webapp/static/js/report.js @@ -0,0 +1,258 @@ +// report.js — 报告详情页渲染:元信息、指标卡片、分布图、分组表、低分样本复核。 + +const Report = { + distChart: null, + currentDetail: null, + activeGrouping: null, + + // 加载并渲染指定运行的完整报告。 + async render(runId) { + const empty = document.getElementById("report-empty"); + const content = document.getElementById("report-content"); + if (!runId) { + empty.hidden = false; + content.hidden = true; + return; + } + empty.hidden = true; + content.hidden = false; + content.style.opacity = "0.4"; + + try { + const detail = await API.runDetail(runId); + Report.currentDetail = detail; + Report.renderMeta(detail.summary); + Report.renderMetricCards(detail.summary, detail.report); + Report.renderDistribution(detail.report); + Report.renderGroupings(detail.report); + Report.renderLowest(detail.report); + content.style.opacity = "1"; + } catch (err) { + empty.hidden = false; + content.hidden = true; + empty.innerHTML = `

加载报告失败:${App.escape(err.message)}

`; + } + }, + + // 顶部元信息条。 + renderMeta(summary) { + const el = document.getElementById("report-meta"); + el.innerHTML = ` +
+
${App.escape(summary.scenario_name || summary.run_id)} + ● completed
+
run_id: ${App.escape(summary.run_id)}
+
+
+ ${App.escape(summary.mode || "—")} · judge: ${App.escape(summary.judge_model || "—")} + · ${summary.total_samples} 样本 (${summary.valid_samples} 有效 / ${summary.invalid_samples} 无效) + · ${App.escape(App.shortTime(summary.finished_at))} +
+ `; + }, + + // ① 指标均值卡片。 + renderMetricCards(summary, report) { + const wrap = document.getElementById("metric-cards"); + wrap.innerHTML = ""; + const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics; + metrics.forEach((metric) => { + const value = report.metric_means ? report.metric_means[metric] : null; + const cls = App.scoreClass(value); + const text = value === null || value === undefined ? "n/a" : value.toFixed(2); + const card = document.createElement("div"); + card.className = "metric-card"; + card.innerHTML = ` +
${text}
+
${App.escape(metric)}
+ `; + wrap.appendChild(card); + }); + }, + + // ② 分数分布直方图(可切换指标)。 + renderDistribution(report) { + const select = document.getElementById("dist-metric-select"); + const distributions = report.distributions || {}; + const metricsWithDist = Object.keys(distributions); + + select.innerHTML = ""; + if (metricsWithDist.length === 0) { + Report._drawDistChart([], []); + return; + } + metricsWithDist.forEach((metric) => { + const opt = document.createElement("option"); + opt.value = metric; + opt.textContent = metric; + select.appendChild(opt); + }); + select.onchange = () => Report._updateDistChart(select.value); + Report._updateDistChart(metricsWithDist[0]); + }, + + // 用选定指标的分箱数据刷新直方图。 + _updateDistChart(metric) { + const distributions = Report.currentDetail.report.distributions || {}; + const bins = distributions[metric] || []; + const labels = bins.map((b) => b.label); + const counts = bins.map((b) => b.count); + const colors = bins.map((b) => Report._binColor(b.lower)); + Report._drawDistChart(labels, counts, colors); + }, + + // 低分箱偏红、高分箱偏绿,直观暴露长尾。 + _binColor(lower) { + if (lower >= 0.8) return "#16a34a"; + if (lower >= 0.6) return "#84cc16"; + if (lower >= 0.4) return "#eab308"; + if (lower >= 0.2) return "#f97316"; + return "#dc2626"; + }, + + // 实际绘制 Chart.js 柱状图。 + _drawDistChart(labels, counts, colors) { + const canvas = document.getElementById("dist-chart"); + if (Report.distChart) Report.distChart.destroy(); + Report.distChart = new Chart(canvas, { + type: "bar", + data: { + labels, + datasets: [{ data: counts, backgroundColor: colors || "#009999", borderRadius: 4 }], + }, + options: { + responsive: true, + plugins: { legend: { display: false } }, + scales: { + y: { beginAtZero: true, ticks: { precision: 0 }, grid: { color: "#f1f5f9" } }, + x: { grid: { display: false } }, + }, + }, + }); + }, + + // ③ 分组均值(difficulty / question_type / language)。 + renderGroupings(report) { + const tabsEl = document.getElementById("grouping-tabs"); + const tableEl = document.getElementById("grouping-table"); + const groupings = report.groupings || {}; + const fields = Object.keys(groupings); + + tabsEl.innerHTML = ""; + if (fields.length === 0) { + tableEl.innerHTML = '

数据集未包含可分组字段(difficulty / question_type)。

'; + return; + } + + const fieldLabels = { difficulty: "难度", question_type: "类型", language: "语言" }; + Report.activeGrouping = fields[0]; + fields.forEach((field) => { + const tab = document.createElement("button"); + tab.className = "grouping-tab" + (field === Report.activeGrouping ? " active" : ""); + tab.textContent = fieldLabels[field] || field; + tab.onclick = () => { + Report.activeGrouping = field; + tabsEl.querySelectorAll(".grouping-tab").forEach((t) => t.classList.remove("active")); + tab.classList.add("active"); + Report._drawGroupTable(report, field); + }; + tabsEl.appendChild(tab); + }); + Report._drawGroupTable(report, Report.activeGrouping); + }, + + // 渲染单个分组字段的均值表。 + _drawGroupTable(report, field) { + const tableEl = document.getElementById("grouping-table"); + const stats = report.groupings[field] || []; + const metrics = report.metrics || []; + + let head = "组样本"; + metrics.forEach((m) => (head += `${App.escape(App.shortMetric(m))}`)); + head += ""; + + let body = ""; + stats.forEach((stat) => { + body += `${App.escape(stat.key)}${stat.count}`; + metrics.forEach((m) => { + const v = stat.means ? stat.means[m] : null; + const cls = App.scoreClass(v); + const text = v === null || v === undefined ? "—" : v.toFixed(2); + body += `${text}`; + }); + body += ""; + }); + tableEl.innerHTML = `${head}${body}
`; + }, + + // ④ 最低分样本逐条复核表(点击展开)。 + renderLowest(report) { + const wrap = document.getElementById("lowest-table"); + const samples = report.lowest_samples || []; + wrap.innerHTML = ""; + if (samples.length === 0) { + wrap.innerHTML = '
暂无可复核样本。
'; + return; + } + const metrics = report.metrics || []; + samples.forEach((sample, idx) => { + const row = document.createElement("div"); + row.className = "lowest-row"; + const scoreBadges = metrics + .map((m) => { + const v = sample.metrics ? sample.metrics[m] : null; + const cls = App.scoreClass(v); + const text = v === null || v === undefined ? "—" : v.toFixed(2); + return `${text}`; + }) + .join(""); + row.innerHTML = ` + ${App.escape(sample.sample_id)} + ${App.escape(sample.question || "—")} + ${scoreBadges} + `; + + const detail = document.createElement("div"); + detail.className = "lowest-detail"; + detail.hidden = true; + detail.innerHTML = Report._detailHtml(sample); + + row.addEventListener("click", () => { + detail.hidden = !detail.hidden; + }); + wrap.appendChild(row); + wrap.appendChild(detail); + }); + }, + + // 单条样本的展开详情:question / contexts / answer / ground_truth。 + _detailHtml(sample) { + const contexts = (sample.contexts || []) + .map((c, i) => `
[${i + 1}] ${App.escape(c)}
`) + .join(""); + const errorBlock = sample.error + ? `
错误 error
${App.escape(sample.error)}
` + : ""; + return ` +
+
+
问题 question
+
${App.escape(sample.question || "—")}
+
+
+
检索片段 contexts
+
${contexts || "(空)"}
+
+
+
生成答案 answer
+
${App.escape(sample.answer || "—")}
+
+
+
标准答案 ground_truth
+
${App.escape(sample.ground_truth || "—")}
+
+ ${errorBlock} +
+ `; + }, +}; diff --git a/webapp/static/js/runner.js b/webapp/static/js/runner.js new file mode 100644 index 0000000..b448f03 --- /dev/null +++ b/webapp/static/js/runner.js @@ -0,0 +1,133 @@ +// runner.js — 新建评估视图:列出场景、触发评估、轮询任务状态与日志。 + +const Runner = { + selectedScenario: null, + pollTimer: null, + + // 绑定运行按钮。 + init() { + document.getElementById("run-btn").addEventListener("click", () => Runner.trigger()); + document.getElementById("view-report-btn").addEventListener("click", () => { + if (Runner.lastRunId) { + App.currentRunId = Runner.lastRunId; + App.enableReportNav(); + App.switchView("report"); + } + }); + }, + + // 加载并渲染可触发的场景列表。 + async loadScenarios() { + const list = document.getElementById("scenario-list"); + list.innerHTML = '

加载中…

'; + try { + const data = await API.scenarios(); + const scenarios = data.scenarios || []; + if (scenarios.length === 0) { + list.innerHTML = '

未在 scenarios/ 下找到场景文件。

'; + return; + } + list.innerHTML = ""; + scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc))); + } catch (err) { + list.innerHTML = `

加载失败:${App.escape(err.message)}

`; + } + }, + + // 构造单个场景条目。 + renderScenarioItem(sc) { + const item = document.createElement("div"); + const invalid = !!sc.error; + item.className = "scenario-item" + (invalid ? " invalid" : ""); + + const modeTag = sc.mode + ? `${App.escape(sc.mode)}` + : ""; + const metricCount = (sc.metrics || []).length; + + item.innerHTML = ` +
+
${App.escape(sc.scenario_name || sc.path)}
+
${App.escape(sc.path)}
+ ${sc.error ? `
${App.escape(sc.error)}
` : ""} +
+
+ ${modeTag} + ${metricCount} 指标 +
+ `; + + if (!invalid) { + item.addEventListener("click", () => { + document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected")); + item.classList.add("selected"); + Runner.selectedScenario = sc.path; + document.getElementById("selected-scenario").textContent = sc.path; + document.getElementById("run-btn").disabled = false; + }); + } + return item; + }, + + // 触发评估并开始轮询。 + async trigger() { + if (!Runner.selectedScenario) return; + const runBtn = document.getElementById("run-btn"); + runBtn.disabled = true; + + const panel = document.getElementById("task-panel"); + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + panel.hidden = false; + reportBtn.hidden = true; + logBox.textContent = ""; + Runner._setStatus(statusBadge, "queued"); + + try { + const resp = await API.triggerEvaluation(Runner.selectedScenario); + Runner.poll(resp.task_id); + } catch (err) { + Runner._setStatus(statusBadge, "failed"); + logBox.textContent = `触发失败:${err.message}`; + runBtn.disabled = false; + } + }, + + // 周期性轮询任务状态,刷新日志与徽标。 + poll(taskId) { + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + const runBtn = document.getElementById("run-btn"); + + if (Runner.pollTimer) clearInterval(Runner.pollTimer); + Runner.pollTimer = setInterval(async () => { + try { + const status = await API.taskStatus(taskId); + logBox.textContent = (status.logs || []).join("\n"); + logBox.scrollTop = logBox.scrollHeight; + Runner._setStatus(statusBadge, status.status); + + if (status.status === "completed" || status.status === "failed") { + clearInterval(Runner.pollTimer); + runBtn.disabled = false; + if (status.status === "completed" && status.run_id) { + Runner.lastRunId = status.run_id; + reportBtn.hidden = false; + } + } + } catch (err) { + clearInterval(Runner.pollTimer); + logBox.textContent += `\n轮询失败:${err.message}`; + runBtn.disabled = false; + } + }, 1200); + }, + + // 更新状态徽标的文本与配色类。 + _setStatus(badge, status) { + badge.textContent = status; + badge.className = "badge " + status; + }, +}; diff --git a/webmain.py b/webmain.py new file mode 100644 index 0000000..30c06d7 --- /dev/null +++ b/webmain.py @@ -0,0 +1,42 @@ +"""CLI entry point that launches the evaluation console web server. + +Run alongside the existing main.py CLI; both share the same rag_eval library +and the same runs/ artifacts. Example: + + python webmain.py + python webmain.py --host 0.0.0.0 --port 8800 +""" + +from __future__ import annotations + +import argparse + +import uvicorn + + +def parse_args() -> argparse.Namespace: + """Parse host/port/reload options for the console server.""" + parser = argparse.ArgumentParser(description="Launch the RAGAS evaluation console.") + parser.add_argument("--host", default="127.0.0.1", help="Bind host (default 127.0.0.1).") + parser.add_argument("--port", type=int, default=8800, help="Bind port (default 8800).") + parser.add_argument( + "--reload", + action="store_true", + help="Enable auto-reload for local development.", + ) + return parser.parse_args() + + +def main() -> None: + """Start uvicorn with the configured application.""" + args = parse_args() + uvicorn.run( + "webapp.server:app", + host=args.host, + port=args.port, + reload=args.reload, + ) + + +if __name__ == "__main__": + main()