From e89695e49000016be4ce413587b5078e920df8c4 Mon Sep 17 00:00:00 2001
From: wangwei <Wei.Wang@t-systems.com>
Date: Mon, 15 Jun 2026 15:53:57 +0800
Subject: [PATCH] Add RAGAS evaluation web console (FastAPI + vanilla JS)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- webapp/: FastAPI backend with runs/scenarios/evaluations API routers;
  services for run_reader, report_builder, scenario_scanner, task_manager
  (lazy ragas import — server boots even without ragas); Pydantic models
- webapp/static/: single-page console (layout A: left-nav + main area);
  report detail with metric cards, Chart.js distribution histogram,
  grouping table, lowest-score sample review; trigger evaluation + log polling
- webmain.py: uvicorn entry point (alongside existing main.py CLI)
- start.bat: Windows one-click launcher with env checks and auto-browser open
- rag_eval/datasets/: implement missing loader + normalizer modules
  (load_dataset_records, normalize_records) required by evaluator
- scripts/seed_sample_run.py: generate realistic demo run artifacts
- .gitignore: exclude datasets/ data files but keep rag_eval/datasets/ source

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 .gitignore                          |   6 +-
 rag_eval/datasets/__init__.py       |   1 +
 rag_eval/datasets/loader.py         |  56 ++++++
 rag_eval/datasets/normalizers.py    | 105 +++++++++++
 scripts/seed_sample_run.py          | 236 ++++++++++++++++++++++++
 start.bat                           |  99 +++++++++++
 webapp/__init__.py                  |   5 +
 webapp/api/__init__.py              |   1 +
 webapp/api/evaluations.py           |  44 +++++
 webapp/api/runs.py                  |  32 ++++
 webapp/api/scenarios.py             |  16 ++
 webapp/models.py                    | 129 ++++++++++++++
 webapp/server.py                    |  49 +++++
 webapp/services/__init__.py         |   1 +
 webapp/services/report_builder.py   | 188 ++++++++++++++++++++
 webapp/services/run_reader.py       | 222 +++++++++++++++++++++++
 webapp/services/scenario_scanner.py |  84 +++++++++
 webapp/services/task_manager.py     | 161 +++++++++++++++++
 webapp/services/text_utils.py       |  47 +++++
 webapp/static/css/app.css           | 267 ++++++++++++++++++++++++++++
 webapp/static/index.html            | 118 ++++++++++++
 webapp/static/js/api.js             |  46 +++++
 webapp/static/js/app.js             | 152 ++++++++++++++++
 webapp/static/js/report.js          | 258 +++++++++++++++++++++++++++
 webapp/static/js/runner.js          | 133 ++++++++++++++
 webmain.py                          |  42 +++++
 26 files changed, 2496 insertions(+), 2 deletions(-)
 create mode 100644 rag_eval/datasets/__init__.py
 create mode 100644 rag_eval/datasets/loader.py
 create mode 100644 rag_eval/datasets/normalizers.py
 create mode 100644 scripts/seed_sample_run.py
 create mode 100644 start.bat
 create mode 100644 webapp/__init__.py
 create mode 100644 webapp/api/__init__.py
 create mode 100644 webapp/api/evaluations.py
 create mode 100644 webapp/api/runs.py
 create mode 100644 webapp/api/scenarios.py
 create mode 100644 webapp/models.py
 create mode 100644 webapp/server.py
 create mode 100644 webapp/services/__init__.py
 create mode 100644 webapp/services/report_builder.py
 create mode 100644 webapp/services/run_reader.py
 create mode 100644 webapp/services/scenario_scanner.py
 create mode 100644 webapp/services/task_manager.py
 create mode 100644 webapp/services/text_utils.py
 create mode 100644 webapp/static/css/app.css
 create mode 100644 webapp/static/index.html
 create mode 100644 webapp/static/js/api.js
 create mode 100644 webapp/static/js/app.js
 create mode 100644 webapp/static/js/report.js
 create mode 100644 webapp/static/js/runner.js
 create mode 100644 webmain.py

diff --git a/.gitignore b/.gitignore
index 6c851d2..f2d09f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,5 +17,7 @@ wheels/
 # outputs
 outputs/
 
-# datasets
-datasets/
\ No newline at end of file
+# datasets — raw/normalized data files (large, not committed)
+# Note: rag_eval/datasets/ is source code and IS committed (see negation below)
+datasets/
+!rag_eval/datasets/
\ No newline at end of file
diff --git a/rag_eval/datasets/__init__.py b/rag_eval/datasets/__init__.py
new file mode 100644
index 0000000..750585e
--- /dev/null
+++ b/rag_eval/datasets/__init__.py
@@ -0,0 +1 @@
+"""Dataset loading and normalization for the RAG evaluation platform."""
diff --git a/rag_eval/datasets/loader.py b/rag_eval/datasets/loader.py
new file mode 100644
index 0000000..7939a27
--- /dev/null
+++ b/rag_eval/datasets/loader.py
@@ -0,0 +1,56 @@
+"""Load raw evaluation dataset records from disk.
+
+Supports CSV and JSONL formats. Returns a list of plain dicts — normalization
+into NormalizedSample is handled by normalizers.py.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Any
+
+
+def load_dataset_records(path: Path | str) -> list[dict[str, Any]]:
+    """Load raw records from a CSV or JSONL file.
+
+    Each row becomes a plain dict. Lists stored as JSON strings in CSV columns
+    are left as-is; normalizers handle parsing.
+    """
+    file_path = Path(path)
+    if not file_path.is_file():
+        raise FileNotFoundError(f"Dataset file not found: {file_path}")
+
+    suffix = file_path.suffix.lower()
+    if suffix in (".jsonl", ".ndjson"):
+        return _load_jsonl(file_path)
+    if suffix in (".csv",):
+        return _load_csv(file_path)
+    # Fall back to CSV for unknown extensions.
+    return _load_csv(file_path)
+
+
+def _load_csv(path: Path) -> list[dict[str, Any]]:
+    """Read a CSV file into a list of row dicts."""
+    with path.open(encoding="utf-8", newline="") as fh:
+        reader = csv.DictReader(fh)
+        return [dict(row) for row in reader]
+
+
+def _load_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Read a JSONL file into a list of record dicts."""
+    records: list[dict[str, Any]] = []
+    with path.open(encoding="utf-8") as fh:
+        for lineno, line in enumerate(fh, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON on line {lineno} of {path}: {exc}") from exc
+            if not isinstance(obj, dict):
+                raise ValueError(f"Expected JSON object on line {lineno} of {path}, got {type(obj).__name__}")
+            records.append(obj)
+    return records
diff --git a/rag_eval/datasets/normalizers.py b/rag_eval/datasets/normalizers.py
new file mode 100644
index 0000000..d77b3f8
--- /dev/null
+++ b/rag_eval/datasets/normalizers.py
@@ -0,0 +1,105 @@
+"""Normalize raw dataset records into NormalizedSample and InvalidSample objects.
+
+Handles both offline mode (records already contain answer + contexts) and online
+mode (records only contain question + ground_truth; adapter fills the rest).
+"""
+
+from __future__ import annotations
+
+import uuid
+from typing import Any
+
+from rag_eval.shared.models import InvalidSample, NormalizedSample
+from rag_eval.shared.utils import parse_contexts
+
+# Fields we always strip from the raw record before storing it in metadata.
+_CORE_FIELDS = {
+    "sample_id",
+    "question",
+    "contexts",
+    "answer",
+    "ground_truth",
+    "scenario",
+    "language",
+    "retrieval_config",
+}
+
+
+def _get_str(record: dict[str, Any], key: str, default: str = "") -> str:
+    """Return a string field from the record, coercing None/NaN to the default."""
+    value = record.get(key)
+    if value is None:
+        return default
+    text = str(value).strip()
+    return default if text.lower() == "nan" else text
+
+
+def normalize_records(
+    records: list[dict[str, Any]],
+    mode: str = "offline",
+    max_samples: int | None = None,
+) -> tuple[list[NormalizedSample], list[InvalidSample]]:
+    """Convert raw dicts into NormalizedSample / InvalidSample collections.
+
+    In offline mode every record must already contain answer and contexts.
+    In online mode those fields may be absent; they will be filled by the adapter.
+    """
+    if max_samples is not None:
+        records = records[:max_samples]
+
+    valid: list[NormalizedSample] = []
+    invalid: list[InvalidSample] = []
+
+    for raw in records:
+        sample_id = _get_str(raw, "sample_id") or uuid.uuid4().hex[:12]
+
+        question = _get_str(raw, "question")
+        if not question:
+            invalid.append(InvalidSample(
+                sample_id=sample_id,
+                error="missing required field: question",
+                raw=raw,
+            ))
+            continue
+
+        ground_truth = _get_str(raw, "ground_truth")
+        contexts = parse_contexts(raw.get("contexts"))
+        answer = _get_str(raw, "answer")
+
+        if mode == "offline":
+            errors: list[str] = []
+            if not ground_truth:
+                errors.append("missing ground_truth")
+            if not answer:
+                errors.append("missing answer")
+            if not contexts:
+                errors.append("missing or empty contexts")
+            if errors:
+                invalid.append(InvalidSample(
+                    sample_id=sample_id,
+                    error="; ".join(errors),
+                    raw=raw,
+                ))
+                continue
+
+        # Collect any extra columns as opaque metadata for adapters and reporting.
+        metadata = {
+            key: value
+            for key, value in raw.items()
+            if key not in _CORE_FIELDS
+        }
+
+        valid.append(NormalizedSample(
+            sample_id=sample_id,
+            question=question,
+            contexts=contexts,
+            answer=answer,
+            ground_truth=ground_truth,
+            scenario=_get_str(raw, "scenario"),
+            language=_get_str(raw, "language"),
+            retrieval_config=_get_str(raw, "retrieval_config"),
+            metadata=metadata,
+            raw=raw,
+        ))
+
+    return valid, invalid
diff --git a/scripts/seed_sample_run.py b/scripts/seed_sample_run.py
new file mode 100644
index 0000000..0d8c18b
--- /dev/null
+++ b/scripts/seed_sample_run.py
@@ -0,0 +1,236 @@
+"""Generate a realistic sample evaluation run so the console has demo data.
+
+This writes the standard run artifacts (metadata.json, scores.csv, summary.md,
+scenario.snapshot.yaml) under outputs/, exactly mirroring what the reporting
+layer produces, but without needing ragas or any network calls. It lets the
+report board render immediately for demos and local development.
+
+Usage:
+    python scripts/seed_sample_run.py
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+SCENARIO_NAME = "kba-knowledge-base-offline-baseline"
+RUN_ID = "2026-06-15T08-30-00+00-00"
+JUDGE_MODEL = "deepseek-distill-qwen-32b"
+EMBEDDING_MODEL = "text-embedding-v3"
+METRICS = ["faithfulness", "answer_relevancy", "context_recall", "context_precision"]
+
+# Each row mirrors a scores.csv record: sample fields + metric scores + metadata.
+# Scores are hand-tuned to exercise the full UI (greens, yellows, reds, a long
+# tail in the distribution, and clear weak groups by difficulty).
+SAMPLES = [
+    {
+        "sample_id": "kba-001", "language": "zh", "difficulty": "easy", "question_type": "fact",
+        "question": "员工入职满3年可享受多少天年休假？",
+        "contexts": ["员工入司满1年不满10年的，年休假5天。", "年休假在每年1月1日起可申请。"],
+        "answer": "根据规定，入职满3年的员工可享受5天年休假。",
+        "ground_truth": "员工入司满1年不满10年的，年休假5天。",
+        "faithfulness": 0.98, "answer_relevancy": 0.95, "context_recall": 0.97, "context_precision": 0.92,
+    },
+    {
+        "sample_id": "kba-002", "language": "zh", "difficulty": "easy", "question_type": "fact",
+        "question": "公司报销差旅费的截止提交时间是什么时候？",
+        "contexts": ["差旅费报销须在出差结束后30天内提交。", "逾期提交需部门经理审批。"],
+        "answer": "差旅费需在出差结束后30天内提交报销。",
+        "ground_truth": "差旅费报销须在出差结束后30天内提交。",
+        "faithfulness": 0.96, "answer_relevancy": 0.93, "context_recall": 0.90, "context_precision": 0.88,
+    },
+    {
+        "sample_id": "kba-003", "language": "zh", "difficulty": "medium", "question_type": "procedure",
+        "question": "申请远程办公需要经过哪些审批流程？",
+        "contexts": ["远程办公申请需先由直属主管审批。", "随后提交人力资源部备案。", "每月远程办公不超过8天。"],
+        "answer": "需先由直属主管审批，再提交人力资源部备案，每月不超过8天。",
+        "ground_truth": "远程办公申请须经直属主管审批并报人力资源部备案，每月上限8天。",
+        "faithfulness": 0.91, "answer_relevancy": 0.88, "context_recall": 0.85, "context_precision": 0.79,
+    },
+    {
+        "sample_id": "kba-004", "language": "en", "difficulty": "medium", "question_type": "fact",
+        "question": "How many days of paternity leave are employees entitled to?",
+        "contexts": ["Employees are entitled to 15 days of paternity leave.", "Leave must be taken within 6 months of birth."],
+        "answer": "Employees are entitled to 15 days of paternity leave, to be taken within 6 months.",
+        "ground_truth": "Employees are entitled to 15 days of paternity leave.",
+        "faithfulness": 0.89, "answer_relevancy": 0.86, "context_recall": 0.82, "context_precision": 0.74,
+    },
+    {
+        "sample_id": "kba-005", "language": "zh", "difficulty": "medium", "question_type": "comparison",
+        "question": "正式员工与试用期员工在医疗保险待遇上有何区别？",
+        "contexts": ["正式员工享受补充医疗保险。", "试用期员工享受基础医疗保险。"],
+        "answer": "正式员工额外享受补充医疗保险，试用期员工仅有基础医疗保险。",
+        "ground_truth": "正式员工在基础医疗保险外另享补充医疗保险，试用期员工仅享基础医疗保险。",
+        "faithfulness": 0.84, "answer_relevancy": 0.83, "context_recall": 0.78, "context_precision": 0.71,
+    },
+    {
+        "sample_id": "kba-006", "language": "zh", "difficulty": "hard", "question_type": "summary",
+        "question": "请总结公司数据安全政策中关于第三方数据共享的核心要求。",
+        "contexts": ["第三方共享数据须签署保密协议。", "敏感数据共享须经数据保护官批准。", "共享记录须留存至少3年。"],
+        "answer": "第三方共享需签保密协议，敏感数据须经数据保护官批准，记录留存3年。",
+        "ground_truth": "向第三方共享数据须签署保密协议，敏感数据共享须经数据保护官批准，且共享记录至少留存3年。",
+        "faithfulness": 0.79, "answer_relevancy": 0.81, "context_recall": 0.70, "context_precision": 0.62,
+    },
+    {
+        "sample_id": "kba-007", "language": "zh", "difficulty": "hard", "question_type": "procedure",
+        "question": "跨部门项目预算超支时的审批升级路径是怎样的？",
+        "contexts": ["预算超支10%以内由项目经理审批。", "超支10%-20%需部门总监审批。"],
+        "answer": "超支10%以内项目经理批，10%-20%需总监批，超20%需财务委员会审批。",
+        "ground_truth": "超支10%以内由项目经理审批，10%-20%由部门总监审批，超过20%须提交财务委员会审批。",
+        "faithfulness": 0.58, "answer_relevancy": 0.72, "context_recall": 0.55, "context_precision": 0.48,
+    },
+    {
+        "sample_id": "kba-008", "language": "zh", "difficulty": "hard", "question_type": "fact",
+        "question": "员工持股计划的最低锁定期是多少年？",
+        "contexts": ["员工福利包括弹性工作制。", "公司提供年度体检。"],
+        "answer": "员工持股计划的最低锁定期为3年。",
+        "ground_truth": "员工持股计划的最低锁定期为4年。",
+        "faithfulness": 0.22, "answer_relevancy": 0.65, "context_recall": 0.18, "context_precision": 0.30,
+    },
+    {
+        "sample_id": "kba-009", "language": "en", "difficulty": "hard", "question_type": "comparison",
+        "question": "What is the difference in notice period between voluntary and involuntary termination?",
+        "contexts": ["Voluntary resignation requires 30 days notice.", "The probation period lasts 3 months."],
+        "answer": "Voluntary termination needs 30 days notice; involuntary termination needs 60 days.",
+        "ground_truth": "Voluntary resignation requires 30 days notice; involuntary termination requires 60 days notice.",
+        "faithfulness": 0.35, "answer_relevancy": 0.70, "context_recall": 0.40, "context_precision": 0.33,
+    },
+    {
+        "sample_id": "kba-010", "language": "zh", "difficulty": "medium", "question_type": "fact",
+        "question": "公司规定的标准工作时间是每周多少小时？",
+        "contexts": ["标准工作时间为每周40小时。", "加班需事先申请。"],
+        "answer": "公司标准工作时间为每周40小时。",
+        "ground_truth": "公司标准工作时间为每周40小时。",
+        "faithfulness": 0.99, "answer_relevancy": 0.96, "context_recall": 0.95, "context_precision": 0.90,
+    },
+]
+
+# Two samples that failed normalization, to exercise the invalid count display.
+INVALID_SAMPLES = [
+    {"sample_id": "kba-011", "error": "missing ground_truth", "question": "公司年会在什么时候举办？"},
+    {"sample_id": "kba-012", "error": "empty contexts after retrieval", "question": "停车场如何申请月卡？"},
+]
+
+
+def _output_dir() -> Path:
+    """Return the run directory where sample artifacts are written."""
+    return REPO_ROOT / "outputs" / SCENARIO_NAME / RUN_ID
+
+
+def _write_scores_csv(path: Path) -> None:
+    """Write scores.csv with sample fields, metric scores, and metadata columns."""
+    fieldnames = [
+        "sample_id", "question", "contexts", "answer", "ground_truth",
+        "scenario", "language", "difficulty", "question_type",
+        *METRICS, "error", "judge_model", "embedding_model", "run_id",
+    ]
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for sample in SAMPLES:
+            row = {
+                "sample_id": sample["sample_id"],
+                "question": sample["question"],
+                # Serialize contexts as a JSON list, matching engine CSV output.
+                "contexts": json.dumps(sample["contexts"], ensure_ascii=False),
+                "answer": sample["answer"],
+                "ground_truth": sample["ground_truth"],
+                "scenario": SCENARIO_NAME,
+                "language": sample["language"],
+                "difficulty": sample["difficulty"],
+                "question_type": sample["question_type"],
+                "error": "",
+                "judge_model": JUDGE_MODEL,
+                "embedding_model": EMBEDDING_MODEL,
+                "run_id": SCENARIO_NAME,
+            }
+            for metric in METRICS:
+                row[metric] = sample[metric]
+            writer.writerow(row)
+
+
+def _write_invalid_csv(path: Path) -> None:
+    """Write invalid.csv with the small set of unscored samples."""
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=["sample_id", "error", "question"])
+        writer.writeheader()
+        writer.writerows(INVALID_SAMPLES)
+
+
+def _metric_mean(metric: str) -> float:
+    """Compute the mean of one metric across the valid samples."""
+    return round(sum(sample[metric] for sample in SAMPLES) / len(SAMPLES), 4)
+
+
+def _write_metadata(path: Path) -> None:
+    """Write metadata.json mirroring the reporting layer's schema."""
+    metadata = {
+        "run_id": RUN_ID,
+        "scenario_name": SCENARIO_NAME,
+        "mode": "offline",
+        "judge_model": JUDGE_MODEL,
+        "embedding_model": EMBEDDING_MODEL,
+        "started_at": "2026-06-15T08:29:12+00:00",
+        "finished_at": "2026-06-15T08:31:45+00:00",
+        "dataset": "datasets/normalized/kba_knowledge_base_baseline.csv",
+        "valid_samples": len(SAMPLES),
+        "invalid_samples": len(INVALID_SAMPLES),
+    }
+    path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def _write_summary(path: Path) -> None:
+    """Write a human-readable summary.md echoing the metric means."""
+    lines = [
+        f"# {SCENARIO_NAME}",
+        "",
+        f"- run_id: `{RUN_ID}`",
+        "- mode: `offline`",
+        f"- total_samples: `{len(SAMPLES) + len(INVALID_SAMPLES)}`",
+        f"- valid_samples: `{len(SAMPLES)}`",
+        f"- invalid_samples: `{len(INVALID_SAMPLES)}`",
+        f"- judge_model: `{JUDGE_MODEL}`",
+        "",
+        "## Metric Means",
+        "",
+    ]
+    for metric in METRICS:
+        lines.append(f"- {metric}: `{_metric_mean(metric):.4f}`")
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def _write_scenario_snapshot(path: Path) -> None:
+    """Write scenario.snapshot.yaml so the reader resolves the metric list."""
+    import yaml
+
+    snapshot = {
+        "scenario_name": SCENARIO_NAME,
+        "mode": "offline",
+        "judge_model": JUDGE_MODEL,
+        "embedding_model": EMBEDDING_MODEL,
+        "metrics": METRICS,
+    }
+    path.write_text(yaml.safe_dump(snapshot, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+
+def main() -> None:
+    """Write all sample run artifacts into a fresh run directory."""
+    run_dir = _output_dir()
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    _write_scores_csv(run_dir / "scores.csv")
+    _write_invalid_csv(run_dir / "invalid.csv")
+    _write_metadata(run_dir / "metadata.json")
+    _write_summary(run_dir / "summary.md")
+    _write_scenario_snapshot(run_dir / "scenario.snapshot.yaml")
+
+    print(f"Sample run written to: {run_dir}")
+    print("Start the console with: python webmain.py")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/start.bat b/start.bat
new file mode 100644
index 0000000..353b309
--- /dev/null
+++ b/start.bat
@@ -0,0 +1,99 @@
+@echo off
+chcp 65001 >nul
+setlocal enabledelayedexpansion
+
+echo.
+echo ============================================================
+echo   Siemens RAGAS 评估控制台  启动脚本
+echo ============================================================
+echo.
+
+:: ---- 切换到脚本所在目录（即 siemens_ragas/）-------------------
+cd /d "%~dp0"
+
+:: ---- 检查 Python ---------------------------------------------------
+python --version >nul 2>&1
+if errorlevel 1 (
+    echo [错误] 未找到 Python，请确认已安装 Python 3.12+ 并加入 PATH。
+    pause
+    exit /b 1
+)
+for /f "tokens=*" %%v in ('python --version 2^>^&1') do set PY_VER=%%v
+echo [OK] %PY_VER%
+
+:: ---- 检查 FastAPI / uvicorn ---------------------------------------
+python -c "import fastapi, uvicorn" >nul 2>&1
+if errorlevel 1 (
+    echo [提示] 正在安装 FastAPI 和 uvicorn...
+    pip install fastapi uvicorn --quiet
+    if errorlevel 1 (
+        echo [错误] 安装依赖失败，请手动运行: pip install fastapi uvicorn
+        pause
+        exit /b 1
+    )
+    echo [OK] FastAPI / uvicorn 安装完成。
+) else (
+    echo [OK] FastAPI / uvicorn 已就绪。
+)
+
+:: ---- 检查 ragas 版本 ----------------------------------------------
+python -c "import ragas; assert ragas.__version__ == '0.4.3', ragas.__version__" >nul 2>&1
+if errorlevel 1 (
+    echo [提示] 正在安装 ragas==0.4.3（评估引擎依赖）...
+    pip install "ragas==0.4.3" --quiet
+    if errorlevel 1 (
+        echo [警告] ragas 安装失败。
+        echo         控制台仍可启动：报告看板可用，触发评估功能将显示错误。
+        echo.
+    ) else (
+        echo [OK] ragas 0.4.3 安装完成。
+    )
+) else (
+    echo [OK] ragas 0.4.3 已就绪。
+)
+
+:: ---- 检查是否有示例数据，没有则自动生成 ---------------------------
+set SAMPLE_META=outputs\kba-knowledge-base-offline-baseline\2026-06-15T08-30-00+00-00\metadata.json
+if not exist "%SAMPLE_META%" (
+    echo [提示] 未找到示例运行数据，正在生成...
+    python scripts\seed_sample_run.py
+    if errorlevel 1 (
+        echo [警告] 示例数据生成失败，看板可能为空。继续启动...
+    ) else (
+        echo [OK] 示例数据已生成。
+    )
+) else (
+    echo [OK] 已有运行数据，跳过示例生成。
+)
+
+:: ---- 检查端口是否已占用 ------------------------------------------
+set PORT=8800
+netstat -ano | findstr /r ":%PORT%[^0-9]" | findstr "LISTENING" >nul 2>&1
+if not errorlevel 1 (
+    echo [警告] 端口 %PORT% 已被占用，尝试使用 8801...
+    set PORT=8801
+    netstat -ano | findstr /r ":8801[^0-9]" | findstr "LISTENING" >nul 2>&1
+    if not errorlevel 1 (
+        echo [错误] 端口 8800 和 8801 均被占用，请手动指定端口：
+        echo         python webmain.py --port ^<端口号^>
+        pause
+        exit /b 1
+    )
+)
+
+echo.
+echo ============================================================
+echo   启动控制台：http://127.0.0.1:%PORT%
+echo   按 Ctrl+C 停止服务
+echo ============================================================
+echo.
+
+:: ---- 稍等 1 秒后在默认浏览器打开页面 ----------------------------
+start /b cmd /c "timeout /t 2 >nul && start http://127.0.0.1:%PORT%"
+
+:: ---- 启动 uvicorn -------------------------------------------------
+python webmain.py --host 127.0.0.1 --port %PORT%
+
+echo.
+echo 服务已停止。
+pause
diff --git a/webapp/__init__.py b/webapp/__init__.py
new file mode 100644
index 0000000..68c9b41
--- /dev/null
+++ b/webapp/__init__.py
@@ -0,0 +1,5 @@
+"""Lightweight FastAPI web console layered on top of the rag_eval platform.
+
+This package is additive and non-invasive: it imports rag_eval as a library and
+reads run artifacts from disk. It never modifies the core evaluation modules.
+"""
diff --git a/webapp/api/__init__.py b/webapp/api/__init__.py
new file mode 100644
index 0000000..f471954
--- /dev/null
+++ b/webapp/api/__init__.py
@@ -0,0 +1 @@
+"""API router package for the evaluation console."""
diff --git a/webapp/api/evaluations.py b/webapp/api/evaluations.py
new file mode 100644
index 0000000..3775b47
--- /dev/null
+++ b/webapp/api/evaluations.py
@@ -0,0 +1,44 @@
+"""Routes for triggering evaluations and polling background task status."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, HTTPException
+
+from webapp.models import (
+    TaskStatus,
+    TriggerEvaluationRequest,
+    TriggerEvaluationResponse,
+)
+from webapp.services import scenario_scanner
+from webapp.services.task_manager import task_manager
+
+router = APIRouter(prefix="/api/evaluations", tags=["evaluations"])
+
+
+@router.post("", response_model=TriggerEvaluationResponse)
+def trigger_evaluation(request: TriggerEvaluationRequest) -> TriggerEvaluationResponse:
+    """Validate the scenario path and queue a background evaluation task."""
+    resolved = scenario_scanner.resolve_scenario_path(request.scenario_path)
+    if resolved is None:
+        raise HTTPException(
+            status_code=400,
+            detail=f"无效或不允许的场景路径: {request.scenario_path}",
+        )
+
+    task_id = task_manager.submit(request.scenario_path)
+    return TriggerEvaluationResponse(task_id=task_id)
+
+
+@router.get("/{task_id}", response_model=TaskStatus)
+def get_task_status(task_id: str) -> TaskStatus:
+    """Return the current status and logs for one evaluation task."""
+    status = task_manager.get(task_id)
+    if status is None:
+        raise HTTPException(status_code=404, detail=f"未找到任务: {task_id}")
+    return status
+
+
+@router.get("", response_model=dict)
+def list_tasks() -> dict[str, list]:
+    """Return all known evaluation tasks for this server session."""
+    return {"tasks": [task.model_dump() for task in task_manager.list_tasks()]}
diff --git a/webapp/api/runs.py b/webapp/api/runs.py
new file mode 100644
index 0000000..da3765d
--- /dev/null
+++ b/webapp/api/runs.py
@@ -0,0 +1,32 @@
+"""Routes for listing evaluation runs and fetching a single run's report."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, HTTPException
+
+from webapp.models import RunDetail
+from webapp.services import report_builder, run_reader
+
+router = APIRouter(prefix="/api/runs", tags=["runs"])
+
+
+@router.get("")
+def get_runs() -> dict[str, list]:
+    """Return summaries for every discoverable evaluation run."""
+    summaries = run_reader.list_run_summaries()
+    return {"runs": [summary.model_dump() for summary in summaries]}
+
+
+@router.get("/{run_id}")
+def get_run_detail(run_id: str) -> RunDetail:
+    """Return the full summary and aggregated report for one run."""
+    run_dir = run_reader.find_run_dir(run_id)
+    if run_dir is None:
+        raise HTTPException(status_code=404, detail=f"未找到运行: {run_id}")
+
+    summary = run_reader.build_run_summary(run_dir)
+    if summary is None:
+        raise HTTPException(status_code=404, detail=f"运行元数据缺失: {run_id}")
+
+    report = report_builder.build_report(run_dir, summary.metrics)
+    return RunDetail(summary=summary, report=report)
diff --git a/webapp/api/scenarios.py b/webapp/api/scenarios.py
new file mode 100644
index 0000000..45aa31c
--- /dev/null
+++ b/webapp/api/scenarios.py
@@ -0,0 +1,16 @@
+"""Route for discovering scenario YAML files that can be evaluated."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+from webapp.services import scenario_scanner
+
+router = APIRouter(prefix="/api/scenarios", tags=["scenarios"])
+
+
+@router.get("")
+def get_scenarios() -> dict[str, list]:
+    """Return every scenario file found under the scenarios/ directory."""
+    scenarios = scenario_scanner.list_scenarios()
+    return {"scenarios": [item.model_dump() for item in scenarios]}
diff --git a/webapp/models.py b/webapp/models.py
new file mode 100644
index 0000000..03dd6a9
--- /dev/null
+++ b/webapp/models.py
@@ -0,0 +1,129 @@
+"""Pydantic response models for the evaluation console HTTP API."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class RunSummary(BaseModel):
+    """Compact description of a single evaluation run for list views."""
+
+    run_id: str
+    scenario_name: str
+    mode: str = ""
+    judge_model: str = ""
+    embedding_model: str = ""
+    started_at: str = ""
+    finished_at: str = ""
+    dataset: str = ""
+    total_samples: int = 0
+    valid_samples: int = 0
+    invalid_samples: int = 0
+    metrics: list[str] = Field(default_factory=list)
+    metric_means: dict[str, float | None] = Field(default_factory=dict)
+    output_path: str = ""
+
+
+class GroupStat(BaseModel):
+    """Mean metric values for one slice of samples grouped by a metadata field."""
+
+    key: str
+    count: int
+    means: dict[str, float | None] = Field(default_factory=dict)
+
+
+class DistributionBin(BaseModel):
+    """One histogram bucket of sample counts for a single metric."""
+
+    label: str
+    lower: float
+    upper: float
+    count: int
+
+
+class SampleScore(BaseModel):
+    """Per-sample row used for the lowest-score review table."""
+
+    sample_id: str
+    question: str = ""
+    contexts: list[str] = Field(default_factory=list)
+    answer: str = ""
+    ground_truth: str = ""
+    language: str = ""
+    difficulty: str = ""
+    question_type: str = ""
+    metrics: dict[str, float | None] = Field(default_factory=dict)
+    mean_score: float | None = None
+    error: str = ""
+
+
+class ReportData(BaseModel):
+    """Aggregated report payload rendered by the report detail page."""
+
+    metrics: list[str] = Field(default_factory=list)
+    metric_means: dict[str, float | None] = Field(default_factory=dict)
+    distributions: dict[str, list[DistributionBin]] = Field(default_factory=dict)
+    groupings: dict[str, list[GroupStat]] = Field(default_factory=dict)
+    lowest_samples: list[SampleScore] = Field(default_factory=list)
+    summary_markdown: str = ""
+
+
+class RunDetail(BaseModel):
+    """Full payload for a single run: summary metadata plus the report."""
+
+    summary: RunSummary
+    report: ReportData
+
+
+class ScenarioInfo(BaseModel):
+    """One discoverable scenario YAML file that can be evaluated from the UI."""
+
+    path: str
+    scenario_name: str = ""
+    mode: str = ""
+    dataset: str = ""
+    judge_model: str = ""
+    metrics: list[str] = Field(default_factory=list)
+    error: str = ""
+
+
+class TaskStatus(BaseModel):
+    """State of a background evaluation task tracked by the task manager."""
+
+    task_id: str
+    scenario_path: str
+    status: str
+    logs: list[str] = Field(default_factory=list)
+    run_id: str | None = None
+    error: str | None = None
+    created_at: str = ""
+    finished_at: str = ""
+
+
+class TriggerEvaluationRequest(BaseModel):
+    """Request body for launching an evaluation run from the UI."""
+
+    scenario_path: str
+
+
+class TriggerEvaluationResponse(BaseModel):
+    """Response returned immediately after queuing an evaluation task."""
+
+    task_id: str
+
+
+def jsonable(value: Any) -> Any:
+    """Convert NaN/inf floats into None so the payload stays valid JSON."""
+    import math
+
+    if isinstance(value, float):
+        if math.isnan(value) or math.isinf(value):
+            return None
+        return value
+    if isinstance(value, dict):
+        return {key: jsonable(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [jsonable(item) for item in value]
+    return value
diff --git a/webapp/server.py b/webapp/server.py
new file mode 100644
index 0000000..49ea03d
--- /dev/null
+++ b/webapp/server.py
@@ -0,0 +1,49 @@
+"""FastAPI application factory for the RAGAS evaluation console.
+
+The app mounts three JSON API routers and serves the single-page static
+frontend. It imports rag_eval only lazily (inside the task manager worker), so
+the server starts even when the evaluation dependencies are not yet installed.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi import FastAPI
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+
+from webapp.api import evaluations, runs, scenarios
+
+STATIC_DIR = Path(__file__).resolve().parent / "static"
+
+
+def create_app() -> FastAPI:
+    """Build and configure the FastAPI application instance."""
+    app = FastAPI(
+        title="Siemens RAGAS 评估控制台",
+        description="RAGAS 评估子系统的可视化报告与评估触发控制台。",
+        version="0.1.0",
+    )
+
+    app.include_router(runs.router)
+    app.include_router(scenarios.router)
+    app.include_router(evaluations.router)
+
+    @app.get("/api/health", tags=["meta"])
+    def health() -> dict[str, str]:
+        """Report basic liveness so the UI can confirm the server is reachable."""
+        return {"status": "ok"}
+
+    @app.get("/", include_in_schema=False)
+    def index() -> FileResponse:
+        """Serve the single-page console entry document."""
+        return FileResponse(STATIC_DIR / "index.html")
+
+    # Serve CSS/JS assets under /static while keeping API routes at /api.
+    app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+
+    return app
+
+
+app = create_app()
diff --git a/webapp/services/__init__.py b/webapp/services/__init__.py
new file mode 100644
index 0000000..fbdd2df
--- /dev/null
+++ b/webapp/services/__init__.py
@@ -0,0 +1 @@
+"""Service package for the evaluation console (filesystem readers and task runner)."""
diff --git a/webapp/services/report_builder.py b/webapp/services/report_builder.py
new file mode 100644
index 0000000..1320578
--- /dev/null
+++ b/webapp/services/report_builder.py
@@ -0,0 +1,188 @@
+"""Aggregate a run's per-sample scores into the report payload for the UI.
+
+All aggregation reads only the standard scores.csv produced by the reporting
+layer, plus the metric list resolved by run_reader. The output mirrors the
+report detail page: metric means, per-metric distribution histograms, grouped
+means by difficulty / question_type, and the lowest-scoring samples for review.
+"""
+
+from __future__ import annotations
+
+import math
+from pathlib import Path
+
+import pandas as pd
+
+from webapp.services.text_utils import parse_contexts
+from webapp.models import (
+    DistributionBin,
+    GroupStat,
+    ReportData,
+    SampleScore,
+)
+from webapp.services import run_reader
+
+
+# Number of equal-width buckets used for metric score histograms.
+DISTRIBUTION_BIN_COUNT = 5
+
+# Metadata columns that we group samples by when present in the data.
+GROUPING_FIELDS = ("difficulty", "question_type", "language")
+
+# How many lowest-scoring samples to surface for manual review.
+LOWEST_SAMPLE_COUNT = 10
+
+
+def _round_or_none(value: float | None) -> float | None:
+    """Round a float to four places, mapping NaN/None to None for clean JSON."""
+    if value is None:
+        return None
+    if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
+        return None
+    return round(float(value), 4)
+
+
+def _metric_means(frame: pd.DataFrame, metrics: list[str]) -> dict[str, float | None]:
+    """Compute the mean of each metric column across all scored samples."""
+    means: dict[str, float | None] = {}
+    for metric in metrics:
+        if metric in frame.columns:
+            means[metric] = _round_or_none(frame[metric].mean(numeric_only=True))
+        else:
+            means[metric] = None
+    return means
+
+
+def _distribution(frame: pd.DataFrame, metric: str) -> list[DistributionBin]:
+    """Bucket one metric's scores into fixed-width [0,1] histogram bins."""
+    bins: list[DistributionBin] = []
+    if metric not in frame.columns:
+        return bins
+
+    series = pd.to_numeric(frame[metric], errors="coerce").dropna()
+    width = 1.0 / DISTRIBUTION_BIN_COUNT
+    for index in range(DISTRIBUTION_BIN_COUNT):
+        lower = index * width
+        upper = (index + 1) * width
+        # Include the right edge in the final bin so 1.0 is counted.
+        if index == DISTRIBUTION_BIN_COUNT - 1:
+            mask = (series >= lower) & (series <= upper)
+        else:
+            mask = (series >= lower) & (series < upper)
+        bins.append(
+            DistributionBin(
+                label=f"{lower:.1f}–{upper:.1f}",
+                lower=round(lower, 2),
+                upper=round(upper, 2),
+                count=int(mask.sum()),
+            )
+        )
+    return bins
+
+
+def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupStat]]:
+    """Compute per-group metric means for each available grouping field."""
+    groupings: dict[str, list[GroupStat]] = {}
+    for field in GROUPING_FIELDS:
+        if field not in frame.columns:
+            continue
+        # Skip fields that are entirely empty so the UI does not render noise.
+        non_empty = frame[field].astype(str).str.strip().replace("nan", "")
+        if non_empty.eq("").all():
+            continue
+
+        stats: list[GroupStat] = []
+        for key, group in frame.groupby(frame[field].astype(str)):
+            key_text = str(key).strip()
+            if not key_text or key_text == "nan":
+                continue
+            means = {
+                metric: _round_or_none(group[metric].mean(numeric_only=True))
+                for metric in metrics
+                if metric in group.columns
+            }
+            stats.append(GroupStat(key=key_text, count=int(len(group)), means=means))
+        if stats:
+            stats.sort(key=lambda item: item.key)
+            groupings[field] = stats
+    return groupings
+
+
+def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
+    """Average a single sample's available metric scores for ranking."""
+    values = [
+        float(row[metric])
+        for metric in metrics
+        if metric in row and pd.notna(row[metric])
+    ]
+    if not values:
+        return None
+    return sum(values) / len(values)
+
+
+def _cell_text(row: pd.Series, column: str) -> str:
+    """Safely read a string cell, returning '' for missing or NaN values."""
+    if column not in row or pd.isna(row[column]):
+        return ""
+    return str(row[column]).strip()
+
+
+def _lowest_samples(frame: pd.DataFrame, metrics: list[str]) -> list[SampleScore]:
+    """Select and shape the lowest-scoring samples for the review table."""
+    if frame.empty:
+        return []
+
+    enriched: list[tuple[float, SampleScore]] = []
+    for _, row in frame.iterrows():
+        mean_score = _sample_mean(row, metrics)
+        sample = SampleScore(
+            sample_id=_cell_text(row, "sample_id") or "—",
+            question=_cell_text(row, "question"),
+            contexts=parse_contexts(row["contexts"]) if "contexts" in row else [],
+            answer=_cell_text(row, "answer"),
+            ground_truth=_cell_text(row, "ground_truth"),
+            language=_cell_text(row, "language"),
+            difficulty=_cell_text(row, "difficulty"),
+            question_type=_cell_text(row, "question_type"),
+            metrics={
+                metric: _round_or_none(float(row[metric]))
+                for metric in metrics
+                if metric in row and pd.notna(row[metric])
+            },
+            mean_score=_round_or_none(mean_score),
+            error=_cell_text(row, "error"),
+        )
+        # Samples without any score sort last (treated as worst for review).
+        sort_key = mean_score if mean_score is not None else -1.0
+        enriched.append((sort_key, sample))
+
+    enriched.sort(key=lambda item: item[0])
+    return [sample for _, sample in enriched[:LOWEST_SAMPLE_COUNT]]
+
+
+def build_report(run_dir: Path, metrics: list[str]) -> ReportData:
+    """Build the full aggregated report payload for one run directory."""
+    frame = run_reader.read_scores_frame(run_dir)
+    summary_markdown = run_reader.read_summary_markdown(run_dir)
+
+    if frame.empty or not metrics:
+        return ReportData(
+            metrics=metrics,
+            metric_means={metric: None for metric in metrics},
+            summary_markdown=summary_markdown,
+        )
+
+    distributions = {
+        metric: _distribution(frame, metric)
+        for metric in metrics
+        if metric in frame.columns
+    }
+
+    return ReportData(
+        metrics=metrics,
+        metric_means=_metric_means(frame, metrics),
+        distributions=distributions,
+        groupings=_groupings(frame, metrics),
+        lowest_samples=_lowest_samples(frame, metrics),
+        summary_markdown=summary_markdown,
+    )
diff --git a/webapp/services/run_reader.py b/webapp/services/run_reader.py
new file mode 100644
index 0000000..0d9eb27
--- /dev/null
+++ b/webapp/services/run_reader.py
@@ -0,0 +1,222 @@
+"""Read evaluation run artifacts from disk into API-friendly structures.
+
+A "run" is any directory under the configured output roots that contains a
+metadata.json file. This service stays decoupled from rag_eval internals: it
+only reads the standard artifact files (metadata.json, scores.csv, summary.md,
+scenario.snapshot.yaml) that the reporting layer writes.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import yaml
+
+from webapp.models import RunSummary
+
+
+# Directory names that commonly hold run outputs, relative to the repo root.
+DEFAULT_OUTPUT_ROOTS = ("outputs", "runs")
+
+
+def _repo_root() -> Path:
+    """Return the siemens_ragas repository root (parent of the webapp package)."""
+    return Path(__file__).resolve().parents[2]
+
+
+def _candidate_roots(extra_roots: list[Path] | None = None) -> list[Path]:
+    """Collect existing output directories that may contain run artifacts."""
+    root = _repo_root()
+    roots: list[Path] = []
+    for name in DEFAULT_OUTPUT_ROOTS:
+        candidate = root / name
+        if candidate.is_dir():
+            roots.append(candidate)
+    for extra in extra_roots or []:
+        if extra.is_dir():
+            roots.append(extra)
+    return roots
+
+
+def _read_json(path: Path) -> dict[str, Any]:
+    """Load a JSON file, returning an empty dict on any failure."""
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, ValueError):
+        return {}
+
+
+def _read_metrics_from_snapshot(run_dir: Path) -> list[str]:
+    """Read the configured metric list from a scenario snapshot if present."""
+    snapshot = run_dir / "scenario.snapshot.yaml"
+    if not snapshot.is_file():
+        return []
+    try:
+        payload = yaml.safe_load(snapshot.read_text(encoding="utf-8")) or {}
+    except (OSError, yaml.YAMLError):
+        return []
+    metrics = payload.get("metrics")
+    if isinstance(metrics, list):
+        return [str(item) for item in metrics]
+    return []
+
+
+def discover_run_dirs(extra_roots: list[Path] | None = None) -> list[Path]:
+    """Find every run directory (one that contains metadata.json) under the roots."""
+    run_dirs: list[Path] = []
+    seen: set[Path] = set()
+    for root in _candidate_roots(extra_roots):
+        for metadata_path in root.rglob("metadata.json"):
+            run_dir = metadata_path.parent
+            # A dataset-build metadata.json also exists; keep only evaluation runs
+            # by requiring a scores.csv alongside, or a recognizable run metadata.
+            metadata = _read_json(metadata_path)
+            if "scenario_name" not in metadata:
+                continue
+            if run_dir in seen:
+                continue
+            seen.add(run_dir)
+            run_dirs.append(run_dir)
+    return run_dirs
+
+
+def _metric_means(run_dir: Path, metrics: list[str]) -> dict[str, float | None]:
+    """Compute per-metric mean scores from a run's scores.csv."""
+    scores_path = run_dir / "scores.csv"
+    if not scores_path.is_file():
+        return {}
+    try:
+        frame = pd.read_csv(scores_path)
+    except (OSError, ValueError, pd.errors.ParserError):
+        return {}
+    means: dict[str, float | None] = {}
+    for metric in metrics:
+        if metric in frame.columns:
+            mean_value = frame[metric].mean(numeric_only=True)
+            means[metric] = None if pd.isna(mean_value) else round(float(mean_value), 4)
+        else:
+            means[metric] = None
+    return means
+
+
+def build_run_summary(run_dir: Path) -> RunSummary | None:
+    """Assemble a RunSummary from one run directory's artifacts."""
+    metadata = _read_json(run_dir / "metadata.json")
+    if "scenario_name" not in metadata:
+        return None
+
+    metrics = _read_metrics_from_snapshot(run_dir)
+    if not metrics:
+        # Fall back to numeric score columns inferred from the scores file.
+        metrics = _infer_metrics_from_scores(run_dir)
+
+    valid = int(metadata.get("valid_samples", 0) or 0)
+    invalid = int(metadata.get("invalid_samples", 0) or 0)
+    run_id = str(metadata.get("run_id") or run_dir.name)
+
+    return RunSummary(
+        run_id=run_id,
+        scenario_name=str(metadata.get("scenario_name", "")),
+        mode=str(metadata.get("mode", "")),
+        judge_model=str(metadata.get("judge_model", "")),
+        embedding_model=str(metadata.get("embedding_model", "")),
+        started_at=str(metadata.get("started_at", "")),
+        finished_at=str(metadata.get("finished_at", "")),
+        dataset=str(metadata.get("dataset", "")),
+        total_samples=valid + invalid,
+        valid_samples=valid,
+        invalid_samples=invalid,
+        metrics=metrics,
+        metric_means=_metric_means(run_dir, metrics),
+        output_path=run_dir.as_posix(),
+    )
+
+
+# Columns in scores.csv that are sample fields rather than metric scores.
+NON_METRIC_COLUMNS = {
+    "sample_id",
+    "question",
+    "contexts",
+    "answer",
+    "ground_truth",
+    "scenario",
+    "language",
+    "retrieval_config",
+    "error",
+    "judge_model",
+    "embedding_model",
+    "run_id",
+    "difficulty",
+    "question_type",
+    "doc_id",
+    "doc_name",
+    "section_path",
+    "page_start",
+    "page_end",
+    "source_chunk_ids",
+    "review_status",
+    "review_notes",
+}
+
+
+def _infer_metrics_from_scores(run_dir: Path) -> list[str]:
+    """Infer metric column names from a scores.csv when no snapshot is available."""
+    scores_path = run_dir / "scores.csv"
+    if not scores_path.is_file():
+        return []
+    try:
+        frame = pd.read_csv(scores_path, nrows=1)
+    except (OSError, ValueError, pd.errors.ParserError):
+        return []
+    metrics: list[str] = []
+    for column in frame.columns:
+        if column in NON_METRIC_COLUMNS:
+            continue
+        if pd.api.types.is_numeric_dtype(frame[column]):
+            metrics.append(str(column))
+    return metrics
+
+
+def list_run_summaries(extra_roots: list[Path] | None = None) -> list[RunSummary]:
+    """Return all run summaries sorted by finish time (most recent first)."""
+    summaries: list[RunSummary] = []
+    for run_dir in discover_run_dirs(extra_roots):
+        summary = build_run_summary(run_dir)
+        if summary is not None:
+            summaries.append(summary)
+    summaries.sort(key=lambda item: item.finished_at or item.started_at, reverse=True)
+    return summaries
+
+
+def find_run_dir(run_id: str, extra_roots: list[Path] | None = None) -> Path | None:
+    """Locate the run directory whose metadata or folder name matches run_id."""
+    for run_dir in discover_run_dirs(extra_roots):
+        metadata = _read_json(run_dir / "metadata.json")
+        if str(metadata.get("run_id") or run_dir.name) == run_id:
+            return run_dir
+    return None
+
+
+def read_scores_frame(run_dir: Path) -> pd.DataFrame:
+    """Load a run's scores.csv into a dataframe, or an empty frame if missing."""
+    scores_path = run_dir / "scores.csv"
+    if not scores_path.is_file():
+        return pd.DataFrame()
+    try:
+        return pd.read_csv(scores_path)
+    except (OSError, ValueError, pd.errors.ParserError):
+        return pd.DataFrame()
+
+
+def read_summary_markdown(run_dir: Path) -> str:
+    """Return the human-readable summary.md for a run, or an empty string."""
+    summary_path = run_dir / "summary.md"
+    if not summary_path.is_file():
+        return ""
+    try:
+        return summary_path.read_text(encoding="utf-8")
+    except OSError:
+        return ""
diff --git a/webapp/services/scenario_scanner.py b/webapp/services/scenario_scanner.py
new file mode 100644
index 0000000..910f316
--- /dev/null
+++ b/webapp/services/scenario_scanner.py
@@ -0,0 +1,84 @@
+"""Discover scenario YAML files that can be launched from the console.
+
+Scanning is intentionally tolerant: a malformed scenario file is reported with
+an error string rather than aborting the whole listing, so the UI can show the
+user which files are runnable and which need fixing.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from webapp.models import ScenarioInfo
+
+
+def _repo_root() -> Path:
+    """Return the siemens_ragas repository root (parent of the webapp package)."""
+    return Path(__file__).resolve().parents[2]
+
+
+def _scenarios_root() -> Path:
+    """Return the conventional scenarios/ directory inside the repository."""
+    return _repo_root() / "scenarios"
+
+
+def _summarize_scenario(path: Path) -> ScenarioInfo:
+    """Read a scenario file into a compact info object, capturing parse errors."""
+    relative = path.relative_to(_repo_root()).as_posix()
+    try:
+        payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    except (OSError, yaml.YAMLError) as exc:
+        return ScenarioInfo(path=relative, error=f"无法解析: {exc}")
+
+    if not isinstance(payload, dict):
+        return ScenarioInfo(path=relative, error="场景文件格式不是 YAML 映射。")
+
+    metrics = payload.get("metrics")
+    metric_list = [str(item) for item in metrics] if isinstance(metrics, list) else []
+
+    return ScenarioInfo(
+        path=relative,
+        scenario_name=str(payload.get("scenario_name", "")),
+        mode=str(payload.get("mode", "")),
+        dataset=str(payload.get("dataset", "")),
+        judge_model=str(payload.get("judge_model", "")),
+        metrics=metric_list,
+    )
+
+
+def list_scenarios() -> list[ScenarioInfo]:
+    """Return every scenario YAML under scenarios/, sorted by path."""
+    root = _scenarios_root()
+    if not root.is_dir():
+        return []
+
+    scenarios: list[ScenarioInfo] = []
+    for path in sorted(root.rglob("*.yaml")):
+        scenarios.append(_summarize_scenario(path))
+    for path in sorted(root.rglob("*.yml")):
+        scenarios.append(_summarize_scenario(path))
+    return scenarios
+
+
+def resolve_scenario_path(relative_or_absolute: str) -> Path | None:
+    """Resolve a user-supplied scenario path safely within the repository.
+
+    Only paths that live inside the repository's scenarios/ directory are
+    accepted, which prevents the trigger endpoint from reading arbitrary files.
+    """
+    root = _repo_root()
+    candidate = Path(relative_or_absolute)
+    resolved = candidate if candidate.is_absolute() else (root / candidate)
+    try:
+        resolved = resolved.resolve()
+    except OSError:
+        return None
+
+    scenarios_root = _scenarios_root().resolve()
+    if scenarios_root not in resolved.parents and resolved != scenarios_root:
+        return None
+    if not resolved.is_file():
+        return None
+    return resolved
diff --git a/webapp/services/task_manager.py b/webapp/services/task_manager.py
new file mode 100644
index 0000000..27a5725
--- /dev/null
+++ b/webapp/services/task_manager.py
@@ -0,0 +1,161 @@
+"""In-process background task manager for evaluation runs.
+
+Evaluations run in a thread pool so the FastAPI event loop is never blocked.
+The heavy rag_eval / ragas import is performed lazily inside the worker thread,
+which keeps the web server bootable even when the evaluation dependencies are
+broken — failures then surface as task errors in the UI instead of crashing
+startup. This matches the "coarse status + logs" progress decision.
+"""
+
+from __future__ import annotations
+
+import io
+import threading
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import redirect_stderr, redirect_stdout
+from datetime import datetime, timezone
+from pathlib import Path
+
+from webapp.models import TaskStatus
+
+
+def _now_iso() -> str:
+    """Return the current UTC time as an ISO 8601 string."""
+    return datetime.now(timezone.utc).isoformat()
+
+
+class _LineCapture(io.TextIOBase):
+    """A writable stream that appends captured lines to a task's log buffer."""
+
+    def __init__(self, sink: "EvaluationTask") -> None:
+        """Bind the capture stream to the owning task."""
+        self._sink = sink
+        self._buffer = ""
+
+    def write(self, text: str) -> int:
+        """Buffer text and flush complete lines into the task log."""
+        self._buffer += text
+        while "\n" in self._buffer:
+            line, self._buffer = self._buffer.split("\n", 1)
+            self._sink.append_log(line)
+        return len(text)
+
+    def flush(self) -> None:
+        """Flush any trailing partial line into the task log."""
+        if self._buffer:
+            self._sink.append_log(self._buffer)
+            self._buffer = ""
+
+
+class EvaluationTask:
+    """Mutable state for a single background evaluation run."""
+
+    def __init__(self, task_id: str, scenario_path: str) -> None:
+        """Initialize a queued task for the given scenario path."""
+        self.task_id = task_id
+        self.scenario_path = scenario_path
+        self.status = "queued"
+        self.logs: list[str] = []
+        self.run_id: str | None = None
+        self.error: str | None = None
+        self.created_at = _now_iso()
+        self.finished_at = ""
+        self._lock = threading.Lock()
+
+    def append_log(self, line: str) -> None:
+        """Append one log line in a thread-safe manner."""
+        with self._lock:
+            self.logs.append(line)
+
+    def snapshot(self) -> TaskStatus:
+        """Return an immutable copy of the current task state for the API."""
+        with self._lock:
+            return TaskStatus(
+                task_id=self.task_id,
+                scenario_path=self.scenario_path,
+                status=self.status,
+                logs=list(self.logs),
+                run_id=self.run_id,
+                error=self.error,
+                created_at=self.created_at,
+                finished_at=self.finished_at,
+            )
+
+
+class TaskManager:
+    """Owns the thread pool and registry of evaluation tasks."""
+
+    def __init__(self, max_workers: int = 2) -> None:
+        """Create a task manager backed by a small thread pool."""
+        self._executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._tasks: dict[str, EvaluationTask] = {}
+        self._lock = threading.Lock()
+
+    def submit(self, scenario_path: str) -> str:
+        """Register and schedule a new evaluation task, returning its id."""
+        task_id = uuid.uuid4().hex[:12]
+        task = EvaluationTask(task_id=task_id, scenario_path=scenario_path)
+        with self._lock:
+            self._tasks[task_id] = task
+        self._executor.submit(self._run, task)
+        return task_id
+
+    def get(self, task_id: str) -> TaskStatus | None:
+        """Return a snapshot of one task, or None if the id is unknown."""
+        with self._lock:
+            task = self._tasks.get(task_id)
+        return task.snapshot() if task is not None else None
+
+    def list_tasks(self) -> list[TaskStatus]:
+        """Return snapshots of all known tasks, newest first."""
+        with self._lock:
+            tasks = list(self._tasks.values())
+        snapshots = [task.snapshot() for task in tasks]
+        snapshots.sort(key=lambda item: item.created_at, reverse=True)
+        return snapshots
+
+    def _run(self, task: EvaluationTask) -> None:
+        """Execute one evaluation end to end inside a worker thread."""
+        task.status = "running"
+        task.append_log(f"[{_now_iso()}] 开始评估: {task.scenario_path}")
+
+        capture = _LineCapture(task)
+        try:
+            # Lazy import keeps the web server bootable if ragas is unavailable.
+            task.append_log("加载评估引擎 (rag_eval / ragas)...")
+            from rag_eval.execution.runner import run_scenario
+
+            absolute_path = self._to_absolute(task.scenario_path)
+            task.append_log(f"运行场景文件: {absolute_path}")
+
+            with redirect_stdout(capture), redirect_stderr(capture):
+                result = run_scenario(str(absolute_path))
+            capture.flush()
+
+            task.run_id = getattr(result, "run_id", None)
+            output_dir = getattr(getattr(result, "scenario", None), "output_dir", "")
+            task.append_log(f"[{_now_iso()}] 评估完成。run_id={task.run_id}")
+            if output_dir:
+                task.append_log(f"结果目录: {output_dir}")
+            task.status = "completed"
+        except Exception as exc:  # noqa: BLE001 - surface any failure to the UI
+            capture.flush()
+            error_type = type(exc).__name__
+            task.error = f"{error_type}: {exc}"
+            task.append_log(f"[{_now_iso()}] 评估失败 [{error_type}]: {exc}")
+            task.status = "failed"
+        finally:
+            task.finished_at = _now_iso()
+
+    def _to_absolute(self, scenario_path: str) -> Path:
+        """Resolve a scenario path against the repository root if relative."""
+        candidate = Path(scenario_path)
+        if candidate.is_absolute():
+            return candidate
+        repo_root = Path(__file__).resolve().parents[2]
+        return (repo_root / candidate).resolve()
+
+
+# Module-level singleton shared by the FastAPI routes.
+task_manager = TaskManager()
diff --git a/webapp/services/text_utils.py b/webapp/services/text_utils.py
new file mode 100644
index 0000000..94b563a
--- /dev/null
+++ b/webapp/services/text_utils.py
@@ -0,0 +1,47 @@
+"""Self-contained text helpers for the web layer.
+
+These intentionally avoid importing from rag_eval so the web server has no
+import-time dependency on the evaluation engine (and therefore boots even when
+ragas is unavailable). The contexts parser mirrors rag_eval.shared.utils so the
+console interprets serialized CSV context columns the same way the engine does.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import math
+from typing import Any
+
+
+def parse_contexts(value: Any) -> list[str]:
+    """Normalize a context payload into a list of non-empty strings.
+
+    Accepts native lists, JSON/Python-literal serialized lists (as written into
+    scores.csv), and plain text, mirroring the engine's own parsing rules.
+    """
+    if isinstance(value, list):
+        return [str(item).strip() for item in value if str(item).strip()]
+    if value is None or (isinstance(value, float) and math.isnan(value)):
+        return []
+
+    text = str(value).strip()
+    if not text:
+        return []
+
+    # Accept serialized lists from CSV exports before falling back to plain text.
+    for parser in (json.loads, ast.literal_eval):
+        try:
+            parsed = parser(text)
+        except (ValueError, SyntaxError, json.JSONDecodeError):
+            continue
+        if isinstance(parsed, list):
+            return [str(item).strip() for item in parsed if str(item).strip()]
+
+    # Preserve paragraph-style context dumps by splitting on blank lines first.
+    if "\n\n" in text:
+        chunks = [chunk.strip() for chunk in text.split("\n\n") if chunk.strip()]
+        if chunks:
+            return chunks
+
+    return [text]
diff --git a/webapp/static/css/app.css b/webapp/static/css/app.css
new file mode 100644
index 0000000..22f7886
--- /dev/null
+++ b/webapp/static/css/app.css
@@ -0,0 +1,267 @@
+/* Siemens RAGAS 评估控制台 — 样式表
+   配色取自西门子品牌色（petrol / 深青）与中性灰，呼应企业语境。 */
+
+:root {
+  --petrol: #009999;
+  --petrol-dark: #007a7a;
+  --ink: #0f1b2d;
+  --ink-soft: #1a2942;
+  --slate: #64748b;
+  --slate-light: #94a3b8;
+  --line: #e2e8f0;
+  --bg: #f4f6f9;
+  --surface: #ffffff;
+  --good: #16a34a;
+  --warn: #eab308;
+  --bad: #dc2626;
+  --shadow: 0 1px 3px rgba(15, 27, 45, 0.08), 0 1px 2px rgba(15, 27, 45, 0.04);
+  --radius: 10px;
+  font-synthesis: none;
+}
+
+* { box-sizing: border-box; margin: 0; padding: 0; }
+
+body {
+  font-family: "Segoe UI", "Microsoft YaHei", system-ui, -apple-system, sans-serif;
+  background: var(--bg);
+  color: var(--ink);
+  font-size: 14px;
+  line-height: 1.5;
+}
+
+.app { display: flex; min-height: 100vh; }
+
+/* ---------- 左侧导航 ---------- */
+.sidebar {
+  width: 208px;
+  flex-shrink: 0;
+  background: linear-gradient(180deg, var(--ink) 0%, var(--ink-soft) 100%);
+  color: #cbd5e1;
+  display: flex;
+  flex-direction: column;
+  padding: 20px 14px;
+  position: sticky;
+  top: 0;
+  height: 100vh;
+}
+
+.brand { padding: 0 8px 22px; }
+.brand-mark {
+  font-size: 20px; font-weight: 700; letter-spacing: 1px; color: #fff;
+}
+.brand-sub { font-size: 12px; color: var(--petrol); margin-top: 2px; letter-spacing: 2px; }
+
+.nav { display: flex; flex-direction: column; gap: 4px; flex: 1; }
+.nav-item {
+  display: flex; align-items: center; gap: 10px;
+  background: transparent; border: none; color: #cbd5e1;
+  padding: 10px 12px; border-radius: 8px; cursor: pointer;
+  font-size: 14px; text-align: left; width: 100%;
+  transition: background 0.15s, color 0.15s;
+}
+.nav-item:hover { background: rgba(255, 255, 255, 0.06); color: #fff; }
+.nav-item.active { background: var(--petrol); color: #fff; }
+.nav-item.active .nav-ico { color: #fff; }
+.nav-item:disabled { opacity: 0.4; cursor: not-allowed; }
+.nav-ico { width: 18px; text-align: center; color: var(--petrol); font-weight: 700; }
+.nav-item.active .nav-ico { color: #fff; }
+
+.sidebar-foot {
+  display: flex; align-items: center; gap: 8px;
+  font-size: 12px; color: var(--slate-light);
+  padding: 12px 8px 0; border-top: 1px solid rgba(255, 255, 255, 0.08);
+}
+.dot { width: 8px; height: 8px; border-radius: 50%; background: var(--slate-light); }
+.dot.ok { background: var(--good); }
+.dot.bad { background: var(--bad); }
+
+/* ---------- 主内容区 ---------- */
+.main { flex: 1; display: flex; flex-direction: column; min-width: 0; }
+
+.topbar {
+  display: flex; align-items: center; justify-content: space-between;
+  padding: 18px 28px; background: var(--surface); border-bottom: 1px solid var(--line);
+  position: sticky; top: 0; z-index: 5;
+}
+.topbar h1 { font-size: 18px; font-weight: 600; }
+
+.view { padding: 24px 28px; }
+
+/* ---------- 按钮 ---------- */
+.btn {
+  border: 1px solid var(--line); background: var(--surface); color: var(--ink);
+  padding: 8px 16px; border-radius: 8px; cursor: pointer; font-size: 13px;
+  transition: all 0.15s; font-family: inherit;
+}
+.btn:hover { border-color: var(--petrol); color: var(--petrol); }
+.btn-primary { background: var(--petrol); border-color: var(--petrol); color: #fff; }
+.btn-primary:hover { background: var(--petrol-dark); border-color: var(--petrol-dark); color: #fff; }
+.btn-primary:disabled { background: var(--slate-light); border-color: var(--slate-light); cursor: not-allowed; }
+.btn-ghost { background: transparent; }
+
+/* ---------- 运行列表 ---------- */
+.runs-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 16px; }
+.run-card {
+  background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
+  padding: 16px; cursor: pointer; transition: all 0.15s; box-shadow: var(--shadow);
+}
+.run-card:hover { border-color: var(--petrol); transform: translateY(-1px); }
+.run-card-head { display: flex; justify-content: space-between; align-items: flex-start; gap: 10px; }
+.run-card-title { font-size: 15px; font-weight: 600; word-break: break-all; }
+.run-card-meta { font-size: 12px; color: var(--slate); margin-top: 6px; line-height: 1.7; }
+.run-card-metrics { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px; }
+.metric-chip {
+  font-size: 12px; padding: 3px 8px; border-radius: 6px; background: var(--bg);
+  border: 1px solid var(--line);
+}
+.metric-chip b { font-variant-numeric: tabular-nums; }
+
+/* ---------- 通用面板 ---------- */
+.panel {
+  background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
+  padding: 20px; box-shadow: var(--shadow); margin-bottom: 18px;
+}
+.panel h2 { font-size: 16px; margin-bottom: 6px; }
+.panel-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 12px; }
+
+.muted { color: var(--slate); }
+.tiny { font-size: 11px; margin-top: 8px; }
+.tight { margin: 0 !important; }
+code {
+  background: var(--bg); border: 1px solid var(--line); border-radius: 4px;
+  padding: 1px 6px; font-size: 12px; font-family: "Cascadia Code", Consolas, monospace;
+}
+
+/* ---------- 新建评估 ---------- */
+.scenario-list { display: flex; flex-direction: column; gap: 8px; margin: 16px 0; }
+.scenario-item {
+  display: flex; align-items: center; justify-content: space-between; gap: 12px;
+  border: 1px solid var(--line); border-radius: 8px; padding: 12px 14px; cursor: pointer;
+  transition: all 0.15s;
+}
+.scenario-item:hover { border-color: var(--petrol); background: #f0fbfb; }
+.scenario-item.selected { border-color: var(--petrol); background: #e6f7f7; box-shadow: inset 0 0 0 1px var(--petrol); }
+.scenario-item.invalid { opacity: 0.55; cursor: not-allowed; }
+.scenario-name { font-weight: 600; font-size: 14px; }
+.scenario-path { font-size: 12px; color: var(--slate); font-family: monospace; }
+.scenario-tags { display: flex; gap: 6px; align-items: center; flex-shrink: 0; }
+.tag {
+  font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--bg);
+  border: 1px solid var(--line); color: var(--slate);
+}
+.tag.mode-online { background: #eff6ff; color: #1d4ed8; border-color: #bfdbfe; }
+.tag.mode-offline { background: #f0fdf4; color: #15803d; border-color: #bbf7d0; }
+
+.run-actions { display: flex; align-items: center; gap: 14px; }
+.selected-scenario { font-size: 13px; }
+
+/* ---------- 任务进度 ---------- */
+.task-head { display: flex; align-items: center; gap: 12px; margin-bottom: 12px; }
+.badge {
+  font-size: 12px; padding: 3px 10px; border-radius: 999px; font-weight: 600;
+  background: var(--bg); color: var(--slate); border: 1px solid var(--line);
+}
+.badge.queued { background: #f1f5f9; color: var(--slate); }
+.badge.running { background: #fef9c3; color: #854d0e; border-color: #fde68a; }
+.badge.completed { background: #dcfce7; color: #166534; border-color: #bbf7d0; }
+.badge.failed { background: #fee2e2; color: #991b1b; border-color: #fecaca; }
+.log-box {
+  background: #0b1220; color: #cbd5e1; border-radius: 8px; padding: 14px;
+  font-family: "Cascadia Code", Consolas, monospace; font-size: 12px; line-height: 1.7;
+  max-height: 320px; overflow-y: auto; white-space: pre-wrap; word-break: break-word;
+}
+.task-actions { margin-top: 12px; }
+
+/* ---------- 报告详情 ---------- */
+.report-meta {
+  background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
+  padding: 14px 18px; display: flex; justify-content: space-between; align-items: center;
+  flex-wrap: wrap; gap: 10px; box-shadow: var(--shadow); margin-bottom: 18px;
+}
+.report-meta-title { font-size: 15px; font-weight: 600; }
+.report-meta-info { font-size: 12px; color: var(--slate); }
+.status-pill { font-size: 12px; font-weight: 600; }
+.status-pill.completed { color: var(--good); }
+
+.section-label {
+  font-size: 12px; font-weight: 600; letter-spacing: 0.5px; color: var(--slate);
+  text-transform: uppercase; margin: 18px 0 10px;
+}
+
+.metric-cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 12px; }
+.metric-card {
+  background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
+  padding: 16px; text-align: center; box-shadow: var(--shadow);
+}
+.metric-value { font-size: 28px; font-weight: 700; font-variant-numeric: tabular-nums; }
+.metric-value.good { color: var(--good); }
+.metric-value.warn { color: var(--warn); }
+.metric-value.bad { color: var(--bad); }
+.metric-value.na { color: var(--slate-light); }
+.metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
+
+.report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+.report-half { margin-bottom: 0; }
+
+.select {
+  border: 1px solid var(--line); border-radius: 6px; padding: 5px 10px; font-size: 12px;
+  background: var(--surface); color: var(--ink); font-family: inherit; cursor: pointer;
+}
+
+.grouping-tabs { display: flex; gap: 6px; margin-bottom: 10px; flex-wrap: wrap; }
+.grouping-tab {
+  font-size: 12px; padding: 4px 10px; border-radius: 6px; border: 1px solid var(--line);
+  background: var(--surface); cursor: pointer; color: var(--slate);
+}
+.grouping-tab.active { background: var(--petrol); color: #fff; border-color: var(--petrol); }
+
+table.group-table { width: 100%; border-collapse: collapse; font-size: 12px; }
+table.group-table th, table.group-table td { padding: 6px 8px; text-align: left; }
+table.group-table th { color: var(--slate); border-bottom: 1px solid var(--line); font-weight: 600; }
+table.group-table td { border-bottom: 1px solid #f1f5f9; font-variant-numeric: tabular-nums; }
+
+/* 最低分样本表 */
+.lowest-table {
+  background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius);
+  overflow: hidden; box-shadow: var(--shadow);
+}
+.lowest-row {
+  display: grid; grid-template-columns: 90px 1fr auto; gap: 12px; align-items: center;
+  padding: 11px 16px; border-bottom: 1px solid #f1f5f9; cursor: pointer; transition: background 0.12s;
+}
+.lowest-row:hover { background: var(--bg); }
+.lowest-row .sid { font-size: 12px; color: var(--slate); font-family: monospace; }
+.lowest-row .q { font-size: 13px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+.lowest-row .scores { display: flex; gap: 8px; }
+.score-badge {
+  font-size: 12px; padding: 2px 8px; border-radius: 6px; font-variant-numeric: tabular-nums;
+  font-weight: 600;
+}
+.score-badge.good { background: #dcfce7; color: #166534; }
+.score-badge.warn { background: #fef9c3; color: #854d0e; }
+.score-badge.bad { background: #fee2e2; color: #991b1b; }
+.score-badge.na { background: var(--bg); color: var(--slate-light); }
+
+.lowest-detail { padding: 0 16px; background: #fcfdfe; border-bottom: 1px solid #f1f5f9; }
+.lowest-detail-inner { padding: 14px 0; font-size: 13px; line-height: 1.7; }
+.detail-field { margin-bottom: 10px; }
+.detail-label { font-size: 12px; color: var(--slate); font-weight: 600; margin-bottom: 3px; }
+.detail-context { color: #475569; font-size: 12px; }
+.detail-context .ctx-item {
+  padding: 4px 0; border-bottom: 1px dashed var(--line);
+}
+.detail-gt { color: var(--good); }
+
+.empty { text-align: center; padding: 60px 20px; color: var(--slate); }
+.empty p { margin-bottom: 8px; }
+
+.spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--line);
+  border-top-color: var(--petrol); border-radius: 50%; animation: spin 0.7s linear infinite;
+  vertical-align: middle; }
+@keyframes spin { to { transform: rotate(360deg); } }
+
+@media (max-width: 880px) {
+  .report-row { grid-template-columns: 1fr; }
+  .sidebar { width: 64px; }
+  .brand-sub, .nav-item span:not(.nav-ico), .sidebar-foot span:last-child { display: none; }
+}
diff --git a/webapp/static/index.html b/webapp/static/index.html
new file mode 100644
index 0000000..c270cbb
--- /dev/null
+++ b/webapp/static/index.html
@@ -0,0 +1,118 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Siemens RAGAS 评估控制台</title>
+  <link rel="stylesheet" href="/static/css/app.css" />
+  <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
+</head>
+<body>
+  <div class="app">
+    <!-- 左侧导航（布局 A） -->
+    <aside class="sidebar">
+      <div class="brand">
+        <div class="brand-mark">RAGAS</div>
+        <div class="brand-sub">评估控制台</div>
+      </div>
+      <nav class="nav">
+        <button class="nav-item" data-view="runs">
+          <span class="nav-ico">▢</span><span>运行列表</span>
+        </button>
+        <button class="nav-item" data-view="new">
+          <span class="nav-ico">＋</span><span>新建评估</span>
+        </button>
+        <button class="nav-item" data-view="report" data-requires-run="1">
+          <span class="nav-ico">▤</span><span>报告详情</span>
+        </button>
+      </nav>
+      <div class="sidebar-foot">
+        <span class="dot" id="health-dot"></span>
+        <span id="health-text">连接中…</span>
+      </div>
+    </aside>
+
+    <!-- 主内容区 -->
+    <main class="main">
+      <header class="topbar">
+        <h1 id="view-title">运行列表</h1>
+        <button class="btn btn-ghost" id="refresh-btn">刷新</button>
+      </header>
+
+      <!-- 运行列表视图 -->
+      <section class="view" id="view-runs">
+        <div id="runs-container" class="runs-grid"></div>
+        <div class="empty" id="runs-empty" hidden>
+          <p>暂无评估运行。</p>
+          <p class="muted">从「新建评估」触发一次，或运行示例数据生成脚本：<code>python scripts/seed_sample_run.py</code></p>
+        </div>
+      </section>
+
+      <!-- 新建评估视图 -->
+      <section class="view" id="view-new" hidden>
+        <div class="panel">
+          <h2>选择场景并运行</h2>
+          <p class="muted">从 <code>scenarios/</code> 下选择一个场景配置，点击运行后在下方查看实时状态与日志。</p>
+          <div class="scenario-list" id="scenario-list"></div>
+          <div class="run-actions">
+            <button class="btn btn-primary" id="run-btn" disabled>运行评估</button>
+            <span class="selected-scenario muted" id="selected-scenario">未选择场景</span>
+          </div>
+        </div>
+        <div class="panel" id="task-panel" hidden>
+          <div class="task-head">
+            <h2>评估进度</h2>
+            <span class="badge" id="task-status">queued</span>
+          </div>
+          <pre class="log-box" id="task-log"></pre>
+          <div class="task-actions">
+            <button class="btn btn-primary" id="view-report-btn" hidden>查看报告</button>
+          </div>
+        </div>
+      </section>
+
+      <!-- 报告详情视图 -->
+      <section class="view" id="view-report" hidden>
+        <div class="empty" id="report-empty">
+          <p>请先从「运行列表」选择一次运行。</p>
+        </div>
+        <div id="report-content" hidden>
+          <!-- 顶部元信息条 -->
+          <div class="report-meta" id="report-meta"></div>
+
+          <!-- ① 指标均值卡片 -->
+          <div class="section-label">① 指标均值 OVERVIEW</div>
+          <div class="metric-cards" id="metric-cards"></div>
+
+          <!-- ② 分布 + ③ 分组 并排 -->
+          <div class="report-row">
+            <div class="panel report-half">
+              <div class="panel-head">
+                <div class="section-label tight">② 分数分布</div>
+                <select id="dist-metric-select" class="select"></select>
+              </div>
+              <canvas id="dist-chart" height="160"></canvas>
+              <p class="muted tiny">暴露长尾失败样本</p>
+            </div>
+            <div class="panel report-half">
+              <div class="section-label tight">③ 分组均值</div>
+              <div id="grouping-tabs" class="grouping-tabs"></div>
+              <div id="grouping-table"></div>
+              <p class="muted tiny">定位薄弱类别</p>
+            </div>
+          </div>
+
+          <!-- ④ 最低分样本逐条复核 -->
+          <div class="section-label">④ 最低分样本（点击展开逐条复核）</div>
+          <div class="lowest-table" id="lowest-table"></div>
+        </div>
+      </section>
+    </main>
+  </div>
+
+  <script src="/static/js/api.js"></script>
+  <script src="/static/js/report.js"></script>
+  <script src="/static/js/runner.js"></script>
+  <script src="/static/js/app.js"></script>
+</body>
+</html>
diff --git a/webapp/static/js/api.js b/webapp/static/js/api.js
new file mode 100644
index 0000000..28fcca2
--- /dev/null
+++ b/webapp/static/js/api.js
@@ -0,0 +1,46 @@
+// api.js — 控制台后端 HTTP 接口的轻量封装。
+
+const API = {
+  // 通用 JSON GET，失败时抛出带状态码的错误。
+  async get(path) {
+    const resp = await fetch(path);
+    if (!resp.ok) {
+      const detail = await API._extractError(resp);
+      throw new Error(detail);
+    }
+    return resp.json();
+  },
+
+  // 通用 JSON POST。
+  async post(path, body) {
+    const resp = await fetch(path, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body || {}),
+    });
+    if (!resp.ok) {
+      const detail = await API._extractError(resp);
+      throw new Error(detail);
+    }
+    return resp.json();
+  },
+
+  // 从错误响应中尽量解析出 detail 文本。
+  async _extractError(resp) {
+    try {
+      const data = await resp.json();
+      return data.detail || `请求失败 (${resp.status})`;
+    } catch (_e) {
+      return `请求失败 (${resp.status})`;
+    }
+  },
+
+  health() { return API.get("/api/health"); },
+  runs() { return API.get("/api/runs"); },
+  runDetail(runId) { return API.get(`/api/runs/${encodeURIComponent(runId)}`); },
+  scenarios() { return API.get("/api/scenarios"); },
+  triggerEvaluation(scenarioPath) {
+    return API.post("/api/evaluations", { scenario_path: scenarioPath });
+  },
+  taskStatus(taskId) { return API.get(`/api/evaluations/${encodeURIComponent(taskId)}`); },
+};
diff --git a/webapp/static/js/app.js b/webapp/static/js/app.js
new file mode 100644
index 0000000..e36a66b
--- /dev/null
+++ b/webapp/static/js/app.js
@@ -0,0 +1,152 @@
+// app.js — 视图路由、运行列表渲染、健康检查。整个控制台的入口编排。
+
+const App = {
+  currentRunId: null,
+  views: ["runs", "new", "report"],
+  titles: { runs: "运行列表", new: "新建评估", report: "报告详情" },
+
+  // 初始化：绑定导航、加载首屏、启动健康检查。
+  init() {
+    document.querySelectorAll(".nav-item").forEach((btn) => {
+      btn.addEventListener("click", () => App.switchView(btn.dataset.view));
+    });
+    document.getElementById("refresh-btn").addEventListener("click", () => App.refreshCurrent());
+
+    Runner.init();
+    App.switchView("runs");
+    App.checkHealth();
+    setInterval(App.checkHealth, 15000);
+  },
+
+  // 切换主视图，并同步导航高亮与标题。
+  switchView(view) {
+    if (view === "report" && !App.currentRunId) {
+      // 没有选中的运行时，报告页显示占位。
+    }
+    App.views.forEach((name) => {
+      const el = document.getElementById(`view-${name}`);
+      if (el) el.hidden = name !== view;
+    });
+    document.querySelectorAll(".nav-item").forEach((btn) => {
+      btn.classList.toggle("active", btn.dataset.view === view);
+    });
+    document.getElementById("view-title").textContent = App.titles[view] || view;
+    App.activeView = view;
+
+    if (view === "runs") App.loadRuns();
+    if (view === "new") Runner.loadScenarios();
+    if (view === "report") Report.render(App.currentRunId);
+  },
+
+  // 刷新当前视图的数据。
+  refreshCurrent() {
+    App.switchView(App.activeView || "runs");
+  },
+
+  // 加载并渲染运行列表。
+  async loadRuns() {
+    const container = document.getElementById("runs-container");
+    const empty = document.getElementById("runs-empty");
+    container.innerHTML = '<p class="muted">加载中…</p>';
+    try {
+      const data = await API.runs();
+      const runs = data.runs || [];
+      if (runs.length === 0) {
+        container.innerHTML = "";
+        empty.hidden = false;
+        return;
+      }
+      empty.hidden = true;
+      container.innerHTML = "";
+      runs.forEach((run) => container.appendChild(App.renderRunCard(run)));
+    } catch (err) {
+      container.innerHTML = `<p class="muted">加载失败：${App.escape(err.message)}</p>`;
+    }
+  },
+
+  // 构造一张运行卡片。
+  renderRunCard(run) {
+    const card = document.createElement("div");
+    card.className = "run-card";
+    card.addEventListener("click", () => {
+      App.currentRunId = run.run_id;
+      App.enableReportNav();
+      App.switchView("report");
+    });
+
+    const chips = (run.metrics || [])
+      .map((m) => {
+        const val = run.metric_means ? run.metric_means[m] : null;
+        const cls = App.scoreClass(val);
+        const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
+        return `<span class="metric-chip">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
+      })
+      .join("");
+
+    card.innerHTML = `
+      <div class="run-card-head">
+        <div class="run-card-title">${App.escape(run.scenario_name || run.run_id)}</div>
+      </div>
+      <div class="run-card-meta">
+        <div>${App.escape(run.mode || "—")} · judge: ${App.escape(run.judge_model || "—")}</div>
+        <div>${run.valid_samples} 有效 / ${run.invalid_samples} 无效 · ${App.escape(App.shortTime(run.finished_at))}</div>
+      </div>
+      <div class="run-card-metrics">${chips}</div>
+    `;
+    return card;
+  },
+
+  // 启用报告导航项（选中运行后）。
+  enableReportNav() {
+    const btn = document.querySelector('.nav-item[data-view="report"]');
+    if (btn) btn.disabled = false;
+  },
+
+  // 根据分值返回 good/warn/bad/na 配色类。
+  scoreClass(value) {
+    if (value === null || value === undefined) return "na";
+    if (value >= 0.8) return "good";
+    if (value >= 0.65) return "warn";
+    return "bad";
+  },
+
+  // 指标名缩写，节省卡片横向空间。
+  shortMetric(name) {
+    const map = {
+      faithfulness: "faith.",
+      answer_relevancy: "ans.rel.",
+      context_recall: "ctx.recall",
+      context_precision: "ctx.prec.",
+    };
+    return map[name] || name;
+  },
+
+  // 截取时间戳到分钟，便于阅读。
+  shortTime(iso) {
+    if (!iso) return "—";
+    return String(iso).replace("T", " ").slice(0, 16);
+  },
+
+  // 简单 HTML 转义，防止注入。
+  escape(text) {
+    const div = document.createElement("div");
+    div.textContent = text == null ? "" : String(text);
+    return div.innerHTML;
+  },
+
+  // 健康检查，更新左下角状态点。
+  async checkHealth() {
+    const dot = document.getElementById("health-dot");
+    const label = document.getElementById("health-text");
+    try {
+      await API.health();
+      dot.className = "dot ok";
+      label.textContent = "服务正常";
+    } catch (_e) {
+      dot.className = "dot bad";
+      label.textContent = "服务离线";
+    }
+  },
+};
+
+document.addEventListener("DOMContentLoaded", App.init);
diff --git a/webapp/static/js/report.js b/webapp/static/js/report.js
new file mode 100644
index 0000000..882e27e
--- /dev/null
+++ b/webapp/static/js/report.js
@@ -0,0 +1,258 @@
+// report.js — 报告详情页渲染：元信息、指标卡片、分布图、分组表、低分样本复核。
+
+const Report = {
+  distChart: null,
+  currentDetail: null,
+  activeGrouping: null,
+
+  // 加载并渲染指定运行的完整报告。
+  async render(runId) {
+    const empty = document.getElementById("report-empty");
+    const content = document.getElementById("report-content");
+    if (!runId) {
+      empty.hidden = false;
+      content.hidden = true;
+      return;
+    }
+    empty.hidden = true;
+    content.hidden = false;
+    content.style.opacity = "0.4";
+
+    try {
+      const detail = await API.runDetail(runId);
+      Report.currentDetail = detail;
+      Report.renderMeta(detail.summary);
+      Report.renderMetricCards(detail.summary, detail.report);
+      Report.renderDistribution(detail.report);
+      Report.renderGroupings(detail.report);
+      Report.renderLowest(detail.report);
+      content.style.opacity = "1";
+    } catch (err) {
+      empty.hidden = false;
+      content.hidden = true;
+      empty.innerHTML = `<p>加载报告失败：${App.escape(err.message)}</p>`;
+    }
+  },
+
+  // 顶部元信息条。
+  renderMeta(summary) {
+    const el = document.getElementById("report-meta");
+    el.innerHTML = `
+      <div>
+        <div class="report-meta-title">${App.escape(summary.scenario_name || summary.run_id)}
+          <span class="status-pill completed">● completed</span></div>
+        <div class="report-meta-info">run_id: ${App.escape(summary.run_id)}</div>
+      </div>
+      <div class="report-meta-info">
+        ${App.escape(summary.mode || "—")} · judge: ${App.escape(summary.judge_model || "—")}
+        · ${summary.total_samples} 样本 (${summary.valid_samples} 有效 / ${summary.invalid_samples} 无效)
+        · ${App.escape(App.shortTime(summary.finished_at))}
+      </div>
+    `;
+  },
+
+  // ① 指标均值卡片。
+  renderMetricCards(summary, report) {
+    const wrap = document.getElementById("metric-cards");
+    wrap.innerHTML = "";
+    const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
+    metrics.forEach((metric) => {
+      const value = report.metric_means ? report.metric_means[metric] : null;
+      const cls = App.scoreClass(value);
+      const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
+      const card = document.createElement("div");
+      card.className = "metric-card";
+      card.innerHTML = `
+        <div class="metric-value ${cls}">${text}</div>
+        <div class="metric-name">${App.escape(metric)}</div>
+      `;
+      wrap.appendChild(card);
+    });
+  },
+
+  // ② 分数分布直方图（可切换指标）。
+  renderDistribution(report) {
+    const select = document.getElementById("dist-metric-select");
+    const distributions = report.distributions || {};
+    const metricsWithDist = Object.keys(distributions);
+
+    select.innerHTML = "";
+    if (metricsWithDist.length === 0) {
+      Report._drawDistChart([], []);
+      return;
+    }
+    metricsWithDist.forEach((metric) => {
+      const opt = document.createElement("option");
+      opt.value = metric;
+      opt.textContent = metric;
+      select.appendChild(opt);
+    });
+    select.onchange = () => Report._updateDistChart(select.value);
+    Report._updateDistChart(metricsWithDist[0]);
+  },
+
+  // 用选定指标的分箱数据刷新直方图。
+  _updateDistChart(metric) {
+    const distributions = Report.currentDetail.report.distributions || {};
+    const bins = distributions[metric] || [];
+    const labels = bins.map((b) => b.label);
+    const counts = bins.map((b) => b.count);
+    const colors = bins.map((b) => Report._binColor(b.lower));
+    Report._drawDistChart(labels, counts, colors);
+  },
+
+  // 低分箱偏红、高分箱偏绿，直观暴露长尾。
+  _binColor(lower) {
+    if (lower >= 0.8) return "#16a34a";
+    if (lower >= 0.6) return "#84cc16";
+    if (lower >= 0.4) return "#eab308";
+    if (lower >= 0.2) return "#f97316";
+    return "#dc2626";
+  },
+
+  // 实际绘制 Chart.js 柱状图。
+  _drawDistChart(labels, counts, colors) {
+    const canvas = document.getElementById("dist-chart");
+    if (Report.distChart) Report.distChart.destroy();
+    Report.distChart = new Chart(canvas, {
+      type: "bar",
+      data: {
+        labels,
+        datasets: [{ data: counts, backgroundColor: colors || "#009999", borderRadius: 4 }],
+      },
+      options: {
+        responsive: true,
+        plugins: { legend: { display: false } },
+        scales: {
+          y: { beginAtZero: true, ticks: { precision: 0 }, grid: { color: "#f1f5f9" } },
+          x: { grid: { display: false } },
+        },
+      },
+    });
+  },
+
+  // ③ 分组均值（difficulty / question_type / language）。
+  renderGroupings(report) {
+    const tabsEl = document.getElementById("grouping-tabs");
+    const tableEl = document.getElementById("grouping-table");
+    const groupings = report.groupings || {};
+    const fields = Object.keys(groupings);
+
+    tabsEl.innerHTML = "";
+    if (fields.length === 0) {
+      tableEl.innerHTML = '<p class="muted tiny">数据集未包含可分组字段（difficulty / question_type）。</p>';
+      return;
+    }
+
+    const fieldLabels = { difficulty: "难度", question_type: "类型", language: "语言" };
+    Report.activeGrouping = fields[0];
+    fields.forEach((field) => {
+      const tab = document.createElement("button");
+      tab.className = "grouping-tab" + (field === Report.activeGrouping ? " active" : "");
+      tab.textContent = fieldLabels[field] || field;
+      tab.onclick = () => {
+        Report.activeGrouping = field;
+        tabsEl.querySelectorAll(".grouping-tab").forEach((t) => t.classList.remove("active"));
+        tab.classList.add("active");
+        Report._drawGroupTable(report, field);
+      };
+      tabsEl.appendChild(tab);
+    });
+    Report._drawGroupTable(report, Report.activeGrouping);
+  },
+
+  // 渲染单个分组字段的均值表。
+  _drawGroupTable(report, field) {
+    const tableEl = document.getElementById("grouping-table");
+    const stats = report.groupings[field] || [];
+    const metrics = report.metrics || [];
+
+    let head = "<tr><th>组</th><th>样本</th>";
+    metrics.forEach((m) => (head += `<th>${App.escape(App.shortMetric(m))}</th>`));
+    head += "</tr>";
+
+    let body = "";
+    stats.forEach((stat) => {
+      body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
+      metrics.forEach((m) => {
+        const v = stat.means ? stat.means[m] : null;
+        const cls = App.scoreClass(v);
+        const text = v === null || v === undefined ? "—" : v.toFixed(2);
+        body += `<td class="${cls}">${text}</td>`;
+      });
+      body += "</tr>";
+    });
+    tableEl.innerHTML = `<table class="group-table">${head}${body}</table>`;
+  },
+
+  // ④ 最低分样本逐条复核表（点击展开）。
+  renderLowest(report) {
+    const wrap = document.getElementById("lowest-table");
+    const samples = report.lowest_samples || [];
+    wrap.innerHTML = "";
+    if (samples.length === 0) {
+      wrap.innerHTML = '<div class="lowest-detail-inner" style="padding:16px">暂无可复核样本。</div>';
+      return;
+    }
+    const metrics = report.metrics || [];
+    samples.forEach((sample, idx) => {
+      const row = document.createElement("div");
+      row.className = "lowest-row";
+      const scoreBadges = metrics
+        .map((m) => {
+          const v = sample.metrics ? sample.metrics[m] : null;
+          const cls = App.scoreClass(v);
+          const text = v === null || v === undefined ? "—" : v.toFixed(2);
+          return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
+        })
+        .join("");
+      row.innerHTML = `
+        <span class="sid">${App.escape(sample.sample_id)}</span>
+        <span class="q">${App.escape(sample.question || "—")}</span>
+        <span class="scores">${scoreBadges}</span>
+      `;
+
+      const detail = document.createElement("div");
+      detail.className = "lowest-detail";
+      detail.hidden = true;
+      detail.innerHTML = Report._detailHtml(sample);
+
+      row.addEventListener("click", () => {
+        detail.hidden = !detail.hidden;
+      });
+      wrap.appendChild(row);
+      wrap.appendChild(detail);
+    });
+  },
+
+  // 单条样本的展开详情：question / contexts / answer / ground_truth。
+  _detailHtml(sample) {
+    const contexts = (sample.contexts || [])
+      .map((c, i) => `<div class="ctx-item">[${i + 1}] ${App.escape(c)}</div>`)
+      .join("");
+    const errorBlock = sample.error
+      ? `<div class="detail-field"><div class="detail-label">错误 error</div><div style="color:#dc2626">${App.escape(sample.error)}</div></div>`
+      : "";
+    return `
+      <div class="lowest-detail-inner">
+        <div class="detail-field">
+          <div class="detail-label">问题 question</div>
+          <div>${App.escape(sample.question || "—")}</div>
+        </div>
+        <div class="detail-field">
+          <div class="detail-label">检索片段 contexts</div>
+          <div class="detail-context">${contexts || "（空）"}</div>
+        </div>
+        <div class="detail-field">
+          <div class="detail-label">生成答案 answer</div>
+          <div>${App.escape(sample.answer || "—")}</div>
+        </div>
+        <div class="detail-field">
+          <div class="detail-label">标准答案 ground_truth</div>
+          <div class="detail-gt">${App.escape(sample.ground_truth || "—")}</div>
+        </div>
+        ${errorBlock}
+      </div>
+    `;
+  },
+};
diff --git a/webapp/static/js/runner.js b/webapp/static/js/runner.js
new file mode 100644
index 0000000..b448f03
--- /dev/null
+++ b/webapp/static/js/runner.js
@@ -0,0 +1,133 @@
+// runner.js — 新建评估视图：列出场景、触发评估、轮询任务状态与日志。
+
+const Runner = {
+  selectedScenario: null,
+  pollTimer: null,
+
+  // 绑定运行按钮。
+  init() {
+    document.getElementById("run-btn").addEventListener("click", () => Runner.trigger());
+    document.getElementById("view-report-btn").addEventListener("click", () => {
+      if (Runner.lastRunId) {
+        App.currentRunId = Runner.lastRunId;
+        App.enableReportNav();
+        App.switchView("report");
+      }
+    });
+  },
+
+  // 加载并渲染可触发的场景列表。
+  async loadScenarios() {
+    const list = document.getElementById("scenario-list");
+    list.innerHTML = '<p class="muted">加载中…</p>';
+    try {
+      const data = await API.scenarios();
+      const scenarios = data.scenarios || [];
+      if (scenarios.length === 0) {
+        list.innerHTML = '<p class="muted">未在 scenarios/ 下找到场景文件。</p>';
+        return;
+      }
+      list.innerHTML = "";
+      scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc)));
+    } catch (err) {
+      list.innerHTML = `<p class="muted">加载失败：${App.escape(err.message)}</p>`;
+    }
+  },
+
+  // 构造单个场景条目。
+  renderScenarioItem(sc) {
+    const item = document.createElement("div");
+    const invalid = !!sc.error;
+    item.className = "scenario-item" + (invalid ? " invalid" : "");
+
+    const modeTag = sc.mode
+      ? `<span class="tag mode-${App.escape(sc.mode)}">${App.escape(sc.mode)}</span>`
+      : "";
+    const metricCount = (sc.metrics || []).length;
+
+    item.innerHTML = `
+      <div>
+        <div class="scenario-name">${App.escape(sc.scenario_name || sc.path)}</div>
+        <div class="scenario-path">${App.escape(sc.path)}</div>
+        ${sc.error ? `<div class="scenario-path" style="color:#dc2626">${App.escape(sc.error)}</div>` : ""}
+      </div>
+      <div class="scenario-tags">
+        ${modeTag}
+        <span class="tag">${metricCount} 指标</span>
+      </div>
+    `;
+
+    if (!invalid) {
+      item.addEventListener("click", () => {
+        document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected"));
+        item.classList.add("selected");
+        Runner.selectedScenario = sc.path;
+        document.getElementById("selected-scenario").textContent = sc.path;
+        document.getElementById("run-btn").disabled = false;
+      });
+    }
+    return item;
+  },
+
+  // 触发评估并开始轮询。
+  async trigger() {
+    if (!Runner.selectedScenario) return;
+    const runBtn = document.getElementById("run-btn");
+    runBtn.disabled = true;
+
+    const panel = document.getElementById("task-panel");
+    const logBox = document.getElementById("task-log");
+    const statusBadge = document.getElementById("task-status");
+    const reportBtn = document.getElementById("view-report-btn");
+    panel.hidden = false;
+    reportBtn.hidden = true;
+    logBox.textContent = "";
+    Runner._setStatus(statusBadge, "queued");
+
+    try {
+      const resp = await API.triggerEvaluation(Runner.selectedScenario);
+      Runner.poll(resp.task_id);
+    } catch (err) {
+      Runner._setStatus(statusBadge, "failed");
+      logBox.textContent = `触发失败：${err.message}`;
+      runBtn.disabled = false;
+    }
+  },
+
+  // 周期性轮询任务状态，刷新日志与徽标。
+  poll(taskId) {
+    const logBox = document.getElementById("task-log");
+    const statusBadge = document.getElementById("task-status");
+    const reportBtn = document.getElementById("view-report-btn");
+    const runBtn = document.getElementById("run-btn");
+
+    if (Runner.pollTimer) clearInterval(Runner.pollTimer);
+    Runner.pollTimer = setInterval(async () => {
+      try {
+        const status = await API.taskStatus(taskId);
+        logBox.textContent = (status.logs || []).join("\n");
+        logBox.scrollTop = logBox.scrollHeight;
+        Runner._setStatus(statusBadge, status.status);
+
+        if (status.status === "completed" || status.status === "failed") {
+          clearInterval(Runner.pollTimer);
+          runBtn.disabled = false;
+          if (status.status === "completed" && status.run_id) {
+            Runner.lastRunId = status.run_id;
+            reportBtn.hidden = false;
+          }
+        }
+      } catch (err) {
+        clearInterval(Runner.pollTimer);
+        logBox.textContent += `\n轮询失败：${err.message}`;
+        runBtn.disabled = false;
+      }
+    }, 1200);
+  },
+
+  // 更新状态徽标的文本与配色类。
+  _setStatus(badge, status) {
+    badge.textContent = status;
+    badge.className = "badge " + status;
+  },
+};
diff --git a/webmain.py b/webmain.py
new file mode 100644
index 0000000..30c06d7
--- /dev/null
+++ b/webmain.py
@@ -0,0 +1,42 @@
+"""CLI entry point that launches the evaluation console web server.
+
+Run alongside the existing main.py CLI; both share the same rag_eval library
+and the same runs/ artifacts. Example:
+
+    python webmain.py
+    python webmain.py --host 0.0.0.0 --port 8800
+"""
+
+from __future__ import annotations
+
+import argparse
+
+import uvicorn
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse host/port/reload options for the console server."""
+    parser = argparse.ArgumentParser(description="Launch the RAGAS evaluation console.")
+    parser.add_argument("--host", default="127.0.0.1", help="Bind host (default 127.0.0.1).")
+    parser.add_argument("--port", type=int, default=8800, help="Bind port (default 8800).")
+    parser.add_argument(
+        "--reload",
+        action="store_true",
+        help="Enable auto-reload for local development.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Start uvicorn with the configured application."""
+    args = parse_args()
+    uvicorn.run(
+        "webapp.server:app",
+        host=args.host,
+        port=args.port,
+        reload=args.reload,
+    )
+
+
+if __name__ == "__main__":
+    main()