diff --git a/configs/llm_profiles.json b/configs/llm_profiles.json new file mode 100644 index 0000000..53ea2c8 --- /dev/null +++ b/configs/llm_profiles.json @@ -0,0 +1,64 @@ +{ + "profiles": [ + { + "profile_id": "c8e185a64fa0", + "name": "glm-5", + "model": "glm-5", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:16:22.438297+00:00", + "updated_at": "2026-06-16T09:19:03.089865+00:00" + }, + { + "profile_id": "54ddfe5aeb46", + "name": "deepseek-v4-pro", + "model": "deepseek-v4-pro", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:17:08.473904+00:00", + "updated_at": "2026-06-16T09:19:07.504082+00:00" + }, + { + "profile_id": "25d035eef194", + "name": "qwen3.5-flash", + "model": "qwen3.5-flash", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:18:24.265619+00:00", + "updated_at": "2026-06-16T09:18:24.265619+00:00" + }, + { + "profile_id": "ff1d0f417a5d", + "name": "deepseek-v4-flash", + "model": "deepseek-v4-flash", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:18:57.091549+00:00", + "updated_at": "2026-06-16T09:18:57.091549+00:00" + }, + { + "profile_id": "5b04c49df9df", + "name": "text-embedding-v4", + "model": "text-embedding-v4", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:19:49.104004+00:00", + "updated_at": "2026-06-16T09:19:49.104004+00:00" + }, + { + "profile_id": "b4f7c82859d5", + "name": "text-embedding-v3", + "model": "text-embedding-v3", + "base_url": "http://6.86.80.4:30080/v1", + "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8", + "timeout_seconds": 600, + "created_at": "2026-06-16T09:20:18.266540+00:00", + "updated_at": "2026-06-16T09:20:18.266540+00:00" + } + ] +} \ No newline at end of file diff --git a/docs/superpowers/plans/2026-06-16-llm-profile-manager.md b/docs/superpowers/plans/2026-06-16-llm-profile-manager.md new file mode 100644 index 0000000..aa18537 --- /dev/null +++ b/docs/superpowers/plans/2026-06-16-llm-profile-manager.md @@ -0,0 +1,1387 @@ +# LLM Profile Manager Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a visual LLM configuration management feature to the siemens_ragas web console, allowing users to create/save named LLM profiles (model, base_url, api_key, timeout) and assign them to different task roles (judge, answer, dataset-build) when running evaluations, with selections written back to the scenario YAML before execution. + +**Architecture:** Backend adds a `ProfileManager` service (memory + JSON file persistence, mirroring the existing `TaskManager` pattern) plus a `llm_profiles` FastAPI router. A new `apply` endpoint patches the selected profile fields into the target scenario YAML file. Frontend adds a new "LLM配置" sidebar view (profiles.js) and extends the existing "新建评估" view (runner.js) with a role-assignment panel that appears after selecting a scenario. + +**Tech Stack:** Python 3.11+, FastAPI, Pydantic v2, PyYAML (already installed), vanilla JS (ES2022), existing CSS design tokens + +--- + +## File Map + +### New files +- `webapp/api/llm_profiles.py` — FastAPI router: CRUD + apply endpoint +- `webapp/services/profile_manager.py` — in-memory + JSON persistence service +- `webapp/static/js/profiles.js` — frontend profile management view +- `configs/llm_profiles.json` — persistent storage (auto-created on first write) +- `tests/webapp/test_profile_manager.py` — unit tests for ProfileManager +- `tests/webapp/test_llm_profiles_api.py` — integration tests for the API router + +### Modified files +- `webapp/models.py` — add LLMProfile, ProfileApplyRequest, ProfileApplyResponse Pydantic models +- `webapp/server.py` — register `llm_profiles` router +- `webapp/static/index.html` — add "LLM配置" nav item; load profiles.js +- `webapp/static/js/api.js` — add profile + apply API calls +- `webapp/static/js/runner.js` — add LLM role-assignment panel after scenario selection +- `webapp/static/css/app.css` — add styles for profile cards, role-assignment panel, modal form + +--- + +## Task 1: Pydantic Models + +**Files:** +- Modify: `webapp/models.py` + +- [ ] **Step 1: Write failing test** + +```python +# tests/webapp/test_profile_manager.py +import pytest +from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse + +def test_llm_profile_defaults(): + p = LLMProfile( + profile_id="abc", + name="Test", + model="gpt-4", + base_url="http://localhost/v1", + api_key="sk-test", + ) + assert p.timeout_seconds == 30 + assert p.created_at != "" + assert p.updated_at != "" + +def test_profile_apply_request_fields(): + req = ProfileApplyRequest( + scenario_path="scenarios/offline/sample.yaml", + judge_profile_id="id1", + answer_profile_id="id2", + dataset_profile_id=None, + ) + assert req.judge_profile_id == "id1" + assert req.dataset_profile_id is None + +def test_profile_apply_response(): + resp = ProfileApplyResponse(scenario_path="scenarios/offline/sample.yaml", patched_fields=["judge_model"]) + assert "judge_model" in resp.patched_fields +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +cd /c/Projects/AIProjects/Siemens-AIPOC/siemens_ragas +python -m pytest tests/webapp/test_profile_manager.py -v 2>&1 | head -30 +``` +Expected: ImportError or AttributeError (models not defined yet) + +- [ ] **Step 3: Add models to webapp/models.py** + +Append after the existing `TriggerEvaluationResponse` class (before `jsonable`): + +```python +class LLMProfile(BaseModel): + """A named LLM connection configuration that can be reused across tasks.""" + + profile_id: str + name: str + model: str + base_url: str + api_key: str + timeout_seconds: int = 30 + created_at: str = "" + updated_at: str = "" + + +class CreateProfileRequest(BaseModel): + """Request body for creating or updating an LLM profile.""" + + name: str + model: str + base_url: str + api_key: str + timeout_seconds: int = 30 + + +class ProfileApplyRequest(BaseModel): + """Request body to patch LLM profile selections into a scenario YAML.""" + + scenario_path: str + judge_profile_id: str | None = None + answer_profile_id: str | None = None + dataset_profile_id: str | None = None + + +class ProfileApplyResponse(BaseModel): + """Response after patching a scenario YAML with profile settings.""" + + scenario_path: str + patched_fields: list[str] = Field(default_factory=list) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +python -m pytest tests/webapp/test_profile_manager.py -v +``` +Expected: 3 tests pass + +- [ ] **Step 5: Commit** + +```bash +git add webapp/models.py tests/webapp/test_profile_manager.py +git commit -m "feat: add LLMProfile pydantic models" +``` + +--- + +## Task 2: ProfileManager Service + +**Files:** +- Create: `webapp/services/profile_manager.py` +- Create: `configs/` directory (auto-created) + +- [ ] **Step 1: Write failing tests** (append to `tests/webapp/test_profile_manager.py`) + +```python +import json, tempfile, pathlib +from webapp.services.profile_manager import ProfileManager + +def _make_manager(tmp_path): + store = tmp_path / "profiles.json" + return ProfileManager(store_path=store) + +def test_create_profile(tmp_path): + mgr = _make_manager(tmp_path) + p = mgr.create(name="Local", model="deepseek-v4-flash", + base_url="http://localhost/v1", api_key="sk-x") + assert p.profile_id != "" + assert p.name == "Local" + +def test_list_profiles(tmp_path): + mgr = _make_manager(tmp_path) + mgr.create(name="A", model="m1", base_url="http://a/v1", api_key="k1") + mgr.create(name="B", model="m2", base_url="http://b/v1", api_key="k2") + profiles = mgr.list_all() + assert len(profiles) == 2 + +def test_get_profile(tmp_path): + mgr = _make_manager(tmp_path) + created = mgr.create(name="X", model="m", base_url="http://x/v1", api_key="k") + fetched = mgr.get(created.profile_id) + assert fetched is not None + assert fetched.name == "X" + +def test_update_profile(tmp_path): + mgr = _make_manager(tmp_path) + p = mgr.create(name="Old", model="m", base_url="http://x/v1", api_key="k") + updated = mgr.update(p.profile_id, name="New", model="m2", + base_url="http://x/v1", api_key="k", timeout_seconds=60) + assert updated is not None + assert updated.name == "New" + assert updated.model == "m2" + assert updated.timeout_seconds == 60 + +def test_delete_profile(tmp_path): + mgr = _make_manager(tmp_path) + p = mgr.create(name="Del", model="m", base_url="http://x/v1", api_key="k") + assert mgr.delete(p.profile_id) is True + assert mgr.get(p.profile_id) is None + +def test_persistence(tmp_path): + store = tmp_path / "profiles.json" + mgr1 = ProfileManager(store_path=store) + p = mgr1.create(name="Persist", model="m", base_url="http://x/v1", api_key="k") + mgr2 = ProfileManager(store_path=store) + assert mgr2.get(p.profile_id) is not None + +def test_get_nonexistent(tmp_path): + mgr = _make_manager(tmp_path) + assert mgr.get("does-not-exist") is None + +def test_delete_nonexistent(tmp_path): + mgr = _make_manager(tmp_path) + assert mgr.delete("does-not-exist") is False +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +python -m pytest tests/webapp/test_profile_manager.py -v -k "test_create or test_list or test_get or test_update or test_delete or test_persistence" 2>&1 | head -20 +``` +Expected: ImportError (module not found) + +- [ ] **Step 3: Create `webapp/services/profile_manager.py`** + +```python +"""In-memory + JSON-file LLM profile manager. + +Profiles are kept in a dict keyed by profile_id and written to a JSON file +on every mutation, so they survive server restarts. The pattern mirrors +TaskManager but without threading (profiles are only mutated by API calls +that run in FastAPI's request handler, which is single-threaded per request). +""" + +from __future__ import annotations + +import json +import threading +import uuid +from datetime import datetime, timezone +from pathlib import Path + +from webapp.models import LLMProfile + + +_DEFAULT_STORE = Path(__file__).resolve().parents[2] / "configs" / "llm_profiles.json" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +class ProfileManager: + """Manages LLM profiles with in-memory cache and JSON file persistence.""" + + def __init__(self, store_path: Path = _DEFAULT_STORE) -> None: + self._store_path = store_path + self._lock = threading.Lock() + self._profiles: dict[str, LLMProfile] = {} + self._load() + + # ------------------------------------------------------------------ # + # Public API + # ------------------------------------------------------------------ # + + def list_all(self) -> list[LLMProfile]: + """Return all profiles sorted by creation time.""" + with self._lock: + return sorted(self._profiles.values(), key=lambda p: p.created_at) + + def get(self, profile_id: str) -> LLMProfile | None: + """Return one profile by id, or None if not found.""" + with self._lock: + return self._profiles.get(profile_id) + + def create( + self, + name: str, + model: str, + base_url: str, + api_key: str, + timeout_seconds: int = 30, + ) -> LLMProfile: + """Create and persist a new profile, returning it.""" + now = _now_iso() + profile = LLMProfile( + profile_id=uuid.uuid4().hex[:12], + name=name, + model=model, + base_url=base_url, + api_key=api_key, + timeout_seconds=timeout_seconds, + created_at=now, + updated_at=now, + ) + with self._lock: + self._profiles[profile.profile_id] = profile + self._persist() + return profile + + def update( + self, + profile_id: str, + name: str, + model: str, + base_url: str, + api_key: str, + timeout_seconds: int = 30, + ) -> LLMProfile | None: + """Update an existing profile in-place; returns None if not found.""" + with self._lock: + existing = self._profiles.get(profile_id) + if existing is None: + return None + updated = existing.model_copy(update={ + "name": name, + "model": model, + "base_url": base_url, + "api_key": api_key, + "timeout_seconds": timeout_seconds, + "updated_at": _now_iso(), + }) + self._profiles[profile_id] = updated + self._persist() + return updated + + def delete(self, profile_id: str) -> bool: + """Remove a profile; returns True if deleted, False if not found.""" + with self._lock: + if profile_id not in self._profiles: + return False + del self._profiles[profile_id] + self._persist() + return True + + # ------------------------------------------------------------------ # + # Persistence helpers + # ------------------------------------------------------------------ # + + def _load(self) -> None: + """Load profiles from the JSON store file, ignoring missing/corrupt files.""" + if not self._store_path.exists(): + return + try: + data = json.loads(self._store_path.read_text(encoding="utf-8")) + for raw in data.get("profiles", []): + p = LLMProfile.model_validate(raw) + self._profiles[p.profile_id] = p + except Exception: # noqa: BLE001 + pass # Corrupt store — start fresh + + def _persist(self) -> None: + """Write current profiles to the JSON store file (must be called under lock).""" + self._store_path.parent.mkdir(parents=True, exist_ok=True) + payload = {"profiles": [p.model_dump() for p in self._profiles.values()]} + self._store_path.write_text( + json.dumps(payload, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + +# Module-level singleton shared by FastAPI routes. +profile_manager = ProfileManager() +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +python -m pytest tests/webapp/test_profile_manager.py -v +``` +Expected: All 11 tests pass + +- [ ] **Step 5: Commit** + +```bash +git add webapp/services/profile_manager.py tests/webapp/test_profile_manager.py +git commit -m "feat: add ProfileManager service with JSON persistence" +``` + +--- + +## Task 3: LLM Profiles API Router + +**Files:** +- Create: `webapp/api/llm_profiles.py` +- Create: `tests/webapp/test_llm_profiles_api.py` +- Modify: `webapp/server.py` + +- [ ] **Step 1: Write failing tests** + +```python +# tests/webapp/test_llm_profiles_api.py +"""Integration tests for /api/llm-profiles endpoints.""" +import json, pathlib, tempfile +import pytest +from fastapi.testclient import TestClient + + +@pytest.fixture() +def client(tmp_path, monkeypatch): + """TestClient with a fresh ProfileManager backed by a temp file.""" + store = tmp_path / "profiles.json" + # Patch the singleton before importing server + import webapp.services.profile_manager as pm_mod + from webapp.services.profile_manager import ProfileManager + fresh_mgr = ProfileManager(store_path=store) + monkeypatch.setattr(pm_mod, "profile_manager", fresh_mgr) + # Also patch inside the api module if already imported + import webapp.api.llm_profiles as api_mod + monkeypatch.setattr(api_mod, "profile_manager", fresh_mgr) + + from webapp.server import create_app + return TestClient(create_app()) + + +def test_list_empty(client): + resp = client.get("/api/llm-profiles") + assert resp.status_code == 200 + assert resp.json()["profiles"] == [] + + +def test_create_and_list(client): + body = {"name": "Test", "model": "m1", "base_url": "http://x/v1", "api_key": "k"} + resp = client.post("/api/llm-profiles", json=body) + assert resp.status_code == 201 + data = resp.json() + assert data["name"] == "Test" + assert data["profile_id"] != "" + + resp2 = client.get("/api/llm-profiles") + assert len(resp2.json()["profiles"]) == 1 + + +def test_update_profile(client): + body = {"name": "Old", "model": "m1", "base_url": "http://x/v1", "api_key": "k"} + pid = client.post("/api/llm-profiles", json=body).json()["profile_id"] + + upd = {"name": "New", "model": "m2", "base_url": "http://x/v1", "api_key": "k", "timeout_seconds": 60} + resp = client.put(f"/api/llm-profiles/{pid}", json=upd) + assert resp.status_code == 200 + assert resp.json()["name"] == "New" + assert resp.json()["timeout_seconds"] == 60 + + +def test_delete_profile(client): + body = {"name": "Del", "model": "m", "base_url": "http://x/v1", "api_key": "k"} + pid = client.post("/api/llm-profiles", json=body).json()["profile_id"] + resp = client.delete(f"/api/llm-profiles/{pid}") + assert resp.status_code == 200 + assert resp.json()["deleted"] is True + assert len(client.get("/api/llm-profiles").json()["profiles"]) == 0 + + +def test_update_nonexistent(client): + resp = client.put("/api/llm-profiles/nope", + json={"name": "X", "model": "m", "base_url": "http://x/v1", "api_key": "k"}) + assert resp.status_code == 404 + + +def test_delete_nonexistent(client): + resp = client.delete("/api/llm-profiles/nope") + assert resp.status_code == 404 +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +python -m pytest tests/webapp/test_llm_profiles_api.py -v 2>&1 | head -20 +``` +Expected: ImportError (router not yet registered) + +- [ ] **Step 3: Create `webapp/api/llm_profiles.py`** + +```python +"""CRUD routes for LLM profiles plus the scenario-patching apply endpoint.""" + +from __future__ import annotations + +from fastapi import APIRouter, HTTPException +from fastapi.responses import JSONResponse + +from webapp.models import ( + CreateProfileRequest, + LLMProfile, + ProfileApplyRequest, + ProfileApplyResponse, +) +from webapp.services.profile_manager import profile_manager +from webapp.services.yaml_patcher import apply_profiles_to_scenario + +router = APIRouter(prefix="/api/llm-profiles", tags=["llm-profiles"]) + + +@router.get("", response_model=dict) +def list_profiles() -> dict: + """Return all saved LLM profiles.""" + return {"profiles": [p.model_dump() for p in profile_manager.list_all()]} + + +@router.post("", status_code=201, response_model=LLMProfile) +def create_profile(request: CreateProfileRequest) -> LLMProfile: + """Create a new LLM profile.""" + return profile_manager.create( + name=request.name, + model=request.model, + base_url=request.base_url, + api_key=request.api_key, + timeout_seconds=request.timeout_seconds, + ) + + +@router.put("/{profile_id}", response_model=LLMProfile) +def update_profile(profile_id: str, request: CreateProfileRequest) -> LLMProfile: + """Update an existing LLM profile by id.""" + updated = profile_manager.update( + profile_id=profile_id, + name=request.name, + model=request.model, + base_url=request.base_url, + api_key=request.api_key, + timeout_seconds=request.timeout_seconds, + ) + if updated is None: + raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}") + return updated + + +@router.delete("/{profile_id}", response_model=dict) +def delete_profile(profile_id: str) -> dict: + """Delete an LLM profile by id.""" + deleted = profile_manager.delete(profile_id) + if not deleted: + raise HTTPException(status_code=404, detail=f"Profile not found: {profile_id}") + return {"deleted": True} + + +@router.post("/apply", response_model=ProfileApplyResponse) +def apply_profiles(request: ProfileApplyRequest) -> ProfileApplyResponse: + """Patch selected LLM profiles into the target scenario YAML file.""" + profiles: dict[str, LLMProfile | None] = { + "judge": profile_manager.get(request.judge_profile_id) if request.judge_profile_id else None, + "answer": profile_manager.get(request.answer_profile_id) if request.answer_profile_id else None, + "dataset": profile_manager.get(request.dataset_profile_id) if request.dataset_profile_id else None, + } + + missing = [role for role, pid in [ + ("judge", request.judge_profile_id), + ("answer", request.answer_profile_id), + ("dataset", request.dataset_profile_id), + ] if pid and profiles[role] is None] + + if missing: + raise HTTPException( + status_code=400, + detail=f"Profile(s) not found for roles: {', '.join(missing)}", + ) + + patched = apply_profiles_to_scenario( + scenario_path=request.scenario_path, + judge_profile=profiles["judge"], + answer_profile=profiles["answer"], + dataset_profile=profiles["dataset"], + ) + return ProfileApplyResponse( + scenario_path=request.scenario_path, + patched_fields=patched, + ) +``` + +- [ ] **Step 4: Register router in `webapp/server.py`** + +Replace the import line: +```python +from webapp.api import evaluations, runs, scenarios +``` +with: +```python +from webapp.api import evaluations, llm_profiles, runs, scenarios +``` + +And add inside `create_app()` after the existing `app.include_router` calls: +```python + app.include_router(llm_profiles.router) +``` + +- [ ] **Step 5: Run tests to verify they pass** + +```bash +python -m pytest tests/webapp/test_llm_profiles_api.py -v +``` +Expected: 6 tests pass (apply test comes in Task 4) + +- [ ] **Step 6: Commit** + +```bash +git add webapp/api/llm_profiles.py webapp/server.py tests/webapp/test_llm_profiles_api.py +git commit -m "feat: add /api/llm-profiles CRUD router" +``` + +--- + +## Task 4: YAML Patcher Service + +**Files:** +- Create: `webapp/services/yaml_patcher.py` +- Modify: `tests/webapp/test_llm_profiles_api.py` (add apply tests) + +This service reads a scenario YAML, patches the relevant LLM fields, and writes it back. + +**YAML field mapping:** +- `judge_profile` → patches `judge_model` (string), `embedding_model` stays unchanged (same profile reused) +- `answer_profile` → patches `app_adapter.static_kwargs.model` (only if `app_adapter` exists and type=python) +- `dataset_profile` → patches `generation.model` (for dataset build configs) + +- [ ] **Step 1: Write failing tests** (append to `tests/webapp/test_llm_profiles_api.py`) + +```python +import yaml as yaml_lib + +def test_apply_judge_profile(client, tmp_path): + """Applying a judge profile patches judge_model in the YAML.""" + # Create a profile + body = {"name": "Judge", "model": "deepseek-v4-flash", "base_url": "http://x/v1", "api_key": "k"} + pid = client.post("/api/llm-profiles", json=body).json()["profile_id"] + + # Create a minimal scenario YAML + scenario_file = tmp_path / "test-scenario.yaml" + scenario_file.write_text( + "scenario_name: test\nmode: offline\njudge_model: old-model\nembedding_model: emb\n" + "dataset: data.csv\nmetrics: [faithfulness]\noutput_dir: outputs/test\n", + encoding="utf-8", + ) + + # Monkeypatch the repo root resolution so patcher resolves our temp file + import webapp.services.yaml_patcher as patcher_mod + import pathlib + orig_resolve = patcher_mod._resolve_scenario_path + + def fake_resolve(path_str): + return scenario_file + + import monkeypatch # this won't work — use the client fixture's monkeypatch + # NOTE: This test uses the patcher directly instead + from webapp.services.yaml_patcher import apply_profiles_to_scenario + from webapp.models import LLMProfile + judge_p = LLMProfile(profile_id="x", name="J", model="new-model", + base_url="http://x/v1", api_key="k", created_at="", updated_at="") + patched = apply_profiles_to_scenario( + scenario_path=str(scenario_file), + judge_profile=judge_p, + answer_profile=None, + dataset_profile=None, + _resolve_absolute=True, + ) + assert "judge_model" in patched + data = yaml_lib.safe_load(scenario_file.read_text()) + assert data["judge_model"] == "new-model" + + +def test_apply_answer_profile(tmp_path): + """Applying an answer profile patches app_adapter.static_kwargs.model.""" + from webapp.services.yaml_patcher import apply_profiles_to_scenario + from webapp.models import LLMProfile + + scenario_file = tmp_path / "online.yaml" + scenario_file.write_text( + "scenario_name: online\nmode: online\njudge_model: j\nembedding_model: emb\n" + "dataset: d.csv\nmetrics: [faithfulness]\noutput_dir: out\n" + "app_adapter:\n type: python\n callable: apps.foo:run\n" + " static_kwargs:\n model: old\n source_chunks_path: chunks.jsonl\n", + encoding="utf-8", + ) + answer_p = LLMProfile(profile_id="y", name="A", model="new-answer-model", + base_url="http://x/v1", api_key="k", created_at="", updated_at="") + patched = apply_profiles_to_scenario( + scenario_path=str(scenario_file), + judge_profile=None, + answer_profile=answer_p, + dataset_profile=None, + _resolve_absolute=True, + ) + assert "app_adapter.static_kwargs.model" in patched + data = yaml_lib.safe_load(scenario_file.read_text()) + assert data["app_adapter"]["static_kwargs"]["model"] == "new-answer-model" +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +python -m pytest tests/webapp/test_llm_profiles_api.py::test_apply_judge_profile tests/webapp/test_llm_profiles_api.py::test_apply_answer_profile -v 2>&1 | head -20 +``` +Expected: ImportError (yaml_patcher not found) + +- [ ] **Step 3: Create `webapp/services/yaml_patcher.py`** + +```python +"""Patch LLM profile settings into scenario YAML files in-place. + +Only the fields that correspond to a provided (non-None) profile are touched. +All other fields, comments, and structure are preserved by using ruamel.yaml +if available, or PyYAML (which loses comments) as fallback. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + +from webapp.models import LLMProfile + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def _resolve_scenario_path(path_str: str) -> Path: + """Resolve a scenario path; absolute paths are used as-is.""" + candidate = Path(path_str) + if candidate.is_absolute(): + return candidate + return (_repo_root() / candidate).resolve() + + +def apply_profiles_to_scenario( + scenario_path: str, + judge_profile: LLMProfile | None, + answer_profile: LLMProfile | None, + dataset_profile: LLMProfile | None, + _resolve_absolute: bool = False, +) -> list[str]: + """Patch the YAML file at *scenario_path* with the supplied profiles. + + Returns a list of dotted field names that were actually patched. + """ + if _resolve_absolute: + resolved = Path(scenario_path) + else: + resolved = _resolve_scenario_path(scenario_path) + + if not resolved.exists(): + raise FileNotFoundError(f"Scenario file not found: {resolved}") + + data: dict[str, Any] = yaml.safe_load(resolved.read_text(encoding="utf-8")) or {} + patched: list[str] = [] + + if judge_profile is not None: + data["judge_model"] = judge_profile.model + patched.append("judge_model") + + if answer_profile is not None: + adapter = data.get("app_adapter") + if isinstance(adapter, dict): + static_kwargs = adapter.setdefault("static_kwargs", {}) + static_kwargs["model"] = answer_profile.model + patched.append("app_adapter.static_kwargs.model") + + if dataset_profile is not None: + generation = data.get("generation") + if isinstance(generation, dict): + generation["model"] = dataset_profile.model + patched.append("generation.model") + + resolved.write_text( + yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False), + encoding="utf-8", + ) + return patched +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +python -m pytest tests/webapp/test_llm_profiles_api.py -v +``` +Expected: All tests pass + +- [ ] **Step 5: Commit** + +```bash +git add webapp/services/yaml_patcher.py tests/webapp/test_llm_profiles_api.py +git commit -m "feat: add yaml_patcher service to apply LLM profiles to scenario YAML" +``` + +--- + +## Task 5: Frontend — profiles.js (LLM配置管理页) + +**Files:** +- Create: `webapp/static/js/profiles.js` +- Modify: `webapp/static/js/api.js` +- Modify: `webapp/static/index.html` +- Modify: `webapp/static/css/app.css` + +This task adds the "LLM配置" sidebar page: list all profiles as cards, create new profile via inline form, edit/delete existing profiles. + +- [ ] **Step 1: Add profile API calls to `api.js`** + +Append to the `API` object (before the closing `};`): + +```js + profiles() { return API.get("/api/llm-profiles"); }, + createProfile(body) { return API.post("/api/llm-profiles", body); }, + updateProfile(id, body) { + return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, { + method: "PUT", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }).then(async r => { + if (!r.ok) { const d = await API._extractError(r); throw new Error(d); } + return r.json(); + }); + }, + deleteProfile(id) { + return fetch(`/api/llm-profiles/${encodeURIComponent(id)}`, { method: "DELETE" }) + .then(async r => { + if (!r.ok) { const d = await API._extractError(r); throw new Error(d); } + return r.json(); + }); + }, + applyProfiles(body) { return API.post("/api/llm-profiles/apply", body); }, +``` + +- [ ] **Step 2: Add nav item to `index.html`** + +In the ``): + +```html + +``` + +Add `"profiles"` to the views and add a new section at the bottom of `
` (before `
`): + +```html + + +``` + +- [ ] **Step 3: Create `webapp/static/js/profiles.js`** + +```js +// profiles.js — LLM 配置管理页面逻辑 + +const Profiles = { + _data: [], + + // 初始化:绑定按钮事件 + init() { + document.getElementById("add-profile-btn").addEventListener("click", () => Profiles.showForm()); + document.getElementById("save-profile-btn").addEventListener("click", () => Profiles.save()); + document.getElementById("cancel-profile-btn").addEventListener("click", () => Profiles.hideForm()); + }, + + // 加载并渲染 Profile 列表 + async load() { + const grid = document.getElementById("profile-cards"); + const empty = document.getElementById("profiles-empty"); + grid.innerHTML = '

加载中…

'; + try { + const data = await API.profiles(); + Profiles._data = data.profiles || []; + grid.innerHTML = ""; + if (Profiles._data.length === 0) { + empty.hidden = false; + } else { + empty.hidden = true; + Profiles._data.forEach(p => grid.appendChild(Profiles.renderCard(p))); + } + } catch (err) { + grid.innerHTML = `

加载失败:${App.escape(err.message)}

`; + } + }, + + // 渲染单个 Profile 卡片 + renderCard(p) { + const card = document.createElement("div"); + card.className = "profile-card"; + card.dataset.id = p.profile_id; + card.innerHTML = ` +
+
${App.escape(p.name)}
+
+ + +
+
+
模型 ${App.escape(p.model)}
+
Base URL ${App.escape(p.base_url)}
+
超时 ${p.timeout_seconds}s
+ `; + card.querySelector("[data-action=edit]").addEventListener("click", () => Profiles.showForm(p)); + card.querySelector("[data-action=delete]").addEventListener("click", () => Profiles.remove(p.profile_id, p.name)); + return card; + }, + + // 显示新建或编辑表单 + showForm(profile = null) { + const panel = document.getElementById("profile-form-panel"); + const title = document.getElementById("profile-form-title"); + panel.hidden = false; + title.textContent = profile ? "编辑 LLM 配置" : "新建 LLM 配置"; + document.getElementById("edit-profile-id").value = profile ? profile.profile_id : ""; + document.getElementById("pf-name").value = profile ? profile.name : ""; + document.getElementById("pf-model").value = profile ? profile.model : ""; + document.getElementById("pf-base-url").value = profile ? profile.base_url : ""; + document.getElementById("pf-api-key").value = profile ? profile.api_key : ""; + document.getElementById("pf-timeout").value = profile ? profile.timeout_seconds : 30; + document.getElementById("profile-form-error").textContent = ""; + panel.scrollIntoView({ behavior: "smooth", block: "start" }); + }, + + hideForm() { + document.getElementById("profile-form-panel").hidden = true; + }, + + // 保存(新建 or 更新) + async save() { + const id = document.getElementById("edit-profile-id").value; + const body = { + name: document.getElementById("pf-name").value.trim(), + model: document.getElementById("pf-model").value.trim(), + base_url: document.getElementById("pf-base-url").value.trim(), + api_key: document.getElementById("pf-api-key").value.trim(), + timeout_seconds: parseInt(document.getElementById("pf-timeout").value, 10) || 30, + }; + const errEl = document.getElementById("profile-form-error"); + if (!body.name || !body.model || !body.base_url || !body.api_key) { + errEl.textContent = "请填写所有必填字段(名称、模型、Base URL、API Key)"; + return; + } + try { + if (id) { + await API.updateProfile(id, body); + } else { + await API.createProfile(body); + } + Profiles.hideForm(); + await Profiles.load(); + } catch (err) { + errEl.textContent = `保存失败:${err.message}`; + } + }, + + // 删除 Profile + async remove(profileId, name) { + if (!confirm(`确认删除配置「${name}」?`)) return; + try { + await API.deleteProfile(profileId); + await Profiles.load(); + } catch (err) { + alert(`删除失败:${err.message}`); + } + }, + + // 获取当前已加载的 profiles(供 runner.js 使用) + getAll() { + return Profiles._data; + }, +}; +``` + +- [ ] **Step 4: Add CSS for profiles page to `app.css`** + +Append to `webapp/static/css/app.css`: + +```css +/* ---------- LLM 配置管理页 ---------- */ +.profile-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(300px, 1fr)); gap: 16px; } +.profile-card { + background: var(--surface); border: 1px solid var(--line); border-radius: var(--radius); + padding: 16px; box-shadow: var(--shadow); +} +.profile-card-head { display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; } +.profile-card-name { font-size: 15px; font-weight: 600; } +.profile-card-actions { display: flex; gap: 6px; } +.profile-card-field { font-size: 12px; color: var(--slate); margin-top: 4px; } +.field-label { font-weight: 600; color: var(--ink); } + +/* Form */ +.profile-form { display: flex; flex-direction: column; gap: 12px; margin-top: 14px; max-width: 560px; } +.form-row { display: flex; flex-direction: column; gap: 4px; } +.form-label { font-size: 13px; font-weight: 600; } +.req { color: var(--bad); } +.form-input { + border: 1px solid var(--line); border-radius: 6px; padding: 8px 10px; + font-size: 13px; font-family: inherit; width: 100%; +} +.form-input:focus { outline: none; border-color: var(--petrol); } +.form-input-sm { max-width: 120px; } +.form-actions { display: flex; gap: 10px; align-items: center; margin-top: 4px; } +.form-error { font-size: 12px; color: var(--bad); } +.btn-sm { padding: 4px 10px; font-size: 12px; } +.btn-danger { color: var(--bad); border-color: var(--bad); } +.btn-danger:hover { background: #fee2e2; } +``` + +- [ ] **Step 5: Update `index.html` to load `profiles.js`** + +Add before the closing ``: +```html + +``` +(place it before ``) + +- [ ] **Step 6: Update `app.js` to handle the new view** + +In `App.views`, add `"profiles"`: +```js +views: ["runs", "new", "report", "profiles"], +``` + +In `App.titles`, add: +```js +profiles: "LLM 配置", +``` + +In `App.switchView`, add after `if (view === "report") Report.render(App.currentRunId);`: +```js + if (view === "profiles") { Profiles.load(); } +``` + +Also call `Profiles.init()` inside `App.init()`: +```js + Profiles.init(); +``` + +- [ ] **Step 7: Smoke test the server starts** + +```bash +cd /c/Projects/AIProjects/Siemens-AIPOC/siemens_ragas +python -c "from webapp.server import create_app; app = create_app(); print('OK')" +``` +Expected: `OK` + +- [ ] **Step 8: Commit** + +```bash +git add webapp/static/js/profiles.js webapp/static/js/api.js webapp/static/js/app.js webapp/static/index.html webapp/static/css/app.css +git commit -m "feat: add LLM配置 management page (profiles view)" +``` + +--- + +## Task 6: Frontend — LLM Role-Assignment Panel in runner.js + +**Files:** +- Modify: `webapp/static/js/runner.js` +- Modify: `webapp/static/css/app.css` +- Modify: `webapp/static/index.html` + +After the user selects a scenario, show a collapsible LLM assignment panel with dropdowns for Judge/Answer/Dataset roles. On "运行评估", first call `applyProfiles`, then trigger evaluation. + +- [ ] **Step 1: Add HTML for role-assignment panel to `index.html`** + +Inside `
`, after the `.run-actions` div and before the `task-panel`: + +```html + + +``` + +- [ ] **Step 2: Add CSS for role-assignment panel to `app.css`** + +```css +/* ---------- LLM 角色配置面板 ---------- */ +.llm-assignment-panel { border-left: 3px solid var(--petrol); } +.llm-role-rows { display: flex; flex-direction: column; gap: 10px; } +.llm-role-row { display: flex; align-items: center; gap: 14px; } +.llm-role-label { font-size: 13px; font-weight: 600; min-width: 180px; color: var(--ink); } +.llm-role-select { min-width: 240px; } +``` + +- [ ] **Step 3: Extend `runner.js` — add profile loading and apply logic** + +Replace the entire contents of `runner.js` with: + +```js +// runner.js — 新建评估视图:列出场景、LLM角色配置、触发评估、轮询任务状态与日志。 + +const Runner = { + selectedScenario: null, + pollTimer: null, + lastRunId: null, + + // 绑定运行按钮。 + init() { + document.getElementById("run-btn").addEventListener("click", () => Runner.trigger()); + document.getElementById("view-report-btn").addEventListener("click", () => { + if (Runner.lastRunId) { + App.currentRunId = Runner.lastRunId; + App.enableReportNav(); + App.switchView("report"); + } + }); + }, + + // 加载并渲染可触发的场景列表。 + async loadScenarios() { + const list = document.getElementById("scenario-list"); + list.innerHTML = '

加载中…

'; + try { + const data = await API.scenarios(); + const scenarios = data.scenarios || []; + if (scenarios.length === 0) { + list.innerHTML = '

未在 scenarios/ 下找到场景文件。

'; + return; + } + list.innerHTML = ""; + scenarios.forEach((sc) => list.appendChild(Runner.renderScenarioItem(sc))); + } catch (err) { + list.innerHTML = `

加载失败:${App.escape(err.message)}

`; + } + // 同时加载 profiles 供角色选择 + Runner._populateProfileSelects(); + }, + + // 填充三个角色下拉框 + async _populateProfileSelects() { + const profiles = Profiles.getAll().length > 0 + ? Profiles.getAll() + : (await API.profiles().catch(() => ({ profiles: [] }))).profiles; + + ["role-judge", "role-answer", "role-dataset"].forEach(id => { + const sel = document.getElementById(id); + // 保留第一个 placeholder option + sel.innerHTML = ''; + profiles.forEach(p => { + const opt = document.createElement("option"); + opt.value = p.profile_id; + opt.textContent = `${p.name} (${p.model})`; + sel.appendChild(opt); + }); + }); + }, + + // 构造单个场景条目。 + renderScenarioItem(sc) { + const item = document.createElement("div"); + const invalid = !!sc.error; + item.className = "scenario-item" + (invalid ? " invalid" : ""); + + const modeTag = sc.mode + ? `${App.escape(sc.mode)}` + : ""; + const metricCount = (sc.metrics || []).length; + + item.innerHTML = ` +
+
${App.escape(sc.scenario_name || sc.path)}
+
${App.escape(sc.path)}
+ ${sc.error ? `
${App.escape(sc.error)}
` : ""} +
+
+ ${modeTag} + ${metricCount} 指标 +
+ `; + + if (!invalid) { + item.addEventListener("click", () => { + document.querySelectorAll(".scenario-item").forEach((el) => el.classList.remove("selected")); + item.classList.add("selected"); + Runner.selectedScenario = sc.path; + document.getElementById("selected-scenario").textContent = sc.path; + document.getElementById("run-btn").disabled = false; + // 显示 LLM 角色面板 + document.getElementById("llm-assignment-panel").hidden = false; + }); + } + return item; + }, + + // 触发评估:先 apply profiles(若选了),再触发任务。 + async trigger() { + if (!Runner.selectedScenario) return; + const runBtn = document.getElementById("run-btn"); + runBtn.disabled = true; + + const panel = document.getElementById("task-panel"); + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + panel.hidden = false; + reportBtn.hidden = true; + logBox.textContent = ""; + Runner._setStatus(statusBadge, "queued"); + + try { + // Step 1: apply LLM profiles to YAML if any selected + await Runner._applyProfilesIfNeeded(logBox); + + // Step 2: trigger evaluation + const resp = await API.triggerEvaluation(Runner.selectedScenario); + Runner.poll(resp.task_id); + } catch (err) { + Runner._setStatus(statusBadge, "failed"); + logBox.textContent = (logBox.textContent ? logBox.textContent + "\n" : "") + `触发失败:${err.message}`; + runBtn.disabled = false; + } + }, + + // 如果用户选了 profile,就先 apply 写回 YAML + async _applyProfilesIfNeeded(logBox) { + const judgeId = document.getElementById("role-judge").value; + const answerId = document.getElementById("role-answer").value; + const datasetId = document.getElementById("role-dataset").value; + + if (!judgeId && !answerId && !datasetId) return; // 全空,跳过 + + logBox.textContent = "正在将 LLM 配置写入场景文件…\n"; + const body = { + scenario_path: Runner.selectedScenario, + judge_profile_id: judgeId || null, + answer_profile_id: answerId || null, + dataset_profile_id: datasetId || null, + }; + const result = await API.applyProfiles(body); + const fields = (result.patched_fields || []).join(", "); + logBox.textContent += fields + ? `✓ 已更新字段:${fields}\n` + : "(未找到可更新的字段,继续运行)\n"; + }, + + // 周期性轮询任务状态,刷新日志与徽标。 + poll(taskId) { + const logBox = document.getElementById("task-log"); + const statusBadge = document.getElementById("task-status"); + const reportBtn = document.getElementById("view-report-btn"); + const runBtn = document.getElementById("run-btn"); + + if (Runner.pollTimer) clearInterval(Runner.pollTimer); + Runner.pollTimer = setInterval(async () => { + try { + const status = await API.taskStatus(taskId); + logBox.textContent = (status.logs || []).join("\n"); + logBox.scrollTop = logBox.scrollHeight; + Runner._setStatus(statusBadge, status.status); + + if (status.status === "completed" || status.status === "failed") { + clearInterval(Runner.pollTimer); + runBtn.disabled = false; + if (status.status === "completed" && status.run_id) { + Runner.lastRunId = status.run_id; + reportBtn.hidden = false; + } + } + } catch (err) { + clearInterval(Runner.pollTimer); + logBox.textContent += `\n轮询失败:${err.message}`; + runBtn.disabled = false; + } + }, 1200); + }, + + // 更新状态徽标的文本与配色类。 + _setStatus(badge, status) { + badge.textContent = status; + badge.className = "badge " + status; + }, +}; +``` + +- [ ] **Step 4: Smoke-test server import** + +```bash +python -c "from webapp.server import create_app; app = create_app(); print('OK')" +``` +Expected: `OK` + +- [ ] **Step 5: Commit** + +```bash +git add webapp/static/js/runner.js webapp/static/index.html webapp/static/css/app.css +git commit -m "feat: add LLM role-assignment panel to 新建评估 view" +``` + +--- + +## Task 7: End-to-End Smoke Test & Init Files + +**Files:** +- Create: `tests/webapp/__init__.py` (if missing) +- Verify all tests pass + +- [ ] **Step 1: Ensure test package init files exist** + +```bash +# Check what init files exist +ls tests/ && ls tests/webapp/ 2>/dev/null || echo "no webapp dir" +``` + +Create missing init files: +```bash +touch tests/__init__.py 2>/dev/null; touch tests/webapp/__init__.py 2>/dev/null; echo done +``` + +- [ ] **Step 2: Run full test suite** + +```bash +python -m pytest tests/webapp/ -v +``` +Expected: All tests pass (≥ 17 tests) + +- [ ] **Step 3: Verify server starts and routes are registered** + +```bash +python -c " +from webapp.server import create_app +app = create_app() +routes = [r.path for r in app.routes] +assert '/api/llm-profiles' in routes or any('llm-profiles' in r for r in routes), 'Route missing' +print('Routes OK:', [r for r in routes if 'llm' in r or 'profile' in r]) +" +``` +Expected: prints routes including `/api/llm-profiles` + +- [ ] **Step 4: Final commit** + +```bash +git add tests/ +git commit -m "test: ensure test package structure and all webapp tests pass" +``` diff --git a/rag_eval/metrics/pipeline.py b/rag_eval/metrics/pipeline.py index 91865c2..d575a16 100644 --- a/rag_eval/metrics/pipeline.py +++ b/rag_eval/metrics/pipeline.py @@ -94,6 +94,23 @@ class MetricPipeline: reference=sample.ground_truth, retrieved_contexts=sample.contexts, ) + elif name == "noise_sensitivity": + coroutine = metric.ascore( + user_input=sample.question, + response=sample.answer, + reference=sample.ground_truth, + retrieved_contexts=sample.contexts, + ) + elif name == "factual_correctness": + coroutine = metric.ascore( + response=sample.answer, + reference=sample.ground_truth, + ) + elif name == "semantic_similarity": + coroutine = metric.ascore( + reference=sample.ground_truth, + response=sample.answer, + ) else: raise ValueError(f"Unsupported metric: {name}") diff --git a/rag_eval/metrics/registry.py b/rag_eval/metrics/registry.py index 797bd6b..3ab01d1 100644 --- a/rag_eval/metrics/registry.py +++ b/rag_eval/metrics/registry.py @@ -1,8 +1,13 @@ """Supported metric names recognized by scenario validation and pipeline setup.""" SUPPORTED_METRICS = { + # Core retrieval / generation metrics (always available). "faithfulness", "answer_relevancy", "context_recall", "context_precision", + # Robustness and end-to-end metrics (see 架构设计 §10.2). + "noise_sensitivity", # 鲁棒性:对检索噪声的敏感度 + "factual_correctness", # 端到端:回答相对标准答案的事实正确性 + "semantic_similarity", # 端到端:回答与标准答案的语义相似度(embedding,无 LLM 调用) } diff --git a/scenarios/offline/siemens-pdf-offline-smoke.yaml b/scenarios/offline/siemens-pdf-offline-smoke.yaml index 9494bd2..ca3e7ee 100644 --- a/scenarios/offline/siemens-pdf-offline-smoke.yaml +++ b/scenarios/offline/siemens-pdf-offline-smoke.yaml @@ -9,6 +9,10 @@ metrics: - answer_relevancy - context_recall - context_precision + # 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth,取消注释即可启用) + # - noise_sensitivity # 鲁棒性:对检索噪声的敏感度 + # - factual_correctness # 端到端:事实正确性(相对标准答案) + # - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用) output_dir: ../../outputs/siemens-pdf-offline-smoke runtime: batch_size: 4 diff --git a/scenarios/online/sample-pdf-question-bank-online.yaml b/scenarios/online/sample-pdf-question-bank-online.yaml index a433d00..d216a7e 100644 --- a/scenarios/online/sample-pdf-question-bank-online.yaml +++ b/scenarios/online/sample-pdf-question-bank-online.yaml @@ -1,13 +1,13 @@ scenario_name: sample-pdf-question-bank-online mode: online dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv -judge_model: deepseek-v4-pro +judge_model: qwen3.5-flash embedding_model: text-embedding-v3 metrics: - - faithfulness - - answer_relevancy - - context_recall - - context_precision +- faithfulness +- answer_relevancy +- context_recall +- context_precision output_dir: ../../outputs/online/sample-pdf-question-bank runtime: batch_size: 2 @@ -19,4 +19,4 @@ app_adapter: callable: apps.pdf_question_bank.adapter:run static_kwargs: source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl - model: deepseek-v4-flash + model: glm-5 diff --git a/scenarios/online/siemens-pdf-question-bank-online.yaml b/scenarios/online/siemens-pdf-question-bank-online.yaml index 4a614b4..e26ec3d 100644 --- a/scenarios/online/siemens-pdf-question-bank-online.yaml +++ b/scenarios/online/siemens-pdf-question-bank-online.yaml @@ -1,28 +1,26 @@ scenario_name: siemens-pdf-question-bank-online mode: online dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv -# judge_model: qwen3.5-flash judge_model: deepseek-v4-flash embedding_model: text-embedding-v3 -optimization_advisor: true # 评测结束后自动生成优化建议报告 +optimization_advisor: true metrics: - - faithfulness - - answer_relevancy - - context_recall - - context_precision - # 已启用:鲁棒性 / 端到端指标(数据集已含 ground_truth) - - noise_sensitivity # 鲁棒性:对检索噪声的敏感度 - - factual_correctness # 端到端:事实正确性(相对标准答案) - - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用) +- faithfulness +- answer_relevancy +- context_recall +- context_precision +- noise_sensitivity +- factual_correctness +- semantic_similarity output_dir: ../../outputs/online/siemens-pdf-question-bank runtime: - batch_size: 4 - app_concurrency: 4 - metric_concurrency: 4 - max_samples: 50 + batch_size: 3 + app_concurrency: 3 + metric_concurrency: 3 + max_samples: 10 app_adapter: type: python callable: apps.siemens_pdf_qa.adapter:run static_kwargs: source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl - model: deepseek-v4-flash + model: glm-5