更新

2026-06-16 18:12:33 +08:00
parent ca01e44ad2
commit 24956bbf75
7 changed files with 1496 additions and 21 deletions
--- a/configs/llm_profiles.json
+++ b/configs/llm_profiles.json
@@ -0,0 +1,64 @@
+{
+  "profiles": [
+    {
+      "profile_id": "c8e185a64fa0",
+      "name": "glm-5",
+      "model": "glm-5",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:16:22.438297+00:00",
+      "updated_at": "2026-06-16T09:19:03.089865+00:00"
+    },
+    {
+      "profile_id": "54ddfe5aeb46",
+      "name": "deepseek-v4-pro",
+      "model": "deepseek-v4-pro",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:17:08.473904+00:00",
+      "updated_at": "2026-06-16T09:19:07.504082+00:00"
+    },
+    {
+      "profile_id": "25d035eef194",
+      "name": "qwen3.5-flash",
+      "model": "qwen3.5-flash",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:18:24.265619+00:00",
+      "updated_at": "2026-06-16T09:18:24.265619+00:00"
+    },
+    {
+      "profile_id": "ff1d0f417a5d",
+      "name": "deepseek-v4-flash",
+      "model": "deepseek-v4-flash",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:18:57.091549+00:00",
+      "updated_at": "2026-06-16T09:18:57.091549+00:00"
+    },
+    {
+      "profile_id": "5b04c49df9df",
+      "name": "text-embedding-v4",
+      "model": "text-embedding-v4",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:19:49.104004+00:00",
+      "updated_at": "2026-06-16T09:19:49.104004+00:00"
+    },
+    {
+      "profile_id": "b4f7c82859d5",
+      "name": "text-embedding-v3",
+      "model": "text-embedding-v3",
+      "base_url": "http://6.86.80.4:30080/v1",
+      "api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
+      "timeout_seconds": 600,
+      "created_at": "2026-06-16T09:20:18.266540+00:00",
+      "updated_at": "2026-06-16T09:20:18.266540+00:00"
+    }
+  ]
+}
--- a/docs/superpowers/plans/2026-06-16-llm-profile-manager.md
+++ b/docs/superpowers/plans/2026-06-16-llm-profile-manager.md
--- a/rag_eval/metrics/pipeline.py
+++ b/rag_eval/metrics/pipeline.py
@@ -94,6 +94,23 @@ class MetricPipeline:
                reference=sample.ground_truth,
                retrieved_contexts=sample.contexts,
            )
+        elif name == "noise_sensitivity":
+            coroutine = metric.ascore(
+                user_input=sample.question,
+                response=sample.answer,
+                reference=sample.ground_truth,
+                retrieved_contexts=sample.contexts,
+            )
+        elif name == "factual_correctness":
+            coroutine = metric.ascore(
+                response=sample.answer,
+                reference=sample.ground_truth,
+            )
+        elif name == "semantic_similarity":
+            coroutine = metric.ascore(
+                reference=sample.ground_truth,
+                response=sample.answer,
+            )
        else:
            raise ValueError(f"Unsupported metric: {name}")

--- a/rag_eval/metrics/registry.py
+++ b/rag_eval/metrics/registry.py
@@ -1,8 +1,13 @@
 """Supported metric names recognized by scenario validation and pipeline setup."""

 SUPPORTED_METRICS = {
+    # Core retrieval / generation metrics (always available).
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "context_precision",
+    # Robustness and end-to-end metrics (see 架构设计 §10.2).
+    "noise_sensitivity",      # 鲁棒性：对检索噪声的敏感度
+    "factual_correctness",    # 端到端：回答相对标准答案的事实正确性
+    "semantic_similarity",    # 端到端：回答与标准答案的语义相似度（embedding，无 LLM 调用）
 }
--- a/scenarios/offline/siemens-pdf-offline-smoke.yaml
+++ b/scenarios/offline/siemens-pdf-offline-smoke.yaml
@@ -9,6 +9,10 @@ metrics:
  - answer_relevancy
  - context_recall
  - context_precision
+  # 可选：鲁棒性 / 端到端指标（数据集已含 ground_truth，取消注释即可启用）
+  # - noise_sensitivity      # 鲁棒性：对检索噪声的敏感度
+  # - factual_correctness    # 端到端：事实正确性（相对标准答案）
+  # - semantic_similarity    # 端到端：语义相似度（embedding，无 LLM 调用）
 output_dir: ../../outputs/siemens-pdf-offline-smoke
 runtime:
  batch_size: 4
--- a/scenarios/online/sample-pdf-question-bank-online.yaml
+++ b/scenarios/online/sample-pdf-question-bank-online.yaml
@@ -1,13 +1,13 @@
 scenario_name: sample-pdf-question-bank-online
 mode: online
 dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
-judge_model: deepseek-v4-pro
+judge_model: qwen3.5-flash
 embedding_model: text-embedding-v3
 metrics:
-  - faithfulness
-  - answer_relevancy
-  - context_recall
-  - context_precision
+- faithfulness
+- answer_relevancy
+- context_recall
+- context_precision
 output_dir: ../../outputs/online/sample-pdf-question-bank
 runtime:
  batch_size: 2
@@ -19,4 +19,4 @@ app_adapter:
  callable: apps.pdf_question_bank.adapter:run
  static_kwargs:
    source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
-    model: deepseek-v4-flash
+    model: glm-5
--- a/scenarios/online/siemens-pdf-question-bank-online.yaml
+++ b/scenarios/online/siemens-pdf-question-bank-online.yaml
@@ -1,28 +1,26 @@
 scenario_name: siemens-pdf-question-bank-online
 mode: online
 dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
-# judge_model: qwen3.5-flash
 judge_model: deepseek-v4-flash
 embedding_model: text-embedding-v3
-optimization_advisor: true      # 评测结束后自动生成优化建议报告
+optimization_advisor: true
 metrics:
-  - faithfulness
-  - answer_relevancy
-  - context_recall
-  - context_precision
-  # 已启用：鲁棒性 / 端到端指标（数据集已含 ground_truth）
-  - noise_sensitivity        # 鲁棒性：对检索噪声的敏感度
-  - factual_correctness      # 端到端：事实正确性（相对标准答案）
-  - semantic_similarity      # 端到端：语义相似度（embedding，无 LLM 调用）
+- faithfulness
+- answer_relevancy
+- context_recall
+- context_precision
+- noise_sensitivity
+- factual_correctness
+- semantic_similarity
 output_dir: ../../outputs/online/siemens-pdf-question-bank
 runtime:
-  batch_size: 4
-  app_concurrency: 4
-  metric_concurrency: 4
-  max_samples: 50
+  batch_size: 3
+  app_concurrency: 3
+  metric_concurrency: 3
+  max_samples: 10
 app_adapter:
  type: python
  callable: apps.siemens_pdf_qa.adapter:run
  static_kwargs:
    source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
-    model: deepseek-v4-flash
+    model: glm-5