This commit is contained in:
2026-06-16 18:12:33 +08:00
parent ca01e44ad2
commit 24956bbf75
7 changed files with 1496 additions and 21 deletions

64
configs/llm_profiles.json Normal file
View File

@@ -0,0 +1,64 @@
{
"profiles": [
{
"profile_id": "c8e185a64fa0",
"name": "glm-5",
"model": "glm-5",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:16:22.438297+00:00",
"updated_at": "2026-06-16T09:19:03.089865+00:00"
},
{
"profile_id": "54ddfe5aeb46",
"name": "deepseek-v4-pro",
"model": "deepseek-v4-pro",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:17:08.473904+00:00",
"updated_at": "2026-06-16T09:19:07.504082+00:00"
},
{
"profile_id": "25d035eef194",
"name": "qwen3.5-flash",
"model": "qwen3.5-flash",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:18:24.265619+00:00",
"updated_at": "2026-06-16T09:18:24.265619+00:00"
},
{
"profile_id": "ff1d0f417a5d",
"name": "deepseek-v4-flash",
"model": "deepseek-v4-flash",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:18:57.091549+00:00",
"updated_at": "2026-06-16T09:18:57.091549+00:00"
},
{
"profile_id": "5b04c49df9df",
"name": "text-embedding-v4",
"model": "text-embedding-v4",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:19:49.104004+00:00",
"updated_at": "2026-06-16T09:19:49.104004+00:00"
},
{
"profile_id": "b4f7c82859d5",
"name": "text-embedding-v3",
"model": "text-embedding-v3",
"base_url": "http://6.86.80.4:30080/v1",
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
"timeout_seconds": 600,
"created_at": "2026-06-16T09:20:18.266540+00:00",
"updated_at": "2026-06-16T09:20:18.266540+00:00"
}
]
}

File diff suppressed because it is too large Load Diff

View File

@@ -94,6 +94,23 @@ class MetricPipeline:
reference=sample.ground_truth, reference=sample.ground_truth,
retrieved_contexts=sample.contexts, retrieved_contexts=sample.contexts,
) )
elif name == "noise_sensitivity":
coroutine = metric.ascore(
user_input=sample.question,
response=sample.answer,
reference=sample.ground_truth,
retrieved_contexts=sample.contexts,
)
elif name == "factual_correctness":
coroutine = metric.ascore(
response=sample.answer,
reference=sample.ground_truth,
)
elif name == "semantic_similarity":
coroutine = metric.ascore(
reference=sample.ground_truth,
response=sample.answer,
)
else: else:
raise ValueError(f"Unsupported metric: {name}") raise ValueError(f"Unsupported metric: {name}")

View File

@@ -1,8 +1,13 @@
"""Supported metric names recognized by scenario validation and pipeline setup.""" """Supported metric names recognized by scenario validation and pipeline setup."""
SUPPORTED_METRICS = { SUPPORTED_METRICS = {
# Core retrieval / generation metrics (always available).
"faithfulness", "faithfulness",
"answer_relevancy", "answer_relevancy",
"context_recall", "context_recall",
"context_precision", "context_precision",
# Robustness and end-to-end metrics (see 架构设计 §10.2).
"noise_sensitivity", # 鲁棒性:对检索噪声的敏感度
"factual_correctness", # 端到端:回答相对标准答案的事实正确性
"semantic_similarity", # 端到端回答与标准答案的语义相似度embedding无 LLM 调用)
} }

View File

@@ -9,6 +9,10 @@ metrics:
- answer_relevancy - answer_relevancy
- context_recall - context_recall
- context_precision - context_precision
# 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth取消注释即可启用
# - noise_sensitivity # 鲁棒性:对检索噪声的敏感度
# - factual_correctness # 端到端:事实正确性(相对标准答案)
# - semantic_similarity # 端到端语义相似度embedding无 LLM 调用)
output_dir: ../../outputs/siemens-pdf-offline-smoke output_dir: ../../outputs/siemens-pdf-offline-smoke
runtime: runtime:
batch_size: 4 batch_size: 4

View File

@@ -1,13 +1,13 @@
scenario_name: sample-pdf-question-bank-online scenario_name: sample-pdf-question-bank-online
mode: online mode: online
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
judge_model: deepseek-v4-pro judge_model: qwen3.5-flash
embedding_model: text-embedding-v3 embedding_model: text-embedding-v3
metrics: metrics:
- faithfulness - faithfulness
- answer_relevancy - answer_relevancy
- context_recall - context_recall
- context_precision - context_precision
output_dir: ../../outputs/online/sample-pdf-question-bank output_dir: ../../outputs/online/sample-pdf-question-bank
runtime: runtime:
batch_size: 2 batch_size: 2
@@ -19,4 +19,4 @@ app_adapter:
callable: apps.pdf_question_bank.adapter:run callable: apps.pdf_question_bank.adapter:run
static_kwargs: static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash model: glm-5

View File

@@ -1,28 +1,26 @@
scenario_name: siemens-pdf-question-bank-online scenario_name: siemens-pdf-question-bank-online
mode: online mode: online
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
# judge_model: qwen3.5-flash
judge_model: deepseek-v4-flash judge_model: deepseek-v4-flash
embedding_model: text-embedding-v3 embedding_model: text-embedding-v3
optimization_advisor: true # 评测结束后自动生成优化建议报告 optimization_advisor: true
metrics: metrics:
- faithfulness - faithfulness
- answer_relevancy - answer_relevancy
- context_recall - context_recall
- context_precision - context_precision
# 已启用:鲁棒性 / 端到端指标(数据集已含 ground_truth - noise_sensitivity
- noise_sensitivity # 鲁棒性:对检索噪声的敏感度 - factual_correctness
- factual_correctness # 端到端:事实正确性(相对标准答案) - semantic_similarity
- semantic_similarity # 端到端语义相似度embedding无 LLM 调用)
output_dir: ../../outputs/online/siemens-pdf-question-bank output_dir: ../../outputs/online/siemens-pdf-question-bank
runtime: runtime:
batch_size: 4 batch_size: 3
app_concurrency: 4 app_concurrency: 3
metric_concurrency: 4 metric_concurrency: 3
max_samples: 50 max_samples: 10
app_adapter: app_adapter:
type: python type: python
callable: apps.siemens_pdf_qa.adapter:run callable: apps.siemens_pdf_qa.adapter:run
static_kwargs: static_kwargs:
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
model: deepseek-v4-flash model: glm-5