更新
This commit is contained in:
64
configs/llm_profiles.json
Normal file
64
configs/llm_profiles.json
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
{
|
||||||
|
"profiles": [
|
||||||
|
{
|
||||||
|
"profile_id": "c8e185a64fa0",
|
||||||
|
"name": "glm-5",
|
||||||
|
"model": "glm-5",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:16:22.438297+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:03.089865+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "54ddfe5aeb46",
|
||||||
|
"name": "deepseek-v4-pro",
|
||||||
|
"model": "deepseek-v4-pro",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:17:08.473904+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:07.504082+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "25d035eef194",
|
||||||
|
"name": "qwen3.5-flash",
|
||||||
|
"model": "qwen3.5-flash",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:18:24.265619+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:18:24.265619+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "ff1d0f417a5d",
|
||||||
|
"name": "deepseek-v4-flash",
|
||||||
|
"model": "deepseek-v4-flash",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:18:57.091549+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:18:57.091549+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "5b04c49df9df",
|
||||||
|
"name": "text-embedding-v4",
|
||||||
|
"model": "text-embedding-v4",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:19:49.104004+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:19:49.104004+00:00"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"profile_id": "b4f7c82859d5",
|
||||||
|
"name": "text-embedding-v3",
|
||||||
|
"model": "text-embedding-v3",
|
||||||
|
"base_url": "http://6.86.80.4:30080/v1",
|
||||||
|
"api_key": "sk-fVr9KmDZNC4pGDBQj0EUWz9bDmFzNxjYC9EzZpe2bVDsxtz8",
|
||||||
|
"timeout_seconds": 600,
|
||||||
|
"created_at": "2026-06-16T09:20:18.266540+00:00",
|
||||||
|
"updated_at": "2026-06-16T09:20:18.266540+00:00"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
1387
docs/superpowers/plans/2026-06-16-llm-profile-manager.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -94,6 +94,23 @@ class MetricPipeline:
|
|||||||
reference=sample.ground_truth,
|
reference=sample.ground_truth,
|
||||||
retrieved_contexts=sample.contexts,
|
retrieved_contexts=sample.contexts,
|
||||||
)
|
)
|
||||||
|
elif name == "noise_sensitivity":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
user_input=sample.question,
|
||||||
|
response=sample.answer,
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
retrieved_contexts=sample.contexts,
|
||||||
|
)
|
||||||
|
elif name == "factual_correctness":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
response=sample.answer,
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
)
|
||||||
|
elif name == "semantic_similarity":
|
||||||
|
coroutine = metric.ascore(
|
||||||
|
reference=sample.ground_truth,
|
||||||
|
response=sample.answer,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported metric: {name}")
|
raise ValueError(f"Unsupported metric: {name}")
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
"""Supported metric names recognized by scenario validation and pipeline setup."""
|
||||||
|
|
||||||
SUPPORTED_METRICS = {
|
SUPPORTED_METRICS = {
|
||||||
|
# Core retrieval / generation metrics (always available).
|
||||||
"faithfulness",
|
"faithfulness",
|
||||||
"answer_relevancy",
|
"answer_relevancy",
|
||||||
"context_recall",
|
"context_recall",
|
||||||
"context_precision",
|
"context_precision",
|
||||||
|
# Robustness and end-to-end metrics (see 架构设计 §10.2).
|
||||||
|
"noise_sensitivity", # 鲁棒性:对检索噪声的敏感度
|
||||||
|
"factual_correctness", # 端到端:回答相对标准答案的事实正确性
|
||||||
|
"semantic_similarity", # 端到端:回答与标准答案的语义相似度(embedding,无 LLM 调用)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,6 +9,10 @@ metrics:
|
|||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
|
# 可选:鲁棒性 / 端到端指标(数据集已含 ground_truth,取消注释即可启用)
|
||||||
|
# - noise_sensitivity # 鲁棒性:对检索噪声的敏感度
|
||||||
|
# - factual_correctness # 端到端:事实正确性(相对标准答案)
|
||||||
|
# - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用)
|
||||||
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
output_dir: ../../outputs/siemens-pdf-offline-smoke
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
scenario_name: sample-pdf-question-bank-online
|
scenario_name: sample-pdf-question-bank-online
|
||||||
mode: online
|
mode: online
|
||||||
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
|
dataset: ../../datasets/raw/generated/sample-pdf-question-bank.csv
|
||||||
judge_model: deepseek-v4-pro
|
judge_model: qwen3.5-flash
|
||||||
embedding_model: text-embedding-v3
|
embedding_model: text-embedding-v3
|
||||||
metrics:
|
metrics:
|
||||||
- faithfulness
|
- faithfulness
|
||||||
@@ -19,4 +19,4 @@ app_adapter:
|
|||||||
callable: apps.pdf_question_bank.adapter:run
|
callable: apps.pdf_question_bank.adapter:run
|
||||||
static_kwargs:
|
static_kwargs:
|
||||||
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
|
source_chunks_path: ../../outputs/dataset-builds/sample-pdf-question-bank/latest/source_chunks.jsonl
|
||||||
model: deepseek-v4-flash
|
model: glm-5
|
||||||
|
|||||||
@@ -1,28 +1,26 @@
|
|||||||
scenario_name: siemens-pdf-question-bank-online
|
scenario_name: siemens-pdf-question-bank-online
|
||||||
mode: online
|
mode: online
|
||||||
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv
|
||||||
# judge_model: qwen3.5-flash
|
|
||||||
judge_model: deepseek-v4-flash
|
judge_model: deepseek-v4-flash
|
||||||
embedding_model: text-embedding-v3
|
embedding_model: text-embedding-v3
|
||||||
optimization_advisor: true # 评测结束后自动生成优化建议报告
|
optimization_advisor: true
|
||||||
metrics:
|
metrics:
|
||||||
- faithfulness
|
- faithfulness
|
||||||
- answer_relevancy
|
- answer_relevancy
|
||||||
- context_recall
|
- context_recall
|
||||||
- context_precision
|
- context_precision
|
||||||
# 已启用:鲁棒性 / 端到端指标(数据集已含 ground_truth)
|
- noise_sensitivity
|
||||||
- noise_sensitivity # 鲁棒性:对检索噪声的敏感度
|
- factual_correctness
|
||||||
- factual_correctness # 端到端:事实正确性(相对标准答案)
|
- semantic_similarity
|
||||||
- semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用)
|
|
||||||
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
output_dir: ../../outputs/online/siemens-pdf-question-bank
|
||||||
runtime:
|
runtime:
|
||||||
batch_size: 4
|
batch_size: 3
|
||||||
app_concurrency: 4
|
app_concurrency: 3
|
||||||
metric_concurrency: 4
|
metric_concurrency: 3
|
||||||
max_samples: 50
|
max_samples: 10
|
||||||
app_adapter:
|
app_adapter:
|
||||||
type: python
|
type: python
|
||||||
callable: apps.siemens_pdf_qa.adapter:run
|
callable: apps.siemens_pdf_qa.adapter:run
|
||||||
static_kwargs:
|
static_kwargs:
|
||||||
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl
|
||||||
model: deepseek-v4-flash
|
model: glm-5
|
||||||
|
|||||||
Reference in New Issue
Block a user