scenario_name: siemens-pdf-question-bank-online mode: online dataset: ../../datasets/raw/generated/siemens-pdf-question-bank.csv # judge_model: qwen3.5-flash judge_model: deepseek-v4-flash embedding_model: text-embedding-v3 optimization_advisor: true # 评测结束后自动生成优化建议报告 metrics: - faithfulness - answer_relevancy - context_recall - context_precision # 已启用:鲁棒性 / 端到端指标(数据集已含 ground_truth) - noise_sensitivity # 鲁棒性:对检索噪声的敏感度 - factual_correctness # 端到端:事实正确性(相对标准答案) - semantic_similarity # 端到端:语义相似度(embedding,无 LLM 调用) output_dir: ../../outputs/online/siemens-pdf-question-bank runtime: batch_size: 4 app_concurrency: 4 metric_concurrency: 4 max_samples: 50 app_adapter: type: python callable: apps.siemens_pdf_qa.adapter:run static_kwargs: source_chunks_path: ../../outputs/dataset-builds/siemens-pdf-question-bank/latest/source_chunks.jsonl model: deepseek-v4-flash