feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples) - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback - writer.py: writes optimization_advice.md + log summary - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False) - Scenario.optimization_advisor: new bool field (default False) - ScenarioModel: same field added, loader.py透传 - RunArtifactPaths.advice_md: new path field - factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings - runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end - siemens online YAML: optimization_advisor: true enabled - tests: 9 rules tests + 6 writer tests, all pass - docs: advisor section added to engine-flow.md and architecture.md Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-16 17:06:19 +08:00
parent d68399d39b
commit f5c2dce64a
17 changed files with 2381 additions and 9 deletions
--- a/rag_eval/metrics/factory.py
+++ b/rag_eval/metrics/factory.py
@@ -18,7 +18,10 @@ from ragas.metrics.collections import (
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
+    FactualCorrectness,
    Faithfulness,
+    NoiseSensitivity,
+    SemanticSimilarity,
 )

 from .pipeline import MetricPipeline
@@ -39,19 +42,34 @@ def build_models(
 def build_metric_pipeline(
    scenario: Scenario,
    settings: EvaluationSettings,
+    llm: Any | None = None,
+    embeddings: Any | None = None,
 ) -> MetricPipeline:
-    """Build a metric pipeline containing only the metrics requested by the scenario."""
-    llm, embeddings = build_models(
-        scenario.judge_model,
-        scenario.embedding_model,
-        settings,
-    )
+    """Build a metric pipeline containing only the metrics requested by the scenario.
+
+    If llm and embeddings are provided (pre-built by the caller), they are reused.
+    Otherwise, new instances are created from scenario + settings.
+    """
+    if llm is None or embeddings is None:
+        llm, embeddings = build_models(
+            scenario.judge_model,
+            scenario.embedding_model,
+            settings,
+        )
+
    # Build the full registry once, then slice it by configured metric names.
    registry: dict[str, Any] = {
        "faithfulness": Faithfulness(llm=llm),
        "answer_relevancy": AnswerRelevancy(llm=llm, embeddings=embeddings),
        "context_recall": ContextRecall(llm=llm),
        "context_precision": ContextPrecision(llm=llm),
+        # Robustness / end-to-end metrics (架构设计 §10.2).
+        # NoiseSensitivity mode='relevant': sensitivity to noise from relevant contexts.
+        "noise_sensitivity": NoiseSensitivity(llm=llm),
+        # FactualCorrectness mode='f1': balances claim precision and recall vs. ground truth.
+        "factual_correctness": FactualCorrectness(llm=llm),
+        # SemanticSimilarity: embedding cosine between answer and ground truth (no LLM call).
+        "semantic_similarity": SemanticSimilarity(embeddings=embeddings),
    }
    return MetricPipeline(
        metrics={name: registry[name] for name in scenario.metrics},