siemens_ragas/rag_eval/advisor/rules.py

"""Rule-based diagnostic engine for RAG evaluation metric scores."""
from __future__ import annotations

import math
from dataclasses import dataclass, field
from typing import Any


@dataclass
class MetricRule:
    """Threshold configuration and diagnostic text for one metric."""
    warning_threshold: float
    critical_threshold: float
    higher_is_better: bool  # False for noise_sensitivity
    root_causes: list[str]
    suggested_actions: list[str]
    # Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
    # Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
    advisory_threshold: float = 0.85


METRIC_RULES: dict[str, MetricRule] = {
    "faithfulness": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "生成回答包含检索片段中不支持的陈述（幻觉）",
            "生成阶段未严格遵循 grounding 约束",
            "校验阶段未开启或未生效",
        ],
        suggested_actions=[
            "强化生成 prompt 的 grounding 约束（'只依据参考资料作答'）",
            "开启校验阶段（validation: by_scenario）",
            "检查低分样本中模型是否引用了片段外的知识",
        ],
    ),
    "answer_relevancy": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "回答偏离问题主旨或包含大量冗余内容",
            "查询改写后问题语义漂移",
            "生成 prompt 格式约束不足",
        ],
        suggested_actions=[
            "优化查询改写 prompt，确保改写后语义不偏移",
            "在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
            "检查低分样本的回答是否存在格式冗余或话题偏移",
        ],
    ),
    "context_recall": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "检索未能召回标准答案所涉及的关键信息",
            "单一查询未能覆盖问题的多个角度",
            "过召回数量不足，关键片段被截断",
        ],
        suggested_actions=[
            "启用多查询扩展（use_multi_query）覆盖不同措辞",
            "对多跳问题启用问题分解（sub_questions）",
            "加大过召回宽度（recall_top_k）",
            "对颗粒度细的问题尝试 Step-back 双路检索",
        ],
    ),
    "context_precision": MetricRule(
        warning_threshold=0.6,
        critical_threshold=0.4,
        higher_is_better=True,
        root_causes=[
            "检索引入过多与问题无关的片段",
            "重排未能将相关片段排在前列",
            "缺少相关性过滤，噪声片段进入上下文",
        ],
        suggested_actions=[
            "启用或优化 listwise 重排，将相关片段排在前列",
            "启用上下文压缩（compression）过滤无关句子",
            "启用相关性过滤（relevance_filter）丢弃明确无关片段",
            "缩小 rerank_keep_k（如从 8 降到 5）",
        ],
    ),
    "noise_sensitivity": MetricRule(
        warning_threshold=0.3,   # higher is worse; trigger when mean > threshold
        critical_threshold=0.5,
        higher_is_better=False,
        root_causes=[
            "回答中包含检索到的噪声片段所引入的错误陈述",
            "相关性过滤未能拦截干扰性片段",
            "生成阶段对噪声片段未加区分地引用",
        ],
        suggested_actions=[
            "启用相关性过滤（relevance_filter）拦截噪声",
            "优化重排，将不相关片段排到截断点之后",
            "在生成 prompt 中强调'来源冲突时并列陈述，不擅自下定论'",
        ],
    ),
    "factual_correctness": MetricRule(
        warning_threshold=0.6,
        critical_threshold=0.4,
        higher_is_better=True,
        root_causes=[
            "回答的事实陈述与标准答案存在偏差",
            "检索未能命中标准答案所依据的关键片段",
            "生成阶段对多个来源综合时产生事实错误",
        ],
        suggested_actions=[
            "重点检查低分样本，确认是检索遗漏还是生成错误",
            "提升 context_recall 以确保关键信息被检索到",
            "对事实型问题将 temperature 降至 0",
        ],
    ),
    "semantic_similarity": MetricRule(
        warning_threshold=0.7,
        critical_threshold=0.5,
        higher_is_better=True,
        root_causes=[
            "回答语义与标准答案差距较大",
            "回答过于简短或过于冗长，语义偏移",
            "检索到的片段质量不足，导致生成内容偏离",
        ],
        suggested_actions=[
            "检查低分样本的回答与标准答案的表述差异",
            "优化生成 prompt 使回答更贴近标准表述风格",
            "提升检索质量（context_recall / context_precision）",
        ],
    ),
}


@dataclass
class Diagnosis:
    """Diagnostic result for one metric that triggered a threshold."""
    metric: str
    mean_score: float
    threshold: float          # the triggered threshold
    severity: str             # "warning" | "critical"
    root_causes: list[str] = field(default_factory=list)
    suggested_actions: list[str] = field(default_factory=list)
    low_samples: list[dict[str, Any]] = field(default_factory=list)


def _mean_ignoring_nan(values: list[float]) -> float | None:
    valid = [v for v in values if not math.isnan(v)]
    if not valid:
        return None
    return sum(valid) / len(valid)


def _select_low_samples(
    rows: list[dict[str, Any]],
    metric: str,
    top_n: int,
    higher_is_better: bool,
) -> list[dict[str, Any]]:
    """Return the top_n worst-scoring rows for a metric, excluding NaN."""
    valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
    sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
    worst = sorted_rows[:top_n]
    keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
    return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]


def diagnose(
    score_rows: list[dict[str, Any]],
    metrics: list[str],
    top_low_samples: int = 3,
) -> list[Diagnosis]:
    """Analyse score_rows and return a Diagnosis for each metric below threshold.

    Args:
        score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
        metrics: Metric names to evaluate (from Scenario.metrics).
        top_low_samples: How many worst-scoring samples to attach per diagnosis.

    Returns:
        List of Diagnosis objects, one per triggered metric. Empty if all OK.
    """
    diagnoses: list[Diagnosis] = []

    for metric in metrics:
        rule = METRIC_RULES.get(metric)
        if rule is None:
            continue  # unknown metric, skip

        values = []
        for row in score_rows:
            raw = row.get(metric)
            if raw is None:
                continue
            try:
                v = float(raw)
            except (TypeError, ValueError):
                continue
            values.append(v)

        if not values:
            continue

        mean = _mean_ignoring_nan(values)
        if mean is None:
            continue

        # Determine severity (direction-aware)
        if rule.higher_is_better:
            if mean < rule.critical_threshold:
                severity = "critical"
                threshold = rule.critical_threshold
            elif mean < rule.warning_threshold:
                severity = "warning"
                threshold = rule.warning_threshold
            elif mean < rule.advisory_threshold:
                # Score is acceptable but below 0.85 — request LLM optimization advice.
                severity = "low"
                threshold = rule.advisory_threshold
            else:
                continue  # >= advisory_threshold → no diagnosis needed
        else:
            # lower is better (noise_sensitivity): keep existing two-tier logic
            if mean > rule.critical_threshold:
                severity = "critical"
                threshold = rule.critical_threshold
            elif mean > rule.warning_threshold:
                severity = "warning"
                threshold = rule.warning_threshold
            else:
                continue

        low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)

        diagnoses.append(Diagnosis(
            metric=metric,
            mean_score=round(mean, 4),
            threshold=threshold,
            severity=severity,
            root_causes=list(rule.root_causes),
            suggested_actions=list(rule.suggested_actions),
            low_samples=low_samples,
        ))

    return diagnoses
-												feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:06:19 +08:00
+								"""Rule-based diagnostic engine for RAG evaluation metric scores."""
 								from __future__ import annotations
 								import math
 								from dataclasses import dataclass, field
 								from typing import Any
 								@dataclass
 								class MetricRule:
 								    """Threshold configuration and diagnostic text for one metric."""
 								    warning_threshold: float
 								    critical_threshold: float
 								    higher_is_better: bool  # False for noise_sensitivity
 								    root_causes: list[str]
 								    suggested_actions: list[str]
-												feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-25 11:35:49 +08:00
+								    # Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
 								    # Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
 								    advisory_threshold: float = 0.85
-												feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:06:19 +08:00
 								METRIC_RULES: dict[str, MetricRule] = {
 								    "faithfulness": MetricRule(
 								        warning_threshold=0.7,
 								        critical_threshold=0.5,
 								        higher_is_better=True,
 								        root_causes=[
 								            "生成回答包含检索片段中不支持的陈述（幻觉）",
 								            "生成阶段未严格遵循 grounding 约束",
 								            "校验阶段未开启或未生效",
 								        ],
 								        suggested_actions=[
 								            "强化生成 prompt 的 grounding 约束（'只依据参考资料作答'）",
 								            "开启校验阶段（validation: by_scenario）",
 								            "检查低分样本中模型是否引用了片段外的知识",
 								        ],
 								    ),
 								    "answer_relevancy": MetricRule(
 								        warning_threshold=0.7,
 								        critical_threshold=0.5,
 								        higher_is_better=True,
 								        root_causes=[
 								            "回答偏离问题主旨或包含大量冗余内容",
 								            "查询改写后问题语义漂移",
 								            "生成 prompt 格式约束不足",
 								        ],
 								        suggested_actions=[
 								            "优化查询改写 prompt，确保改写后语义不偏移",
 								            "在生成 prompt 中加入'简洁准确、直接回答问题'的约束",
 								            "检查低分样本的回答是否存在格式冗余或话题偏移",
 								        ],
 								    ),
 								    "context_recall": MetricRule(
 								        warning_threshold=0.7,
 								        critical_threshold=0.5,
 								        higher_is_better=True,
 								        root_causes=[
 								            "检索未能召回标准答案所涉及的关键信息",
 								            "单一查询未能覆盖问题的多个角度",
 								            "过召回数量不足，关键片段被截断",
 								        ],
 								        suggested_actions=[
 								            "启用多查询扩展（use_multi_query）覆盖不同措辞",
 								            "对多跳问题启用问题分解（sub_questions）",
 								            "加大过召回宽度（recall_top_k）",
 								            "对颗粒度细的问题尝试 Step-back 双路检索",
 								        ],
 								    ),
 								    "context_precision": MetricRule(
 								        warning_threshold=0.6,
 								        critical_threshold=0.4,
 								        higher_is_better=True,
 								        root_causes=[
 								            "检索引入过多与问题无关的片段",
 								            "重排未能将相关片段排在前列",
 								            "缺少相关性过滤，噪声片段进入上下文",
 								        ],
 								        suggested_actions=[
 								            "启用或优化 listwise 重排，将相关片段排在前列",
 								            "启用上下文压缩（compression）过滤无关句子",
 								            "启用相关性过滤（relevance_filter）丢弃明确无关片段",
 								            "缩小 rerank_keep_k（如从 8 降到 5）",
 								        ],
 								    ),
 								    "noise_sensitivity": MetricRule(
 								        warning_threshold=0.3,   # higher is worse; trigger when mean > threshold
 								        critical_threshold=0.5,
 								        higher_is_better=False,
 								        root_causes=[
 								            "回答中包含检索到的噪声片段所引入的错误陈述",
 								            "相关性过滤未能拦截干扰性片段",
 								            "生成阶段对噪声片段未加区分地引用",
 								        ],
 								        suggested_actions=[
 								            "启用相关性过滤（relevance_filter）拦截噪声",
 								            "优化重排，将不相关片段排到截断点之后",
 								            "在生成 prompt 中强调'来源冲突时并列陈述，不擅自下定论'",
 								        ],
 								    ),
 								    "factual_correctness": MetricRule(
 								        warning_threshold=0.6,
 								        critical_threshold=0.4,
 								        higher_is_better=True,
 								        root_causes=[
 								            "回答的事实陈述与标准答案存在偏差",
 								            "检索未能命中标准答案所依据的关键片段",
 								            "生成阶段对多个来源综合时产生事实错误",
 								        ],
 								        suggested_actions=[
 								            "重点检查低分样本，确认是检索遗漏还是生成错误",
 								            "提升 context_recall 以确保关键信息被检索到",
 								            "对事实型问题将 temperature 降至 0",
 								        ],
 								    ),
 								    "semantic_similarity": MetricRule(
 								        warning_threshold=0.7,
 								        critical_threshold=0.5,
 								        higher_is_better=True,
 								        root_causes=[
 								            "回答语义与标准答案差距较大",
 								            "回答过于简短或过于冗长，语义偏移",
 								            "检索到的片段质量不足，导致生成内容偏离",
 								        ],
 								        suggested_actions=[
 								            "检查低分样本的回答与标准答案的表述差异",
 								            "优化生成 prompt 使回答更贴近标准表述风格",
 								            "提升检索质量（context_recall / context_precision）",
 								        ],
 								    ),
 								}
 								@dataclass
 								class Diagnosis:
 								    """Diagnostic result for one metric that triggered a threshold."""
 								    metric: str
 								    mean_score: float
 								    threshold: float          # the triggered threshold
 								    severity: str             # "warning" | "critical"
 								    root_causes: list[str] = field(default_factory=list)
 								    suggested_actions: list[str] = field(default_factory=list)
 								    low_samples: list[dict[str, Any]] = field(default_factory=list)
 								def _mean_ignoring_nan(values: list[float]) -> float | None:
 								    valid = [v for v in values if not math.isnan(v)]
 								    if not valid:
 								        return None
 								    return sum(valid) / len(valid)
 								def _select_low_samples(
 								    rows: list[dict[str, Any]],
 								    metric: str,
 								    top_n: int,
 								    higher_is_better: bool,
 								) -> list[dict[str, Any]]:
 								    """Return the top_n worst-scoring rows for a metric, excluding NaN."""
 								    valid = [r for r in rows if metric in r and not math.isnan(float(r[metric]))]
 								    sorted_rows = sorted(valid, key=lambda r: float(r[metric]), reverse=not higher_is_better)
 								    worst = sorted_rows[:top_n]
 								    keep_keys = {"sample_id", "question", "answer", "ground_truth", metric}
 								    return [{k: v for k, v in row.items() if k in keep_keys} for row in worst]
 								def diagnose(
 								    score_rows: list[dict[str, Any]],
 								    metrics: list[str],
 								    top_low_samples: int = 3,
 								) -> list[Diagnosis]:
 								    """Analyse score_rows and return a Diagnosis for each metric below threshold.
 								    Args:
 								        score_rows: List of per-sample score dicts (from EvaluationResult.score_rows).
 								        metrics: Metric names to evaluate (from Scenario.metrics).
 								        top_low_samples: How many worst-scoring samples to attach per diagnosis.
 								    Returns:
 								        List of Diagnosis objects, one per triggered metric. Empty if all OK.
 								    """
 								    diagnoses: list[Diagnosis] = []
 								    for metric in metrics:
 								        rule = METRIC_RULES.get(metric)
 								        if rule is None:
 								            continue  # unknown metric, skip
 								        values = []
 								        for row in score_rows:
 								            raw = row.get(metric)
 								            if raw is None:
 								                continue
 								            try:
 								                v = float(raw)
 								            except (TypeError, ValueError):
 								                continue
 								            values.append(v)
 								        if not values:
 								            continue
 								        mean = _mean_ignoring_nan(values)
 								        if mean is None:
 								            continue
 								        # Determine severity (direction-aware)
 								        if rule.higher_is_better:
 								            if mean < rule.critical_threshold:
 								                severity = "critical"
 								                threshold = rule.critical_threshold
 								            elif mean < rule.warning_threshold:
 								                severity = "warning"
 								                threshold = rule.warning_threshold
-												feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-25 11:35:49 +08:00
+								            elif mean < rule.advisory_threshold:
 								                # Score is acceptable but below 0.85 — request LLM optimization advice.
 								                severity = "low"
 								                threshold = rule.advisory_threshold
-												feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:06:19 +08:00
+								            else:
-												feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-25 11:35:49 +08:00
+								                continue  # >= advisory_threshold → no diagnosis needed
-												feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:06:19 +08:00
+								        else:
-												feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

											
										
										
											2026-06-25 11:35:49 +08:00
+								            # lower is better (noise_sensitivity): keep existing two-tier logic
-												feat(advisor): add optimization advisor module

- rag_eval/advisor/: new package with rules engine, LLM analyzer, writer
  - rules.py: 7-metric diagnostic rules (warning/critical thresholds, top-3 low samples)
  - llm_analyzer.py: Chinese optimization report via judge_model, graceful fallback
  - writer.py: writes optimization_advice.md + log summary
  - __init__.py: run_advisor() entry point (no-op when optimization_advisor=False)
- Scenario.optimization_advisor: new bool field (default False)
- ScenarioModel: same field added, loader.py透传
- RunArtifactPaths.advice_md: new path field
- factory.py: build_models() now public; build_metric_pipeline() accepts pre-built llm/embeddings
- runner.py: lifts llm, passes to pipeline and advisor; calls run_advisor() at end
- siemens online YAML: optimization_advisor: true enabled
- tests: 9 rules tests + 6 writer tests, all pass
- docs: advisor section added to engine-flow.md and architecture.md

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2026-06-16 17:06:19 +08:00
+								            if mean > rule.critical_threshold:
 								                severity = "critical"
 								                threshold = rule.critical_threshold
 								            elif mean > rule.warning_threshold:
 								                severity = "warning"
 								                threshold = rule.warning_threshold
 								            else:
 								                continue
 								        low_samples = _select_low_samples(score_rows, metric, top_low_samples, rule.higher_is_better)
 								        diagnoses.append(Diagnosis(
 								            metric=metric,
 								            mean_score=round(mean, 4),
 								            threshold=threshold,
 								            severity=severity,
 								            root_causes=list(rule.root_causes),
 								            suggested_actions=list(rule.suggested_actions),
 								            low_samples=low_samples,
 								        ))
 								    return diagnoses