feat(advisor): add 0.85 advisory threshold triggering LLM suggestions

- Add advisory_threshold=0.85 field to MetricRule (higher-is-better metrics)
- diagnose() now emits severity='low' for scores in (warning_threshold, 0.85)
- noise_sensitivity (lower-is-better) keeps its existing two-tier thresholds
- writer.py: severity labels mapped to Chinese (严重/警告/待优化)
- llm_analyzer.py: prompt explains low/warning/critical tiers in Chinese
- Tests: 5 new cases for 'low' severity, updated log summary assertions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-06-25 11:35:49 +08:00
parent 4fd515d2d9
commit e1751447df
5 changed files with 66 additions and 11 deletions

View File

@@ -22,22 +22,31 @@ _PROMPT_TEMPLATE = """\
## 报告要求
1. 按指标分节(## 指标名 [severity]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
3. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先)
4. 语言简洁,面向工程师,不要废话,不要重复列表内容
1. 按指标分节(## 指标名 [严重程度]),先解释"为什么低"(结合低分样本具体分析),再给出"具体怎么改"
2. 严重程度说明critical=严重(<阈值50%warning=警告(<阈值70%low=待优化低于0.85,有提升空间)
3. "具体怎么改"要结合低分样本的实际内容,而不只是泛泛建议
4. 最后写一节 **## 优先优化次序**,按性价比排序(不增加 LLM 调用次数的优化优先critical 和 warning 项优先于 low 项
5. 语言简洁,面向工程师,不要废话,不要重复列表内容
只输出 Markdown 报告正文,不要任何前置说明。
"""
_SEVERITY_LABEL_ZH: dict[str, str] = {
"critical": "严重",
"warning": "警告",
"low": "待优化",
}
def _build_diagnosis_summary(diagnoses: list[Diagnosis]) -> str:
lines = []
for d in diagnoses:
direction = "(越低越好)" if d.metric == "noise_sensitivity" else ""
label = _SEVERITY_LABEL_ZH.get(d.severity, d.severity)
lines.append(
f"- **{d.metric}** {direction} 均值={d.mean_score:.4f}"
f"阈值={d.threshold},严重程度={d.severity}"
f"阈值={d.threshold},严重程度={label}"
)
lines.append(f" - 可能原因:{'; '.join(d.root_causes)}")
lines.append(f" - 建议动作:{'; '.join(d.suggested_actions)}")

View File

@@ -14,6 +14,9 @@ class MetricRule:
higher_is_better: bool # False for noise_sensitivity
root_causes: list[str]
suggested_actions: list[str]
# Scores below this threshold trigger a "low" advisory (LLM suggestion requested).
# Only applies to higher_is_better metrics; noise_sensitivity uses existing thresholds.
advisory_threshold: float = 0.85
METRIC_RULES: dict[str, MetricRule] = {
@@ -208,10 +211,14 @@ def diagnose(
elif mean < rule.warning_threshold:
severity = "warning"
threshold = rule.warning_threshold
elif mean < rule.advisory_threshold:
# Score is acceptable but below 0.85 — request LLM optimization advice.
severity = "low"
threshold = rule.advisory_threshold
else:
continue # above warning threshold → no diagnosis
continue # >= advisory_threshold → no diagnosis needed
else:
# lower is better (noise_sensitivity)
# lower is better (noise_sensitivity): keep existing two-tier logic
if mean > rule.critical_threshold:
severity = "critical"
threshold = rule.critical_threshold

View File

@@ -8,12 +8,22 @@ from .rules import Diagnosis
logger = logging.getLogger("rag_eval.advisor")
# Chinese display labels for each severity tier.
_SEVERITY_LABEL: dict[str, str] = {
"critical": "严重",
"warning": "警告",
"low": "待优化",
}
def _format_log_summary(diagnoses: list[Diagnosis], advice_path: Path) -> str:
"""Return a single-line log summary of triggered diagnoses."""
if not diagnoses:
return "[advisor] 所有指标正常,无需优化建议。"
parts = [f"{d.metric}({d.mean_score:.2f}, {d.severity})" for d in diagnoses]
parts = [
f"{d.metric}({d.mean_score:.2f},{_SEVERITY_LABEL.get(d.severity, d.severity)})"
for d in diagnoses
]
triggered = " ".join(parts)
return f"[advisor] 触发诊断 {len(diagnoses)} 项: {triggered}{advice_path}"
@@ -24,7 +34,8 @@ def _build_fallback_report(diagnoses: list[Diagnosis]) -> str:
return ""
lines = ["## 规则诊断LLM 分析不可用)\n"]
for d in diagnoses:
lines.append(f"### {d.metric} [{d.severity}] 均值={d.mean_score:.4f}")
label = _SEVERITY_LABEL.get(d.severity, d.severity)
lines.append(f"### {d.metric} [{label}] 均值={d.mean_score:.4f}")
lines.append("\n**可能原因:**")
for cause in d.root_causes:
lines.append(f"- {cause}")