From 9828b1d44c0fbe48e5182bdc5049cd51087f1e0b Mon Sep 17 00:00:00 2001 From: wangwei Date: Sat, 27 Jun 2026 14:31:45 +0800 Subject: [PATCH] update --- .env.example | 6 ++- rag_eval/metrics/factory.py | 8 ++- rag_eval/settings.py | 5 ++ tests/test_metric_presenter.py | 68 ++++++++++++++++++++++++ tests/test_webapp_report_builder.py | 27 ++++++++++ tests/webapp/test_profile_manager.py | 56 ++++++++++++++++++++ webapp/api/session_score_jobs.py | 35 +++++++++++++ webapp/models.py | 20 ++++++++ webapp/server.py | 8 ++- webapp/services/report_builder.py | 5 +- webapp/static/css/app.css | 1 + webapp/static/index.html | 1 + webapp/static/js/app.js | 9 ++-- webapp/static/js/metric_presenter.js | 77 ++++++++++++++++++++++++++++ webapp/static/js/report.js | 18 +++---- webapp/static/js/score_jobs.js | 2 +- 16 files changed, 323 insertions(+), 23 deletions(-) create mode 100644 tests/test_metric_presenter.py create mode 100644 webapp/static/js/metric_presenter.js diff --git a/.env.example b/.env.example index dc30131..3d61443 100644 --- a/.env.example +++ b/.env.example @@ -8,10 +8,12 @@ OPENAI_BASE_URL=http://6.86.80.4:30080/v1 OPENAI_TIMEOUT_SECONDS=180 # 默认评测模型(可在场景 YAML 或 Web 控制台 LLM 配置中覆盖) -# RAGAS_JUDGE_MODEL 需支持 max_tokens + json_object(gpt-5、gpt-4.1、gpt-4o 等) -# 注意:gpt-5.4/5.5/5.2 系列不支持 max_tokens,与 RAGAS 0.4.3 不兼容 +# RAGAS_JUDGE_MODEL 需支持 OpenAI 兼容 chat.completions + 结构化 JSON 输出 +# RAGAS_LLM_MAX_TOKENS 控制 Judge 评分链路的 completion budget;faithfulness 等 +# 结构化指标在 GPT-5 系列上通常需要 4096 或更高,避免 IncompleteOutputException RAGAS_JUDGE_MODEL=gpt-5 RAGAS_EMBEDDING_MODEL=text-embedding-3-small +RAGAS_LLM_MAX_TOKENS=4096 # 评估并发控制(启用 7 个指标时建议 RAGAS_METRIC_TIMEOUT_SECONDS=300) BATCH_SIZE=8 diff --git a/rag_eval/metrics/factory.py b/rag_eval/metrics/factory.py index df5683a..c4361d5 100644 --- a/rag_eval/metrics/factory.py +++ b/rag_eval/metrics/factory.py @@ -69,7 +69,13 @@ def build_models( """ client_kwargs = _resolve_openai_client_kwargs(judge_model, settings) client = AsyncOpenAI(**client_kwargs) - llm = llm_factory(judge_model, client=client) + # RAGAS structured-output judge calls can be truncated by the upstream default + # 1024 completion budget, especially for faithfulness and GPT-5 family models. + llm = llm_factory( + judge_model, + client=client, + max_tokens=max(1, int(settings.ragas_llm_max_tokens)), + ) embeddings = embedding_factory(provider="openai", model=embedding_model, client=client) return llm, embeddings diff --git a/rag_eval/settings.py b/rag_eval/settings.py index 750dc0f..d19609c 100644 --- a/rag_eval/settings.py +++ b/rag_eval/settings.py @@ -26,6 +26,11 @@ class EvaluationSettings(BaseSettings): default="text-embedding-3-small", alias="RAGAS_EMBEDDING_MODEL", ) + ragas_llm_max_tokens: int = Field( + default=4096, + alias="RAGAS_LLM_MAX_TOKENS", + gt=0, + ) openai_timeout_seconds: float = Field(default=30.0, alias="OPENAI_TIMEOUT_SECONDS") ragas_metric_timeout_seconds: float = Field(default=45.0, alias="RAGAS_METRIC_TIMEOUT_SECONDS") batch_size: int = Field(default=8, alias="BATCH_SIZE") diff --git a/tests/test_metric_presenter.py b/tests/test_metric_presenter.py new file mode 100644 index 0000000..d4f94f8 --- /dev/null +++ b/tests/test_metric_presenter.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _run_node(script: str) -> str: + """Execute a short Node.js script and return stdout.""" + completed = subprocess.run( + ["node", "-e", script], + cwd=REPO_ROOT, + capture_output=True, + text=True, + encoding="utf-8", + check=True, + ) + return completed.stdout.strip() + + +def test_metric_presenter_applies_thresholds_and_noise_direction() -> None: + """MetricPresenter should centralize thresholds and inverse noise semantics.""" + metric_js = (REPO_ROOT / "webapp" / "static" / "js" / "metric_presenter.js").as_posix() + script = f""" +const fs = require("fs"); +const vm = require("vm"); +const code = fs.readFileSync("{metric_js}", "utf8"); +const sandbox = {{ window: {{}}, console }}; +vm.runInNewContext(code, sandbox); +const p = sandbox.window.MetricPresenter; +const result = {{ + faith085: p.scoreClass("faithfulness", 0.85), + faith070: p.scoreClass("faithfulness", 0.70), + faith064: p.scoreClass("faithfulness", 0.64), + noise010: p.scoreClass("noise_sensitivity", 0.10), + noise030: p.scoreClass("noise_sensitivity", 0.30), + noise050: p.scoreClass("noise_sensitivity", 0.50), + desc: p.describeMetric("faithfulness"), + noiseDesc: p.describeMetric("noise_sensitivity"), + noiseBin: p.binColor("noise_sensitivity", 0.0), + faithBin: p.binColor("faithfulness", 0.8) +}}; +console.log(JSON.stringify(result)); +""" + output = _run_node(script) + assert '"faith085":"good"' in output + assert '"faith070":"warn"' in output + assert '"faith064":"bad"' in output + assert '"noise010":"good"' in output + assert '"noise030":"warn"' in output + assert '"noise050":"bad"' in output + assert '"desc":"' in output + assert '"noiseDesc":"' in output + assert '"noiseBin":"#16a34a"' in output + assert '"faithBin":"#16a34a"' in output + + +def test_report_and_index_load_metric_presenter_helper() -> None: + """The report page should use the shared helper for card descriptions and colors.""" + index_html = (REPO_ROOT / "webapp" / "static" / "index.html").read_text(encoding="utf-8") + report_js = (REPO_ROOT / "webapp" / "static" / "js" / "report.js").read_text(encoding="utf-8") + app_js = (REPO_ROOT / "webapp" / "static" / "js" / "app.js").read_text(encoding="utf-8") + + assert "js/metric_presenter.js" in index_html + assert "MetricPresenter.describeMetric" in report_js + assert "MetricPresenter.scoreClass" in app_js diff --git a/tests/test_webapp_report_builder.py b/tests/test_webapp_report_builder.py index 8d492c1..b1be92d 100644 --- a/tests/test_webapp_report_builder.py +++ b/tests/test_webapp_report_builder.py @@ -88,3 +88,30 @@ def test_infer_metrics_excludes_weight_columns_without_snapshot(tmp_path: Path) ) assert _infer_metrics_from_scores(run_dir) == ["faithfulness"] + + +def test_build_report_ranks_noise_sensitivity_with_lower_values_as_better(tmp_path: Path) -> None: + """Lowest-sample review should treat higher noise sensitivity as worse.""" + run_dir = tmp_path / "run" + run_dir.mkdir(parents=True, exist_ok=True) + (run_dir / "scores.csv").write_text( + "\n".join( + [ + "sample_id,question,noise_sensitivity", + "s-good,q1,0.10", + "s-warn,q2,0.30", + "s-bad,q3,0.90", + ] + ), + encoding="utf-8", + ) + (run_dir / "summary.md").write_text("summary", encoding="utf-8") + (run_dir / "optimization_advice.md").write_text("", encoding="utf-8") + + report = build_report(run_dir, ["noise_sensitivity"]) + + assert [sample.sample_id for sample in report.lowest_samples[:3]] == [ + "s-bad", + "s-warn", + "s-good", + ] diff --git a/tests/webapp/test_profile_manager.py b/tests/webapp/test_profile_manager.py index 9dc8655..d0239e4 100644 --- a/tests/webapp/test_profile_manager.py +++ b/tests/webapp/test_profile_manager.py @@ -1,4 +1,6 @@ import pytest +from unittest.mock import sentinel + from webapp.models import LLMProfile, ProfileApplyRequest, ProfileApplyResponse def test_llm_profile_defaults(): @@ -147,3 +149,57 @@ def test_resolve_openai_client_kwargs_falls_back_to_env(tmp_path, monkeypatch): assert kwargs["api_key"] == "sk-env" assert kwargs["base_url"] == "http://env-base/v1" assert kwargs["timeout"] == 45.0 + + +def test_build_models_uses_high_default_max_tokens_for_structured_judge(monkeypatch): + """Structured RAGAS judge calls should use a larger completion budget by default.""" + import rag_eval.metrics.factory as factory + from rag_eval.settings import EvaluationSettings + + captured: dict[str, object] = {} + + def fake_llm_factory(model, client=None, **kwargs): + captured["model"] = model + captured["client"] = client + captured["kwargs"] = kwargs + return sentinel.llm + + monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client) + monkeypatch.setattr(factory, "llm_factory", fake_llm_factory) + monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings) + + llm, embeddings = factory.build_models( + "gpt-5", + "text-embedding-3-small", + EvaluationSettings(), + ) + + assert llm is sentinel.llm + assert embeddings is sentinel.embeddings + assert captured["model"] == "gpt-5" + assert captured["client"] is sentinel.client + assert captured["kwargs"] == {"max_tokens": 4096} + + +def test_build_models_allows_env_override_for_judge_max_tokens(monkeypatch): + """Operators should be able to raise the judge completion budget via settings.""" + import rag_eval.metrics.factory as factory + from rag_eval.settings import EvaluationSettings + + captured: dict[str, object] = {} + + def fake_llm_factory(model, client=None, **kwargs): + captured["kwargs"] = kwargs + return sentinel.llm + + monkeypatch.setattr(factory, "AsyncOpenAI", lambda **kwargs: sentinel.client) + monkeypatch.setattr(factory, "llm_factory", fake_llm_factory) + monkeypatch.setattr(factory, "embedding_factory", lambda **kwargs: sentinel.embeddings) + + factory.build_models( + "gpt-5", + "text-embedding-3-small", + EvaluationSettings(RAGAS_LLM_MAX_TOKENS=8192), + ) + + assert captured["kwargs"] == {"max_tokens": 8192} diff --git a/webapp/api/session_score_jobs.py b/webapp/api/session_score_jobs.py index d93b995..f09dc7f 100644 --- a/webapp/api/session_score_jobs.py +++ b/webapp/api/session_score_jobs.py @@ -44,6 +44,41 @@ logger = logging.getLogger("webapp.api.session_score_jobs") status_code=202, response_model=SessionScoreJobResponse, summary="提交 Session 异步评分(多样本批量聚合)", + description=( + "**用途**\n" + "- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n" + "- 相同 `session_id` 的多次调用不会生成多个独立报告,而是持续追加到同一个 session 报告。\n\n" + "**请求字段说明**\n" + "- `session_id`:会话唯一标识,同一会话必须保持一致。\n" + "- `question` / `answer`:本次待评分的问答对。\n" + "- `contexts`:检索片段拼接字符串,按 `context_separator` 拆分。\n" + "- `ground_truth`:标准答案,可选;缺失时会自动跳过依赖它的指标。\n" + "- `metrics`:本次需要计算的指标列表。\n" + "- `judge_model` / `embedding_model`:可选;为空时回退到系统默认配置。\n\n" + "**处理行为**\n" + "1. 服务端立即返回 `202 Accepted`,并生成本次调用的 `job_id`。\n" + "2. 系统根据 `session_id` 计算固定 `run_id`,格式为 `session-`。\n" + "3. 本次评分完成后,会向该 session 的 `scores.csv` 追加一行样本数据。\n" + "4. 系统会基于当前 session 的全量样本重写 `summary.md`,并重新生成 `optimization_advice.md`。\n" + "5. 报告可在「运行列表」中按 `run_id` 查看;同一 session 的后续调用会持续增量更新该报告。\n\n" + "**后续查询接口**\n" + "- `GET /api/score/session/jobs/{job_id}`:查询本次调用状态与得分。\n" + "- `GET /api/score/sessions/{session_id}`:查询整个 session 的累计调用次数、指标均值、所有作业记录。\n" + "- `GET /api/runs/{run_id}`:查看完整评估报告内容。\n\n" + "**典型请求示例**\n" + "```json\n" + "{\n" + " \"session_id\": \"dify-session-001\",\n" + " \"question\": \"单源CT与双源CT在球管配置上有何本质区别?\",\n" + " \"answer\": \"单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。\",\n" + " \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n" + " \"context_separator\": \" |||| \",\n" + " \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n" + " \"judge_model\": \"gpt-5.5\",\n" + " \"embedding_model\": \"text-embedding-3-small\"\n" + "}\n" + "```" + ), responses={ 202: { "description": ( diff --git a/webapp/models.py b/webapp/models.py index e124102..9e46be2 100644 --- a/webapp/models.py +++ b/webapp/models.py @@ -542,6 +542,26 @@ class SessionScoreRequest(ScoreRequest): Each call adds a new sample row to the session's scores.csv. """ + model_config = ConfigDict( + json_schema_extra={ + "examples": [ + { + "summary": "Dify 会话批量评分", + "value": { + "session_id": "dify-session-001", + "question": "单源CT与双源CT在球管配置上有何本质区别?", + "answer": "单源CT只有一套球管-探测器系统,双源CT有两套独立的球管-探测器系统。", + "contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管", + "context_separator": " |||| ", + "metrics": ["answer_relevancy", "faithfulness"], + "judge_model": "gpt-5.5", + "embedding_model": "text-embedding-3-small", + }, + } + ] + } + ) + session_id: str = Field( description=( "会话唯一标识符。相同 session_id 的多次调用合并为同一报告," diff --git a/webapp/server.py b/webapp/server.py index 211545b..c0ec655 100644 --- a/webapp/server.py +++ b/webapp/server.py @@ -75,8 +75,12 @@ OPENAPI_TAGS = [ "在「运行列表」页查看。\n\n" "**Session 批量评分 API** — `POST /api/score/session_async`\n\n" "适合 Dify 循环节点批量评估:同一 `session_id` 的多次调用合并为一个报告," - "每次调用新增一个样本行,指标均值和优化建议增量更新。\n" - "通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n\n" + "每次调用新增一个样本行,指标均值和优化建议增量更新。\n\n" + "**Session 模式调用流程**\n" + "1. `POST /api/score/session_async` 提交一条问答评分请求。\n" + "2. 用 `GET /api/score/session/jobs/{job_id}` 轮询单次调用状态。\n" + "3. 用 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n" + "4. 用 `GET /api/runs/{run_id}` 或在「运行列表」中查看完整报告。\n\n" "通过 `GET /api/score/jobs` 列出所有异步评分记录," "`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n" "**鉴权**:若 `.env` 中配置了 `SCORE_API_TOKEN`,需携带 " diff --git a/webapp/services/report_builder.py b/webapp/services/report_builder.py index 1f56f65..0a1e204 100644 --- a/webapp/services/report_builder.py +++ b/webapp/services/report_builder.py @@ -37,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language") # How many lowest-scoring samples to surface for manual review. LOWEST_SAMPLE_COUNT = 10 +# Metrics whose lower raw value means stronger performance. +LOWER_IS_BETTER_METRICS = {"noise_sensitivity"} + def _round_or_none(value: float | None) -> float | None: """Round a float to four places, mapping NaN/None to None for clean JSON.""" @@ -105,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None: """Average a single sample's available metric scores for ranking.""" values = [ - float(row[metric]) + (1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric]) for metric in metrics if metric in row and pd.notna(row[metric]) ] diff --git a/webapp/static/css/app.css b/webapp/static/css/app.css index aa41081..7282750 100644 --- a/webapp/static/css/app.css +++ b/webapp/static/css/app.css @@ -199,6 +199,7 @@ code { .metric-value.bad { color: var(--bad); } .metric-value.na { color: var(--slate-light); } .metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; } +.metric-desc { font-size: 12px; color: #64748b; margin-top: 6px; line-height: 1.45; } .report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; } .report-half { margin-bottom: 0; } diff --git a/webapp/static/index.html b/webapp/static/index.html index 42088a8..5cdd311 100644 --- a/webapp/static/index.html +++ b/webapp/static/index.html @@ -267,6 +267,7 @@ + diff --git a/webapp/static/js/app.js b/webapp/static/js/app.js index 7251d8c..bdfcb0c 100644 --- a/webapp/static/js/app.js +++ b/webapp/static/js/app.js @@ -147,7 +147,7 @@ const App = { const chips = (run.metrics || []) .map((m) => { const val = run.metric_means ? run.metric_means[m] : null; - const cls = App.scoreClass(val); + const cls = App.scoreClass(m, val); const text = val === null || val === undefined ? "n/a" : val.toFixed(2); return `${App.escape(App.shortMetric(m))} ${text}`; }) @@ -174,11 +174,8 @@ const App = { if (btn) btn.disabled = false; }, - scoreClass(value) { - if (value === null || value === undefined) return "na"; - if (value >= 0.8) return "good"; - if (value >= 0.65) return "warn"; - return "bad"; + scoreClass(metricName, value) { + return MetricPresenter.scoreClass(metricName, value); }, shortMetric(name) { diff --git a/webapp/static/js/metric_presenter.js b/webapp/static/js/metric_presenter.js new file mode 100644 index 0000000..99a5b0b --- /dev/null +++ b/webapp/static/js/metric_presenter.js @@ -0,0 +1,77 @@ +// metric_presenter.js — 统一维护指标语义(高分好 / 低分好)、颜色阈值与简要说明。 + +(function attachMetricPresenter(globalObj) { + const METRIC_META = { + faithfulness: { + direction: "higher_better", + description: "回答是否被检索内容直接支持,越高越可靠。", + }, + answer_relevancy: { + direction: "higher_better", + description: "回答与问题是否紧密相关,越高越切题。", + }, + context_recall: { + direction: "higher_better", + description: "检索片段覆盖标准答案关键信息的程度,越高越完整。", + }, + context_precision: { + direction: "higher_better", + description: "检索片段中有效信息的占比,越高越精准。", + }, + noise_sensitivity: { + direction: "lower_better", + description: "对噪声上下文的敏感程度,越低说明抗干扰能力越强。", + }, + factual_correctness: { + direction: "higher_better", + description: "回答与标准答案在事实层面的吻合程度,越高越准确。", + }, + semantic_similarity: { + direction: "higher_better", + description: "回答与标准答案在语义上的相似程度,越高越接近。", + }, + }; + + function isLowerBetter(metricName) { + return METRIC_META[metricName]?.direction === "lower_better"; + } + + function scoreClass(metricName, value) { + if (value === null || value === undefined || Number.isNaN(Number(value))) return "na"; + const numeric = Number(value); + if (isLowerBetter(metricName)) { + if (numeric <= 0.15) return "good"; + if (numeric <= 0.35) return "warn"; + return "bad"; + } + if (numeric >= 0.85) return "good"; + if (numeric >= 0.65) return "warn"; + return "bad"; + } + + function describeMetric(metricName) { + return METRIC_META[metricName]?.description || "该指标用于衡量当前问答样本的评估表现。"; + } + + function binColor(metricName, lower) { + const numeric = Number(lower); + if (isLowerBetter(metricName)) { + if (numeric < 0.2) return "#16a34a"; + if (numeric < 0.4) return "#84cc16"; + if (numeric < 0.6) return "#eab308"; + if (numeric < 0.8) return "#f97316"; + return "#dc2626"; + } + if (numeric >= 0.8) return "#16a34a"; + if (numeric >= 0.6) return "#84cc16"; + if (numeric >= 0.4) return "#eab308"; + if (numeric >= 0.2) return "#f97316"; + return "#dc2626"; + } + + globalObj.MetricPresenter = { + scoreClass, + describeMetric, + binColor, + }; +})(window); diff --git a/webapp/static/js/report.js b/webapp/static/js/report.js index a76a7ab..42c5287 100644 --- a/webapp/static/js/report.js +++ b/webapp/static/js/report.js @@ -117,13 +117,15 @@ const Report = { const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics; metrics.forEach((metric) => { const value = report.metric_means ? report.metric_means[metric] : null; - const cls = App.scoreClass(value); + const cls = App.scoreClass(metric, value); const text = value === null || value === undefined ? "n/a" : value.toFixed(2); + const description = MetricPresenter.describeMetric(metric); const card = document.createElement("div"); card.className = "metric-card"; card.innerHTML = `
${text}
${App.escape(metric)}
+
${App.escape(description)}
`; wrap.appendChild(card); }); @@ -168,17 +170,13 @@ const Report = { const bins = distributions[metric] || []; const labels = bins.map((b) => b.label); const counts = bins.map((b) => b.count); - const colors = bins.map((b) => Report._binColor(b.lower)); + const colors = bins.map((b) => Report._binColor(metric, b.lower)); Report._drawDistChart(labels, counts, colors); }, // 低分箱偏红、高分箱偏绿,直观暴露长尾。 - _binColor(lower) { - if (lower >= 0.8) return "#16a34a"; - if (lower >= 0.6) return "#84cc16"; - if (lower >= 0.4) return "#eab308"; - if (lower >= 0.2) return "#f97316"; - return "#dc2626"; + _binColor(metric, lower) { + return MetricPresenter.binColor(metric, lower); }, // 实际绘制 Chart.js 柱状图。 @@ -247,7 +245,7 @@ const Report = { body += `${App.escape(stat.key)}${stat.count}`; metrics.forEach((m) => { const v = stat.means ? stat.means[m] : null; - const cls = App.scoreClass(v); + const cls = App.scoreClass(m, v); const text = v === null || v === undefined ? "—" : v.toFixed(2); body += `${text}`; }); @@ -271,7 +269,7 @@ const Report = { const scoreBadges = metrics .map((m) => { const v = sample.metrics ? sample.metrics[m] : null; - const cls = App.scoreClass(v); + const cls = App.scoreClass(m, v); const text = v === null || v === undefined ? "—" : v.toFixed(2); return `${text}`; }) diff --git a/webapp/static/js/score_jobs.js b/webapp/static/js/score_jobs.js index 18f5d9c..519cc4a 100644 --- a/webapp/static/js/score_jobs.js +++ b/webapp/static/js/score_jobs.js @@ -50,7 +50,7 @@ const ScoreJobs = { if (job.status === "completed") { scoreHtml = Object.entries(job.scores || {}) .map(([k, v]) => { - const cls = App.scoreClass(v); + const cls = App.scoreClass(k, v); const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3); return `${App.escape(App.shortMetric(k))} ${text}`; })