update

2026-06-27 14:31:45 +08:00
parent 1df4010acc
commit 9828b1d44c
16 changed files with 323 additions and 23 deletions
--- a/webapp/api/session_score_jobs.py
+++ b/webapp/api/session_score_jobs.py
@@ -44,6 +44,41 @@ logger = logging.getLogger("webapp.api.session_score_jobs")
    status_code=202,
    response_model=SessionScoreJobResponse,
    summary="提交 Session 异步评分（多样本批量聚合）",
+    description=(
+        "**用途**\n"
+        "- 适合 Dify 循环节点、批量问答评测、同一对话多轮累计评分。\n"
+        "- 相同 `session_id` 的多次调用不会生成多个独立报告，而是持续追加到同一个 session 报告。\n\n"
+        "**请求字段说明**\n"
+        "- `session_id`：会话唯一标识，同一会话必须保持一致。\n"
+        "- `question` / `answer`：本次待评分的问答对。\n"
+        "- `contexts`：检索片段拼接字符串，按 `context_separator` 拆分。\n"
+        "- `ground_truth`：标准答案，可选；缺失时会自动跳过依赖它的指标。\n"
+        "- `metrics`：本次需要计算的指标列表。\n"
+        "- `judge_model` / `embedding_model`：可选；为空时回退到系统默认配置。\n\n"
+        "**处理行为**\n"
+        "1. 服务端立即返回 `202 Accepted`，并生成本次调用的 `job_id`。\n"
+        "2. 系统根据 `session_id` 计算固定 `run_id`，格式为 `session-<sanitized-session_id>`。\n"
+        "3. 本次评分完成后，会向该 session 的 `scores.csv` 追加一行样本数据。\n"
+        "4. 系统会基于当前 session 的全量样本重写 `summary.md`，并重新生成 `optimization_advice.md`。\n"
+        "5. 报告可在「运行列表」中按 `run_id` 查看；同一 session 的后续调用会持续增量更新该报告。\n\n"
+        "**后续查询接口**\n"
+        "- `GET /api/score/session/jobs/{job_id}`：查询本次调用状态与得分。\n"
+        "- `GET /api/score/sessions/{session_id}`：查询整个 session 的累计调用次数、指标均值、所有作业记录。\n"
+        "- `GET /api/runs/{run_id}`：查看完整评估报告内容。\n\n"
+        "**典型请求示例**\n"
+        "```json\n"
+        "{\n"
+        "  \"session_id\": \"dify-session-001\",\n"
+        "  \"question\": \"单源CT与双源CT在球管配置上有何本质区别？\",\n"
+        "  \"answer\": \"单源CT只有一套球管-探测器系统，双源CT有两套独立的球管-探测器系统。\",\n"
+        "  \"contexts\": \"双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管\",\n"
+        "  \"context_separator\": \" |||| \",\n"
+        "  \"metrics\": [\"answer_relevancy\", \"faithfulness\"],\n"
+        "  \"judge_model\": \"gpt-5.5\",\n"
+        "  \"embedding_model\": \"text-embedding-3-small\"\n"
+        "}\n"
+        "```"
+    ),
    responses={
        202: {
            "description": (
--- a/webapp/models.py
+++ b/webapp/models.py
@@ -542,6 +542,26 @@ class SessionScoreRequest(ScoreRequest):
    Each call adds a new sample row to the session's scores.csv.
    """

+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "summary": "Dify 会话批量评分",
+                    "value": {
+                        "session_id": "dify-session-001",
+                        "question": "单源CT与双源CT在球管配置上有何本质区别？",
+                        "answer": "单源CT只有一套球管-探测器系统，双源CT有两套独立的球管-探测器系统。",
+                        "contexts": "双源CT采用两套管-探测器系统 |||| 单源CT只有一个球管",
+                        "context_separator": " |||| ",
+                        "metrics": ["answer_relevancy", "faithfulness"],
+                        "judge_model": "gpt-5.5",
+                        "embedding_model": "text-embedding-3-small",
+                    },
+                }
+            ]
+        }
+    )
+
    session_id: str = Field(
        description=(
            "会话唯一标识符。相同 session_id 的多次调用合并为同一报告，"
--- a/webapp/server.py
+++ b/webapp/server.py
@@ -75,8 +75,12 @@ OPENAPI_TAGS = [
            "在「运行列表」页查看。\n\n"
            "**Session 批量评分 API** — `POST /api/score/session_async`\n\n"
            "适合 Dify 循环节点批量评估：同一 `session_id` 的多次调用合并为一个报告，"
-            "每次调用新增一个样本行，指标均值和优化建议增量更新。\n"
-            "通过 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n\n"
+            "每次调用新增一个样本行，指标均值和优化建议增量更新。\n\n"
+            "**Session 模式调用流程**\n"
+            "1. `POST /api/score/session_async` 提交一条问答评分请求。\n"
+            "2. 用 `GET /api/score/session/jobs/{job_id}` 轮询单次调用状态。\n"
+            "3. 用 `GET /api/score/sessions/{session_id}` 查看 session 聚合状态。\n"
+            "4. 用 `GET /api/runs/{run_id}` 或在「运行列表」中查看完整报告。\n\n"
            "通过 `GET /api/score/jobs` 列出所有异步评分记录，"
            "`GET /api/score/jobs/{job_id}` 查询单个任务状态。\n\n"
            "**鉴权**：若 `.env` 中配置了 `SCORE_API_TOKEN`，需携带 "
--- a/webapp/services/report_builder.py
+++ b/webapp/services/report_builder.py
@@ -37,6 +37,9 @@ GROUPING_FIELDS = ("difficulty", "question_type", "language")
 # How many lowest-scoring samples to surface for manual review.
 LOWEST_SAMPLE_COUNT = 10

+# Metrics whose lower raw value means stronger performance.
+LOWER_IS_BETTER_METRICS = {"noise_sensitivity"}
+

 def _round_or_none(value: float | None) -> float | None:
    """Round a float to four places, mapping NaN/None to None for clean JSON."""
@@ -105,7 +108,7 @@ def _groupings(frame: pd.DataFrame, metrics: list[str]) -> dict[str, list[GroupS
 def _sample_mean(row: pd.Series, metrics: list[str]) -> float | None:
    """Average a single sample's available metric scores for ranking."""
    values = [
-        float(row[metric])
+        (1.0 - float(row[metric])) if metric in LOWER_IS_BETTER_METRICS else float(row[metric])
        for metric in metrics
        if metric in row and pd.notna(row[metric])
    ]
--- a/webapp/static/css/app.css
+++ b/webapp/static/css/app.css
@@ -199,6 +199,7 @@ code {
 .metric-value.bad { color: var(--bad); }
 .metric-value.na { color: var(--slate-light); }
 .metric-name { font-size: 12px; color: var(--slate); margin-top: 4px; }
+.metric-desc { font-size: 12px; color: #64748b; margin-top: 6px; line-height: 1.45; }

 .report-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
 .report-half { margin-bottom: 0; }
--- a/webapp/static/index.html
+++ b/webapp/static/index.html
@@ -267,6 +267,7 @@
  </div>

  <script src="/static/js/api.js"></script>
+  <script src="/static/js/metric_presenter.js"></script>
  <script src="/static/js/report.js"></script>
  <script src="/static/js/profiles.js"></script>
  <script src="/static/js/runner.js"></script>
--- a/webapp/static/js/app.js
+++ b/webapp/static/js/app.js
@@ -147,7 +147,7 @@ const App = {
    const chips = (run.metrics || [])
      .map((m) => {
        const val = run.metric_means ? run.metric_means[m] : null;
-        const cls = App.scoreClass(val);
+        const cls = App.scoreClass(m, val);
        const text = val === null || val === undefined ? "n/a" : val.toFixed(2);
        return `<span class="metric-chip" title="${App.escape(m)}">${App.escape(App.shortMetric(m))} <b class="${cls}">${text}</b></span>`;
      })
@@ -174,11 +174,8 @@ const App = {
    if (btn) btn.disabled = false;
  },

-  scoreClass(value) {
-    if (value === null || value === undefined) return "na";
-    if (value >= 0.8) return "good";
-    if (value >= 0.65) return "warn";
-    return "bad";
+  scoreClass(metricName, value) {
+    return MetricPresenter.scoreClass(metricName, value);
  },

  shortMetric(name) {
--- a/webapp/static/js/metric_presenter.js
+++ b/webapp/static/js/metric_presenter.js
@@ -0,0 +1,77 @@
+// metric_presenter.js — 统一维护指标语义（高分好 / 低分好）、颜色阈值与简要说明。
+
+(function attachMetricPresenter(globalObj) {
+  const METRIC_META = {
+    faithfulness: {
+      direction: "higher_better",
+      description: "回答是否被检索内容直接支持，越高越可靠。",
+    },
+    answer_relevancy: {
+      direction: "higher_better",
+      description: "回答与问题是否紧密相关，越高越切题。",
+    },
+    context_recall: {
+      direction: "higher_better",
+      description: "检索片段覆盖标准答案关键信息的程度，越高越完整。",
+    },
+    context_precision: {
+      direction: "higher_better",
+      description: "检索片段中有效信息的占比，越高越精准。",
+    },
+    noise_sensitivity: {
+      direction: "lower_better",
+      description: "对噪声上下文的敏感程度，越低说明抗干扰能力越强。",
+    },
+    factual_correctness: {
+      direction: "higher_better",
+      description: "回答与标准答案在事实层面的吻合程度，越高越准确。",
+    },
+    semantic_similarity: {
+      direction: "higher_better",
+      description: "回答与标准答案在语义上的相似程度，越高越接近。",
+    },
+  };
+
+  function isLowerBetter(metricName) {
+    return METRIC_META[metricName]?.direction === "lower_better";
+  }
+
+  function scoreClass(metricName, value) {
+    if (value === null || value === undefined || Number.isNaN(Number(value))) return "na";
+    const numeric = Number(value);
+    if (isLowerBetter(metricName)) {
+      if (numeric <= 0.15) return "good";
+      if (numeric <= 0.35) return "warn";
+      return "bad";
+    }
+    if (numeric >= 0.85) return "good";
+    if (numeric >= 0.65) return "warn";
+    return "bad";
+  }
+
+  function describeMetric(metricName) {
+    return METRIC_META[metricName]?.description || "该指标用于衡量当前问答样本的评估表现。";
+  }
+
+  function binColor(metricName, lower) {
+    const numeric = Number(lower);
+    if (isLowerBetter(metricName)) {
+      if (numeric < 0.2) return "#16a34a";
+      if (numeric < 0.4) return "#84cc16";
+      if (numeric < 0.6) return "#eab308";
+      if (numeric < 0.8) return "#f97316";
+      return "#dc2626";
+    }
+    if (numeric >= 0.8) return "#16a34a";
+    if (numeric >= 0.6) return "#84cc16";
+    if (numeric >= 0.4) return "#eab308";
+    if (numeric >= 0.2) return "#f97316";
+    return "#dc2626";
+  }
+
+  globalObj.MetricPresenter = {
+    scoreClass,
+    describeMetric,
+    binColor,
+  };
+})(window);
--- a/webapp/static/js/report.js
+++ b/webapp/static/js/report.js
@@ -117,13 +117,15 @@ const Report = {
    const metrics = report.metrics && report.metrics.length ? report.metrics : summary.metrics;
    metrics.forEach((metric) => {
      const value = report.metric_means ? report.metric_means[metric] : null;
-      const cls = App.scoreClass(value);
+      const cls = App.scoreClass(metric, value);
      const text = value === null || value === undefined ? "n/a" : value.toFixed(2);
+      const description = MetricPresenter.describeMetric(metric);
      const card = document.createElement("div");
      card.className = "metric-card";
      card.innerHTML = `
        <div class="metric-value ${cls}">${text}</div>
        <div class="metric-name">${App.escape(metric)}</div>
+        <div class="metric-desc">${App.escape(description)}</div>
      `;
      wrap.appendChild(card);
    });
@@ -168,17 +170,13 @@ const Report = {
    const bins = distributions[metric] || [];
    const labels = bins.map((b) => b.label);
    const counts = bins.map((b) => b.count);
-    const colors = bins.map((b) => Report._binColor(b.lower));
+    const colors = bins.map((b) => Report._binColor(metric, b.lower));
    Report._drawDistChart(labels, counts, colors);
  },

  // 低分箱偏红、高分箱偏绿，直观暴露长尾。
-  _binColor(lower) {
-    if (lower >= 0.8) return "#16a34a";
-    if (lower >= 0.6) return "#84cc16";
-    if (lower >= 0.4) return "#eab308";
-    if (lower >= 0.2) return "#f97316";
-    return "#dc2626";
+  _binColor(metric, lower) {
+    return MetricPresenter.binColor(metric, lower);
  },

  // 实际绘制 Chart.js 柱状图。
@@ -247,7 +245,7 @@ const Report = {
      body += `<tr><td>${App.escape(stat.key)}</td><td>${stat.count}</td>`;
      metrics.forEach((m) => {
        const v = stat.means ? stat.means[m] : null;
-        const cls = App.scoreClass(v);
+        const cls = App.scoreClass(m, v);
        const text = v === null || v === undefined ? "—" : v.toFixed(2);
        body += `<td class="${cls}">${text}</td>`;
      });
@@ -271,7 +269,7 @@ const Report = {
      const scoreBadges = metrics
        .map((m) => {
          const v = sample.metrics ? sample.metrics[m] : null;
-          const cls = App.scoreClass(v);
+          const cls = App.scoreClass(m, v);
          const text = v === null || v === undefined ? "—" : v.toFixed(2);
          return `<span class="score-badge ${cls}" title="${App.escape(m)}">${text}</span>`;
        })
--- a/webapp/static/js/score_jobs.js
+++ b/webapp/static/js/score_jobs.js
@@ -50,7 +50,7 @@ const ScoreJobs = {
    if (job.status === "completed") {
      scoreHtml = Object.entries(job.scores || {})
        .map(([k, v]) => {
-          const cls = App.scoreClass(v);
+          const cls = App.scoreClass(k, v);
          const text = v === null || v === undefined ? "n/a" : Number(v).toFixed(3);
          return `<span class="metric-chip" title="${App.escape(k)}">${App.escape(App.shortMetric(k))} <b class="${cls}">${text}</b></span>`;
        })