From f8e308b7dce2d10e24a3eb17f341859dd73c088d Mon Sep 17 00:00:00 2001
From: wangwei <Wei.Wang@t-systems.com>
Date: Tue, 23 Jun 2026 15:03:27 +0800
Subject: [PATCH] fix: use max_tokens=8 for chat model connectivity test

max_tokens=1 triggers 'min-output limit' errors on gpt-5.x models.
Using 8 tokens is still cheap but satisfies all known model minimums.
Falls back to max_completion_tokens=8 if max_tokens is not supported.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 webapp/api/llm_profiles.py | 44 +++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/webapp/api/llm_profiles.py b/webapp/api/llm_profiles.py
index b2bca65..02fff3c 100644
--- a/webapp/api/llm_profiles.py
+++ b/webapp/api/llm_profiles.py
@@ -64,23 +64,33 @@ def _do_connectivity_test(
             latency_ms = int((time.monotonic() - t0) * 1000)
             return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
 
-    # Chat 模型：先用 max_completion_tokens，失败时 fallback 到 max_tokens
-    for kwargs in [{"max_completion_tokens": 1}, {"max_tokens": 1}]:
-        try:
-            client.chat.completions.create(
-                model=model,
-                messages=[{"role": "user", "content": "hi"}],
-                **kwargs,
-            )
-            latency_ms = int((time.monotonic() - t0) * 1000)
-            return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
-        except Exception as exc:  # noqa: BLE001
-            err_str = str(exc)
-            # 仅当错误明确提示参数名称问题时才重试
-            if "max_tokens" in err_str and "max_completion_tokens" in err_str and kwargs.get("max_completion_tokens"):
-                continue
-            latency_ms = int((time.monotonic() - t0) * 1000)
-            return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
+    # Chat 模型：先不限制 token（最兼容），超时/鉴权错误直接返回
+    # 避免 max_tokens=1 对部分模型（gpt-5.x）触发 min-output 限制
+    try:
+        client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": "hi"}],
+            max_tokens=8,   # 足够小节省费用，同时满足各模型最小输出要求
+        )
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
+    except Exception as exc:  # noqa: BLE001
+        err_str = str(exc)
+        # 如果 max_tokens 不被支持，改用 max_completion_tokens 再试一次
+        if "max_tokens" in err_str and "max_completion_tokens" in err_str:
+            try:
+                client.chat.completions.create(
+                    model=model,
+                    messages=[{"role": "user", "content": "hi"}],
+                    max_completion_tokens=8,
+                )
+                latency_ms = int((time.monotonic() - t0) * 1000)
+                return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
+            except Exception as exc2:  # noqa: BLE001
+                latency_ms = int((time.monotonic() - t0) * 1000)
+                return ProfileTestResponse(ok=False, message=str(exc2), latency_ms=latency_ms)
+        latency_ms = int((time.monotonic() - t0) * 1000)
+        return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
 
     latency_ms = int((time.monotonic() - t0) * 1000)
     return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)