fix: support max_completion_tokens for newer models (gpt-5.x) in connectivity test

Newer OpenAI models (gpt-5.4 etc.) reject max_tokens and require max_completion_tokens. Try max_completion_tokens first, fall back to max_tokens for older models / compatible APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-23 14:51:28 +08:00
parent 1dc7ab9727
commit 05419db1f9
1 changed files with 24 additions and 12 deletions
--- a/webapp/api/llm_profiles.py
+++ b/webapp/api/llm_profiles.py
@@ -29,24 +29,36 @@ def _do_connectivity_test(
    api_key: str,
    timeout_seconds: int,
 ) -> ProfileTestResponse:
-    """Send a minimal chat completion request and return the test result."""
+    """Send a minimal chat completion request and return the test result.
    Tries max_completion_tokens first (required by newer OpenAI models like gpt-5.x),
    then falls back to max_tokens for older models / compatible APIs.
    """
    client = OpenAI(
        api_key=api_key,
        base_url=base_url.rstrip("/"),
        timeout=float(timeout_seconds),
    )
    t0 = time.monotonic()
-    try:
+    # Try newer parameter first, fall back to legacy max_tokens on failure
-        client.chat.completions.create(
+    for kwargs in [{"max_completion_tokens": 1}, {"max_tokens": 1}]:
-            model=model,
+        try:
-            messages=[{"role": "user", "content": "hi"}],
+            client.chat.completions.create(
-            max_tokens=1,
+                model=model,
-        )
+                messages=[{"role": "user", "content": "hi"}],
-        latency_ms = int((time.monotonic() - t0) * 1000)
+                **kwargs,
-        return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
+            )
-    except Exception as exc:  # noqa: BLE001
+            latency_ms = int((time.monotonic() - t0) * 1000)
-        latency_ms = int((time.monotonic() - t0) * 1000)
+            return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
-        return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
+        except Exception as exc:  # noqa: BLE001
            err_str = str(exc)
            # Only retry if the error is specifically about the token parameter name
            if "max_tokens" in err_str and "max_completion_tokens" in err_str and kwargs.get("max_completion_tokens"):
                continue
            latency_ms = int((time.monotonic() - t0) * 1000)
            return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
    latency_ms = int((time.monotonic() - t0) * 1000)
    return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])