From f8e308b7dce2d10e24a3eb17f341859dd73c088d Mon Sep 17 00:00:00 2001 From: wangwei Date: Tue, 23 Jun 2026 15:03:27 +0800 Subject: [PATCH] fix: use max_tokens=8 for chat model connectivity test max_tokens=1 triggers 'min-output limit' errors on gpt-5.x models. Using 8 tokens is still cheap but satisfies all known model minimums. Falls back to max_completion_tokens=8 if max_tokens is not supported. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- webapp/api/llm_profiles.py | 44 +++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/webapp/api/llm_profiles.py b/webapp/api/llm_profiles.py index b2bca65..02fff3c 100644 --- a/webapp/api/llm_profiles.py +++ b/webapp/api/llm_profiles.py @@ -64,23 +64,33 @@ def _do_connectivity_test( latency_ms = int((time.monotonic() - t0) * 1000) return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms) - # Chat 模型:先用 max_completion_tokens,失败时 fallback 到 max_tokens - for kwargs in [{"max_completion_tokens": 1}, {"max_tokens": 1}]: - try: - client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": "hi"}], - **kwargs, - ) - latency_ms = int((time.monotonic() - t0) * 1000) - return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms) - except Exception as exc: # noqa: BLE001 - err_str = str(exc) - # 仅当错误明确提示参数名称问题时才重试 - if "max_tokens" in err_str and "max_completion_tokens" in err_str and kwargs.get("max_completion_tokens"): - continue - latency_ms = int((time.monotonic() - t0) * 1000) - return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms) + # Chat 模型:先不限制 token(最兼容),超时/鉴权错误直接返回 + # 避免 max_tokens=1 对部分模型(gpt-5.x)触发 min-output 限制 + try: + client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "hi"}], + max_tokens=8, # 足够小节省费用,同时满足各模型最小输出要求 + ) + latency_ms = int((time.monotonic() - t0) * 1000) + return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms) + except Exception as exc: # noqa: BLE001 + err_str = str(exc) + # 如果 max_tokens 不被支持,改用 max_completion_tokens 再试一次 + if "max_tokens" in err_str and "max_completion_tokens" in err_str: + try: + client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "hi"}], + max_completion_tokens=8, + ) + latency_ms = int((time.monotonic() - t0) * 1000) + return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms) + except Exception as exc2: # noqa: BLE001 + latency_ms = int((time.monotonic() - t0) * 1000) + return ProfileTestResponse(ok=False, message=str(exc2), latency_ms=latency_ms) + latency_ms = int((time.monotonic() - t0) * 1000) + return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms) latency_ms = int((time.monotonic() - t0) * 1000) return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)