fix: support max_completion_tokens for newer models (gpt-5.x) in connectivity test
Newer OpenAI models (gpt-5.4 etc.) reject max_tokens and require max_completion_tokens. Try max_completion_tokens first, fall back to max_tokens for older models / compatible APIs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -29,24 +29,36 @@ def _do_connectivity_test(
|
||||
api_key: str,
|
||||
timeout_seconds: int,
|
||||
) -> ProfileTestResponse:
|
||||
"""Send a minimal chat completion request and return the test result."""
|
||||
"""Send a minimal chat completion request and return the test result.
|
||||
|
||||
Tries max_completion_tokens first (required by newer OpenAI models like gpt-5.x),
|
||||
then falls back to max_tokens for older models / compatible APIs.
|
||||
"""
|
||||
client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url=base_url.rstrip("/"),
|
||||
timeout=float(timeout_seconds),
|
||||
)
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
max_tokens=1,
|
||||
)
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return ProfileTestResponse(ok=False, message=str(exc), latency_ms=latency_ms)
|
||||
# Try newer parameter first, fall back to legacy max_tokens on failure
|
||||
for kwargs in [{"max_completion_tokens": 1}, {"max_tokens": 1}]:
|
||||
try:
|
||||
client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
**kwargs,
|
||||
)
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return ProfileTestResponse(ok=True, message="连接成功", latency_ms=latency_ms)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
err_str = str(exc)
|
||||
# Only retry if the error is specifically about the token parameter name
|
||||
if "max_tokens" in err_str and "max_completion_tokens" in err_str and kwargs.get("max_completion_tokens"):
|
||||
continue
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return ProfileTestResponse(ok=False, message=err_str, latency_ms=latency_ms)
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
return ProfileTestResponse(ok=False, message="连接测试失败", latency_ms=latency_ms)
|
||||
|
||||
|
||||
@router.post("/probe", response_model=ProfileTestResponse, tags=["llm-profiles"])
|
||||
|
||||
Reference in New Issue
Block a user