392 lines
13 KiB
Python
392 lines
13 KiB
Python
|
|
"""Qwen LLM客户端 - 支持OpenAI兼容API格式"""
|
|||
|
|
|
|||
|
|
import time
|
|||
|
|
import json
|
|||
|
|
from typing import List, Dict, Optional, Generator, AsyncGenerator
|
|||
|
|
from loguru import logger
|
|||
|
|
import httpx
|
|||
|
|
|
|||
|
|
from .base_client import BaseLLMClient, LLMResponse, LLMConfig, LLMProvider
|
|||
|
|
|
|||
|
|
|
|||
|
|
class QwenClient(BaseLLMClient):
|
|||
|
|
"""
|
|||
|
|
Qwen API客户端(OpenAI兼容格式)
|
|||
|
|
|
|||
|
|
支持通过new-api等代理服务调用:
|
|||
|
|
- qwen-turbo
|
|||
|
|
- qwen-plus
|
|||
|
|
- qwen-max
|
|||
|
|
- qwen3.5-flash (推荐:快速响应)
|
|||
|
|
- qwen3.5-plus
|
|||
|
|
- qwen-long
|
|||
|
|
- qwen2.5系列
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
SUPPORTED_MODELS = [
|
|||
|
|
"qwen-turbo",
|
|||
|
|
"qwen-plus",
|
|||
|
|
"qwen-max",
|
|||
|
|
"qwen-max-longcontext",
|
|||
|
|
"qwen-long",
|
|||
|
|
"qwen3.5-flash",
|
|||
|
|
"qwen3.5-plus",
|
|||
|
|
"qwen3-plus",
|
|||
|
|
"qwen2.5-72b-instruct",
|
|||
|
|
"qwen2.5-32b-instruct",
|
|||
|
|
"qwen2.5-14b-instruct",
|
|||
|
|
"qwen2.5-7b-instruct"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def __init__(self, config: LLMConfig):
|
|||
|
|
if config.provider not in [LLMProvider.QWEN, LLMProvider.QWEN_VL]:
|
|||
|
|
raise ValueError(f"配置provider应为Qwen,实际为{config.provider}")
|
|||
|
|
super().__init__(config)
|
|||
|
|
self._init_client()
|
|||
|
|
|
|||
|
|
def _init_client(self):
|
|||
|
|
"""初始化HTTP客户端"""
|
|||
|
|
# OpenAI兼容API格式
|
|||
|
|
self._client = httpx.Client(
|
|||
|
|
base_url=self.config.base_url,
|
|||
|
|
headers={
|
|||
|
|
"Authorization": f"Bearer {self.config.api_key}",
|
|||
|
|
"Content-Type": "application/json"
|
|||
|
|
},
|
|||
|
|
timeout=self.config.timeout
|
|||
|
|
)
|
|||
|
|
logger.info(f"Qwen客户端初始化完成: {self.config.base_url} - {self.config.model}")
|
|||
|
|
|
|||
|
|
def chat(
|
|||
|
|
self,
|
|||
|
|
messages: List[Dict[str, str]],
|
|||
|
|
max_tokens: Optional[int] = None,
|
|||
|
|
temperature: Optional[float] = None,
|
|||
|
|
**kwargs
|
|||
|
|
) -> LLMResponse:
|
|||
|
|
"""对话补全(OpenAI兼容格式)"""
|
|||
|
|
start_time = time.time()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# OpenAI兼容格式的请求体
|
|||
|
|
payload = {
|
|||
|
|
"model": self.config.model,
|
|||
|
|
"messages": messages,
|
|||
|
|
"max_tokens": max_tokens or self.config.max_tokens,
|
|||
|
|
"temperature": temperature or self.config.temperature,
|
|||
|
|
"top_p": kwargs.get("top_p", self.config.top_p),
|
|||
|
|
"stream": False
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# OpenAI兼容接口路径
|
|||
|
|
response = self._client.post("/chat/completions", json=payload)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
data = response.json()
|
|||
|
|
|
|||
|
|
latency_ms = int((time.time() - start_time) * 1000)
|
|||
|
|
|
|||
|
|
# OpenAI兼容格式的响应解析
|
|||
|
|
choices = data.get("choices", [{}])
|
|||
|
|
message = choices[0].get("message", {})
|
|||
|
|
|
|||
|
|
return LLMResponse(
|
|||
|
|
content=message.get("content", ""),
|
|||
|
|
model=data.get("model", self.config.model),
|
|||
|
|
usage=data.get("usage", {}),
|
|||
|
|
finish_reason=choices[0].get("finish_reason", "stop"),
|
|||
|
|
latency_ms=latency_ms
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except httpx.HTTPStatusError as e:
|
|||
|
|
logger.error(f"Qwen API错误: {e.response.status_code} - {e.response.text}")
|
|||
|
|
return LLMResponse(
|
|||
|
|
content="",
|
|||
|
|
model=self.config.model,
|
|||
|
|
error=f"API错误: {e.response.status_code} - {e.response.text[:200]}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Qwen调用失败: {e}")
|
|||
|
|
return LLMResponse(
|
|||
|
|
content="",
|
|||
|
|
model=self.config.model,
|
|||
|
|
error=str(e)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def stream_chat(
|
|||
|
|
self,
|
|||
|
|
messages: List[Dict[str, str]],
|
|||
|
|
max_tokens: Optional[int] = None,
|
|||
|
|
temperature: Optional[float] = None,
|
|||
|
|
**kwargs
|
|||
|
|
) -> Generator[str, None, None]:
|
|||
|
|
"""
|
|||
|
|
流式对话补全(SSE格式)
|
|||
|
|
|
|||
|
|
Yields:
|
|||
|
|
str: 每次返回一个文本片段
|
|||
|
|
|
|||
|
|
使用示例:
|
|||
|
|
for chunk in client.stream_chat(messages):
|
|||
|
|
print(chunk, end="", flush=True)
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# OpenAI兼容格式的请求体,启用流式输出
|
|||
|
|
payload = {
|
|||
|
|
"model": self.config.model,
|
|||
|
|
"messages": messages,
|
|||
|
|
"max_tokens": max_tokens or self.config.max_tokens,
|
|||
|
|
"temperature": temperature or self.config.temperature,
|
|||
|
|
"top_p": kwargs.get("top_p", self.config.top_p),
|
|||
|
|
"stream": True # 启用流式输出
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 使用stream模式发送请求
|
|||
|
|
with self._client.stream("POST", "/chat/completions", json=payload) as response:
|
|||
|
|
for line in response.iter_lines():
|
|||
|
|
if line:
|
|||
|
|
line = line.strip()
|
|||
|
|
# SSE格式: data: {...}
|
|||
|
|
if line.startswith("data: "):
|
|||
|
|
data_str = line[6:] # 移除 "data: " 前缀
|
|||
|
|
if data_str == "[DONE]":
|
|||
|
|
break
|
|||
|
|
try:
|
|||
|
|
data = json.loads(data_str)
|
|||
|
|
choices = data.get("choices", [])
|
|||
|
|
if not choices:
|
|||
|
|
continue # 跳过空的choices
|
|||
|
|
delta = choices[0].get("delta", {})
|
|||
|
|
content = delta.get("content", "")
|
|||
|
|
if content:
|
|||
|
|
yield content
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
except httpx.HTTPStatusError as e:
|
|||
|
|
logger.error(f"Qwen流式API错误: {e.response.status_code}")
|
|||
|
|
yield f"[ERROR: API返回错误 {e.response.status_code}]"
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Qwen流式调用失败: {e}")
|
|||
|
|
yield f"[ERROR: {str(e)}]"
|
|||
|
|
|
|||
|
|
async def async_stream_chat(
|
|||
|
|
self,
|
|||
|
|
messages: List[Dict[str, str]],
|
|||
|
|
max_tokens: Optional[int] = None,
|
|||
|
|
temperature: Optional[float] = None,
|
|||
|
|
**kwargs
|
|||
|
|
) -> AsyncGenerator[str, None]:
|
|||
|
|
"""
|
|||
|
|
异步流式对话补全(用于FastAPI SSE响应)
|
|||
|
|
|
|||
|
|
Yields:
|
|||
|
|
str: 每次返回一个文本片段
|
|||
|
|
"""
|
|||
|
|
import asyncio
|
|||
|
|
|
|||
|
|
# 使用同步流式方法,包装为异步
|
|||
|
|
for chunk in self.stream_chat(messages, max_tokens, temperature, **kwargs):
|
|||
|
|
yield chunk
|
|||
|
|
# 给async循环一个小延迟,让其他任务有机会执行
|
|||
|
|
await asyncio.sleep(0)
|
|||
|
|
|
|||
|
|
def get_available_models(self) -> List[str]:
|
|||
|
|
"""获取可用模型列表"""
|
|||
|
|
return self.SUPPORTED_MODELS
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""关闭客户端"""
|
|||
|
|
if self._client:
|
|||
|
|
self._client.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
class QwenVLClient(BaseLLMClient):
|
|||
|
|
"""
|
|||
|
|
Qwen VL多模态客户端(OpenAI兼容格式)
|
|||
|
|
|
|||
|
|
支持模型:
|
|||
|
|
- qwen-vl-plus
|
|||
|
|
- qwen-vl-max
|
|||
|
|
- qwen3-vl-plus
|
|||
|
|
- qwen2-vl-7b-instruct
|
|||
|
|
- qwen2-vl-72b-instruct
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
SUPPORTED_MODELS = [
|
|||
|
|
"qwen-vl-plus",
|
|||
|
|
"qwen-vl-max",
|
|||
|
|
"qwen3-vl-plus",
|
|||
|
|
"qwen2-vl-7b-instruct",
|
|||
|
|
"qwen2-vl-72b-instruct"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def __init__(self, config: LLMConfig):
|
|||
|
|
if config.provider != LLMProvider.QWEN_VL:
|
|||
|
|
raise ValueError(f"配置provider应为QWEN_VL,实际为{config.provider}")
|
|||
|
|
super().__init__(config)
|
|||
|
|
self._init_client()
|
|||
|
|
|
|||
|
|
def _init_client(self):
|
|||
|
|
"""初始化HTTP客户端"""
|
|||
|
|
self._client = httpx.Client(
|
|||
|
|
base_url=self.config.base_url,
|
|||
|
|
headers={
|
|||
|
|
"Authorization": f"Bearer {self.config.api_key}",
|
|||
|
|
"Content-Type": "application/json"
|
|||
|
|
},
|
|||
|
|
timeout=self.config.timeout
|
|||
|
|
)
|
|||
|
|
logger.info(f"QwenVL客户端初始化完成: {self.config.base_url} - {self.config.model}")
|
|||
|
|
|
|||
|
|
def chat(
|
|||
|
|
self,
|
|||
|
|
messages: List[Dict[str, str]],
|
|||
|
|
max_tokens: Optional[int] = None,
|
|||
|
|
temperature: Optional[float] = None,
|
|||
|
|
**kwargs
|
|||
|
|
) -> LLMResponse:
|
|||
|
|
"""多模态对话补全(OpenAI兼容格式)
|
|||
|
|
|
|||
|
|
支持图片输入,消息格式:
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": [
|
|||
|
|
{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}},
|
|||
|
|
{"type": "text", "text": "描述这张图片"}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
"""
|
|||
|
|
start_time = time.time()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# OpenAI兼容格式的请求体
|
|||
|
|
payload = {
|
|||
|
|
"model": self.config.model,
|
|||
|
|
"messages": messages,
|
|||
|
|
"max_tokens": max_tokens or self.config.max_tokens,
|
|||
|
|
"temperature": temperature or self.config.temperature,
|
|||
|
|
"top_p": kwargs.get("top_p", self.config.top_p),
|
|||
|
|
"stream": False
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = self._client.post("/chat/completions", json=payload)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
data = response.json()
|
|||
|
|
latency_ms = int((time.time() - start_time) * 1000)
|
|||
|
|
|
|||
|
|
choices = data.get("choices", [{}])
|
|||
|
|
message = choices[0].get("message", {})
|
|||
|
|
|
|||
|
|
return LLMResponse(
|
|||
|
|
content=message.get("content", ""),
|
|||
|
|
model=data.get("model", self.config.model),
|
|||
|
|
usage=data.get("usage", {}),
|
|||
|
|
finish_reason=choices[0].get("finish_reason", "stop"),
|
|||
|
|
latency_ms=latency_ms
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except httpx.HTTPStatusError as e:
|
|||
|
|
logger.error(f"QwenVL API错误: {e.response.status_code} - {e.response.text}")
|
|||
|
|
return LLMResponse(
|
|||
|
|
content="",
|
|||
|
|
model=self.config.model,
|
|||
|
|
error=f"API错误: {e.response.status_code} - {e.response.text[:200]}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"QwenVL调用失败: {e}")
|
|||
|
|
return LLMResponse(
|
|||
|
|
content="",
|
|||
|
|
model=self.config.model,
|
|||
|
|
error=str(e)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def stream_chat(
|
|||
|
|
self,
|
|||
|
|
messages: List[Dict[str, str]],
|
|||
|
|
max_tokens: Optional[int] = None,
|
|||
|
|
temperature: Optional[float] = None,
|
|||
|
|
**kwargs
|
|||
|
|
) -> Generator[str, None, None]:
|
|||
|
|
"""流式多模态对话补全"""
|
|||
|
|
try:
|
|||
|
|
payload = {
|
|||
|
|
"model": self.config.model,
|
|||
|
|
"messages": messages,
|
|||
|
|
"max_tokens": max_tokens or self.config.max_tokens,
|
|||
|
|
"temperature": temperature or self.config.temperature,
|
|||
|
|
"top_p": kwargs.get("top_p", self.config.top_p),
|
|||
|
|
"stream": True
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with self._client.stream("POST", "/chat/completions", json=payload) as response:
|
|||
|
|
for line in response.iter_lines():
|
|||
|
|
if line:
|
|||
|
|
line = line.strip()
|
|||
|
|
if line.startswith("data: "):
|
|||
|
|
data_str = line[6:]
|
|||
|
|
if data_str == "[DONE]":
|
|||
|
|
break
|
|||
|
|
try:
|
|||
|
|
data = json.loads(data_str)
|
|||
|
|
choices = data.get("choices", [])
|
|||
|
|
if not choices:
|
|||
|
|
continue # 跳过空的choices
|
|||
|
|
delta = choices[0].get("delta", {})
|
|||
|
|
content = delta.get("content", "")
|
|||
|
|
if content:
|
|||
|
|
yield content
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"QwenVL流式调用失败: {e}")
|
|||
|
|
yield f"[ERROR: {str(e)}]"
|
|||
|
|
|
|||
|
|
def get_available_models(self) -> List[str]:
|
|||
|
|
"""获取可用模型列表"""
|
|||
|
|
return self.SUPPORTED_MODELS
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""关闭客户端"""
|
|||
|
|
if self._client:
|
|||
|
|
self._client.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_qwen_client(
|
|||
|
|
api_key: str,
|
|||
|
|
model: str = "qwen3.5-flash",
|
|||
|
|
base_url: str = "http://6.86.80.4:30080/v1",
|
|||
|
|
**kwargs
|
|||
|
|
) -> QwenClient:
|
|||
|
|
"""便捷函数:创建Qwen客户端"""
|
|||
|
|
config = LLMConfig(
|
|||
|
|
provider=LLMProvider.QWEN,
|
|||
|
|
model=model,
|
|||
|
|
api_key=api_key,
|
|||
|
|
base_url=base_url,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
return QwenClient(config)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_qwen_vl_client(
|
|||
|
|
api_key: str,
|
|||
|
|
model: str = "qwen3-vl-plus",
|
|||
|
|
base_url: str = "http://6.86.80.4:30080/v1",
|
|||
|
|
**kwargs
|
|||
|
|
) -> QwenVLClient:
|
|||
|
|
"""便捷函数:创建QwenVL客户端"""
|
|||
|
|
config = LLMConfig(
|
|||
|
|
provider=LLMProvider.QWEN_VL,
|
|||
|
|
model=model,
|
|||
|
|
api_key=api_key,
|
|||
|
|
base_url=base_url,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
return QwenVLClient(config)
|