v0.21.1-fastapi

This commit is contained in:
2025-11-04 16:06:36 +08:00
parent 3e58c3d0e9
commit d57b5d76ae
218 changed files with 19617 additions and 72339 deletions

View File

@@ -132,8 +132,7 @@ class Base(ABC):
"tool_choice",
"logprobs",
"top_logprobs",
"extra_headers",
"enable_thinking"
"extra_headers"
}
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
@@ -142,6 +141,22 @@ class Base(ABC):
def _chat(self, history, gen_conf, **kwargs):
logging.info("[HISTORY]" + json.dumps(history, ensure_ascii=False, indent=2))
if self.model_name.lower().find("qwq") >= 0:
logging.info(f"[INFO] {self.model_name} detected as reasoning model, using _chat_streamly")
final_ans = ""
tol_token = 0
for delta, tol in self._chat_streamly(history, gen_conf, with_reasoning=False, **kwargs):
if delta.startswith("<think>") or delta.endswith("</think>"):
continue
final_ans += delta
tol_token = tol
if len(final_ans.strip()) == 0:
final_ans = "**ERROR**: Empty response from reasoning model"
return final_ans.strip(), tol_token
if self.model_name.lower().find("qwen3") >= 0:
kwargs["extra_body"] = {"enable_thinking": False}
@@ -152,7 +167,7 @@ class Base(ABC):
ans = response.choices[0].message.content.strip()
if response.choices[0].finish_reason == "length":
ans = self._length_stop(ans)
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def _chat_streamly(self, history, gen_conf, **kwargs):
logging.info("[HISTORY STREAMLY]" + json.dumps(history, ensure_ascii=False, indent=4))
@@ -178,7 +193,7 @@ class Base(ABC):
reasoning_start = False
ans = resp.choices[0].delta.content
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
tol = num_tokens_from_string(resp.choices[0].delta.content)
@@ -268,7 +283,7 @@ class Base(ABC):
for _ in range(self.max_rounds + 1):
logging.info(f"{self.tools=}")
response = self.client.chat.completions.create(model=self.model_name, messages=history, tools=self.tools, tool_choice="auto", **gen_conf)
tk_count += self.total_token_count(response)
tk_count += total_token_count_from_response(response)
if any([not response.choices, not response.choices[0].message]):
raise Exception(f"500 response structure error. Response: {response}")
@@ -386,7 +401,7 @@ class Base(ABC):
answer += resp.choices[0].delta.content
yield resp.choices[0].delta.content
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(resp.choices[0].delta.content)
else:
@@ -422,7 +437,7 @@ class Base(ABC):
if not resp.choices[0].delta.content:
resp.choices[0].delta.content = ""
continue
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(resp.choices[0].delta.content)
else:
@@ -457,9 +472,6 @@ class Base(ABC):
yield total_tokens
def total_token_count(self, resp):
return total_token_count_from_response(resp)
def _calculate_dynamic_ctx(self, history):
"""Calculate dynamic context window size"""
@@ -589,7 +601,7 @@ class BaiChuanChat(Base):
ans += LENGTH_NOTIFICATION_CN
else:
ans += LENGTH_NOTIFICATION_EN
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def chat_streamly(self, system, history, gen_conf={}, **kwargs):
if system and history and history[0].get("role") != "system":
@@ -612,7 +624,7 @@ class BaiChuanChat(Base):
if not resp.choices[0].delta.content:
resp.choices[0].delta.content = ""
ans = resp.choices[0].delta.content
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(resp.choices[0].delta.content)
else:
@@ -676,9 +688,9 @@ class ZhipuChat(Base):
ans += LENGTH_NOTIFICATION_CN
else:
ans += LENGTH_NOTIFICATION_EN
tk_count = self.total_token_count(resp)
tk_count = total_token_count_from_response(resp)
if resp.choices[0].finish_reason == "stop":
tk_count = self.total_token_count(resp)
tk_count = total_token_count_from_response(resp)
yield ans
except Exception as e:
yield ans + "\n**ERROR**: " + str(e)
@@ -797,7 +809,7 @@ class MiniMaxChat(Base):
ans += LENGTH_NOTIFICATION_CN
else:
ans += LENGTH_NOTIFICATION_EN
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def chat_streamly(self, system, history, gen_conf):
if system and history and history[0].get("role") != "system":
@@ -832,7 +844,7 @@ class MiniMaxChat(Base):
if "choices" in resp and "delta" in resp["choices"][0]:
text = resp["choices"][0]["delta"]["content"]
ans = text
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(text)
else:
@@ -871,7 +883,7 @@ class MistralChat(Base):
ans += LENGTH_NOTIFICATION_CN
else:
ans += LENGTH_NOTIFICATION_EN
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def chat_streamly(self, system, history, gen_conf={}, **kwargs):
if system and history and history[0].get("role") != "system":
@@ -1095,7 +1107,7 @@ class BaiduYiyanChat(Base):
system = history[0]["content"] if history and history[0]["role"] == "system" else ""
response = self.client.do(model=self.model_name, messages=[h for h in history if h["role"] != "system"], system=system, **gen_conf).body
ans = response["result"]
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def chat_streamly(self, system, history, gen_conf={}, **kwargs):
gen_conf["penalty_score"] = ((gen_conf.get("presence_penalty", 0) + gen_conf.get("frequency_penalty", 0)) / 2) + 1
@@ -1109,7 +1121,7 @@ class BaiduYiyanChat(Base):
for resp in response:
resp = resp.body
ans = resp["result"]
total_tokens = self.total_token_count(resp)
total_tokens = total_token_count_from_response(resp)
yield ans
@@ -1150,15 +1162,13 @@ class GoogleChat(Base):
else:
self.client = AnthropicVertex(region=region, project_id=project_id)
else:
import vertexai.generative_models as glm
from google.cloud import aiplatform
from google import genai
if access_token:
credits = service_account.Credentials.from_service_account_info(access_token)
aiplatform.init(credentials=credits, project=project_id, location=region)
credits = service_account.Credentials.from_service_account_info(access_token, scopes=scopes)
self.client = genai.Client(vertexai=True, project=project_id, location=region, credentials=credits)
else:
aiplatform.init(project=project_id, location=region)
self.client = glm.GenerativeModel(model_name=self.model_name)
self.client = genai.Client(vertexai=True, project=project_id, location=region)
def _clean_conf(self, gen_conf):
if "claude" in self.model_name:
@@ -1167,6 +1177,7 @@ class GoogleChat(Base):
else:
if "max_tokens" in gen_conf:
gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
del gen_conf["max_tokens"]
for k in list(gen_conf.keys()):
if k not in ["temperature", "top_p", "max_output_tokens"]:
del gen_conf[k]
@@ -1174,7 +1185,9 @@ class GoogleChat(Base):
def _chat(self, history, gen_conf={}, **kwargs):
system = history[0]["content"] if history and history[0]["role"] == "system" else ""
if "claude" in self.model_name:
gen_conf = self._clean_conf(gen_conf)
response = self.client.messages.create(
model=self.model_name,
messages=[h for h in history if h["role"] != "system"],
@@ -1190,25 +1203,63 @@ class GoogleChat(Base):
response["usage"]["input_tokens"] + response["usage"]["output_tokens"],
)
self.client._system_instruction = system
hist = []
# Gemini models with google-genai SDK
# Set default thinking_budget=0 if not specified
if "thinking_budget" not in gen_conf:
gen_conf["thinking_budget"] = 0
thinking_budget = gen_conf.pop("thinking_budget", 0)
gen_conf = self._clean_conf(gen_conf)
# Build GenerateContentConfig
try:
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise
config_dict = {}
if system:
config_dict["system_instruction"] = system
if "temperature" in gen_conf:
config_dict["temperature"] = gen_conf["temperature"]
if "top_p" in gen_conf:
config_dict["top_p"] = gen_conf["top_p"]
if "max_output_tokens" in gen_conf:
config_dict["max_output_tokens"] = gen_conf["max_output_tokens"]
# Add ThinkingConfig
config_dict["thinking_config"] = ThinkingConfig(thinking_budget=thinking_budget)
config = GenerateContentConfig(**config_dict)
# Convert history to google-genai Content format
contents = []
for item in history:
if item["role"] == "system":
continue
hist.append(deepcopy(item))
item = hist[-1]
if "role" in item and item["role"] == "assistant":
item["role"] = "model"
if "content" in item:
item["parts"] = [
{
"text": item.pop("content"),
}
]
# google-genai uses 'model' instead of 'assistant'
role = "model" if item["role"] == "assistant" else item["role"]
content = Content(
role=role,
parts=[Part(text=item["content"])]
)
contents.append(content)
response = self.client.models.generate_content(
model=self.model_name,
contents=contents,
config=config
)
response = self.client.generate_content(hist, generation_config=gen_conf)
ans = response.text
return ans, response.usage_metadata.total_token_count
# Get token count from response
try:
total_tokens = response.usage_metadata.total_token_count
except Exception:
total_tokens = 0
return ans, total_tokens
def chat_streamly(self, system, history, gen_conf={}, **kwargs):
if "claude" in self.model_name:
@@ -1235,28 +1286,65 @@ class GoogleChat(Base):
yield total_tokens
else:
self.client._system_instruction = system
if "max_tokens" in gen_conf:
gen_conf["max_output_tokens"] = gen_conf["max_tokens"]
for k in list(gen_conf.keys()):
if k not in ["temperature", "top_p", "max_output_tokens"]:
del gen_conf[k]
for item in history:
if "role" in item and item["role"] == "assistant":
item["role"] = "model"
if "content" in item:
item["parts"] = item.pop("content")
# Gemini models with google-genai SDK
ans = ""
total_tokens = 0
# Set default thinking_budget=0 if not specified
if "thinking_budget" not in gen_conf:
gen_conf["thinking_budget"] = 0
thinking_budget = gen_conf.pop("thinking_budget", 0)
gen_conf = self._clean_conf(gen_conf)
# Build GenerateContentConfig
try:
response = self.model.generate_content(history, generation_config=gen_conf, stream=True)
for resp in response:
ans = resp.text
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part
except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise
config_dict = {}
if system:
config_dict["system_instruction"] = system
if "temperature" in gen_conf:
config_dict["temperature"] = gen_conf["temperature"]
if "top_p" in gen_conf:
config_dict["top_p"] = gen_conf["top_p"]
if "max_output_tokens" in gen_conf:
config_dict["max_output_tokens"] = gen_conf["max_output_tokens"]
# Add ThinkingConfig
config_dict["thinking_config"] = ThinkingConfig(thinking_budget=thinking_budget)
config = GenerateContentConfig(**config_dict)
# Convert history to google-genai Content format
contents = []
for item in history:
# google-genai uses 'model' instead of 'assistant'
role = "model" if item["role"] == "assistant" else item["role"]
content = Content(
role=role,
parts=[Part(text=item["content"])]
)
contents.append(content)
try:
for chunk in self.client.models.generate_content_stream(
model=self.model_name,
contents=contents,
config=config
):
text = chunk.text
ans = text
total_tokens += num_tokens_from_string(text)
yield ans
except Exception as e:
yield ans + "\n**ERROR**: " + str(e)
yield response._chunks[-1].usage_metadata.total_token_count
yield total_tokens
class GPUStackChat(Base):
@@ -1334,6 +1422,9 @@ class LiteLLMBase(ABC):
self.bedrock_ak = json.loads(key).get("bedrock_ak", "")
self.bedrock_sk = json.loads(key).get("bedrock_sk", "")
self.bedrock_region = json.loads(key).get("bedrock_region", "")
elif self.provider == SupportedLiteLLMProvider.OpenRouter:
self.api_key = json.loads(key).get("api_key", "")
self.provider_order = json.loads(key).get("provider_order", "")
def _get_delay(self):
"""Calculate retry delay time"""
@@ -1378,14 +1469,13 @@ class LiteLLMBase(ABC):
timeout=self.timeout,
)
# response = self.client.chat.completions.create(model=self.model_name, messages=history, **gen_conf, **kwargs)
if any([not response.choices, not response.choices[0].message, not response.choices[0].message.content]):
return "", 0
ans = response.choices[0].message.content.strip()
if response.choices[0].finish_reason == "length":
ans = self._length_stop(ans)
return ans, self.total_token_count(response)
return ans, total_token_count_from_response(response)
def _chat_streamly(self, history, gen_conf, **kwargs):
logging.info("[HISTORY STREAMLY]" + json.dumps(history, ensure_ascii=False, indent=4))
@@ -1419,7 +1509,7 @@ class LiteLLMBase(ABC):
reasoning_start = False
ans = delta.content
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
tol = num_tokens_from_string(delta.content)
@@ -1529,6 +1619,24 @@ class LiteLLMBase(ABC):
"aws_region_name": self.bedrock_region,
}
)
if self.provider == SupportedLiteLLMProvider.OpenRouter:
if self.provider_order:
def _to_order_list(x):
if x is None:
return []
if isinstance(x, str):
return [s.strip() for s in x.split(",") if s.strip()]
if isinstance(x, (list, tuple)):
return [str(s).strip() for s in x if str(s).strip()]
return []
extra_body = {}
provider_cfg = {}
provider_order = _to_order_list(self.provider_order)
provider_cfg["order"] = provider_order
provider_cfg["allow_fallbacks"] = False
extra_body["provider"] = provider_cfg
completion_args.update({"extra_body": extra_body})
return completion_args
def chat_with_tools(self, system: str, history: list, gen_conf: dict = {}):
@@ -1554,7 +1662,7 @@ class LiteLLMBase(ABC):
timeout=self.timeout,
)
tk_count += self.total_token_count(response)
tk_count += total_token_count_from_response(response)
if not hasattr(response, "choices") or not response.choices or not response.choices[0].message:
raise Exception(f"500 response structure error. Response: {response}")
@@ -1686,7 +1794,7 @@ class LiteLLMBase(ABC):
answer += delta.content
yield delta.content
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(delta.content)
else:
@@ -1735,7 +1843,7 @@ class LiteLLMBase(ABC):
delta = resp.choices[0].delta
if not hasattr(delta, "content") or delta.content is None:
continue
tol = self.total_token_count(resp)
tol = total_token_count_from_response(resp)
if not tol:
total_tokens += num_tokens_from_string(delta.content)
else:
@@ -1769,17 +1877,6 @@ class LiteLLMBase(ABC):
yield total_tokens
def total_token_count(self, resp):
try:
return resp.usage.total_tokens
except Exception:
pass
try:
return resp["usage"]["total_tokens"]
except Exception:
pass
return 0
def _calculate_dynamic_ctx(self, history):
"""Calculate dynamic context window size"""

View File

@@ -13,12 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import base64
import json
import os
import tempfile
import logging
from abc import ABC
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from urllib.parse import urljoin
import requests
from openai import OpenAI
@@ -38,6 +42,7 @@ class Base(ABC):
self.is_tools = False
self.tools = []
self.toolcall_sessions = {}
self.extra_body = None
def describe(self, image):
raise NotImplementedError("Please implement encode method!")
@@ -45,7 +50,7 @@ class Base(ABC):
def describe_with_prompt(self, image, prompt=None):
raise NotImplementedError("Please implement encode method!")
def _form_history(self, system, history, images=[]):
def _form_history(self, system, history, images=None):
hist = []
if system:
hist.append({"role": "system", "content": system})
@@ -73,24 +78,26 @@ class Base(ABC):
})
return pmpt
def chat(self, system, history, gen_conf, images=[], **kwargs):
def chat(self, system, history, gen_conf, images=None, **kwargs):
try:
response = self.client.chat.completions.create(
model=self.model_name,
messages=self._form_history(system, history, images)
messages=self._form_history(system, history, images),
extra_body=self.extra_body,
)
return response.choices[0].message.content.strip(), response.usage.total_tokens
except Exception as e:
return "**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=[], **kwargs):
def chat_streamly(self, system, history, gen_conf, images=None, **kwargs):
ans = ""
tk_count = 0
try:
response = self.client.chat.completions.create(
model=self.model_name,
messages=self._form_history(system, history, images),
stream=True
stream=True,
extra_body=self.extra_body,
)
for resp in response:
if not resp.choices[0].delta.content:
@@ -167,6 +174,7 @@ class GptV4(Base):
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
if not base_url:
base_url = "https://api.openai.com/v1"
self.api_key = key
self.client = OpenAI(api_key=key, base_url=base_url)
self.model_name = model_name
self.lang = lang
@@ -177,6 +185,7 @@ class GptV4(Base):
res = self.client.chat.completions.create(
model=self.model_name,
messages=self.prompt(b64),
extra_body=self.extra_body,
)
return res.choices[0].message.content.strip(), total_token_count_from_response(res)
@@ -185,6 +194,7 @@ class GptV4(Base):
res = self.client.chat.completions.create(
model=self.model_name,
messages=self.vision_llm_prompt(b64, prompt),
extra_body=self.extra_body,
)
return res.choices[0].message.content.strip(),total_token_count_from_response(res)
@@ -218,6 +228,61 @@ class QWenCV(GptV4):
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename=""):
if video_bytes:
try:
summary, summary_num_tokens = self._process_video(video_bytes, filename)
return summary, summary_num_tokens
except Exception as e:
return "**ERROR**: " + str(e), 0
return "**ERROR**: Method chat not supported yet.", 0
def _process_video(self, video_bytes, filename):
from dashscope import MultiModalConversation
video_suffix = Path(filename).suffix or ".mp4"
with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
tmp.write(video_bytes)
tmp_path = tmp.name
video_path = f"file://{tmp_path}"
messages = [
{
"role": "user",
"content": [
{
"video": video_path,
"fps": 2,
},
{
"text": "Please summarize this video in proper sentences.",
},
],
}
]
def call_api():
response = MultiModalConversation.call(
api_key=self.api_key,
model=self.model_name,
messages=messages,
)
summary = response["output"]["choices"][0]["message"].content[0]["text"]
return summary, num_tokens_from_string(summary)
try:
return call_api()
except Exception as e1:
import dashscope
dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"
try:
return call_api()
except Exception as e2:
raise RuntimeError(f"Both default and intl endpoint failed.\nFirst error: {e1}\nSecond error: {e2}")
class HunyuanCV(GptV4):
_FACTORY_NAME = "Tencent Hunyuan"
@@ -249,6 +314,17 @@ class StepFunCV(GptV4):
self.lang = lang
Base.__init__(self, **kwargs)
class VolcEngineCV(GptV4):
_FACTORY_NAME = "VolcEngine"
def __init__(self, key, model_name, lang="Chinese", base_url="https://ark.cn-beijing.volces.com/api/v3", **kwargs):
if not base_url:
base_url = "https://ark.cn-beijing.volces.com/api/v3"
ark_api_key = json.loads(key).get("ark_api_key", "")
self.client = OpenAI(api_key=ark_api_key, base_url=base_url)
self.model_name = json.loads(key).get("ep_id", "") + json.loads(key).get("endpoint_id", "")
self.lang = lang
Base.__init__(self, **kwargs)
class LmStudioCV(GptV4):
_FACTORY_NAME = "LM-Studio"
@@ -327,10 +403,27 @@ class OpenRouterCV(GptV4):
):
if not base_url:
base_url = "https://openrouter.ai/api/v1"
self.client = OpenAI(api_key=key, base_url=base_url)
api_key = json.loads(key).get("api_key", "")
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model_name = model_name
self.lang = lang
Base.__init__(self, **kwargs)
provider_order = json.loads(key).get("provider_order", "")
self.extra_body = {}
if provider_order:
def _to_order_list(x):
if x is None:
return []
if isinstance(x, str):
return [s.strip() for s in x.split(",") if s.strip()]
if isinstance(x, (list, tuple)):
return [str(s).strip() for s in x if str(s).strip()]
return []
provider_cfg = {}
provider_order = _to_order_list(provider_order)
provider_cfg["order"] = provider_order
provider_cfg["allow_fallbacks"] = False
self.extra_body["provider"] = provider_cfg
class LocalAICV(GptV4):
@@ -413,7 +506,7 @@ class OllamaCV(Base):
options["frequency_penalty"] = gen_conf["frequency_penalty"]
return options
def _form_history(self, system, history, images=[]):
def _form_history(self, system, history, images=None):
hist = deepcopy(history)
if system and hist[0]["role"] == "user":
hist.insert(0, {"role": "system", "content": system})
@@ -454,7 +547,7 @@ class OllamaCV(Base):
except Exception as e:
return "**ERROR**: " + str(e), 0
def chat(self, system, history, gen_conf, images=[]):
def chat(self, system, history, gen_conf, images=None):
try:
response = self.client.chat(
model=self.model_name,
@@ -468,7 +561,7 @@ class OllamaCV(Base):
except Exception as e:
return "**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=[]):
def chat_streamly(self, system, history, gen_conf, images=None):
ans = ""
try:
response = self.client.chat(
@@ -496,13 +589,14 @@ class GeminiCV(Base):
client.configure(api_key=key)
_client = client.get_default_generative_client()
self.api_key=key
self.model_name = model_name
self.model = GenerativeModel(model_name=self.model_name)
self.model._client = _client
self.lang = lang
Base.__init__(self, **kwargs)
def _form_history(self, system, history, images=[]):
def _form_history(self, system, history, images=None):
hist = []
if system:
hist.append({"role": "user", "parts": [system, history[0]["content"]]})
@@ -538,7 +632,15 @@ class GeminiCV(Base):
res = self.model.generate_content(input)
return res.text, total_token_count_from_response(res)
def chat(self, system, history, gen_conf, images=[]):
def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename=""):
if video_bytes:
try:
summary, summary_num_tokens = self._process_video(video_bytes, filename)
return summary, summary_num_tokens
except Exception as e:
return "**ERROR**: " + str(e), 0
generation_config = dict(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7))
try:
response = self.model.generate_content(
@@ -549,7 +651,7 @@ class GeminiCV(Base):
except Exception as e:
return "**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=[]):
def chat_streamly(self, system, history, gen_conf, images=None):
ans = ""
response = None
try:
@@ -570,6 +672,46 @@ class GeminiCV(Base):
yield total_token_count_from_response(response)
def _process_video(self, video_bytes, filename):
from google import genai
from google.genai import types
video_size_mb = len(video_bytes) / (1024 * 1024)
client = genai.Client(api_key=self.api_key)
tmp_path = None
try:
if video_size_mb <= 20:
response = client.models.generate_content(
model="models/gemini-2.5-flash",
contents=types.Content(parts=[
types.Part(inline_data=types.Blob(data=video_bytes, mime_type="video/mp4")),
types.Part(text="Please summarize the video in proper sentences.")
])
)
else:
logging.info(f"Video size {video_size_mb:.2f}MB exceeds 20MB. Using Files API...")
video_suffix = Path(filename).suffix or ".mp4"
with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
tmp.write(video_bytes)
tmp_path = Path(tmp.name)
uploaded_file = client.files.upload(file=tmp_path)
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[uploaded_file, "Please summarize this video in proper sentences."]
)
summary = response.text or ""
logging.info(f"Video summarized: {summary[:32]}...")
return summary, num_tokens_from_string(summary)
except Exception as e:
logging.error(f"Video processing failed: {e}")
raise
finally:
if tmp_path and tmp_path.exists():
tmp_path.unlink()
class NvidiaCV(Base):
_FACTORY_NAME = "NVIDIA"
@@ -614,7 +756,7 @@ class NvidiaCV(Base):
response = response.json()
return (
response["choices"][0]["message"]["content"].strip(),
response["usage"]["total_tokens"],
total_token_count_from_response(response),
)
def _request(self, msg, gen_conf={}):
@@ -637,26 +779,26 @@ class NvidiaCV(Base):
response = self._request(vision_prompt)
return (
response["choices"][0]["message"]["content"].strip(),
response["usage"]["total_tokens"],
total_token_count_from_response(response)
)
def chat(self, system, history, gen_conf, images=[], **kwargs):
def chat(self, system, history, gen_conf, images=None, **kwargs):
try:
response = self._request(self._form_history(system, history, images), gen_conf)
return (
response["choices"][0]["message"]["content"].strip(),
response["usage"]["total_tokens"],
total_token_count_from_response(response)
)
except Exception as e:
return "**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=[], **kwargs):
def chat_streamly(self, system, history, gen_conf, images=None, **kwargs):
total_tokens = 0
try:
response = self._request(self._form_history(system, history, images), gen_conf)
cnt = response["choices"][0]["message"]["content"]
if "usage" in response and "total_tokens" in response["usage"]:
total_tokens += response["usage"]["total_tokens"]
total_tokens += total_token_count_from_response(response)
for resp in cnt:
yield resp
except Exception as e:
@@ -716,7 +858,7 @@ class AnthropicCV(Base):
gen_conf["max_tokens"] = self.max_tokens
return gen_conf
def chat(self, system, history, gen_conf, images=[]):
def chat(self, system, history, gen_conf, images=None):
gen_conf = self._clean_conf(gen_conf)
ans = ""
try:
@@ -737,7 +879,7 @@ class AnthropicCV(Base):
except Exception as e:
return ans + "\n**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=[]):
def chat_streamly(self, system, history, gen_conf, images=None):
gen_conf = self._clean_conf(gen_conf)
total_tokens = 0
try:
@@ -821,13 +963,13 @@ class GoogleCV(AnthropicCV, GeminiCV):
else:
return GeminiCV.describe_with_prompt(self, image, prompt)
def chat(self, system, history, gen_conf, images=[]):
def chat(self, system, history, gen_conf, images=None):
if "claude" in self.model_name:
return AnthropicCV.chat(self, system, history, gen_conf, images)
else:
return GeminiCV.chat(self, system, history, gen_conf, images)
def chat_streamly(self, system, history, gen_conf, images=[]):
def chat_streamly(self, system, history, gen_conf, images=None):
if "claude" in self.model_name:
for ans in AnthropicCV.chat_streamly(self, system, history, gen_conf, images):
yield ans

View File

@@ -234,8 +234,8 @@ class DeepInfraSeq2txt(Base):
self.client = OpenAI(api_key=key, base_url=base_url)
self.model_name = model_name
class CometAPISeq2txt(Base):
_FACTORY_NAME = "CometAPI"
@@ -244,7 +244,8 @@ class CometAPISeq2txt(Base):
base_url = "https://api.cometapi.com/v1"
self.client = OpenAI(api_key=key, base_url=base_url)
self.model_name = model_name
class DeerAPISeq2txt(Base):
_FACTORY_NAME = "DeerAPI"
@@ -253,3 +254,44 @@ class DeerAPISeq2txt(Base):
base_url = "https://api.deerapi.com/v1"
self.client = OpenAI(api_key=key, base_url=base_url)
self.model_name = model_name
class ZhipuSeq2txt(Base):
_FACTORY_NAME = "ZHIPU-AI"
def __init__(self, key, model_name="glm-asr", base_url="https://open.bigmodel.cn/api/paas/v4", **kwargs):
if not base_url:
base_url = "https://open.bigmodel.cn/api/paas/v4"
self.base_url = base_url
self.api_key = key
self.model_name = model_name
self.gen_conf = kwargs.get("gen_conf", {})
self.stream = kwargs.get("stream", False)
def transcription(self, audio_path):
payload = {
"model": self.model_name,
"temperature": str(self.gen_conf.get("temperature", 0.75)) or "0.75",
"stream": self.stream,
}
headers = {"Authorization": f"Bearer {self.api_key}"}
with open(audio_path, "rb") as audio_file:
files = {"file": audio_file}
try:
response = requests.post(
url=f"{self.base_url}/audio/transcriptions",
data=payload,
files=files,
headers=headers,
)
body = response.json()
if response.status_code == 200:
full_content = body["text"]
return full_content, num_tokens_from_string(full_content)
else:
error = body["error"]
return f"**ERROR**: code: {error['code']}, message: {error['message']}", 0
except Exception as e:
return "**ERROR**: " + str(e), 0