add gemini-3-pro-preview

This commit is contained in:
yongtenglei 2025-11-19 11:32:03 +08:00
parent 2dbd1fad46
commit 6de01dc601
2 changed files with 155 additions and 64 deletions

View file

@ -1429,6 +1429,13 @@
"status": "1", "status": "1",
"rank": "980", "rank": "980",
"llm": [ "llm": [
{
"llm_name": "gemini-3-pro-preview",
"tags": "LLM,CHAT,1M,IMAGE2TEXT",
"max_tokens": 1048576,
"model_type": "image2text",
"is_tools": true
},
{ {
"llm_name": "gemini-2.5-flash", "llm_name": "gemini-2.5-flash",
"tags": "LLM,CHAT,1024K,IMAGE2TEXT", "tags": "LLM,CHAT,1024K,IMAGE2TEXT",

View file

@ -723,29 +723,80 @@ class GeminiCV(Base):
_FACTORY_NAME = "Gemini" _FACTORY_NAME = "Gemini"
def __init__(self, key, model_name="gemini-1.0-pro-vision-latest", lang="Chinese", **kwargs): def __init__(self, key, model_name="gemini-1.0-pro-vision-latest", lang="Chinese", **kwargs):
from google.generativeai import GenerativeModel, client from google import genai
client.configure(api_key=key) self.api_key = key
_client = client.get_default_generative_client()
self.api_key=key
self.model_name = model_name self.model_name = model_name
self.model = GenerativeModel(model_name=self.model_name) self.client = genai.Client(api_key=key)
self.model._client = _client
self.lang = lang self.lang = lang
Base.__init__(self, **kwargs) Base.__init__(self, **kwargs)
logging.info(f"[GeminiCV] Initialized with model={self.model_name} lang={self.lang}")
def _image_to_part(self, image):
from google.genai import types
if isinstance(image, str) and image.startswith("data:") and ";base64," in image:
header, b64data = image.split(",", 1)
mime = header.split(":", 1)[1].split(";", 1)[0]
data = base64.b64decode(b64data)
else:
data_url = self.image2base64(image)
header, b64data = data_url.split(",", 1)
mime = header.split(":", 1)[1].split(";", 1)[0]
data = base64.b64decode(b64data)
return types.Part(
inline_data=types.Blob(
mime_type=mime,
data=data,
)
)
def _form_history(self, system, history, images=None): def _form_history(self, system, history, images=None):
hist = [] from google.genai import types
if system:
hist.append({"role": "user", "parts": [system, history[0]["content"]]}) contents = []
images = images or []
system_len = len(system) if isinstance(system, str) else 0
history_len = len(history) if history else 0
images_len = len(images)
logging.info(f"[GeminiCV] _form_history called: system_len={system_len} history_len={history_len} images_len={images_len}")
image_parts = []
for img in images: for img in images:
hist[0]["parts"].append(("data:image/jpeg;base64," + img) if img[:4]!="data" else img) try:
for h in history[1:]: image_parts.append(self._image_to_part(img))
hist.append({"role": "user" if h["role"]=="user" else "model", "parts": [h["content"]]}) except Exception:
return hist continue
remaining_history = history or []
if system or remaining_history:
parts = []
if system:
parts.append(types.Part(text=system))
if remaining_history:
first = remaining_history[0]
parts.append(types.Part(text=first.get("content", "")))
remaining_history = remaining_history[1:]
parts.extend(image_parts)
contents.append(types.Content(role="user", parts=parts))
elif image_parts:
contents.append(types.Content(role="user", parts=image_parts))
role_map = {"user": "user", "assistant": "model", "system": "user"}
for h in remaining_history:
role = role_map.get(h.get("role"), "user")
contents.append(
types.Content(
role=role,
parts=[types.Part(text=h.get("content", ""))],
)
)
return contents
def describe(self, image): def describe(self, image):
from PIL.Image import open from google.genai import types
prompt = ( prompt = (
"请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。"
@ -753,74 +804,106 @@ class GeminiCV(Base):
else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out." else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out."
) )
if image is bytes: contents = [
with BytesIO(image) as bio: types.Content(
with open(bio) as img: role="user",
input = [prompt, img] parts=[
res = self.model.generate_content(input) types.Part(text=prompt),
return res.text, total_token_count_from_response(res) self._image_to_part(image),
else: ],
b64 = self.image2base64_rawvalue(image) )
with BytesIO(base64.b64decode(b64)) as bio: ]
with open(bio) as img:
input = [prompt, img] res = self.client.models.generate_content(
res = self.model.generate_content(input) model=self.model_name,
return res.text, total_token_count_from_response(res) contents=contents,
)
return res.text, total_token_count_from_response(res)
def describe_with_prompt(self, image, prompt=None): def describe_with_prompt(self, image, prompt=None):
from PIL.Image import open from google.genai import types
vision_prompt = prompt if prompt else vision_llm_describe_prompt() vision_prompt = prompt if prompt else vision_llm_describe_prompt()
if image is bytes: contents = [
with BytesIO(image) as bio: types.Content(
with open(bio) as img: role="user",
input = [vision_prompt, img] parts=[
res = self.model.generate_content(input) types.Part(text=vision_prompt),
return res.text, total_token_count_from_response(res) self._image_to_part(image),
else: ],
b64 = self.image2base64_rawvalue(image) )
with BytesIO(base64.b64decode(b64)) as bio: ]
with open(bio) as img:
input = [vision_prompt, img] res = self.client.models.generate_content(
res = self.model.generate_content(input) model=self.model_name,
return res.text, total_token_count_from_response(res) contents=contents,
)
return res.text, total_token_count_from_response(res)
def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename="", **kwargs): def chat(self, system, history, gen_conf, images=None, video_bytes=None, filename="", **kwargs):
if video_bytes: if video_bytes:
try: try:
size = len(video_bytes) if video_bytes else 0
logging.info(f"[GeminiCV] chat called with video: filename={filename} size={size}")
summary, summary_num_tokens = self._process_video(video_bytes, filename) summary, summary_num_tokens = self._process_video(video_bytes, filename)
return summary, summary_num_tokens return summary, summary_num_tokens
except Exception as e: except Exception as e:
logging.info(f"[GeminiCV] chat video error: {e}")
return "**ERROR**: " + str(e), 0 return "**ERROR**: " + str(e), 0
generation_config = dict(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7)) from google.genai import types
history_len = len(history) if history else 0
images_len = len(images) if images else 0
logging.info(f"[GeminiCV] chat called: history_len={history_len} images_len={images_len} gen_conf={gen_conf}")
generation_config = types.GenerateContentConfig(
temperature=gen_conf.get("temperature", 0.3),
top_p=gen_conf.get("top_p", 0.7),
)
try: try:
response = self.model.generate_content( response = self.client.models.generate_content(
self._form_history(system, history, images), model=self.model_name,
generation_config=generation_config) contents=self._form_history(system, history, images),
config=generation_config,
)
ans = response.text ans = response.text
return ans, total_token_count_from_response(ans) logging.info("[GeminiCV] chat completed")
return ans, total_token_count_from_response(response)
except Exception as e: except Exception as e:
logging.warning(f"[GeminiCV] chat error: {e}")
return "**ERROR**: " + str(e), 0 return "**ERROR**: " + str(e), 0
def chat_streamly(self, system, history, gen_conf, images=None, **kwargs): def chat_streamly(self, system, history, gen_conf, images=None, **kwargs):
ans = "" ans = ""
response = None response = None
try: try:
generation_config = dict(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7)) from google.genai import types
response = self.model.generate_content(
self._form_history(system, history, images), generation_config = types.GenerateContentConfig(
generation_config=generation_config, temperature=gen_conf.get("temperature", 0.3),
stream=True, top_p=gen_conf.get("top_p", 0.7),
)
history_len = len(history) if history else 0
images_len = len(images) if images else 0
logging.info(f"[GeminiCV] chat_streamly called: history_len={history_len} images_len={images_len} gen_conf={gen_conf}")
response_stream = self.client.models.generate_content_stream(
model=self.model_name,
contents=self._form_history(system, history, images),
config=generation_config,
) )
for resp in response: for chunk in response_stream:
if not resp.text: if chunk.text:
continue ans += chunk.text
ans = resp.text yield chunk.text
yield ans logging.info("[GeminiCV] chat_streamly completed")
except Exception as e: except Exception as e:
logging.warning(f"[GeminiCV] chat_streamly error: {e}")
yield ans + "\n**ERROR**: " + str(e) yield ans + "\n**ERROR**: " + str(e)
yield total_token_count_from_response(response) yield total_token_count_from_response(response)
@ -830,7 +913,8 @@ class GeminiCV(Base):
from google.genai import types from google.genai import types
video_size_mb = len(video_bytes) / (1024 * 1024) video_size_mb = len(video_bytes) / (1024 * 1024)
client = genai.Client(api_key=self.api_key) client = self.client if hasattr(self, "client") else genai.Client(api_key=self.api_key)
logging.info(f"[GeminiCV] _process_video called: filename={filename} size_mb={video_size_mb:.2f}")
tmp_path = None tmp_path = None
try: try:
@ -856,10 +940,10 @@ class GeminiCV(Base):
) )
summary = response.text or "" summary = response.text or ""
logging.info(f"Video summarized: {summary[:32]}...") logging.info(f"[GeminiCV] Video summarized: {summary[:32]}...")
return summary, num_tokens_from_string(summary) return summary, num_tokens_from_string(summary)
except Exception as e: except Exception as e:
logging.error(f"Video processing failed: {e}") logging.warning(f"[GeminiCV] Video processing failed: {e}")
raise raise
finally: finally:
if tmp_path and tmp_path.exists(): if tmp_path and tmp_path.exists():