extended to use gemini, sswitched to use gemini-flash-latest

2025-10-20 13:17:16 +03:00 · 2025-10-20 13:17:16 +03:00 · 0b3d31507e
commit 0b3d31507e
parent c0f69395c7
10 changed files with 429 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -120,6 +120,8 @@ cp env.example .env
 docker compose up
 ```
 > Tip: When targeting Google Gemini, set `LLM_BINDING=gemini`, choose a model such as `LLM_MODEL=gemini-flash-latest`, and provide your Gemini key via `LLM_BINDING_API_KEY` (or `GEMINI_API_KEY`). The server now understands this binding out of the box.
 > Historical versions of LightRAG docker images can be found here: [LightRAG Docker Images]( https://github.com/HKUDS/LightRAG/pkgs/container/lightrag)
 ### Install  LightRAG Core
--- a/env.example
+++ b/env.example
@ -154,7 +154,7 @@ MAX_PARALLEL_INSERT=2
 ###########################################################
 ### LLM Configuration
-### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock
+### LLM_BINDING type: openai, ollama, lollms, azure_openai, aws_bedrock, gemini
 ###########################################################
 ### LLM request timeout setting for all llm (0 means no timeout for Ollma)
 # LLM_TIMEOUT=180
@ -174,6 +174,14 @@ LLM_BINDING_API_KEY=your_api_key
 # LLM_BINDING_API_KEY=your_api_key
 # LLM_BINDING=openai
 ### Gemini example
 # LLM_BINDING=gemini
 # LLM_MODEL=gemini-flash-latest
 # LLM_BINDING_HOST=https://generativelanguage.googleapis.com
 # LLM_BINDING_API_KEY=your_gemini_api_key
 # GEMINI_LLM_MAX_OUTPUT_TOKENS=8192
 # GEMINI_LLM_TEMPERATURE=0.7
 ### OpenAI Compatible API Specific Parameters
 ### Increased temperature values may mitigate infinite inference loops in certain LLM, such as Qwen3-30B.
 # OPENAI_LLM_TEMPERATURE=0.9
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@ -8,6 +8,7 @@ import logging
 from dotenv import load_dotenv
 from lightrag.utils import get_env_value
 from lightrag.llm.binding_options import (
    GeminiLLMOptions,
    OllamaEmbeddingOptions,
    OllamaLLMOptions,
    OpenAILLMOptions,
@ -63,6 +64,9 @@ def get_default_host(binding_type: str) -> str:
        "lollms": os.getenv("LLM_BINDING_HOST", "http://localhost:9600"),
        "azure_openai": os.getenv("AZURE_OPENAI_ENDPOINT", "https://api.openai.com/v1"),
        "openai": os.getenv("LLM_BINDING_HOST", "https://api.openai.com/v1"),
        "gemini": os.getenv(
            "LLM_BINDING_HOST", "https://generativelanguage.googleapis.com"
        ),
    }
    return default_hosts.get(
        binding_type, os.getenv("LLM_BINDING_HOST", "http://localhost:11434")
@ -226,6 +230,7 @@ def parse_args() -> argparse.Namespace:
            "openai-ollama",
            "azure_openai",
            "aws_bedrock",
            "gemini",
        ],
        help="LLM binding type (default: from env or ollama)",
    )
@ -281,6 +286,16 @@ def parse_args() -> argparse.Namespace:
    elif os.environ.get("LLM_BINDING") in ["openai", "azure_openai"]:
        OpenAILLMOptions.add_args(parser)
    if "--llm-binding" in sys.argv:
        try:
            idx = sys.argv.index("--llm-binding")
            if idx + 1 < len(sys.argv) and sys.argv[idx + 1] == "gemini":
                GeminiLLMOptions.add_args(parser)
        except IndexError:
            pass
    elif os.environ.get("LLM_BINDING") == "gemini":
        GeminiLLMOptions.add_args(parser)
    args = parser.parse_args()
    # convert relative path to absolute path
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@ -104,6 +104,7 @@ class LLMConfigCache:
        # Initialize configurations based on binding conditions
        self.openai_llm_options = None
        self.gemini_llm_options = None
        self.ollama_llm_options = None
        self.ollama_embedding_options = None
@ -114,6 +115,12 @@ class LLMConfigCache:
            self.openai_llm_options = OpenAILLMOptions.options_dict(args)
            logger.info(f"OpenAI LLM Options: {self.openai_llm_options}")
        if args.llm_binding == "gemini":
            from lightrag.llm.binding_options import GeminiLLMOptions
            self.gemini_llm_options = GeminiLLMOptions.options_dict(args)
            logger.info(f"Gemini LLM Options: {self.gemini_llm_options}")
        # Only initialize and log Ollama LLM options when using Ollama LLM binding
        if args.llm_binding == "ollama":
            try:
@ -282,6 +289,7 @@ def create_app(args):
        "openai",
        "azure_openai",
        "aws_bedrock",
        "gemini",
    ]:
        raise Exception("llm binding not supported")
@ -500,6 +508,42 @@ def create_app(args):
        return optimized_azure_openai_model_complete
    def create_optimized_gemini_llm_func(
        config_cache: LLMConfigCache, args
    ):
        """Create optimized Gemini LLM function with cached configuration"""
        async def optimized_gemini_model_complete(
            prompt,
            system_prompt=None,
            history_messages=None,
            keyword_extraction=False,
            **kwargs,
        ) -> str:
            from lightrag.llm.gemini import gemini_complete_if_cache
            if history_messages is None:
                history_messages = []
            if (
                config_cache.gemini_llm_options is not None
                and "generation_config" not in kwargs
            ):
                kwargs["generation_config"] = dict(config_cache.gemini_llm_options)
            return await gemini_complete_if_cache(
                args.llm_model,
                prompt,
                system_prompt=system_prompt,
                history_messages=history_messages,
                api_key=args.llm_binding_api_key,
                base_url=args.llm_binding_host,
                keyword_extraction=keyword_extraction,
                **kwargs,
            )
        return optimized_gemini_model_complete
    def create_llm_model_func(binding: str):
        """
        Create LLM model function based on binding type.
@ -521,6 +565,8 @@ def create_app(args):
                return create_optimized_azure_openai_llm_func(
                    config_cache, args, llm_timeout
                )
            elif binding == "gemini":
                return create_optimized_gemini_llm_func(config_cache, args)
            else:  # openai and compatible
                # Use optimized function with pre-processed configuration
                return create_optimized_openai_llm_func(config_cache, args, llm_timeout)
--- a/lightrag/llm/binding_options.py
+++ b/lightrag/llm/binding_options.py
@ -9,12 +9,26 @@ from argparse import ArgumentParser, Namespace
 import argparse
 import json
 from dataclasses import asdict, dataclass, field
-from typing import Any, ClassVar, List
+from typing import Any, ClassVar, List, get_args, get_origin
 from lightrag.utils import get_env_value
 from lightrag.constants import DEFAULT_TEMPERATURE
 def _resolve_optional_type(field_type: Any) -> Any:
    """Return the concrete type for Optional/Union annotations."""
    origin = get_origin(field_type)
    if origin in (list, dict, tuple):
        return field_type
    args = get_args(field_type)
    if args:
        non_none_args = [arg for arg in args if arg is not type(None)]
        if len(non_none_args) == 1:
            return non_none_args[0]
    return field_type
 # =============================================================================
 # BindingOptions Base Class
 # =============================================================================
@ -177,9 +191,13 @@ class BindingOptions:
                    help=arg_item["help"],
                )
            else:
                resolved_type = arg_item["type"]
                if resolved_type is not None:
                    resolved_type = _resolve_optional_type(resolved_type)
                group.add_argument(
                    f"--{arg_item['argname']}",
-                    type=arg_item["type"],
+                    type=resolved_type,
                    default=get_env_value(f"{arg_item['env_name']}", argparse.SUPPRESS),
                    help=arg_item["help"],
                )
@ -210,7 +228,7 @@ class BindingOptions:
                argdef = {
                    "argname": f"{args_prefix}-{field.name}",
                    "env_name": f"{env_var_prefix}{field.name.upper()}",
-                    "type": field.type,
+                "type": _resolve_optional_type(field.type),
                    "default": default_value,
                    "help": f"{cls._binding_name} -- " + help.get(field.name, ""),
                }
@ -454,6 +472,39 @@ class OllamaLLMOptions(_OllamaOptionsMixin, BindingOptions):
    _binding_name: ClassVar[str] = "ollama_llm"
@dataclass
 class GeminiLLMOptions(BindingOptions):
    """Options for Google Gemini models."""
    _binding_name: ClassVar[str] = "gemini_llm"
    temperature: float = DEFAULT_TEMPERATURE
    top_p: float = 0.95
    top_k: int = 40
    max_output_tokens: int | None = None
    candidate_count: int = 1
    presence_penalty: float = 0.0
    frequency_penalty: float = 0.0
    stop_sequences: List[str] = field(default_factory=list)
    response_mime_type: str | None = None
    safety_settings: dict | None = None
    system_instruction: str | None = None
    _help: ClassVar[dict[str, str]] = {
        "temperature": "Controls randomness (0.0-2.0, higher = more creative)",
        "top_p": "Nucleus sampling parameter (0.0-1.0)",
        "top_k": "Limits sampling to the top K tokens (1 disables the limit)",
        "max_output_tokens": "Maximum tokens generated in the response",
        "candidate_count": "Number of candidates returned per request",
        "presence_penalty": "Penalty for token presence (-2.0 to 2.0)",
        "frequency_penalty": "Penalty for token frequency (-2.0 to 2.0)",
        "stop_sequences": 'Stop sequences (JSON array of strings, e.g., \'["END"]\')',
        "response_mime_type": "Desired MIME type for the response (e.g., application/json)",
        "safety_settings": "JSON object with Gemini safety settings overrides",
        "system_instruction": "Default system instruction applied to every request",
    }
 # =============================================================================
 # Binding Options for OpenAI
 # =============================================================================
--- a/lightrag/llm/gemini.py
+++ b/lightrag/llm/gemini.py
@ -0,0 +1,297 @@
 """
 Gemini LLM binding for LightRAG.
 This module provides asynchronous helpers that adapt Google's Gemini models
 to the same interface used by the rest of the LightRAG LLM bindings. The
 implementation mirrors the OpenAI helpers while relying on the official
 ``google-genai`` client under the hood.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from collections.abc import AsyncIterator
 from functools import lru_cache
 from typing import Any
 from lightrag.utils import logger, remove_think_tags, safe_unicode_decode
 import pipmaster as pm
 # Install the Google Gemini client on demand
 if not pm.is_installed("google-genai"):
    pm.install("google-genai")
 from google import genai  # type: ignore
 from google.genai import types  # type: ignore
 DEFAULT_GEMINI_ENDPOINT = "https://generativelanguage.googleapis.com"
 LOG = logging.getLogger(__name__)
@lru_cache(maxsize=8)
 def _get_gemini_client(api_key: str, base_url: str | None) -> genai.Client:
    """
    Create (or fetch cached) Gemini client.
    Args:
        api_key: Google Gemini API key.
        base_url: Optional custom API endpoint.
    Returns:
        genai.Client: Configured Gemini client instance.
    """
    client_kwargs: dict[str, Any] = {"api_key": api_key}
    if base_url and base_url != DEFAULT_GEMINI_ENDPOINT:
        try:
            client_kwargs["http_options"] = types.HttpOptions(api_endpoint=base_url)
        except Exception as exc:  # pragma: no cover - defensive
            LOG.warning("Failed to apply custom Gemini endpoint %s: %s", base_url, exc)
    try:
        return genai.Client(**client_kwargs)
    except TypeError:
        # Older google-genai releases don't accept http_options; retry without it.
        client_kwargs.pop("http_options", None)
        return genai.Client(**client_kwargs)
 def _ensure_api_key(api_key: str | None) -> str:
    key = api_key or os.getenv("LLM_BINDING_API_KEY") or os.getenv("GEMINI_API_KEY")
    if not key:
        raise ValueError(
            "Gemini API key not provided. "
            "Set LLM_BINDING_API_KEY or GEMINI_API_KEY in the environment."
        )
    return key
 def _build_generation_config(
    base_config: dict[str, Any] | None,
    system_prompt: str | None,
    keyword_extraction: bool,
 ) -> types.GenerateContentConfig | None:
    config_data = dict(base_config or {})
    if system_prompt:
        if config_data.get("system_instruction"):
            config_data["system_instruction"] = (
                f"{config_data['system_instruction']}\n{system_prompt}"
            )
        else:
            config_data["system_instruction"] = system_prompt
    if keyword_extraction and not config_data.get("response_mime_type"):
        config_data["response_mime_type"] = "application/json"
    # Remove entries that are explicitly set to None to avoid type errors
    sanitized = {
        key: value
        for key, value in config_data.items()
        if value is not None and value != ""
    }
    if not sanitized:
        return None
    return types.GenerateContentConfig(**sanitized)
 def _format_history_messages(history_messages: list[dict[str, Any]] | None) -> str:
    if not history_messages:
        return ""
    history_lines: list[str] = []
    for message in history_messages:
        role = message.get("role", "user")
        content = message.get("content", "")
        history_lines.append(f"[{role}] {content}")
    return "\n".join(history_lines)
 def _extract_response_text(response: Any) -> str:
    if getattr(response, "text", None):
        return response.text
    candidates = getattr(response, "candidates", None)
    if not candidates:
        return ""
    parts: list[str] = []
    for candidate in candidates:
        if not getattr(candidate, "content", None):
            continue
        for part in getattr(candidate.content, "parts", []):
            text = getattr(part, "text", None)
            if text:
                parts.append(text)
    return "\n".join(parts)
 async def gemini_complete_if_cache(
    model: str,
    prompt: str,
    system_prompt: str | None = None,
    history_messages: list[dict[str, Any]] | None = None,
    *,
    api_key: str | None = None,
    base_url: str | None = None,
    generation_config: dict[str, Any] | None = None,
    keyword_extraction: bool = False,
    token_tracker: Any | None = None,
    hashing_kv: Any | None = None,  # noqa: ARG001 - present for interface parity
    stream: bool | None = None,
    enable_cot: bool = False,  # noqa: ARG001 - not supported by Gemini currently
    timeout: float | None = None,  # noqa: ARG001 - handled by caller if needed
    **_: Any,
 ) -> str | AsyncIterator[str]:
    loop = asyncio.get_running_loop()
    key = _ensure_api_key(api_key)
    client = _get_gemini_client(key, base_url)
    history_block = _format_history_messages(history_messages)
    prompt_sections = []
    if history_block:
        prompt_sections.append(history_block)
    prompt_sections.append(f"[user] {prompt}")
    combined_prompt = "\n".join(prompt_sections)
    config_obj = _build_generation_config(
        generation_config,
        system_prompt=system_prompt,
        keyword_extraction=keyword_extraction,
    )
    request_kwargs: dict[str, Any] = {
        "model": model,
        "contents": [combined_prompt],
    }
    if config_obj is not None:
        request_kwargs["config"] = config_obj
    def _call_model():
        return client.models.generate_content(**request_kwargs)
    if stream:
        queue: asyncio.Queue[Any] = asyncio.Queue()
        usage_container: dict[str, Any] = {}
        def _stream_model() -> None:
            try:
                stream_kwargs = dict(request_kwargs)
                stream_iterator = client.models.generate_content_stream(**stream_kwargs)
                for chunk in stream_iterator:
                    usage = getattr(chunk, "usage_metadata", None)
                    if usage is not None:
                        usage_container["usage"] = usage
                    text_piece = getattr(chunk, "text", None) or _extract_response_text(chunk)
                    if text_piece:
                        loop.call_soon_threadsafe(queue.put_nowait, text_piece)
                loop.call_soon_threadsafe(queue.put_nowait, None)
            except Exception as exc:  # pragma: no cover - surface runtime issues
                loop.call_soon_threadsafe(queue.put_nowait, exc)
        loop.run_in_executor(None, _stream_model)
        async def _async_stream() -> AsyncIterator[str]:
            accumulated = ""
            emitted = ""
            try:
                while True:
                    item = await queue.get()
                    if item is None:
                        break
                    if isinstance(item, Exception):
                        raise item
                    chunk_text = str(item)
                    if "\\u" in chunk_text:
                        chunk_text = safe_unicode_decode(chunk_text.encode("utf-8"))
                    accumulated += chunk_text
                    sanitized = remove_think_tags(accumulated)
                    if sanitized.startswith(emitted):
                        delta = sanitized[len(emitted) :]
                    else:
                        delta = sanitized
                    emitted = sanitized
                    if delta:
                        yield delta
            finally:
                usage = usage_container.get("usage")
                if token_tracker and usage:
                    token_tracker.add_usage(
                        {
                            "prompt_tokens": getattr(usage, "prompt_token_count", 0),
                            "completion_tokens": getattr(
                                usage, "candidates_token_count", 0
                            ),
                            "total_tokens": getattr(usage, "total_token_count", 0),
                        }
                    )
        return _async_stream()
    response = await asyncio.to_thread(_call_model)
    text = _extract_response_text(response)
    if not text:
        raise RuntimeError("Gemini response did not contain any text content.")
    if "\\u" in text:
        text = safe_unicode_decode(text.encode("utf-8"))
    text = remove_think_tags(text)
    usage = getattr(response, "usage_metadata", None)
    if token_tracker and usage:
        token_tracker.add_usage(
            {
                "prompt_tokens": getattr(usage, "prompt_token_count", 0),
                "completion_tokens": getattr(usage, "candidates_token_count", 0),
                "total_tokens": getattr(usage, "total_token_count", 0),
            }
        )
    logger.debug("Gemini response length: %s", len(text))
    return text
 async def gemini_model_complete(
    prompt: str,
    system_prompt: str | None = None,
    history_messages: list[dict[str, Any]] | None = None,
    keyword_extraction: bool = False,
    **kwargs: Any,
 ) -> str | AsyncIterator[str]:
    hashing_kv = kwargs.get("hashing_kv")
    model_name = None
    if hashing_kv is not None:
        model_name = hashing_kv.global_config.get("llm_model_name")
    if model_name is None:
        model_name = kwargs.pop("model_name", None)
    if model_name is None:
        raise ValueError("Gemini model name not provided in configuration.")
    return await gemini_complete_if_cache(
        model_name,
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        keyword_extraction=keyword_extraction,
        **kwargs,
    )
 __all__ = [
    "gemini_complete_if_cache",
    "gemini_model_complete",
 ]
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -1783,7 +1783,7 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
    - Filter out short numeric-only text (length < 3 and only digits/dots)
    - remove_inner_quotes = True
        remove Chinese quotes
-        remove English queotes in and around chinese
+        remove English quotes in and around chinese
        Convert non-breaking spaces to regular spaces
        Convert narrow non-breaking spaces after non-digits to regular spaces
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,6 +24,7 @@ dependencies = [
    "aiohttp",
    "configparser",
    "future",
    "google-genai>=1.0.0,<2.0.0",
    "json_repair",
    "nano-vectordb",
    "networkx",
@ -59,6 +60,7 @@ api = [
    "tenacity",
    "tiktoken",
    "xlsxwriter>=3.1.0",
    "google-genai>=1.0.0,<2.0.0",
    # API-specific dependencies
    "aiofiles",
    "ascii_colors",
@ -105,6 +107,7 @@ offline-llm = [
    "aioboto3>=12.0.0,<16.0.0",
    "voyageai>=0.2.0,<1.0.0",
    "llama-index>=0.9.0,<1.0.0",
    "google-genai>=1.0.0,<2.0.0",
 ]
 offline = [
--- a/requirements-offline-llm.txt
+++ b/requirements-offline-llm.txt
@ -13,5 +13,6 @@ anthropic>=0.18.0,<1.0.0
 llama-index>=0.9.0,<1.0.0
 ollama>=0.1.0,<1.0.0
 openai>=1.0.0,<2.0.0
 google-genai>=1.0.0,<2.0.0
 voyageai>=0.2.0,<1.0.0
 zhipuai>=2.0.0,<3.0.0
--- a/requirements-offline.txt
+++ b/requirements-offline.txt
@ -19,6 +19,7 @@ llama-index>=0.9.0,<1.0.0
 neo4j>=5.0.0,<7.0.0
 ollama>=0.1.0,<1.0.0
 openai>=1.0.0,<2.0.0
 google-genai>=1.0.0,<2.0.0
 openpyxl>=3.0.0,<4.0.0
 pymilvus>=2.6.2,<3.0.0
 pymongo>=4.0.0,<5.0.0