Merge pull request #468 from topoteretes/COG-970-refactor-tokenizing

Cog 970 refactor tokenizing
2025-01-29 09:02:23 +01:00 · 2025-01-29 09:02:23 +01:00 · d900060e2b
commit d900060e2b
parent 98f0f60980 860218632f
44 changed files with 633 additions and 134 deletions
--- a/.env.template
+++ b/.env.template
@ -1,12 +1,28 @@
 ENV="local"
 TOKENIZERS_PARALLELISM="false"
-LLM_API_KEY=
+
+# LLM settings
+LLM_API_KEY=""
+LLM_MODEL="openai/gpt-4o-mini"
+LLM_PROVIDER="openai"
+LLM_ENDPOINT=""
+LLM_API_VERSION=""
+LLM_MAX_TOKENS="16384"

 GRAPHISTRY_USERNAME=
 GRAPHISTRY_PASSWORD=

 SENTRY_REPORTING_URL=

+# Embedding settings
+EMBEDDING_PROVIDER="openai"
+EMBEDDING_API_KEY=""
+EMBEDDING_MODEL="openai/text-embedding-3-large"
+EMBEDDING_ENDPOINT=""
+EMBEDDING_API_VERSION=""
+EMBEDDING_DIMENSIONS=3072
+EMBEDDING_MAX_TOKENS=8191
+
 # "neo4j" or "networkx"
 GRAPH_DATABASE_PROVIDER="networkx"
 # Not needed if using networkx
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -20,6 +20,7 @@ from cognee.tasks.repo_processor import (
 from cognee.tasks.repo_processor.get_source_code_chunks import get_source_code_chunks
 from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_code, summarize_text
+from cognee.infrastructure.llm import get_max_chunk_tokens

 monitoring = get_base_config().monitoring_tool
 if monitoring == MonitoringTool.LANGFUSE:
@ -57,7 +58,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
            Task(ingest_data, dataset_name="repo_docs", user=user),
            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
            Task(classify_documents),
-            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
+            Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()),
            Task(
                extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
            ),
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@ -4,6 +4,7 @@ from typing import Union

 from pydantic import BaseModel

+from cognee.infrastructure.llm import get_max_chunk_tokens
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.data.methods import get_datasets, get_datasets_by_name
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
@ -151,7 +152,9 @@ async def get_default_tasks(
        default_tasks = [
            Task(classify_documents),
            Task(check_permissions_on_documents, user=user, permissions=["write"]),
-            Task(extract_chunks_from_documents),  # Extract text chunks based on the document type.
+            Task(
+                extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
+            ),  # Extract text chunks based on the document type.
            Task(
                extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
            ),  # Generate knowledge graphs from the document chunks.
--- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@ -6,6 +6,9 @@ import litellm
 import os
 from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
 from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
+from cognee.infrastructure.llm.tokenizer.Gemini import GeminiTokenizer
+from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
+from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer

 litellm.set_verbose = False
 logger = logging.getLogger("LiteLLMEmbeddingEngine")
@ -15,23 +18,29 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
    api_key: str
    endpoint: str
    api_version: str
+    provider: str
    model: str
    dimensions: int
    mock: bool

    def __init__(
        self,
+        provider: str = "openai",
        model: Optional[str] = "text-embedding-3-large",
        dimensions: Optional[int] = 3072,
        api_key: str = None,
        endpoint: str = None,
        api_version: str = None,
+        max_tokens: int = 512,
    ):
        self.api_key = api_key
        self.endpoint = endpoint
        self.api_version = api_version
+        self.provider = provider
        self.model = model
        self.dimensions = dimensions
+        self.max_tokens = max_tokens
+        self.tokenizer = self.get_tokenizer()

        enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
        if isinstance(enable_mocking, bool):
@ -104,3 +113,18 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):

    def get_vector_size(self) -> int:
        return self.dimensions
+
+    def get_tokenizer(self):
+        logger.debug(f"Loading tokenizer for model {self.model}...")
+        # If model also contains provider information, extract only model information
+        model = self.model.split("/")[-1]
+
+        if "openai" in self.provider.lower():
+            tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens)
+        elif "gemini" in self.provider.lower():
+            tokenizer = GeminiTokenizer(model=model, max_tokens=self.max_tokens)
+        else:
+            tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
+
+        logger.debug(f"Tokenizer loaded for model: {self.model}")
+        return tokenizer
--- a/cognee/infrastructure/databases/vector/embeddings/config.py
+++ b/cognee/infrastructure/databases/vector/embeddings/config.py
@ -4,12 +4,13 @@ from pydantic_settings import BaseSettings, SettingsConfigDict


 class EmbeddingConfig(BaseSettings):
-    embedding_model: Optional[str] = "text-embedding-3-large"
+    embedding_provider: Optional[str] = "openai"
+    embedding_model: Optional[str] = "openai/text-embedding-3-large"
    embedding_dimensions: Optional[int] = 3072
    embedding_endpoint: Optional[str] = None
    embedding_api_key: Optional[str] = None
    embedding_api_version: Optional[str] = None
-
+    embedding_max_tokens: Optional[int] = 8191
    model_config = SettingsConfigDict(env_file=".env", extra="allow")


--- a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py
+++ b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py
@ -10,9 +10,11 @@ def get_embedding_engine() -> EmbeddingEngine:

    return LiteLLMEmbeddingEngine(
        # If OpenAI API is used for embeddings, litellm needs only the api_key.
+        provider=config.embedding_provider,
        api_key=config.embedding_api_key or llm_config.llm_api_key,
        endpoint=config.embedding_endpoint,
        api_version=config.embedding_api_version,
        model=config.embedding_model,
        dimensions=config.embedding_dimensions,
+        max_tokens=config.embedding_max_tokens,
    )
--- a/cognee/infrastructure/llm/init.py
+++ b/cognee/infrastructure/llm/init.py
@ -1 +1,2 @@
 from .config import get_llm_config
+from .utils import get_max_chunk_tokens
--- a/cognee/infrastructure/llm/anthropic/adapter.py
+++ b/cognee/infrastructure/llm/anthropic/adapter.py
@ -14,11 +14,12 @@ class AnthropicAdapter(LLMInterface):
    name = "Anthropic"
    model: str

-    def __init__(self, model: str = None):
+    def __init__(self, max_tokens: int, model: str = None):
        self.aclient = instructor.patch(
            create=anthropic.Anthropic().messages.create, mode=instructor.Mode.ANTHROPIC_TOOLS
        )
        self.model = model
+        self.max_tokens = max_tokens

    async def acreate_structured_output(
        self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
--- a/cognee/infrastructure/llm/config.py
+++ b/cognee/infrastructure/llm/config.py
@ -11,6 +11,7 @@ class LLMConfig(BaseSettings):
    llm_api_version: Optional[str] = None
    llm_temperature: float = 0.0
    llm_streaming: bool = False
+    llm_max_tokens: int = 16384
    transcription_model: str = "whisper-1"

    model_config = SettingsConfigDict(env_file=".env", extra="allow")
@ -24,6 +25,7 @@ class LLMConfig(BaseSettings):
            "api_version": self.llm_api_version,
            "temperature": self.llm_temperature,
            "streaming": self.llm_streaming,
+            "max_tokens": self.llm_max_tokens,
            "transcription_model": self.transcription_model,
        }

--- a/cognee/infrastructure/llm/generic_llm_api/adapter.py
+++ b/cognee/infrastructure/llm/generic_llm_api/adapter.py
@ -2,6 +2,7 @@

 import asyncio
 from typing import List, Type
+
 from pydantic import BaseModel
 import instructor
 from cognee.infrastructure.llm.llm_interface import LLMInterface
@ -16,11 +17,12 @@ class GenericAPIAdapter(LLMInterface):
    model: str
    api_key: str

-    def __init__(self, endpoint, api_key: str, model: str, name: str):
+    def __init__(self, endpoint, api_key: str, model: str, name: str, max_tokens: int):
        self.name = name
        self.model = model
        self.api_key = api_key
        self.endpoint = endpoint
+        self.max_tokens = max_tokens

        llm_config = get_llm_config()

--- a/cognee/infrastructure/llm/get_llm_client.py
+++ b/cognee/infrastructure/llm/get_llm_client.py
@ -20,6 +20,15 @@ def get_llm_client():

    provider = LLMProvider(llm_config.llm_provider)

+    # Check if max_token value is defined in liteLLM for given model
+    # if not use value from cognee configuration
+    from cognee.infrastructure.llm.utils import (
+        get_model_max_tokens,
+    )  # imported here to avoid circular imports
+
+    model_max_tokens = get_model_max_tokens(llm_config.llm_model)
+    max_tokens = model_max_tokens if model_max_tokens else llm_config.llm_max_tokens
+
    if provider == LLMProvider.OPENAI:
        if llm_config.llm_api_key is None:
            raise InvalidValueError(message="LLM API key is not set.")
@ -32,6 +41,7 @@ def get_llm_client():
            api_version=llm_config.llm_api_version,
            model=llm_config.llm_model,
            transcription_model=llm_config.transcription_model,
+            max_tokens=max_tokens,
            streaming=llm_config.llm_streaming,
        )

@ -42,13 +52,17 @@ def get_llm_client():
        from .generic_llm_api.adapter import GenericAPIAdapter

        return GenericAPIAdapter(
-            llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Ollama"
+            llm_config.llm_endpoint,
+            llm_config.llm_api_key,
+            llm_config.llm_model,
+            "Ollama",
+            max_tokens=max_tokens,
        )

    elif provider == LLMProvider.ANTHROPIC:
        from .anthropic.adapter import AnthropicAdapter

-        return AnthropicAdapter(llm_config.llm_model)
+        return AnthropicAdapter(max_tokens=max_tokens, model=llm_config.llm_model)

    elif provider == LLMProvider.CUSTOM:
        if llm_config.llm_api_key is None:
@ -57,7 +71,11 @@ def get_llm_client():
        from .generic_llm_api.adapter import GenericAPIAdapter

        return GenericAPIAdapter(
-            llm_config.llm_endpoint, llm_config.llm_api_key, llm_config.llm_model, "Custom"
+            llm_config.llm_endpoint,
+            llm_config.llm_api_key,
+            llm_config.llm_model,
+            "Custom",
+            max_tokens=max_tokens,
        )

    else:
--- a/cognee/infrastructure/llm/openai/adapter.py
+++ b/cognee/infrastructure/llm/openai/adapter.py
@ -32,6 +32,7 @@ class OpenAIAdapter(LLMInterface):
        api_version: str,
        model: str,
        transcription_model: str,
+        max_tokens: int,
        streaming: bool = False,
    ):
        self.aclient = instructor.from_litellm(litellm.acompletion)
@ -41,6 +42,7 @@ class OpenAIAdapter(LLMInterface):
        self.api_key = api_key
        self.endpoint = endpoint
        self.api_version = api_version
+        self.max_tokens = max_tokens
        self.streaming = streaming

    @observe(as_type="generation")
--- a/cognee/infrastructure/llm/tokenizer/Gemini/init.py
+++ b/cognee/infrastructure/llm/tokenizer/Gemini/init.py
@ -0,0 +1 @@
+from .adapter import GeminiTokenizer
--- a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py
+++ b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py
@ -0,0 +1,44 @@
+from typing import List, Any
+
+from ..tokenizer_interface import TokenizerInterface
+
+
+class GeminiTokenizer(TokenizerInterface):
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 3072,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+
+        # Get LLM API key from config
+        from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
+        from cognee.infrastructure.llm.config import get_llm_config
+
+        config = get_embedding_config()
+        llm_config = get_llm_config()
+
+        import google.generativeai as genai
+
+        genai.configure(api_key=config.embedding_api_key or llm_config.llm_api_key)
+
+    def extract_tokens(self, text: str) -> List[Any]:
+        raise NotImplementedError
+
+    def count_tokens(self, text: str) -> int:
+        """
+        Returns the number of tokens in the given text.
+        Args:
+            text: str
+
+        Returns:
+            number of tokens in the given text
+
+        """
+        import google.generativeai as genai
+
+        return len(genai.embed_content(model=f"models/{self.model}", content=text))
+
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        raise NotImplementedError
--- a/cognee/infrastructure/llm/tokenizer/HuggingFace/init.py
+++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/init.py
@ -0,0 +1 @@
+from .adapter import HuggingFaceTokenizer
--- a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
+++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
@ -0,0 +1,36 @@
+from typing import List, Any
+
+from transformers import AutoTokenizer
+
+from ..tokenizer_interface import TokenizerInterface
+
+
+class HuggingFaceTokenizer(TokenizerInterface):
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 512,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+
+    def extract_tokens(self, text: str) -> List[Any]:
+        tokens = self.tokenizer.tokenize(text)
+        return tokens
+
+    def count_tokens(self, text: str) -> int:
+        """
+        Returns the number of tokens in the given text.
+        Args:
+            text: str
+
+        Returns:
+            number of tokens in the given text
+
+        """
+        return len(self.tokenizer.tokenize(text))
+
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        raise NotImplementedError
--- a/cognee/infrastructure/llm/tokenizer/TikToken/init.py
+++ b/cognee/infrastructure/llm/tokenizer/TikToken/init.py
@ -0,0 +1 @@
+from .adapter import TikTokenTokenizer
--- a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py
+++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py
@ -0,0 +1,69 @@
+from typing import List, Any
+import tiktoken
+
+from ..tokenizer_interface import TokenizerInterface
+
+
+class TikTokenTokenizer(TokenizerInterface):
+    """
+    Tokenizer adapter for OpenAI.
+    Inteded to be used as part of LLM Embedding and LLM Adapters classes
+    """
+
+    def __init__(
+        self,
+        model: str,
+        max_tokens: int = 8191,
+    ):
+        self.model = model
+        self.max_tokens = max_tokens
+        # Initialize TikToken for GPT based on model
+        self.tokenizer = tiktoken.encoding_for_model(self.model)
+
+    def extract_tokens(self, text: str) -> List[Any]:
+        tokens = []
+        # Using TikToken's method to tokenize text
+        token_ids = self.tokenizer.encode(text)
+        # Go through tokens and decode them to text value
+        for token_id in token_ids:
+            token = self.tokenizer.decode([token_id])
+            tokens.append(token)
+        return tokens
+
+    def count_tokens(self, text: str) -> int:
+        """
+        Returns the number of tokens in the given text.
+        Args:
+            text: str
+
+        Returns:
+            number of tokens in the given text
+
+        """
+        num_tokens = len(self.tokenizer.encode(text))
+        return num_tokens
+
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        """
+        Trims the text so that the number of tokens does not exceed max_tokens.
+
+        Args:
+        text (str): Original text string to be trimmed.
+
+        Returns:
+        str: Trimmed version of text or original text if under the limit.
+        """
+        # First check the number of tokens
+        num_tokens = self.count_tokens(text)
+
+        # If the number of tokens is within the limit, return the text as is
+        if num_tokens <= self.max_tokens:
+            return text
+
+        # If the number exceeds the limit, trim the text
+        # This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
+        encoded_text = self.tokenizer.encode(text)
+        trimmed_encoded_text = encoded_text[: self.max_tokens]
+        # Decoding the trimmed text
+        trimmed_text = self.tokenizer.decode(trimmed_encoded_text)
+        return trimmed_text
--- a/cognee/infrastructure/llm/tokenizer/init.py
+++ b/cognee/infrastructure/llm/tokenizer/init.py
@ -0,0 +1 @@
+from .tokenizer_interface import TokenizerInterface
--- a/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py
+++ b/cognee/infrastructure/llm/tokenizer/tokenizer_interface.py
@ -0,0 +1,18 @@
+from typing import List, Protocol, Any
+from abc import abstractmethod
+
+
+class TokenizerInterface(Protocol):
+    """Tokenizer interface"""
+
+    @abstractmethod
+    def extract_tokens(self, text: str) -> List[Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def count_tokens(self, text: str) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def trim_text_to_max_tokens(self, text: str) -> str:
+        raise NotImplementedError
--- a/cognee/infrastructure/llm/utils.py
+++ b/cognee/infrastructure/llm/utils.py
@ -0,0 +1,38 @@
+import logging
+import litellm
+
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+
+logger = logging.getLogger(__name__)
+
+
+def get_max_chunk_tokens():
+    # Calculate max chunk size based on the following formula
+    embedding_engine = get_vector_engine().embedding_engine
+    llm_client = get_llm_client()
+
+    # We need to make sure chunk size won't take more than half of LLM max context token size
+    # but it also can't be bigger than the embedding engine max token size
+    llm_cutoff_point = llm_client.max_tokens // 2  # Round down the division
+    max_chunk_tokens = min(embedding_engine.max_tokens, llm_cutoff_point)
+
+    return max_chunk_tokens
+
+
+def get_model_max_tokens(model_name: str):
+    """
+    Args:
+        model_name: name of LLM or embedding model
+
+    Returns: Number of max tokens of model, or None if model is unknown
+    """
+    max_tokens = None
+
+    if model_name in litellm.model_cost:
+        max_tokens = litellm.model_cost[model_name]["max_tokens"]
+        logger.debug(f"Max input tokens for {model_name}: {max_tokens}")
+    else:
+        logger.info("Model not found in LiteLLM's model_cost.")
+
+    return max_tokens
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@ -14,17 +14,15 @@ class TextChunker:
    chunk_size = 0
    token_count = 0

-    def __init__(
-        self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024
-    ):
+    def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024):
        self.document = document
        self.max_chunk_size = chunk_size
        self.get_text = get_text
-        self.max_tokens = max_tokens if max_tokens else float("inf")
+        self.max_chunk_tokens = max_chunk_tokens

    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
-        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
+        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_chunk_tokens
        return word_count_fits and token_count_fits

    def read(self):
@ -32,7 +30,7 @@ class TextChunker:
        for content_text in self.get_text():
            for chunk_data in chunk_by_paragraph(
                content_text,
-                self.max_tokens,
+                self.max_chunk_tokens,
                self.max_chunk_size,
                batch_paragraphs=True,
            ):
--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@ -8,7 +8,6 @@ import os
 class CognifyConfig(BaseSettings):
    classification_model: object = DefaultContentPrediction
    summarization_model: object = SummarizedContent
-    max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
    model_config = SettingsConfigDict(env_file=".env", extra="allow")

    def to_dict(self) -> dict:
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -13,14 +13,14 @@ class AudioDocument(Document):
        result = get_llm_client().create_transcript(self.raw_data_location)
        return result.text

-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
+    def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
        # Transcribe the audio file

        text = self.create_transcript()

        chunker_func = ChunkerConfig.get_chunker(chunker)
        chunker = chunker_func(
-            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -11,5 +11,5 @@ class Document(DataPoint):
    mime_type: str
    _metadata: dict = {"index_fields": ["name"], "type": "Document"}

-    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
+    def read(self, chunk_size: int, chunker=str, max_chunk_tokens: Optional[int] = None) -> str:
        pass
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@ -13,13 +13,13 @@ class ImageDocument(Document):
        result = get_llm_client().transcribe_image(self.raw_data_location)
        return result.choices[0].message.content

-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
+    def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
        # Transcribe the image file
        text = self.transcribe_image()

        chunker_func = ChunkerConfig.get_chunker(chunker)
        chunker = chunker_func(
-            self, chunk_size=chunk_size, get_text=lambda: [text], max_tokens=max_tokens
+            self, chunk_size=chunk_size, get_text=lambda: [text], max_chunk_tokens=max_chunk_tokens
        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -9,7 +9,7 @@ from .Document import Document
 class PdfDocument(Document):
    type: str = "pdf"

-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
+    def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
        file = PdfReader(self.raw_data_location)

        def get_text():
@ -19,7 +19,7 @@ class PdfDocument(Document):

        chunker_func = ChunkerConfig.get_chunker(chunker)
        chunker = chunker_func(
-            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@ -7,7 +7,7 @@ from .Document import Document
 class TextDocument(Document):
    type: str = "text"

-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
+    def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int):
        def get_text():
            with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                while True:
@ -21,7 +21,7 @@ class TextDocument(Document):
        chunker_func = ChunkerConfig.get_chunker(chunker)

        chunker = chunker_func(
-            self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens
+            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
        )

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@ -10,7 +10,7 @@ from .Document import Document
 class UnstructuredDocument(Document):
    type: str = "unstructured"

-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
+    def read(self, chunk_size: int, chunker: str, max_chunk_tokens: int) -> str:
        def get_text():
            try:
                from unstructured.partition.auto import partition
@ -29,6 +29,8 @@ class UnstructuredDocument(Document):

                yield text

-        chunker = TextChunker(self, chunk_size=chunk_size, get_text=get_text, max_tokens=max_tokens)
+        chunker = TextChunker(
+            self, chunk_size=chunk_size, get_text=get_text, max_chunk_tokens=max_chunk_tokens
+        )

        yield from chunker.read()
--- a/cognee/shared/utils.py
+++ b/cognee/shared/utils.py
@ -10,8 +10,6 @@ import graphistry
 import networkx as nx
 import pandas as pd
 import matplotlib.pyplot as plt
-import tiktoken
-import time

 import logging
 import sys
@ -100,15 +98,6 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
        print(f"Error sending telemetry through proxy: {response.status_code}")


-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    """Returns the number of tokens in a text string."""
-
-    # tiktoken.get_encoding("cl100k_base")
-    encoding = tiktoken.encoding_for_model(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    return num_tokens
-
-
 def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str:
    h = hashlib.md5()

@ -134,34 +123,6 @@ def get_file_content_hash(file_obj: Union[str, BinaryIO]) -> str:
        raise IngestionError(message=f"Failed to load data from {file}: {e}")


-def trim_text_to_max_tokens(text: str, max_tokens: int, encoding_name: str) -> str:
-    """
-    Trims the text so that the number of tokens does not exceed max_tokens.
-
-    Args:
-    text (str): Original text string to be trimmed.
-    max_tokens (int): Maximum number of tokens allowed.
-    encoding_name (str): The name of the token encoding to use.
-
-    Returns:
-    str: Trimmed version of text or original text if under the limit.
-    """
-    # First check the number of tokens
-    num_tokens = num_tokens_from_string(text, encoding_name)
-
-    # If the number of tokens is within the limit, return the text as is
-    if num_tokens <= max_tokens:
-        return text
-
-    # If the number exceeds the limit, trim the text
-    # This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
-    encoded_text = tiktoken.get_encoding(encoding_name).encode(text)
-    trimmed_encoded_text = encoded_text[:max_tokens]
-    # Decoding the trimmed text
-    trimmed_text = tiktoken.get_encoding(encoding_name).decode(trimmed_encoded_text)
-    return trimmed_text
-
-
 def generate_color_palette(unique_layers):
    colormap = plt.cm.get_cmap("viridis", len(unique_layers))
    colors = [colormap(i) for i in range(len(unique_layers))]
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@ -10,7 +10,7 @@ from .chunk_by_sentence import chunk_by_sentence

 def chunk_by_paragraph(
    data: str,
-    max_tokens: Optional[Union[int, float]] = None,
+    max_chunk_tokens,
    paragraph_length: int = 1024,
    batch_paragraphs: bool = True,
 ) -> Iterator[Dict[str, Any]]:
@ -30,8 +30,6 @@ def chunk_by_paragraph(
    paragraph_ids = []
    last_cut_type = None
    current_token_count = 0
-    if not max_tokens:
-        max_tokens = float("inf")

    vector_engine = get_vector_engine()
    embedding_model = vector_engine.embedding_engine.model
@ -47,7 +45,7 @@ def chunk_by_paragraph(

        if current_word_count > 0 and (
            current_word_count + word_count > paragraph_length
-            or current_token_count + token_count > max_tokens
+            or current_token_count + token_count > max_chunk_tokens
        ):
            # Yield current chunk
            chunk_dict = {
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -5,9 +5,9 @@ from cognee.modules.data.processing.document_types.Document import Document

 async def extract_chunks_from_documents(
    documents: list[Document],
+    max_chunk_tokens: int,
    chunk_size: int = 1024,
    chunker="text_chunker",
-    max_tokens: Optional[int] = None,
 ) -> AsyncGenerator:
    """
    Extracts chunks of data from a list of documents based on the specified chunking parameters.
@ -18,6 +18,6 @@ async def extract_chunks_from_documents(
    """
    for document in documents:
        for document_chunk in document.read(
-            chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens
+            chunk_size=chunk_size, chunker=chunker, max_chunk_tokens=max_chunk_tokens
        ):
            yield document_chunk
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@ -89,26 +89,29 @@ def _get_subchunk_token_counts(


 def _get_chunk_source_code(
-    code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int
+    code_token_counts: list[tuple[str, int]], overlap: float
 ) -> tuple[list[tuple[str, int]], str]:
    """Generates a chunk of source code from tokenized subchunks with overlap handling."""
    current_count = 0
    cumulative_counts = []
    current_source_code = ""

+    # Get embedding engine used in vector database
+    embedding_engine = get_vector_engine().embedding_engine
+
    for i, (child_code, token_count) in enumerate(code_token_counts):
        current_count += token_count
        cumulative_counts.append(current_count)
-        if current_count > max_tokens:
+        if current_count > embedding_engine.max_tokens:
            break
        current_source_code += f"\n{child_code}"

-    if current_count <= max_tokens:
+    if current_count <= embedding_engine.max_tokens:
        return [], current_source_code.strip()

    cutoff = 1
    for i, cum_count in enumerate(cumulative_counts):
-        if cum_count > (1 - overlap) * max_tokens:
+        if cum_count > (1 - overlap) * embedding_engine.max_tokens:
            break
        cutoff = i

@ -117,21 +120,18 @@ def _get_chunk_source_code(

 def get_source_code_chunks_from_code_part(
    code_file_part: CodePart,
-    max_tokens: int = 8192,
    overlap: float = 0.25,
    granularity: float = 0.1,
-    model_name: str = "text-embedding-3-large",
 ) -> Generator[SourceCodeChunk, None, None]:
    """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
    if not code_file_part.source_code:
        logger.error(f"No source code in CodeFile {code_file_part.id}")
        return

-    vector_engine = get_vector_engine()
-    embedding_model = vector_engine.embedding_engine.model
-    model_name = embedding_model.split("/")[-1]
-    tokenizer = tiktoken.encoding_for_model(model_name)
-    max_subchunk_tokens = max(1, int(granularity * max_tokens))
+    embedding_engine = get_vector_engine().embedding_engine
+    tokenizer = embedding_engine.tokenizer
+
+    max_subchunk_tokens = max(1, int(granularity * embedding_engine.max_tokens))
    subchunk_token_counts = _get_subchunk_token_counts(
        tokenizer, code_file_part.source_code, max_subchunk_tokens
    )
@ -139,7 +139,7 @@ def get_source_code_chunks_from_code_part(
    previous_chunk = None
    while subchunk_token_counts:
        subchunk_token_counts, chunk_source_code = _get_chunk_source_code(
-            subchunk_token_counts, overlap, max_tokens
+            subchunk_token_counts, overlap
        )
        if not chunk_source_code:
            continue
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@ -34,7 +34,7 @@ def test_AudioDocument():
    )
    with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
        for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
+            GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
        ):
            assert ground_truth["word_count"] == paragraph_data.word_count, (
                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
--- a/cognee/tests/integration/documents/ImageDocument_test.py
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@ -23,7 +23,7 @@ def test_ImageDocument():
    )
    with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):
        for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker")
+            GROUND_TRUTH, document.read(chunk_size=64, chunker="text_chunker", max_chunk_tokens=512)
        ):
            assert ground_truth["word_count"] == paragraph_data.word_count, (
                f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@ -25,7 +25,7 @@ def test_PdfDocument():
    )

    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker")
+        GROUND_TRUTH, document.read(chunk_size=1024, chunker="text_chunker", max_chunk_tokens=2048)
    ):
        assert ground_truth["word_count"] == paragraph_data.word_count, (
            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@ -37,7 +37,8 @@ def test_TextDocument(input_file, chunk_size):
    )

    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker="text_chunker")
+        GROUND_TRUTH[input_file],
+        document.read(chunk_size=chunk_size, chunker="text_chunker", max_chunk_tokens=1024),
    ):
        assert ground_truth["word_count"] == paragraph_data.word_count, (
            f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@ -68,7 +68,9 @@ def test_UnstructuredDocument():
    )

    # Test PPTX
-    for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
+    for paragraph_data in pptx_document.read(
+        chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
+    ):
        assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
        assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
        assert "sentence_cut" == paragraph_data.cut_type, (
@ -76,7 +78,9 @@ def test_UnstructuredDocument():
        )

    # Test DOCX
-    for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
+    for paragraph_data in docx_document.read(
+        chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
+    ):
        assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
        assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
        assert "sentence_end" == paragraph_data.cut_type, (
@ -84,7 +88,9 @@ def test_UnstructuredDocument():
        )

    # TEST CSV
-    for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
+    for paragraph_data in csv_document.read(
+        chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
+    ):
        assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
        assert "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text, (
            f"Read text doesn't match expected text: {paragraph_data.text}"
@ -94,7 +100,9 @@ def test_UnstructuredDocument():
        )

    # Test XLSX
-    for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
+    for paragraph_data in xlsx_document.read(
+        chunk_size=1024, chunker="text_chunker", max_chunk_tokens=1024
+    ):
        assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
        assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
        assert "sentence_cut" == paragraph_data.cut_type, (
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_2_test.py
@ -8,14 +8,24 @@ from cognee.tests.unit.processing.chunks.test_input import INPUT_TEXTS

 paragraph_lengths = [64, 256, 1024]
 batch_paragraphs_vals = [True, False]
+max_chunk_tokens_vals = [512, 1024, 4096]


@pytest.mark.parametrize(
-    "input_text,paragraph_length,batch_paragraphs",
-    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    list(
+        product(
+            list(INPUT_TEXTS.values()),
+            max_chunk_tokens_vals,
+            paragraph_lengths,
+            batch_paragraphs_vals,
+        )
+    ),
 )
-def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_paragraphs):
-    chunks = chunk_by_paragraph(input_text, paragraph_length, batch_paragraphs)
+def test_chunk_by_paragraph_isomorphism(
+    input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
+):
+    chunks = chunk_by_paragraph(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs)
    reconstructed_text = "".join([chunk["text"] for chunk in chunks])
    assert reconstructed_text == input_text, (
        f"texts are not identical: {len(input_text) = }, {len(reconstructed_text) = }"
@ -23,13 +33,23 @@ def test_chunk_by_paragraph_isomorphism(input_text, paragraph_length, batch_para


@pytest.mark.parametrize(
-    "input_text,paragraph_length,batch_paragraphs",
-    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    list(
+        product(
+            list(INPUT_TEXTS.values()),
+            max_chunk_tokens_vals,
+            paragraph_lengths,
+            batch_paragraphs_vals,
+        )
+    ),
 )
-def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):
+def test_paragraph_chunk_length(input_text, max_chunk_tokens, paragraph_length, batch_paragraphs):
    chunks = list(
        chunk_by_paragraph(
-            data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
+            data=input_text,
+            max_chunk_tokens=max_chunk_tokens,
+            paragraph_length=paragraph_length,
+            batch_paragraphs=batch_paragraphs,
        )
    )

@ -42,12 +62,24 @@ def test_paragraph_chunk_length(input_text, paragraph_length, batch_paragraphs):


@pytest.mark.parametrize(
-    "input_text,paragraph_length,batch_paragraphs",
-    list(product(list(INPUT_TEXTS.values()), paragraph_lengths, batch_paragraphs_vals)),
+    "input_text,max_chunk_tokens,paragraph_length,batch_paragraphs",
+    list(
+        product(
+            list(INPUT_TEXTS.values()),
+            max_chunk_tokens_vals,
+            paragraph_lengths,
+            batch_paragraphs_vals,
+        )
+    ),
 )
-def test_chunk_by_paragraph_chunk_numbering(input_text, paragraph_length, batch_paragraphs):
+def test_chunk_by_paragraph_chunk_numbering(
+    input_text, max_chunk_tokens, paragraph_length, batch_paragraphs
+):
    chunks = chunk_by_paragraph(
-        data=input_text, paragraph_length=paragraph_length, batch_paragraphs=batch_paragraphs
+        data=input_text,
+        max_chunk_tokens=max_chunk_tokens,
+        paragraph_length=paragraph_length,
+        batch_paragraphs=batch_paragraphs,
    )
    chunk_indices = np.array([chunk["chunk_index"] for chunk in chunks])
    assert np.all(chunk_indices == np.arange(len(chunk_indices))), (
--- a/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
+++ b/cognee/tests/unit/processing/chunks/chunk_by_paragraph_test.py
@ -50,7 +50,7 @@ Third paragraph is cut and is missing the dot at the end""",
 def run_chunking_test(test_text, expected_chunks):
    chunks = []
    for chunk_data in chunk_by_paragraph(
-        data=test_text, paragraph_length=12, batch_paragraphs=False
+        data=test_text, paragraph_length=12, batch_paragraphs=False, max_chunk_tokens=512
    ):
        chunks.append(chunk_data)

--- a/cognee/tests/unit/processing/utils/utils_test.py
+++ b/cognee/tests/unit/processing/utils/utils_test.py
@ -11,9 +11,7 @@ from cognee.shared.exceptions import IngestionError
 from cognee.shared.utils import (
    get_anonymous_id,
    send_telemetry,
-    num_tokens_from_string,
    get_file_content_hash,
-    trim_text_to_max_tokens,
    prepare_edges,
    prepare_nodes,
    create_cognee_style_network_with_logo,
@ -45,15 +43,6 @@ def test_get_anonymous_id(mock_open_file, mock_makedirs, temp_dir):
 #     args, kwargs = mock_post.call_args
 #     assert kwargs["json"]["event_name"] == "test_event"

-#
-# @patch("tiktoken.encoding_for_model")
-# def test_num_tokens_from_string(mock_encoding):
-#     mock_encoding.return_value.encode = lambda x: list(x)
-#
-#     assert num_tokens_from_string("hello", "test_encoding") == 5
-#     assert num_tokens_from_string("world", "test_encoding") == 5
-#
-

@patch("builtins.open", new_callable=mock_open, read_data=b"test_data")
 def test_get_file_content_hash_file(mock_open_file):
@ -73,18 +62,6 @@ def test_get_file_content_hash_stream():
    assert result == expected_hash


-# def test_trim_text_to_max_tokens():
-#     text = "This is a test string with multiple words."
-#     encoding_name = "test_encoding"
-#
-#     with patch("tiktoken.get_encoding") as mock_get_encoding:
-#         mock_get_encoding.return_value.encode = lambda x: list(x)
-#         mock_get_encoding.return_value.decode = lambda x: "".join(x)
-#
-#         result = trim_text_to_max_tokens(text, 5, encoding_name)
-#         assert result == text[:5]
-
-
 def test_prepare_edges():
    graph = nx.MultiDiGraph()
    graph.add_edge("A", "B", key="AB", weight=1)
--- a/notebooks/cognee_demo.ipynb
+++ b/notebooks/cognee_demo.ipynb
@ -650,6 +650,7 @@
    "from cognee.modules.pipelines import run_tasks\n",
    "from cognee.modules.users.models import User\n",
    "from cognee.tasks.documents import check_permissions_on_documents, classify_documents, extract_chunks_from_documents\n",
+    "from cognee.infrastructure.llm import get_max_chunk_tokens\n",
    "from cognee.tasks.graph import extract_graph_from_data\n",
    "from cognee.tasks.storage import add_data_points\n",
    "from cognee.tasks.summarization import summarize_text\n",
@ -663,7 +664,7 @@
    "        tasks = [\n",
    "            Task(classify_documents),\n",
    "            Task(check_permissions_on_documents, user = user, permissions = [\"write\"]),\n",
-    "            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.\n",
+    "            Task(extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()), # Extract text chunks based on the document type.\n",
    "            Task(extract_graph_from_data, graph_model = KnowledgeGraph, task_config = { \"batch_size\": 10 }), # Generate knowledge graphs from the document chunks.\n",
    "            Task(\n",
    "                summarize_text,\n",
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.

 [[package]]
 name = "aiofiles"
@ -645,6 +645,17 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >
 [package.extras]
 crt = ["awscrt (==0.23.4)"]

+[[package]]
+name = "cachetools"
+version = "5.5.1"
+description = "Extensible memoizing collections and decorators"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "cachetools-5.5.1-py3-none-any.whl", hash = "sha256:b76651fdc3b24ead3c648bbdeeb940c1b04d365b38b4af66788f9ec4a81d42bb"},
+    {file = "cachetools-5.5.1.tar.gz", hash = "sha256:70f238fbba50383ef62e55c6aff6d9673175fe59f7c6782c7a0b9e38f4a9df95"},
+]
+
 [[package]]
 name = "certifi"
 version = "2024.12.14"
@ -1995,6 +2006,135 @@ files = [
    {file = "giturlparse-0.12.0.tar.gz", hash = "sha256:c0fff7c21acc435491b1779566e038757a205c1ffdcb47e4f81ea52ad8c3859a"},
 ]

+[[package]]
+name = "google-ai-generativelanguage"
+version = "0.6.15"
+description = "Google Ai Generativelanguage API client library"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "google_ai_generativelanguage-0.6.15-py3-none-any.whl", hash = "sha256:5a03ef86377aa184ffef3662ca28f19eeee158733e45d7947982eb953c6ebb6c"},
+    {file = "google_ai_generativelanguage-0.6.15.tar.gz", hash = "sha256:8f6d9dc4c12b065fe2d0289026171acea5183ebf2d0b11cefe12f3821e159ec3"},
+]
+
+[package.dependencies]
+google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]}
+google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev"
+proto-plus = ">=1.22.3,<2.0.0dev"
+protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev"
+
+[[package]]
+name = "google-api-core"
+version = "2.24.0"
+description = "Google API client core library"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"},
+    {file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"},
+]
+
+[package.dependencies]
+google-auth = ">=2.14.1,<3.0.dev0"
+googleapis-common-protos = ">=1.56.2,<2.0.dev0"
+grpcio = [
+    {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
+    {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
+]
+grpcio-status = [
+    {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
+    {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
+]
+proto-plus = ">=1.22.3,<2.0.0dev"
+protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+requests = ">=2.18.0,<3.0.0.dev0"
+
+[package.extras]
+async-rest = ["google-auth[aiohttp] (>=2.35.0,<3.0.dev0)"]
+grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"]
+grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
+grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
+
+[[package]]
+name = "google-api-python-client"
+version = "2.159.0"
+description = "Google API Client Library for Python"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"},
+    {file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"},
+]
+
+[package.dependencies]
+google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0.dev0"
+google-auth = ">=1.32.0,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0.dev0"
+google-auth-httplib2 = ">=0.2.0,<1.0.0"
+httplib2 = ">=0.19.0,<1.dev0"
+uritemplate = ">=3.0.1,<5"
+
+[[package]]
+name = "google-auth"
+version = "2.38.0"
+description = "Google Authentication Library"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"},
+    {file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"},
+]
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = ">=3.1.4,<5"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
+enterprise-cert = ["cryptography", "pyopenssl"]
+pyjwt = ["cryptography (>=38.0.3)", "pyjwt (>=2.0)"]
+pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+requests = ["requests (>=2.20.0,<3.0.0.dev0)"]
+
+[[package]]
+name = "google-auth-httplib2"
+version = "0.2.0"
+description = "Google Authentication Library: httplib2 transport"
+optional = true
+python-versions = "*"
+files = [
+    {file = "google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05"},
+    {file = "google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d"},
+]
+
+[package.dependencies]
+google-auth = "*"
+httplib2 = ">=0.19.0"
+
+[[package]]
+name = "google-generativeai"
+version = "0.8.4"
+description = "Google Generative AI High level API client library and tools."
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "google_generativeai-0.8.4-py3-none-any.whl", hash = "sha256:e987b33ea6decde1e69191ddcaec6ef974458864d243de7191db50c21a7c5b82"},
+]
+
+[package.dependencies]
+google-ai-generativelanguage = "0.6.15"
+google-api-core = "*"
+google-api-python-client = "*"
+google-auth = ">=2.15.0"
+protobuf = "*"
+pydantic = "*"
+tqdm = "*"
+typing-extensions = "*"
+
+[package.extras]
+dev = ["Pillow", "absl-py", "black", "ipython", "nose2", "pandas", "pytype", "pyyaml"]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.66.0"
@ -2251,6 +2391,22 @@ files = [
 grpcio = ">=1.67.1"
 protobuf = ">=5.26.1,<6.0dev"

+[[package]]
+name = "grpcio-status"
+version = "1.67.1"
+description = "Status proto mapping for gRPC"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd"},
+    {file = "grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11"},
+]
+
+[package.dependencies]
+googleapis-common-protos = ">=1.5.5"
+grpcio = ">=1.67.1"
+protobuf = ">=5.26.1,<6.0dev"
+
 [[package]]
 name = "grpcio-tools"
 version = "1.67.1"
@ -2445,6 +2601,20 @@ http2 = ["h2 (>=3,<5)"]
 socks = ["socksio (==1.*)"]
 trio = ["trio (>=0.22.0,<1.0)"]

+[[package]]
+name = "httplib2"
+version = "0.22.0"
+description = "A comprehensive HTTP client library."
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"},
+    {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"},
+]
+
+[package.dependencies]
+pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
+
 [[package]]
 name = "httpx"
 version = "0.27.0"
@ -4998,8 +5168,8 @@ files = [
 [package.dependencies]
 numpy = [
    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -5520,6 +5690,23 @@ files = [
    {file = "propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64"},
 ]

+[[package]]
+name = "proto-plus"
+version = "1.25.0"
+description = "Beautiful, Pythonic protocol buffers."
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"},
+    {file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19.0,<6.0.0dev"
+
+[package.extras]
+testing = ["google-api-core (>=1.31.5)"]
+
 [[package]]
 name = "protobuf"
 version = "5.29.3"
@ -5686,6 +5873,31 @@ files = [
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]

+[[package]]
+name = "pyasn1"
+version = "0.6.1"
+description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"},
+    {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"},
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.1"
+description = "A collection of ASN.1-based protocols modules"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd"},
+    {file = "pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.7.0"
+
 [[package]]
 name = "pycparser"
 version = "2.22"
@ -5927,8 +6139,8 @@ astroid = ">=3.3.8,<=3.4.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
    {version = ">=0.2", markers = "python_version < \"3.11\""},
-    {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=0.3.7", markers = "python_version >= \"3.12\""},
+    {version = ">=0.3.6", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
 ]
 isort = ">=4.2.5,<5.13.0 || >5.13.0,<6"
 mccabe = ">=0.6,<0.8"
@ -6967,6 +7179,20 @@ files = [
    {file = "rpds_py-0.22.3.tar.gz", hash = "sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d"},
 ]

+[[package]]
+name = "rsa"
+version = "4.9"
+description = "Pure-Python RSA implementation"
+optional = true
+python-versions = ">=3.6,<4"
+files = [
+    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
+    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
 [[package]]
 name = "ruff"
 version = "0.9.3"
@ -8218,6 +8444,17 @@ files = [
 [package.extras]
 dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake8-commas", "flake8-comprehensions", "flake8-continuation", "flake8-datetimez", "flake8-docstrings", "flake8-import-order", "flake8-literal", "flake8-modern-annotations", "flake8-noqa", "flake8-pyproject", "flake8-requirements", "flake8-typechecking-import", "flake8-use-fstring", "mypy", "pep8-naming", "types-PyYAML"]

+[[package]]
+name = "uritemplate"
+version = "4.1.1"
+description = "Implementation of RFC 6570 URI Templates"
+optional = true
+python-versions = ">=3.6"
+files = [
+    {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"},
+    {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"},
+]
+
 [[package]]
 name = "urllib3"
 version = "2.3.0"
@ -8801,6 +9038,7 @@ deepeval = ["deepeval"]
 docs = ["unstructured"]
 falkordb = ["falkordb"]
 filesystem = ["botocore"]
+gemini = ["google-generativeai"]
 groq = ["groq"]
 langchain = ["langchain_text_splitters", "langsmith"]
 llama-index = ["llama-index-core"]
@ -8815,4 +9053,4 @@ weaviate = ["weaviate-client"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10.0,<3.13"
-content-hash = "1cc352109264d0e3add524cdc15c9b2e6153e1bab20d968b40e42a4d5138967f"
+content-hash = "480675c274cd85a76a95bf03af865b1a0b462f25bbc21d7427b0a0b8e21c13db"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -77,6 +77,7 @@ pre-commit = "^4.0.1"
 httpx = "0.27.0"
 bokeh="^3.6.2"
 nltk = "3.9.1"
+google-generativeai = {version = "^0.8.4", optional = true}
 parso = {version = "^0.8.4", optional = true}
 jedi =  {version = "^0.19.2", optional = true}

@ -90,6 +91,7 @@ postgres = ["psycopg2", "pgvector", "asyncpg"]
 notebook = ["notebook", "ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
 langchain = ["langsmith", "langchain_text_splitters"]
 llama-index = ["llama-index-core"]
+gemini = ["google-generativeai"]
 deepeval = ["deepeval"]
 posthog = ["posthog"]
 falkordb = ["falkordb"]
				`@ -0,0 +1 @@`
				`from .tokenizer_interface import TokenizerInterface`