From 1bd40f1401ffcfb4c1ca4256ecaef45e76ad1383 Mon Sep 17 00:00:00 2001 From: vasilije Date: Sun, 17 Aug 2025 12:39:12 +0200 Subject: [PATCH] renamed max tokens --- cognee/api/v1/cognify/cognify.py | 2 +- cognee/api/v1/responses/models.py | 2 +- .../embeddings/FastembedEmbeddingEngine.py | 8 ++++--- .../embeddings/LiteLLMEmbeddingEngine.py | 23 +++++++++++++------ .../embeddings/OllamaEmbeddingEngine.py | 10 ++++---- .../databases/vector/embeddings/config.py | 4 ++-- .../vector/embeddings/get_embedding_engine.py | 12 +++++----- cognee/infrastructure/llm/config.py | 6 ++--- .../llm/anthropic/adapter.py | 6 ++--- .../litellm_instructor/llm/gemini/adapter.py | 6 ++--- .../llm/generic_llm_api/adapter.py | 4 ++-- .../litellm_instructor/llm/get_llm_client.py | 22 +++++++++++------- .../litellm_instructor/llm/ollama/adapter.py | 10 ++++---- .../litellm_instructor/llm/openai/adapter.py | 6 ++--- .../llm/tokenizer/Gemini/adapter.py | 4 ++-- .../llm/tokenizer/HuggingFace/adapter.py | 6 ++--- .../llm/tokenizer/Mistral/adapter.py | 6 ++--- .../llm/tokenizer/TikToken/adapter.py | 12 +++++----- cognee/infrastructure/llm/utils.py | 14 +++++------ evals/src/qa/qa_benchmark_graphiti.py | 2 +- 20 files changed, 92 insertions(+), 73 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 23984b9a6..90c3c469e 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -91,7 +91,7 @@ async def cognify( - LangchainChunker: Recursive character splitting with overlap Determines how documents are segmented for processing. chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None. - Formula: min(embedding_max_tokens, llm_max_tokens // 2) + Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2) Default limits: ~512-8192 tokens depending on models. Smaller chunks = more granular but potentially fragmented knowledge. ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types. diff --git a/cognee/api/v1/responses/models.py b/cognee/api/v1/responses/models.py index f23b105ee..328c8db7e 100644 --- a/cognee/api/v1/responses/models.py +++ b/cognee/api/v1/responses/models.py @@ -70,7 +70,7 @@ class ResponseRequest(InDTO): tool_choice: Optional[Union[str, Dict[str, Any]]] = "auto" user: Optional[str] = None temperature: Optional[float] = 1.0 - max_tokens: Optional[int] = None + max_completion_tokens: Optional[int] = None class ToolCallOutput(BaseModel): diff --git a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py index f7925a3d2..dc8443459 100644 --- a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py @@ -41,11 +41,11 @@ class FastembedEmbeddingEngine(EmbeddingEngine): self, model: Optional[str] = "openai/text-embedding-3-large", dimensions: Optional[int] = 3072, - max_tokens: int = 512, + max_completion_tokens: int = 512, ): self.model = model self.dimensions = dimensions - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.tokenizer = self.get_tokenizer() # self.retry_count = 0 self.embedding_model = TextEmbedding(model_name=model) @@ -112,7 +112,9 @@ class FastembedEmbeddingEngine(EmbeddingEngine): """ logger.debug("Loading tokenizer for FastembedEmbeddingEngine...") - tokenizer = TikTokenTokenizer(model="gpt-4o", max_tokens=self.max_tokens) + tokenizer = TikTokenTokenizer( + model="gpt-4o", max_completion_tokens=self.max_completion_tokens + ) logger.debug("Tokenizer loaded for for FastembedEmbeddingEngine") return tokenizer diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index dae664907..0cb8286fc 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -57,7 +57,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): api_key: str = None, endpoint: str = None, api_version: str = None, - max_tokens: int = 512, + max_completion_tokens: int = 512, ): self.api_key = api_key self.endpoint = endpoint @@ -65,7 +65,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): self.provider = provider self.model = model self.dimensions = dimensions - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.tokenizer = self.get_tokenizer() self.retry_count = 0 @@ -179,20 +179,29 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): model = self.model.split("/")[-1] if "openai" in self.provider.lower(): - tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens) + tokenizer = TikTokenTokenizer( + model=model, max_completion_tokens=self.max_completion_tokens + ) elif "gemini" in self.provider.lower(): - tokenizer = GeminiTokenizer(model=model, max_tokens=self.max_tokens) + tokenizer = GeminiTokenizer( + model=model, max_completion_tokens=self.max_completion_tokens + ) elif "mistral" in self.provider.lower(): - tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens) + tokenizer = MistralTokenizer( + model=model, max_completion_tokens=self.max_completion_tokens + ) else: try: tokenizer = HuggingFaceTokenizer( - model=self.model.replace("hosted_vllm/", ""), max_tokens=self.max_tokens + model=self.model.replace("hosted_vllm/", ""), + max_completion_tokens=self.max_completion_tokens, ) except Exception as e: logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}") logger.info("Switching to TikToken default tokenizer.") - tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens) + tokenizer = TikTokenTokenizer( + model=None, max_completion_tokens=self.max_completion_tokens + ) logger.debug(f"Tokenizer loaded for model: {self.model}") return tokenizer diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py index bfb24a2d3..e1237a657 100644 --- a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py @@ -30,7 +30,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine): Instance variables: - model - dimensions - - max_tokens + - max_completion_tokens - endpoint - mock - huggingface_tokenizer_name @@ -39,7 +39,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine): model: str dimensions: int - max_tokens: int + max_completion_tokens: int endpoint: str mock: bool huggingface_tokenizer_name: str @@ -50,13 +50,13 @@ class OllamaEmbeddingEngine(EmbeddingEngine): self, model: Optional[str] = "avr/sfr-embedding-mistral:latest", dimensions: Optional[int] = 1024, - max_tokens: int = 512, + max_completion_tokens: int = 512, endpoint: Optional[str] = "http://localhost:11434/api/embeddings", huggingface_tokenizer: str = "Salesforce/SFR-Embedding-Mistral", ): self.model = model self.dimensions = dimensions - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.endpoint = endpoint self.huggingface_tokenizer_name = huggingface_tokenizer self.tokenizer = self.get_tokenizer() @@ -132,7 +132,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine): """ logger.debug("Loading HuggingfaceTokenizer for OllamaEmbeddingEngine...") tokenizer = HuggingFaceTokenizer( - model=self.huggingface_tokenizer_name, max_tokens=self.max_tokens + model=self.huggingface_tokenizer_name, max_completion_tokens=self.max_completion_tokens ) logger.debug("Tokenizer loaded for OllamaEmbeddingEngine") return tokenizer diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py index 2ae60d64a..04a1f18f2 100644 --- a/cognee/infrastructure/databases/vector/embeddings/config.py +++ b/cognee/infrastructure/databases/vector/embeddings/config.py @@ -18,7 +18,7 @@ class EmbeddingConfig(BaseSettings): embedding_endpoint: Optional[str] = None embedding_api_key: Optional[str] = None embedding_api_version: Optional[str] = None - embedding_max_tokens: Optional[int] = 8191 + embedding_max_completion_tokens: Optional[int] = 8191 huggingface_tokenizer: Optional[str] = None model_config = SettingsConfigDict(env_file=".env", extra="allow") @@ -38,7 +38,7 @@ class EmbeddingConfig(BaseSettings): "embedding_endpoint": self.embedding_endpoint, "embedding_api_key": self.embedding_api_key, "embedding_api_version": self.embedding_api_version, - "embedding_max_tokens": self.embedding_max_tokens, + "embedding_max_completion_tokens": self.embedding_max_completion_tokens, "huggingface_tokenizer": self.huggingface_tokenizer, } diff --git a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py index d250525a3..ae15b6c6e 100644 --- a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +++ b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py @@ -27,7 +27,7 @@ def get_embedding_engine() -> EmbeddingEngine: config.embedding_provider, config.embedding_model, config.embedding_dimensions, - config.embedding_max_tokens, + config.embedding_max_completion_tokens, config.embedding_endpoint, config.embedding_api_key, config.embedding_api_version, @@ -41,7 +41,7 @@ def create_embedding_engine( embedding_provider, embedding_model, embedding_dimensions, - embedding_max_tokens, + embedding_max_completion_tokens, embedding_endpoint, embedding_api_key, embedding_api_version, @@ -58,7 +58,7 @@ def create_embedding_engine( 'ollama', or another supported provider. - embedding_model: The model to be used for the embedding engine. - embedding_dimensions: The number of dimensions for the embeddings. - - embedding_max_tokens: The maximum number of tokens for the embeddings. + - embedding_max_completion_tokens: The maximum number of tokens for the embeddings. - embedding_endpoint: The endpoint for the embedding service, relevant for certain providers. - embedding_api_key: API key to authenticate with the embedding service, if @@ -81,7 +81,7 @@ def create_embedding_engine( return FastembedEmbeddingEngine( model=embedding_model, dimensions=embedding_dimensions, - max_tokens=embedding_max_tokens, + max_completion_tokens=embedding_max_completion_tokens, ) if embedding_provider == "ollama": @@ -90,7 +90,7 @@ def create_embedding_engine( return OllamaEmbeddingEngine( model=embedding_model, dimensions=embedding_dimensions, - max_tokens=embedding_max_tokens, + max_completion_tokens=embedding_max_completion_tokens, endpoint=embedding_endpoint, huggingface_tokenizer=huggingface_tokenizer, ) @@ -104,5 +104,5 @@ def create_embedding_engine( api_version=embedding_api_version, model=embedding_model, dimensions=embedding_dimensions, - max_tokens=embedding_max_tokens, + max_completion_tokens=embedding_max_completion_tokens, ) diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index 0bd0beb64..de2e2168e 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -18,7 +18,7 @@ class LLMConfig(BaseSettings): - llm_api_version - llm_temperature - llm_streaming - - llm_max_tokens + - llm_max_completion_tokens - transcription_model - graph_prompt_path - llm_rate_limit_enabled @@ -41,7 +41,7 @@ class LLMConfig(BaseSettings): llm_api_version: Optional[str] = None llm_temperature: float = 0.0 llm_streaming: bool = False - llm_max_tokens: int = 16384 + llm_max_completion_tokens: int = 16384 baml_llm_provider: str = "openai" baml_llm_model: str = "gpt-5-mini" @@ -171,7 +171,7 @@ class LLMConfig(BaseSettings): "api_version": self.llm_api_version, "temperature": self.llm_temperature, "streaming": self.llm_streaming, - "max_tokens": self.llm_max_tokens, + "max_completion_tokens": self.llm_max_completion_tokens, "transcription_model": self.transcription_model, "graph_prompt_path": self.graph_prompt_path, "rate_limit_enabled": self.llm_rate_limit_enabled, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py index 6845fb6aa..103ac5e08 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py @@ -23,7 +23,7 @@ class AnthropicAdapter(LLMInterface): name = "Anthropic" model: str - def __init__(self, max_tokens: int, model: str = None): + def __init__(self, max_completion_tokens: int, model: str = None): import anthropic self.aclient = instructor.patch( @@ -31,7 +31,7 @@ class AnthropicAdapter(LLMInterface): ) self.model = model - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens @sleep_and_retry_async() @rate_limit_async @@ -57,7 +57,7 @@ class AnthropicAdapter(LLMInterface): return await self.aclient( model=self.model, - max_tokens=4096, + max_completion_tokens=4096, max_retries=5, messages=[ { diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py index 3cde1fdc4..2ba94e06f 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py @@ -34,7 +34,7 @@ class GeminiAdapter(LLMInterface): self, api_key: str, model: str, - max_tokens: int, + max_completion_tokens: int, endpoint: Optional[str] = None, api_version: Optional[str] = None, streaming: bool = False, @@ -44,7 +44,7 @@ class GeminiAdapter(LLMInterface): self.endpoint = endpoint self.api_version = api_version self.streaming = streaming - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens @observe(as_type="generation") @sleep_and_retry_async() @@ -90,7 +90,7 @@ class GeminiAdapter(LLMInterface): model=f"{self.model}", messages=messages, api_key=self.api_key, - max_tokens=self.max_tokens, + max_completion_tokens=self.max_completion_tokens, temperature=0.1, response_format=response_schema, timeout=100, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py index c62ab1b2b..86adac25a 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py @@ -41,7 +41,7 @@ class GenericAPIAdapter(LLMInterface): api_key: str, model: str, name: str, - max_tokens: int, + max_completion_tokens: int, fallback_model: str = None, fallback_api_key: str = None, fallback_endpoint: str = None, @@ -50,7 +50,7 @@ class GenericAPIAdapter(LLMInterface): self.model = model self.api_key = api_key self.endpoint = endpoint - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.fallback_model = fallback_model self.fallback_api_key = fallback_api_key diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py index fd347aef3..44679dcf7 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py @@ -54,11 +54,15 @@ def get_llm_client(): # Check if max_token value is defined in liteLLM for given model # if not use value from cognee configuration from cognee.infrastructure.llm.utils import ( - get_model_max_tokens, + get_model_max_completion_tokens, ) # imported here to avoid circular imports - model_max_tokens = get_model_max_tokens(llm_config.llm_model) - max_tokens = model_max_tokens if model_max_tokens else llm_config.llm_max_tokens + model_max_completion_tokens = get_model_max_completion_tokens(llm_config.llm_model) + max_completion_tokens = ( + model_max_completion_tokens + if model_max_completion_tokens + else llm_config.llm_max_completion_tokens + ) if provider == LLMProvider.OPENAI: if llm_config.llm_api_key is None: @@ -74,7 +78,7 @@ def get_llm_client(): api_version=llm_config.llm_api_version, model=llm_config.llm_model, transcription_model=llm_config.transcription_model, - max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, streaming=llm_config.llm_streaming, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, @@ -94,7 +98,7 @@ def get_llm_client(): llm_config.llm_api_key, llm_config.llm_model, "Ollama", - max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, ) elif provider == LLMProvider.ANTHROPIC: @@ -102,7 +106,9 @@ def get_llm_client(): AnthropicAdapter, ) - return AnthropicAdapter(max_tokens=max_tokens, model=llm_config.llm_model) + return AnthropicAdapter( + max_completion_tokens=max_completion_tokens, model=llm_config.llm_model + ) elif provider == LLMProvider.CUSTOM: if llm_config.llm_api_key is None: @@ -117,7 +123,7 @@ def get_llm_client(): llm_config.llm_api_key, llm_config.llm_model, "Custom", - max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, fallback_api_key=llm_config.fallback_api_key, fallback_endpoint=llm_config.fallback_endpoint, fallback_model=llm_config.fallback_model, @@ -134,7 +140,7 @@ def get_llm_client(): return GeminiAdapter( api_key=llm_config.llm_api_key, model=llm_config.llm_model, - max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, endpoint=llm_config.llm_endpoint, api_version=llm_config.llm_api_version, streaming=llm_config.llm_streaming, diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py index cd7276694..314cb79d8 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py @@ -30,16 +30,18 @@ class OllamaAPIAdapter(LLMInterface): - model - api_key - endpoint - - max_tokens + - max_completion_tokens - aclient """ - def __init__(self, endpoint: str, api_key: str, model: str, name: str, max_tokens: int): + def __init__( + self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int + ): self.name = name self.model = model self.api_key = api_key self.endpoint = endpoint - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.aclient = instructor.from_openai( OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON @@ -159,7 +161,7 @@ class OllamaAPIAdapter(LLMInterface): ], } ], - max_tokens=300, + max_completion_tokens=300, ) # Ensure response is valid before accessing .choices[0].message.content diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py index c3c215896..95c14f1bc 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py @@ -64,7 +64,7 @@ class OpenAIAdapter(LLMInterface): api_version: str, model: str, transcription_model: str, - max_tokens: int, + max_completion_tokens: int, streaming: bool = False, fallback_model: str = None, fallback_api_key: str = None, @@ -77,7 +77,7 @@ class OpenAIAdapter(LLMInterface): self.api_key = api_key self.endpoint = endpoint self.api_version = api_version - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens self.streaming = streaming self.fallback_model = fallback_model @@ -301,7 +301,7 @@ class OpenAIAdapter(LLMInterface): api_key=self.api_key, api_base=self.endpoint, api_version=self.api_version, - max_tokens=300, + max_completion_tokens=300, max_retries=self.MAX_RETRIES, ) diff --git a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py index 5e1b48b66..a57cff3f7 100644 --- a/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/Gemini/adapter.py @@ -17,10 +17,10 @@ class GeminiTokenizer(TokenizerInterface): def __init__( self, model: str, - max_tokens: int = 3072, + max_completion_tokens: int = 3072, ): self.model = model - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens # Get LLM API key from config from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config diff --git a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py index 4ac68ff20..26aef4b1a 100644 --- a/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py @@ -14,17 +14,17 @@ class HuggingFaceTokenizer(TokenizerInterface): Instance variables include: - model: str - - max_tokens: int + - max_completion_tokens: int - tokenizer: AutoTokenizer """ def __init__( self, model: str, - max_tokens: int = 512, + max_completion_tokens: int = 512, ): self.model = model - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens # Import here to make it an optional dependency from transformers import AutoTokenizer diff --git a/cognee/infrastructure/llm/tokenizer/Mistral/adapter.py b/cognee/infrastructure/llm/tokenizer/Mistral/adapter.py index 5f23046ca..a0bdfd222 100644 --- a/cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/Mistral/adapter.py @@ -16,17 +16,17 @@ class MistralTokenizer(TokenizerInterface): Instance variables include: - model: str - - max_tokens: int + - max_completion_tokens: int - tokenizer: MistralTokenizer """ def __init__( self, model: str, - max_tokens: int = 3072, + max_completion_tokens: int = 3072, ): self.model = model - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens # Import here to make it an optional dependency from mistral_common.tokens.tokenizers.mistral import MistralTokenizer diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py index 8806112c3..e51747612 100644 --- a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py @@ -13,10 +13,10 @@ class TikTokenTokenizer(TokenizerInterface): def __init__( self, model: Optional[str] = None, - max_tokens: int = 8191, + max_completion_tokens: int = 8191, ): self.model = model - self.max_tokens = max_tokens + self.max_completion_tokens = max_completion_tokens # Initialize TikToken for GPT based on model if model: self.tokenizer = tiktoken.encoding_for_model(self.model) @@ -93,9 +93,9 @@ class TikTokenTokenizer(TokenizerInterface): num_tokens = len(self.tokenizer.encode(text)) return num_tokens - def trim_text_to_max_tokens(self, text: str) -> str: + def trim_text_to_max_completion_tokens(self, text: str) -> str: """ - Trim the text so that the number of tokens does not exceed max_tokens. + Trim the text so that the number of tokens does not exceed max_completion_tokens. Parameters: ----------- @@ -111,13 +111,13 @@ class TikTokenTokenizer(TokenizerInterface): num_tokens = self.count_tokens(text) # If the number of tokens is within the limit, return the text as is - if num_tokens <= self.max_tokens: + if num_tokens <= self.max_completion_tokens: return text # If the number exceeds the limit, trim the text # This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut encoded_text = self.tokenizer.encode(text) - trimmed_encoded_text = encoded_text[: self.max_tokens] + trimmed_encoded_text = encoded_text[: self.max_completion_tokens] # Decoding the trimmed text trimmed_text = self.tokenizer.decode(trimmed_encoded_text) return trimmed_text diff --git a/cognee/infrastructure/llm/utils.py b/cognee/infrastructure/llm/utils.py index 7190db654..cb88fa85e 100644 --- a/cognee/infrastructure/llm/utils.py +++ b/cognee/infrastructure/llm/utils.py @@ -32,13 +32,13 @@ def get_max_chunk_tokens(): # We need to make sure chunk size won't take more than half of LLM max context token size # but it also can't be bigger than the embedding engine max token size - llm_cutoff_point = llm_client.max_tokens // 2 # Round down the division - max_chunk_tokens = min(embedding_engine.max_tokens, llm_cutoff_point) + llm_cutoff_point = llm_client.max_completion_tokens // 2 # Round down the division + max_chunk_tokens = min(embedding_engine.max_completion_tokens, llm_cutoff_point) return max_chunk_tokens -def get_model_max_tokens(model_name: str): +def get_model_max_completion_tokens(model_name: str): """ Retrieve the maximum token limit for a specified model name if it exists. @@ -56,15 +56,15 @@ def get_model_max_tokens(model_name: str): Number of max tokens of model, or None if model is unknown """ - max_tokens = None + max_completion_tokens = None if model_name in litellm.model_cost: - max_tokens = litellm.model_cost[model_name]["max_tokens"] - logger.debug(f"Max input tokens for {model_name}: {max_tokens}") + max_completion_tokens = litellm.model_cost[model_name]["max_tokens"] + logger.debug(f"Max input tokens for {model_name}: {max_completion_tokens}") else: logger.info("Model not found in LiteLLM's model_cost.") - return max_tokens + return max_completion_tokens async def test_llm_connection(): diff --git a/evals/src/qa/qa_benchmark_graphiti.py b/evals/src/qa/qa_benchmark_graphiti.py index 17fc87338..474a756a0 100644 --- a/evals/src/qa/qa_benchmark_graphiti.py +++ b/evals/src/qa/qa_benchmark_graphiti.py @@ -43,7 +43,7 @@ class QABenchmarkGraphiti(QABenchmarkRAG): async def initialize_rag(self) -> Any: """Initialize Graphiti and LLM.""" - llm_config = LLMConfig(model=self.config.model_name, max_tokens=65536) + llm_config = LLMConfig(model=self.config.model_name, max_completion_tokens=65536) llm_client = OpenAIClient(config=llm_config) graphiti = Graphiti( self.config.db_url,