diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index b51d397ed..54f319be3 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -177,7 +177,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): elif "mistral" in self.provider.lower(): tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens) else: - tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens) + try: + tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens) + except Exception as e: + logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}") + logger.info("Switching to TikToken default tokenizer.") + tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens) logger.debug(f"Tokenizer loaded for model: {self.model}") return tokenizer diff --git a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py index 881ffaba7..8806112c3 100644 --- a/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +++ b/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py @@ -1,4 +1,4 @@ -from typing import List, Any +from typing import List, Any, Optional import tiktoken from ..tokenizer_interface import TokenizerInterface @@ -12,13 +12,17 @@ class TikTokenTokenizer(TokenizerInterface): def __init__( self, - model: str, + model: Optional[str] = None, max_tokens: int = 8191, ): self.model = model self.max_tokens = max_tokens # Initialize TikToken for GPT based on model - self.tokenizer = tiktoken.encoding_for_model(self.model) + if model: + self.tokenizer = tiktoken.encoding_for_model(self.model) + else: + # Use default if model not provided + self.tokenizer = tiktoken.get_encoding("cl100k_base") def extract_tokens(self, text: str) -> List[Any]: """