feat: add default tokenizer in case hugging face is not available (#1177)

<!-- .github/pull_request_template.md -->

## Description
Add default tokenizer for custom models not available on HuggingFace

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Igor Ilic 2025-08-01 16:37:53 +02:00 committed by GitHub
parent 5b6e946c43
commit 9faa47fc5a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 13 additions and 4 deletions

View file

@ -177,7 +177,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
elif "mistral" in self.provider.lower():
tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
else:
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
try:
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
except Exception as e:
logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
logger.info("Switching to TikToken default tokenizer.")
tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
logger.debug(f"Tokenizer loaded for model: {self.model}")
return tokenizer

View file

@ -1,4 +1,4 @@
from typing import List, Any
from typing import List, Any, Optional
import tiktoken
from ..tokenizer_interface import TokenizerInterface
@ -12,13 +12,17 @@ class TikTokenTokenizer(TokenizerInterface):
def __init__(
self,
model: str,
model: Optional[str] = None,
max_tokens: int = 8191,
):
self.model = model
self.max_tokens = max_tokens
# Initialize TikToken for GPT based on model
self.tokenizer = tiktoken.encoding_for_model(self.model)
if model:
self.tokenizer = tiktoken.encoding_for_model(self.model)
else:
# Use default if model not provided
self.tokenizer = tiktoken.get_encoding("cl100k_base")
def extract_tokens(self, text: str) -> List[Any]:
"""