feat: add default tokenizer in case hugging face is not available (#1177)
<!-- .github/pull_request_template.md --> ## Description Add default tokenizer for custom models not available on HuggingFace ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
5b6e946c43
commit
9faa47fc5a
2 changed files with 13 additions and 4 deletions
|
|
@ -177,7 +177,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|||
elif "mistral" in self.provider.lower():
|
||||
tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
|
||||
else:
|
||||
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
|
||||
try:
|
||||
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
|
||||
logger.info("Switching to TikToken default tokenizer.")
|
||||
tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
|
||||
|
||||
logger.debug(f"Tokenizer loaded for model: {self.model}")
|
||||
return tokenizer
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import List, Any
|
||||
from typing import List, Any, Optional
|
||||
import tiktoken
|
||||
|
||||
from ..tokenizer_interface import TokenizerInterface
|
||||
|
|
@ -12,13 +12,17 @@ class TikTokenTokenizer(TokenizerInterface):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: int = 8191,
|
||||
):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
# Initialize TikToken for GPT based on model
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
||||
if model:
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
||||
else:
|
||||
# Use default if model not provided
|
||||
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def extract_tokens(self, text: str) -> List[Any]:
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue