feat: add default tokenizer in case hugging face is not available (#1177)
<!-- .github/pull_request_template.md --> ## Description Add default tokenizer for custom models not available on HuggingFace ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
5b6e946c43
commit
9faa47fc5a
2 changed files with 13 additions and 4 deletions
|
|
@ -177,7 +177,12 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
||||||
elif "mistral" in self.provider.lower():
|
elif "mistral" in self.provider.lower():
|
||||||
tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
|
tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
|
||||||
else:
|
else:
|
||||||
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
|
try:
|
||||||
|
tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
|
||||||
|
logger.info("Switching to TikToken default tokenizer.")
|
||||||
|
tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
|
||||||
|
|
||||||
logger.debug(f"Tokenizer loaded for model: {self.model}")
|
logger.debug(f"Tokenizer loaded for model: {self.model}")
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Any
|
from typing import List, Any, Optional
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
from ..tokenizer_interface import TokenizerInterface
|
from ..tokenizer_interface import TokenizerInterface
|
||||||
|
|
@ -12,13 +12,17 @@ class TikTokenTokenizer(TokenizerInterface):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: Optional[str] = None,
|
||||||
max_tokens: int = 8191,
|
max_tokens: int = 8191,
|
||||||
):
|
):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
# Initialize TikToken for GPT based on model
|
# Initialize TikToken for GPT based on model
|
||||||
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
if model:
|
||||||
|
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
||||||
|
else:
|
||||||
|
# Use default if model not provided
|
||||||
|
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
def extract_tokens(self, text: str) -> List[Any]:
|
def extract_tokens(self, text: str) -> List[Any]:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue