<!-- .github/pull_request_template.md --> ## Description Add default tokenizer for custom models not available on HuggingFace ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
123 lines
3.5 KiB
Python
123 lines
3.5 KiB
Python
from typing import List, Any, Optional
|
|
import tiktoken
|
|
|
|
from ..tokenizer_interface import TokenizerInterface
|
|
|
|
|
|
class TikTokenTokenizer(TokenizerInterface):
|
|
"""
|
|
Tokenizer adapter for OpenAI. Intended to be used as part of LLM Embedding and LLM
|
|
Adapters classes.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model: Optional[str] = None,
|
|
max_tokens: int = 8191,
|
|
):
|
|
self.model = model
|
|
self.max_tokens = max_tokens
|
|
# Initialize TikToken for GPT based on model
|
|
if model:
|
|
self.tokenizer = tiktoken.encoding_for_model(self.model)
|
|
else:
|
|
# Use default if model not provided
|
|
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
|
|
def extract_tokens(self, text: str) -> List[Any]:
|
|
"""
|
|
Extract tokens from the given text.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- text (str): The text to be tokenized.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- List[Any]: A list of token IDs representing the encoded text.
|
|
"""
|
|
# Using TikToken's method to tokenize text
|
|
token_ids = self.tokenizer.encode(text)
|
|
return token_ids
|
|
|
|
def decode_token_list(self, tokens: List[Any]) -> List[Any]:
|
|
"""
|
|
Decode a list of token IDs back into their corresponding text representations.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- tokens (List[Any]): A list of token IDs to be decoded.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- List[Any]: A list of decoded text representations of the tokens.
|
|
"""
|
|
if not isinstance(tokens, list):
|
|
tokens = [tokens]
|
|
return [self.tokenizer.decode(i) for i in tokens]
|
|
|
|
def decode_single_token(self, token: int):
|
|
"""
|
|
Decode a single token ID into its corresponding text representation.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- token (int): A single token ID to be decoded.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
The decoded text representation of the token.
|
|
"""
|
|
return self.tokenizer.decode_single_token_bytes(token).decode("utf-8", errors="replace")
|
|
|
|
def count_tokens(self, text: str) -> int:
|
|
"""
|
|
Count the number of tokens in the given text.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- text (str): The text for which to count the tokens.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- int: The number of tokens in the given text.
|
|
"""
|
|
num_tokens = len(self.tokenizer.encode(text))
|
|
return num_tokens
|
|
|
|
def trim_text_to_max_tokens(self, text: str) -> str:
|
|
"""
|
|
Trim the text so that the number of tokens does not exceed max_tokens.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- text (str): Original text string to be trimmed.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- str: Trimmed version of text or original text if under the limit.
|
|
"""
|
|
# First check the number of tokens
|
|
num_tokens = self.count_tokens(text)
|
|
|
|
# If the number of tokens is within the limit, return the text as is
|
|
if num_tokens <= self.max_tokens:
|
|
return text
|
|
|
|
# If the number exceeds the limit, trim the text
|
|
# This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
|
|
encoded_text = self.tokenizer.encode(text)
|
|
trimmed_encoded_text = encoded_text[: self.max_tokens]
|
|
# Decoding the trimmed text
|
|
trimmed_text = self.tokenizer.decode(trimmed_encoded_text)
|
|
return trimmed_text
|