cognee/cognee/infrastructure/llm/tokenizer/TikToken/adapter.py
Igor Ilic 9faa47fc5a
feat: add default tokenizer in case hugging face is not available (#1177)
<!-- .github/pull_request_template.md -->

## Description
Add default tokenizer for custom models not available on HuggingFace

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
2025-08-01 16:37:53 +02:00

123 lines
3.5 KiB
Python

from typing import List, Any, Optional
import tiktoken
from ..tokenizer_interface import TokenizerInterface
class TikTokenTokenizer(TokenizerInterface):
"""
Tokenizer adapter for OpenAI. Intended to be used as part of LLM Embedding and LLM
Adapters classes.
"""
def __init__(
self,
model: Optional[str] = None,
max_tokens: int = 8191,
):
self.model = model
self.max_tokens = max_tokens
# Initialize TikToken for GPT based on model
if model:
self.tokenizer = tiktoken.encoding_for_model(self.model)
else:
# Use default if model not provided
self.tokenizer = tiktoken.get_encoding("cl100k_base")
def extract_tokens(self, text: str) -> List[Any]:
"""
Extract tokens from the given text.
Parameters:
-----------
- text (str): The text to be tokenized.
Returns:
--------
- List[Any]: A list of token IDs representing the encoded text.
"""
# Using TikToken's method to tokenize text
token_ids = self.tokenizer.encode(text)
return token_ids
def decode_token_list(self, tokens: List[Any]) -> List[Any]:
"""
Decode a list of token IDs back into their corresponding text representations.
Parameters:
-----------
- tokens (List[Any]): A list of token IDs to be decoded.
Returns:
--------
- List[Any]: A list of decoded text representations of the tokens.
"""
if not isinstance(tokens, list):
tokens = [tokens]
return [self.tokenizer.decode(i) for i in tokens]
def decode_single_token(self, token: int):
"""
Decode a single token ID into its corresponding text representation.
Parameters:
-----------
- token (int): A single token ID to be decoded.
Returns:
--------
The decoded text representation of the token.
"""
return self.tokenizer.decode_single_token_bytes(token).decode("utf-8", errors="replace")
def count_tokens(self, text: str) -> int:
"""
Count the number of tokens in the given text.
Parameters:
-----------
- text (str): The text for which to count the tokens.
Returns:
--------
- int: The number of tokens in the given text.
"""
num_tokens = len(self.tokenizer.encode(text))
return num_tokens
def trim_text_to_max_tokens(self, text: str) -> str:
"""
Trim the text so that the number of tokens does not exceed max_tokens.
Parameters:
-----------
- text (str): Original text string to be trimmed.
Returns:
--------
- str: Trimmed version of text or original text if under the limit.
"""
# First check the number of tokens
num_tokens = self.count_tokens(text)
# If the number of tokens is within the limit, return the text as is
if num_tokens <= self.max_tokens:
return text
# If the number exceeds the limit, trim the text
# This is a simple trim, it may cut words in half; consider using word boundaries for a cleaner cut
encoded_text = self.tokenizer.encode(text)
trimmed_encoded_text = encoded_text[: self.max_tokens]
# Decoding the trimmed text
trimmed_text = self.tokenizer.decode(trimmed_encoded_text)
return trimmed_text