cognee/cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py
Igor Ilic 0a9f1349f2 refactor: Change variable and function names based on PR comments
Change variable and function names based on PR comments
2025-01-28 10:10:29 +01:00

36 lines
885 B
Python

from typing import List, Any
from transformers import AutoTokenizer
from ..tokenizer_interface import TokenizerInterface
class HuggingFaceTokenizer(TokenizerInterface):
def __init__(
self,
model: str,
max_tokens: int = 512,
):
self.model = model
self.max_tokens = max_tokens
self.tokenizer = AutoTokenizer.from_pretrained(model)
def extract_tokens(self, text: str) -> List[Any]:
tokens = self.tokenizer.tokenize(text)
return tokens
def count_tokens(self, text: str) -> int:
"""
Returns the number of tokens in the given text.
Args:
text: str
Returns:
number of tokens in the given text
"""
return len(self.tokenizer.tokenize(text))
def trim_text_to_max_tokens(self, text: str) -> str:
raise NotImplementedError