LightRAG/tests/simple_tokenizer.py

224 lines
5.3 KiB
Python

"""
Simple tokenizer implementation for offline integration testing.
This tokenizer doesn't require internet access and provides a basic
word-based tokenization suitable for testing purposes.
"""
from typing import List
import re
class SimpleTokenizerImpl:
"""
A simple word-based tokenizer that works offline.
This tokenizer:
- Splits text into words and punctuation
- Doesn't require downloading any external files
- Provides deterministic token IDs based on a vocabulary
"""
def __init__(self):
# Build a simple vocabulary for common tokens
# This is a simplified approach - real tokenizers have much larger vocabularies
self.vocab = self._build_vocab()
self.inverse_vocab = {v: k for k, v in self.vocab.items()}
self.unk_token_id = len(self.vocab)
def _build_vocab(self) -> dict:
"""Build a basic vocabulary of common tokens."""
vocab = {}
current_id = 0
# Add common words and symbols
common_tokens = [
# Whitespace and punctuation
" ",
"\n",
"\t",
".",
",",
"!",
"?",
";",
":",
"(",
")",
"[",
"]",
"{",
"}",
'"',
"'",
"-",
"_",
"/",
"\\",
"@",
"#",
"$",
"%",
"&",
"*",
"+",
"=",
# Common programming keywords (for C++ code)
"class",
"struct",
"public",
"private",
"protected",
"void",
"int",
"double",
"float",
"char",
"bool",
"if",
"else",
"for",
"while",
"return",
"include",
"namespace",
"using",
"const",
"static",
"virtual",
"new",
"delete",
"this",
"nullptr",
"true",
"false",
# Common English words
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"from",
"with",
"by",
"for",
"of",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"should",
"could",
"can",
"may",
"might",
"must",
"not",
"no",
"yes",
"this",
"that",
"these",
"those",
"what",
"which",
"who",
"when",
"where",
"why",
"how",
]
for token in common_tokens:
vocab[token.lower()] = current_id
current_id += 1
return vocab
def _tokenize(self, text: str) -> List[str]:
"""Split text into tokens (words and punctuation)."""
# Simple pattern to split on whitespace and keep punctuation separate
pattern = r"\w+|[^\w\s]"
tokens = re.findall(pattern, text)
return tokens
def encode(self, content: str) -> List[int]:
"""
Encode a string into a list of token IDs.
Args:
content: The string to encode.
Returns:
A list of integer token IDs.
"""
if not content:
return []
tokens = self._tokenize(content)
token_ids = []
for token in tokens:
token_lower = token.lower()
if token_lower in self.vocab:
token_ids.append(self.vocab[token_lower])
else:
# For unknown tokens, use a hash-based ID to be deterministic
# Offset by vocab size to avoid collisions
hash_id = abs(hash(token)) % 10000 + len(self.vocab)
token_ids.append(hash_id)
return token_ids
def decode(self, tokens: List[int]) -> str:
"""
Decode a list of token IDs into a string.
Args:
tokens: The list of token IDs to decode.
Returns:
The decoded string.
"""
if not tokens:
return ""
words = []
for token_id in tokens:
if token_id in self.inverse_vocab:
words.append(self.inverse_vocab[token_id])
else:
# For unknown IDs, use a placeholder
words.append(f"<unk_{token_id}>")
# Simple reconstruction - join words with spaces
# This is a simplification; real tokenizers preserve exact spacing
return " ".join(words)
def create_simple_tokenizer():
"""
Create a simple tokenizer for offline use.
Returns:
A Tokenizer instance using SimpleTokenizerImpl.
"""
from lightrag.utils import Tokenizer
return Tokenizer("simple-tokenizer", SimpleTokenizerImpl())