Refactor text cleaning to use sanitize_text_for_encoding consistently
• Replace clean_text with sanitize_text • Remove deprecated clean_text function • Add whitespace trimming to sanitizer • Improve UTF-8 encoding safety • Consolidate text cleaning logic
This commit is contained in:
parent
f9cf544805
commit
806081645f
2 changed files with 17 additions and 21 deletions
|
|
@ -85,7 +85,7 @@ from .utils import (
|
||||||
lazy_external_import,
|
lazy_external_import,
|
||||||
priority_limit_async_func_call,
|
priority_limit_async_func_call,
|
||||||
get_content_summary,
|
get_content_summary,
|
||||||
clean_text,
|
sanitize_text_for_encoding,
|
||||||
check_storage_env_vars,
|
check_storage_env_vars,
|
||||||
generate_track_id,
|
generate_track_id,
|
||||||
logger,
|
logger,
|
||||||
|
|
@ -908,8 +908,8 @@ class LightRAG:
|
||||||
update_storage = False
|
update_storage = False
|
||||||
try:
|
try:
|
||||||
# Clean input texts
|
# Clean input texts
|
||||||
full_text = clean_text(full_text)
|
full_text = sanitize_text_for_encoding(full_text)
|
||||||
text_chunks = [clean_text(chunk) for chunk in text_chunks]
|
text_chunks = [sanitize_text_for_encoding(chunk) for chunk in text_chunks]
|
||||||
file_path = ""
|
file_path = ""
|
||||||
|
|
||||||
# Process cleaned texts
|
# Process cleaned texts
|
||||||
|
|
@ -1020,7 +1020,7 @@ class LightRAG:
|
||||||
# Generate contents dict and remove duplicates in one pass
|
# Generate contents dict and remove duplicates in one pass
|
||||||
unique_contents = {}
|
unique_contents = {}
|
||||||
for id_, doc, path in zip(ids, input, file_paths):
|
for id_, doc, path in zip(ids, input, file_paths):
|
||||||
cleaned_content = clean_text(doc)
|
cleaned_content = sanitize_text_for_encoding(doc)
|
||||||
if cleaned_content not in unique_contents:
|
if cleaned_content not in unique_contents:
|
||||||
unique_contents[cleaned_content] = (id_, path)
|
unique_contents[cleaned_content] = (id_, path)
|
||||||
|
|
||||||
|
|
@ -1033,7 +1033,7 @@ class LightRAG:
|
||||||
# Clean input text and remove duplicates in one pass
|
# Clean input text and remove duplicates in one pass
|
||||||
unique_content_with_paths = {}
|
unique_content_with_paths = {}
|
||||||
for doc, path in zip(input, file_paths):
|
for doc, path in zip(input, file_paths):
|
||||||
cleaned_content = clean_text(doc)
|
cleaned_content = sanitize_text_for_encoding(doc)
|
||||||
if cleaned_content not in unique_content_with_paths:
|
if cleaned_content not in unique_content_with_paths:
|
||||||
unique_content_with_paths[cleaned_content] = path
|
unique_content_with_paths[cleaned_content] = path
|
||||||
|
|
||||||
|
|
@ -1817,7 +1817,7 @@ class LightRAG:
|
||||||
all_chunks_data: dict[str, dict[str, str]] = {}
|
all_chunks_data: dict[str, dict[str, str]] = {}
|
||||||
chunk_to_source_map: dict[str, str] = {}
|
chunk_to_source_map: dict[str, str] = {}
|
||||||
for chunk_data in custom_kg.get("chunks", []):
|
for chunk_data in custom_kg.get("chunks", []):
|
||||||
chunk_content = clean_text(chunk_data["content"])
|
chunk_content = sanitize_text_for_encoding(chunk_data["content"])
|
||||||
source_id = chunk_data["source_id"]
|
source_id = chunk_data["source_id"]
|
||||||
file_path = chunk_data.get("file_path", "custom_kg")
|
file_path = chunk_data.get("file_path", "custom_kg")
|
||||||
tokens = len(self.tokenizer.encode(chunk_content))
|
tokens = len(self.tokenizer.encode(chunk_content))
|
||||||
|
|
|
||||||
|
|
@ -1575,18 +1575,6 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
"""Clean text by removing null bytes (0x00) and whitespace
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Input text to clean
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Cleaned text
|
|
||||||
"""
|
|
||||||
return text.strip().replace("\x00", "")
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
"""Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
|
"""Sanitize text to ensure safe UTF-8 encoding by removing or replacing problematic characters.
|
||||||
|
|
||||||
|
|
@ -1594,6 +1582,7 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
- Surrogate characters (the main cause of the encoding error)
|
- Surrogate characters (the main cause of the encoding error)
|
||||||
- Other invalid Unicode sequences
|
- Other invalid Unicode sequences
|
||||||
- Control characters that might cause issues
|
- Control characters that might cause issues
|
||||||
|
- Whitespace trimming
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Input text to sanitize
|
text: Input text to sanitize
|
||||||
|
|
@ -1609,7 +1598,14 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# First, try to encode/decode to catch any encoding issues early
|
# First, strip whitespace
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# Early return if text is empty after basic cleaning
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Try to encode/decode to catch any encoding issues early
|
||||||
text.encode("utf-8")
|
text.encode("utf-8")
|
||||||
|
|
||||||
# Remove or replace surrogate characters (U+D800 to U+DFFF)
|
# Remove or replace surrogate characters (U+D800 to U+DFFF)
|
||||||
|
|
@ -1630,8 +1626,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
else:
|
else:
|
||||||
sanitized += char
|
sanitized += char
|
||||||
|
|
||||||
# Additional cleanup: remove null bytes and other control characters
|
# Additional cleanup: remove null bytes and other control characters that might cause issues
|
||||||
# that might cause issues (but preserve common whitespace)
|
# (but preserve common whitespace like \t, \n, \r)
|
||||||
sanitized = re.sub(
|
sanitized = re.sub(
|
||||||
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
|
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", replacement_char, sanitized
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue