diff --git a/lightrag/utils.py b/lightrag/utils.py index 8ebdf9a2..c696c110 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1995,8 +1995,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: # Unescape HTML escapes sanitized = html.unescape(sanitized) - # Remove control characters - sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized) + # Remove control characters but preserve common whitespace (\t, \n, \r) + sanitized = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", sanitized) return sanitized.strip()