From 9b516a8a53e3b692f6738a452e8a228704f41886 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 4 Sep 2025 10:58:29 +0800 Subject: [PATCH] Hot Fix: Preserve whitespace chars in text sanitization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Keep \t, \n, \r in control char removal --- lightrag/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 8ebdf9a2..c696c110 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1995,8 +1995,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: # Unescape HTML escapes sanitized = html.unescape(sanitized) - # Remove control characters - sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized) + # Remove control characters but preserve common whitespace (\t, \n, \r) + sanitized = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", sanitized) return sanitized.strip()