From 5885637ebf6df86d6b0fb9c2ce99207a552dbdf7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 12 Nov 2025 00:38:47 +0800 Subject: [PATCH] Add specialized JSON string sanitizer to prevent UTF-8 encoding errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove surrogate characters (U+D800-DFFF) • Filter Unicode non-characters • Direct char-by-char filtering --- lightrag/utils.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 064e4804..7232a91c 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -927,6 +927,37 @@ def load_json(file_name): return json.load(f) +def _sanitize_string_for_json(text: str) -> str: + """Remove characters that cannot be encoded in UTF-8 for JSON serialization. + + This is a simpler sanitizer specifically for JSON that directly removes + problematic characters without attempting to encode first. + + Args: + text: String to sanitize + + Returns: + Sanitized string safe for UTF-8 encoding in JSON + """ + if not text: + return text + + # Directly filter out problematic characters without pre-validation + sanitized = "" + for char in text: + code_point = ord(char) + # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors + if 0xD800 <= code_point <= 0xDFFF: + continue + # Skip other non-characters in Unicode + elif code_point == 0xFFFE or code_point == 0xFFFF: + continue + else: + sanitized += char + + return sanitized + + def _sanitize_json_data(data: Any) -> Any: """Recursively sanitize all string values in data structure for safe UTF-8 encoding @@ -941,7 +972,7 @@ def _sanitize_json_data(data: Any) -> Any: elif isinstance(data, list): return [_sanitize_json_data(item) for item in data] elif isinstance(data, str): - return sanitize_text_for_encoding(data, replacement_char="") + return _sanitize_string_for_json(data) else: return data