From 23cbb9c9b2d74450eb6910a2cba7a48e7944e0c7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 12 Nov 2025 00:11:13 +0800 Subject: [PATCH] Add data sanitization to JSON writing to prevent UTF-8 encoding errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add _sanitize_json_data helper function • Recursively clean strings in data • Sanitize before JSON serialization • Prevent encoding-related crashes • Use existing sanitize_text_for_encoding --- lightrag/utils.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 460ede3c..064e4804 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -927,9 +927,30 @@ def load_json(file_name): return json.load(f) +def _sanitize_json_data(data: Any) -> Any: + """Recursively sanitize all string values in data structure for safe UTF-8 encoding + + Args: + data: Data to sanitize (dict, list, str, or other types) + + Returns: + Sanitized data with all strings cleaned of problematic characters + """ + if isinstance(data, dict): + return {k: _sanitize_json_data(v) for k, v in data.items()} + elif isinstance(data, list): + return [_sanitize_json_data(item) for item in data] + elif isinstance(data, str): + return sanitize_text_for_encoding(data, replacement_char="") + else: + return data + + def write_json(json_obj, file_name): + # Sanitize data before writing to prevent UTF-8 encoding errors + sanitized_obj = _sanitize_json_data(json_obj) with open(file_name, "w", encoding="utf-8") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) + json.dump(sanitized_obj, f, indent=2, ensure_ascii=False) class TokenizerInterface(Protocol):