From 1368d3a1febf537b93d22ad89aadad3bc7d1f55b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 4 Dec 2025 19:15:03 +0800 Subject: [PATCH] cherry-pick abeaac84 --- lightrag/utils.py | 156 ++++++++++++++++------------------------------ 1 file changed, 53 insertions(+), 103 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index b78b7523..4bfd20f2 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -56,9 +56,6 @@ if not logger.handlers: # Set httpx logging level to WARNING logging.getLogger("httpx").setLevel(logging.WARNING) -# Precompile regex pattern for JSON sanitization (module-level, compiled once) -_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]") - # Global import for pypinyin with startup-time logging try: import pypinyin @@ -933,120 +930,73 @@ def load_json(file_name): def _sanitize_string_for_json(text: str) -> str: """Remove characters that cannot be encoded in UTF-8 for JSON serialization. - Uses regex for optimal performance with zero-copy optimization for clean strings. - Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings. + This is a simpler sanitizer specifically for JSON that directly removes + problematic characters without attempting to encode first. Args: text: String to sanitize Returns: - Original string if clean (zero-copy), sanitized string if dirty + Sanitized string safe for UTF-8 encoding in JSON """ if not text: return text - # Fast path: Check if sanitization is needed using C-level regex search - if not _SURROGATE_PATTERN.search(text): - return text # Zero-copy for clean strings - most common case - - # Slow path: Remove problematic characters using C-level regex substitution - return _SURROGATE_PATTERN.sub("", text) - - -class SanitizingJSONEncoder(json.JSONEncoder): - """ - Custom JSON encoder that sanitizes data during serialization. - - This encoder cleans strings during the encoding process without creating - a full copy of the data structure, making it memory-efficient for large datasets. - """ - - def encode(self, o): - """Override encode method to handle simple string cases""" - if isinstance(o, str): - return json.encoder.encode_basestring(_sanitize_string_for_json(o)) - return super().encode(o) - - def iterencode(self, o, _one_shot=False): - """ - Override iterencode to sanitize strings during serialization. - This is the core method that handles complex nested structures. - """ - # Preprocess: sanitize all strings in the object - sanitized = self._sanitize_for_encoding(o) - - # Call parent's iterencode with sanitized data - for chunk in super().iterencode(sanitized, _one_shot): - yield chunk - - def _sanitize_for_encoding(self, obj): - """ - Recursively sanitize strings in an object. - Creates new objects only when necessary to avoid deep copies. - - Args: - obj: Object to sanitize - - Returns: - Sanitized object with cleaned strings - """ - if isinstance(obj, str): - return _sanitize_string_for_json(obj) - - elif isinstance(obj, dict): - # Create new dict with sanitized keys and values - new_dict = {} - for k, v in obj.items(): - clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k - clean_v = self._sanitize_for_encoding(v) - new_dict[clean_k] = clean_v - return new_dict - - elif isinstance(obj, (list, tuple)): - # Sanitize list/tuple elements - cleaned = [self._sanitize_for_encoding(item) for item in obj] - return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned - + # Directly filter out problematic characters without pre-validation + sanitized = "" + for char in text: + code_point = ord(char) + # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors + if 0xD800 <= code_point <= 0xDFFF: + continue + # Skip other non-characters in Unicode + elif code_point == 0xFFFE or code_point == 0xFFFF: + continue else: - # Numbers, booleans, None, etc. remain unchanged - return obj + sanitized += char + + return sanitized + + +def _sanitize_json_data(data: Any) -> Any: + """Recursively sanitize all string values in data structure for safe UTF-8 encoding + + Handles all JSON-serializable types including: + - Dictionary keys and values + - Lists and tuples (preserves type) + - Nested structures + - Strings at any level + + Args: + data: Data to sanitize (dict, list, tuple, str, or other types) + + Returns: + Sanitized data with all strings cleaned of problematic characters + """ + if isinstance(data, dict): + # Sanitize both keys and values + return { + _sanitize_string_for_json(k) + if isinstance(k, str) + else k: _sanitize_json_data(v) + for k, v in data.items() + } + elif isinstance(data, (list, tuple)): + # Handle both lists and tuples, preserve original type + sanitized = [_sanitize_json_data(item) for item in data] + return type(data)(sanitized) + elif isinstance(data, str): + return _sanitize_string_for_json(data) + else: + # Numbers, booleans, None, etc. - return as-is + return data def write_json(json_obj, file_name): - """ - Write JSON data to file with optimized sanitization strategy. - - This function uses a two-stage approach: - 1. Fast path: Try direct serialization (works for clean data ~99% of time) - 2. Slow path: Use custom encoder that sanitizes during serialization - - The custom encoder approach avoids creating a deep copy of the data, - making it memory-efficient. When sanitization occurs, the caller should - reload the cleaned data from the file to update shared memory. - - Args: - json_obj: Object to serialize (may be a shallow copy from shared memory) - file_name: Output file path - - Returns: - bool: True if sanitization was applied (caller should reload data), - False if direct write succeeded (no reload needed) - """ - try: - # Strategy 1: Fast path - try direct serialization - with open(file_name, "w", encoding="utf-8") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) - return False # No sanitization needed, no reload required - - except (UnicodeEncodeError, UnicodeDecodeError) as e: - logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}") - - # Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy) + # Sanitize data before writing to prevent UTF-8 encoding errors + sanitized_obj = _sanitize_json_data(json_obj) with open(file_name, "w", encoding="utf-8") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder) - - logger.info(f"JSON sanitization applied during write: {file_name}") - return True # Sanitization applied, reload recommended + json.dump(sanitized_obj, f, indent=2, ensure_ascii=False) class TokenizerInterface(Protocol):