diff --git a/lightrag/utils.py b/lightrag/utils.py index da27926c..b78b7523 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -56,6 +56,9 @@ if not logger.handlers: # Set httpx logging level to WARNING logging.getLogger("httpx").setLevel(logging.WARNING) +# Precompile regex pattern for JSON sanitization (module-level, compiled once) +_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]") + # Global import for pypinyin with startup-time logging try: import pypinyin @@ -930,70 +933,24 @@ def load_json(file_name): def _sanitize_string_for_json(text: str) -> str: """Remove characters that cannot be encoded in UTF-8 for JSON serialization. - This is a simpler sanitizer specifically for JSON that directly removes - problematic characters without attempting to encode first. + Uses regex for optimal performance with zero-copy optimization for clean strings. + Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings. Args: text: String to sanitize Returns: - Sanitized string safe for UTF-8 encoding in JSON + Original string if clean (zero-copy), sanitized string if dirty """ if not text: return text - # Directly filter out problematic characters without pre-validation - sanitized = "" - for char in text: - code_point = ord(char) - # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors - if 0xD800 <= code_point <= 0xDFFF: - continue - # Skip other non-characters in Unicode - elif code_point == 0xFFFE or code_point == 0xFFFF: - continue - else: - sanitized += char + # Fast path: Check if sanitization is needed using C-level regex search + if not _SURROGATE_PATTERN.search(text): + return text # Zero-copy for clean strings - most common case - return sanitized - - -def _sanitize_json_data(data: Any) -> Any: - """Recursively sanitize all string values in data structure for safe UTF-8 encoding - - DEPRECATED: This function creates a deep copy of the data which can be memory-intensive. - For new code, prefer using write_json with SanitizingJSONEncoder which sanitizes during - serialization without creating copies. - - Handles all JSON-serializable types including: - - Dictionary keys and values - - Lists and tuples (preserves type) - - Nested structures - - Strings at any level - - Args: - data: Data to sanitize (dict, list, tuple, str, or other types) - - Returns: - Sanitized data with all strings cleaned of problematic characters - """ - if isinstance(data, dict): - # Sanitize both keys and values - return { - _sanitize_string_for_json(k) - if isinstance(k, str) - else k: _sanitize_json_data(v) - for k, v in data.items() - } - elif isinstance(data, (list, tuple)): - # Handle both lists and tuples, preserve original type - sanitized = [_sanitize_json_data(item) for item in data] - return type(data)(sanitized) - elif isinstance(data, str): - return _sanitize_string_for_json(data) - else: - # Numbers, booleans, None, etc. - return as-is - return data + # Slow path: Remove problematic characters using C-level regex substitution + return _SURROGATE_PATTERN.sub("", text) class SanitizingJSONEncoder(json.JSONEncoder):