Optimize JSON string sanitization with precompiled regex and zero-copy
- Precompile regex pattern at module level - Zero-copy path for clean strings - Use C-level regex for performance - Remove deprecated _sanitize_json_data - Fast detection for common case
This commit is contained in:
parent
777c987371
commit
6de4123f74
1 changed files with 11 additions and 54 deletions
|
|
@ -56,6 +56,9 @@ if not logger.handlers:
|
||||||
# Set httpx logging level to WARNING
|
# Set httpx logging level to WARNING
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
# Precompile regex pattern for JSON sanitization (module-level, compiled once)
|
||||||
|
_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
|
||||||
|
|
||||||
# Global import for pypinyin with startup-time logging
|
# Global import for pypinyin with startup-time logging
|
||||||
try:
|
try:
|
||||||
import pypinyin
|
import pypinyin
|
||||||
|
|
@ -930,70 +933,24 @@ def load_json(file_name):
|
||||||
def _sanitize_string_for_json(text: str) -> str:
|
def _sanitize_string_for_json(text: str) -> str:
|
||||||
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
|
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
|
||||||
|
|
||||||
This is a simpler sanitizer specifically for JSON that directly removes
|
Uses regex for optimal performance with zero-copy optimization for clean strings.
|
||||||
problematic characters without attempting to encode first.
|
Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: String to sanitize
|
text: String to sanitize
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Sanitized string safe for UTF-8 encoding in JSON
|
Original string if clean (zero-copy), sanitized string if dirty
|
||||||
"""
|
"""
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
# Directly filter out problematic characters without pre-validation
|
# Fast path: Check if sanitization is needed using C-level regex search
|
||||||
sanitized = ""
|
if not _SURROGATE_PATTERN.search(text):
|
||||||
for char in text:
|
return text # Zero-copy for clean strings - most common case
|
||||||
code_point = ord(char)
|
|
||||||
# Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
|
|
||||||
if 0xD800 <= code_point <= 0xDFFF:
|
|
||||||
continue
|
|
||||||
# Skip other non-characters in Unicode
|
|
||||||
elif code_point == 0xFFFE or code_point == 0xFFFF:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
sanitized += char
|
|
||||||
|
|
||||||
return sanitized
|
# Slow path: Remove problematic characters using C-level regex substitution
|
||||||
|
return _SURROGATE_PATTERN.sub("", text)
|
||||||
|
|
||||||
def _sanitize_json_data(data: Any) -> Any:
|
|
||||||
"""Recursively sanitize all string values in data structure for safe UTF-8 encoding
|
|
||||||
|
|
||||||
DEPRECATED: This function creates a deep copy of the data which can be memory-intensive.
|
|
||||||
For new code, prefer using write_json with SanitizingJSONEncoder which sanitizes during
|
|
||||||
serialization without creating copies.
|
|
||||||
|
|
||||||
Handles all JSON-serializable types including:
|
|
||||||
- Dictionary keys and values
|
|
||||||
- Lists and tuples (preserves type)
|
|
||||||
- Nested structures
|
|
||||||
- Strings at any level
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: Data to sanitize (dict, list, tuple, str, or other types)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Sanitized data with all strings cleaned of problematic characters
|
|
||||||
"""
|
|
||||||
if isinstance(data, dict):
|
|
||||||
# Sanitize both keys and values
|
|
||||||
return {
|
|
||||||
_sanitize_string_for_json(k)
|
|
||||||
if isinstance(k, str)
|
|
||||||
else k: _sanitize_json_data(v)
|
|
||||||
for k, v in data.items()
|
|
||||||
}
|
|
||||||
elif isinstance(data, (list, tuple)):
|
|
||||||
# Handle both lists and tuples, preserve original type
|
|
||||||
sanitized = [_sanitize_json_data(item) for item in data]
|
|
||||||
return type(data)(sanitized)
|
|
||||||
elif isinstance(data, str):
|
|
||||||
return _sanitize_string_for_json(data)
|
|
||||||
else:
|
|
||||||
# Numbers, booleans, None, etc. - return as-is
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class SanitizingJSONEncoder(json.JSONEncoder):
|
class SanitizingJSONEncoder(json.JSONEncoder):
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue