This commit is contained in:
Raphaël MANSUY 2025-12-04 19:15:03 +08:00
parent 60b6b6bbae
commit 1368d3a1fe

View file

@ -56,9 +56,6 @@ if not logger.handlers:
# Set httpx logging level to WARNING
logging.getLogger("httpx").setLevel(logging.WARNING)
# Precompile regex pattern for JSON sanitization (module-level, compiled once)
_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
# Global import for pypinyin with startup-time logging
try:
import pypinyin
@ -933,120 +930,73 @@ def load_json(file_name):
def _sanitize_string_for_json(text: str) -> str:
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
Uses regex for optimal performance with zero-copy optimization for clean strings.
Fast detection path for clean strings (99% of cases) with efficient removal for dirty strings.
This is a simpler sanitizer specifically for JSON that directly removes
problematic characters without attempting to encode first.
Args:
text: String to sanitize
Returns:
Original string if clean (zero-copy), sanitized string if dirty
Sanitized string safe for UTF-8 encoding in JSON
"""
if not text:
return text
# Fast path: Check if sanitization is needed using C-level regex search
if not _SURROGATE_PATTERN.search(text):
return text # Zero-copy for clean strings - most common case
# Slow path: Remove problematic characters using C-level regex substitution
return _SURROGATE_PATTERN.sub("", text)
class SanitizingJSONEncoder(json.JSONEncoder):
"""
Custom JSON encoder that sanitizes data during serialization.
This encoder cleans strings during the encoding process without creating
a full copy of the data structure, making it memory-efficient for large datasets.
"""
def encode(self, o):
"""Override encode method to handle simple string cases"""
if isinstance(o, str):
return json.encoder.encode_basestring(_sanitize_string_for_json(o))
return super().encode(o)
def iterencode(self, o, _one_shot=False):
"""
Override iterencode to sanitize strings during serialization.
This is the core method that handles complex nested structures.
"""
# Preprocess: sanitize all strings in the object
sanitized = self._sanitize_for_encoding(o)
# Call parent's iterencode with sanitized data
for chunk in super().iterencode(sanitized, _one_shot):
yield chunk
def _sanitize_for_encoding(self, obj):
"""
Recursively sanitize strings in an object.
Creates new objects only when necessary to avoid deep copies.
Args:
obj: Object to sanitize
Returns:
Sanitized object with cleaned strings
"""
if isinstance(obj, str):
return _sanitize_string_for_json(obj)
elif isinstance(obj, dict):
# Create new dict with sanitized keys and values
new_dict = {}
for k, v in obj.items():
clean_k = _sanitize_string_for_json(k) if isinstance(k, str) else k
clean_v = self._sanitize_for_encoding(v)
new_dict[clean_k] = clean_v
return new_dict
elif isinstance(obj, (list, tuple)):
# Sanitize list/tuple elements
cleaned = [self._sanitize_for_encoding(item) for item in obj]
return type(obj)(cleaned) if isinstance(obj, tuple) else cleaned
# Directly filter out problematic characters without pre-validation
sanitized = ""
for char in text:
code_point = ord(char)
# Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
if 0xD800 <= code_point <= 0xDFFF:
continue
# Skip other non-characters in Unicode
elif code_point == 0xFFFE or code_point == 0xFFFF:
continue
else:
# Numbers, booleans, None, etc. remain unchanged
return obj
sanitized += char
return sanitized
def _sanitize_json_data(data: Any) -> Any:
"""Recursively sanitize all string values in data structure for safe UTF-8 encoding
Handles all JSON-serializable types including:
- Dictionary keys and values
- Lists and tuples (preserves type)
- Nested structures
- Strings at any level
Args:
data: Data to sanitize (dict, list, tuple, str, or other types)
Returns:
Sanitized data with all strings cleaned of problematic characters
"""
if isinstance(data, dict):
# Sanitize both keys and values
return {
_sanitize_string_for_json(k)
if isinstance(k, str)
else k: _sanitize_json_data(v)
for k, v in data.items()
}
elif isinstance(data, (list, tuple)):
# Handle both lists and tuples, preserve original type
sanitized = [_sanitize_json_data(item) for item in data]
return type(data)(sanitized)
elif isinstance(data, str):
return _sanitize_string_for_json(data)
else:
# Numbers, booleans, None, etc. - return as-is
return data
def write_json(json_obj, file_name):
"""
Write JSON data to file with optimized sanitization strategy.
This function uses a two-stage approach:
1. Fast path: Try direct serialization (works for clean data ~99% of time)
2. Slow path: Use custom encoder that sanitizes during serialization
The custom encoder approach avoids creating a deep copy of the data,
making it memory-efficient. When sanitization occurs, the caller should
reload the cleaned data from the file to update shared memory.
Args:
json_obj: Object to serialize (may be a shallow copy from shared memory)
file_name: Output file path
Returns:
bool: True if sanitization was applied (caller should reload data),
False if direct write succeeded (no reload needed)
"""
try:
# Strategy 1: Fast path - try direct serialization
with open(file_name, "w", encoding="utf-8") as f:
json.dump(json_obj, f, indent=2, ensure_ascii=False)
return False # No sanitization needed, no reload required
except (UnicodeEncodeError, UnicodeDecodeError) as e:
logger.debug(f"Direct JSON write failed, using sanitizing encoder: {e}")
# Strategy 2: Use custom encoder (sanitizes during serialization, zero memory copy)
# Sanitize data before writing to prevent UTF-8 encoding errors
sanitized_obj = _sanitize_json_data(json_obj)
with open(file_name, "w", encoding="utf-8") as f:
json.dump(json_obj, f, indent=2, ensure_ascii=False, cls=SanitizingJSONEncoder)
logger.info(f"JSON sanitization applied during write: {file_name}")
return True # Sanitization applied, reload recommended
json.dump(sanitized_obj, f, indent=2, ensure_ascii=False)
class TokenizerInterface(Protocol):