Hot Fix: Preserve whitespace chars in text sanitization

• Keep \t, \n, \r in control char removal
This commit is contained in:
yangdx 2025-09-04 10:58:29 +08:00
parent 7b35657e32
commit 9b516a8a53

View file

@ -1995,8 +1995,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
# Unescape HTML escapes
sanitized = html.unescape(sanitized)
# Remove control characters
sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized)
# Remove control characters but preserve common whitespace (\t, \n, \r)
sanitized = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", sanitized)
return sanitized.strip()