Hot Fix: Preserve whitespace chars in text sanitization
• Keep \t, \n, \r in control char removal
This commit is contained in:
parent
7b35657e32
commit
9b516a8a53
1 changed files with 2 additions and 2 deletions
|
|
@ -1995,8 +1995,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
|||
# Unescape HTML escapes
|
||||
sanitized = html.unescape(sanitized)
|
||||
|
||||
# Remove control characters
|
||||
sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized)
|
||||
# Remove control characters but preserve common whitespace (\t, \n, \r)
|
||||
sanitized = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", sanitized)
|
||||
|
||||
return sanitized.strip()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue