Merge branch 'optimize-extraction' into return-data-only

This commit is contained in:
yangdx 2025-09-12 18:03:58 +08:00
commit 2eddd1d46d

View file

@ -2584,6 +2584,18 @@ def fix_tuple_delimiter_corruption(
"entity<|S|>name"
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
"entity<|S|>name"
Regex Sample:
<\|S\|+S\|>
<\|\\S\|>
<\|+>
<.?\|S\|.?>
<\|?S\|?>
<[^|]S\|>|<\|S[^|]>
<\|S\|(?!>)
<\|\|(?!>)
(?<!<)\|S\|>
<\|S\|>\|
\|\|S\|\|
"""
if not record or not delimiter_core or not tuple_delimiter:
return record
@ -2591,13 +2603,6 @@ def fix_tuple_delimiter_corruption(
# Escape the delimiter core for regex use
escaped_delimiter_core = re.escape(delimiter_core)
# Fix: <||S||>
record = re.sub(
rf"<\|+{escaped_delimiter_core}\|+>",
tuple_delimiter,
record,
)
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
record = re.sub(
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
@ -2619,7 +2624,7 @@ def fix_tuple_delimiter_corruption(
record,
)
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes)
record = re.sub(
rf"<.?\|{escaped_delimiter_core}\|.?>",
tuple_delimiter,