Merge branch 'optimize-extraction' into return-data-only
This commit is contained in:
commit
2eddd1d46d
1 changed files with 13 additions and 8 deletions
|
|
@ -2584,6 +2584,18 @@ def fix_tuple_delimiter_corruption(
|
|||
"entity<|S|>name"
|
||||
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
|
||||
"entity<|S|>name"
|
||||
Regex Sample:
|
||||
<\|S\|+S\|>
|
||||
<\|\\S\|>
|
||||
<\|+>
|
||||
<.?\|S\|.?>
|
||||
<\|?S\|?>
|
||||
<[^|]S\|>|<\|S[^|]>
|
||||
<\|S\|(?!>)
|
||||
<\|\|(?!>)
|
||||
(?<!<)\|S\|>
|
||||
<\|S\|>\|
|
||||
\|\|S\|\|
|
||||
"""
|
||||
if not record or not delimiter_core or not tuple_delimiter:
|
||||
return record
|
||||
|
|
@ -2591,13 +2603,6 @@ def fix_tuple_delimiter_corruption(
|
|||
# Escape the delimiter core for regex use
|
||||
escaped_delimiter_core = re.escape(delimiter_core)
|
||||
|
||||
# Fix: <||S||>
|
||||
record = re.sub(
|
||||
rf"<\|+{escaped_delimiter_core}\|+>",
|
||||
tuple_delimiter,
|
||||
record,
|
||||
)
|
||||
|
||||
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
|
||||
record = re.sub(
|
||||
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
||||
|
|
@ -2619,7 +2624,7 @@ def fix_tuple_delimiter_corruption(
|
|||
record,
|
||||
)
|
||||
|
||||
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
|
||||
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes)
|
||||
record = re.sub(
|
||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
||||
tuple_delimiter,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue