Merge branch 'optimize-extraction' into return-data-only
This commit is contained in:
commit
2eddd1d46d
1 changed files with 13 additions and 8 deletions
|
|
@ -2584,6 +2584,18 @@ def fix_tuple_delimiter_corruption(
|
||||||
"entity<|S|>name"
|
"entity<|S|>name"
|
||||||
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
|
>>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>")
|
||||||
"entity<|S|>name"
|
"entity<|S|>name"
|
||||||
|
Regex Sample:
|
||||||
|
<\|S\|+S\|>
|
||||||
|
<\|\\S\|>
|
||||||
|
<\|+>
|
||||||
|
<.?\|S\|.?>
|
||||||
|
<\|?S\|?>
|
||||||
|
<[^|]S\|>|<\|S[^|]>
|
||||||
|
<\|S\|(?!>)
|
||||||
|
<\|\|(?!>)
|
||||||
|
(?<!<)\|S\|>
|
||||||
|
<\|S\|>\|
|
||||||
|
\|\|S\|\|
|
||||||
"""
|
"""
|
||||||
if not record or not delimiter_core or not tuple_delimiter:
|
if not record or not delimiter_core or not tuple_delimiter:
|
||||||
return record
|
return record
|
||||||
|
|
@ -2591,13 +2603,6 @@ def fix_tuple_delimiter_corruption(
|
||||||
# Escape the delimiter core for regex use
|
# Escape the delimiter core for regex use
|
||||||
escaped_delimiter_core = re.escape(delimiter_core)
|
escaped_delimiter_core = re.escape(delimiter_core)
|
||||||
|
|
||||||
# Fix: <||S||>
|
|
||||||
record = re.sub(
|
|
||||||
rf"<\|+{escaped_delimiter_core}\|+>",
|
|
||||||
tuple_delimiter,
|
|
||||||
record,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
|
# Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|>
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>",
|
||||||
|
|
@ -2619,7 +2624,7 @@ def fix_tuple_delimiter_corruption(
|
||||||
record,
|
record,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|> (one extra characters outside pipes)
|
# Fix: <X|S|> -> <|S|>, <|S|Y> -> <|S|>, <X|S|Y> -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes)
|
||||||
record = re.sub(
|
record = re.sub(
|
||||||
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
rf"<.?\|{escaped_delimiter_core}\|.?>",
|
||||||
tuple_delimiter,
|
tuple_delimiter,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue