diff --git a/lightrag/utils.py b/lightrag/utils.py index 163fb013..50b86f98 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2584,6 +2584,18 @@ def fix_tuple_delimiter_corruption( "entity<|S|>name" >>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>") "entity<|S|>name" + Regex Sample: + <\|S\|+S\|> + <\|\\S\|> + <\|+> + <.?\|S\|.?> + <\|?S\|?> + <[^|]S\|>|<\|S[^|]> + <\|S\|(?!>) + <\|\|(?!>) + (? + <\|S\|>\| + \|\|S\|\| """ if not record or not delimiter_core or not tuple_delimiter: return record @@ -2591,13 +2603,6 @@ def fix_tuple_delimiter_corruption( # Escape the delimiter core for regex use escaped_delimiter_core = re.escape(delimiter_core) - # Fix: <||S||> - record = re.sub( - rf"<\|+{escaped_delimiter_core}\|+>", - tuple_delimiter, - record, - ) - # Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|> record = re.sub( rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", @@ -2619,7 +2624,7 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: -> <|S|>, <|S|Y> -> <|S|>, -> <|S|> (one extra characters outside pipes) + # Fix: -> <|S|>, <|S|Y> -> <|S|>, -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes) record = re.sub( rf"<.?\|{escaped_delimiter_core}\|.?>", tuple_delimiter,