From 8088b7e07a5bcb814c8e928e327a13e5b148c650 Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 12 Sep 2025 18:03:37 +0800 Subject: [PATCH] Fix tuple delimiter corruption handling and update documentation --- lightrag/utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 163fb013..50b86f98 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2584,6 +2584,18 @@ def fix_tuple_delimiter_corruption( "entity<|S|>name" >>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>") "entity<|S|>name" + Regex Sample: + <\|S\|+S\|> + <\|\\S\|> + <\|+> + <.?\|S\|.?> + <\|?S\|?> + <[^|]S\|>|<\|S[^|]> + <\|S\|(?!>) + <\|\|(?!>) + (? + <\|S\|>\| + \|\|S\|\| """ if not record or not delimiter_core or not tuple_delimiter: return record @@ -2591,13 +2603,6 @@ def fix_tuple_delimiter_corruption( # Escape the delimiter core for regex use escaped_delimiter_core = re.escape(delimiter_core) - # Fix: <||S||> - record = re.sub( - rf"<\|+{escaped_delimiter_core}\|+>", - tuple_delimiter, - record, - ) - # Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|> record = re.sub( rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", @@ -2619,7 +2624,7 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: -> <|S|>, <|S|Y> -> <|S|>, -> <|S|> (one extra characters outside pipes) + # Fix: -> <|S|>, <|S|Y> -> <|S|>, -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes) record = re.sub( rf"<.?\|{escaped_delimiter_core}\|.?>", tuple_delimiter,