From 87bb8a023bdfa8cb6f9f1e3ebc381b2a04c2a6d8 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 17:29:27 +0800 Subject: [PATCH] Fix tuple delimiter regex patterns and add debug logging - Add debug logs for malformed records - Fix regex for consecutive delimiters - Handle missing closing brackets --- lightrag/operate.py | 2 ++ lightrag/utils.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 4cdc1c19..7821ae38 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -326,6 +326,7 @@ async def _handle_single_entity_extraction( logger.warning( f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`" ) + logger.debug(record_attributes) return None try: @@ -399,6 +400,7 @@ async def _handle_single_relationship_extraction( logger.warning( f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`" ) + logger.debug(record_attributes) return None try: diff --git a/lightrag/utils.py b/lightrag/utils.py index 177d3fbf..77bd9c47 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2583,9 +2583,9 @@ def fix_tuple_delimiter_corruption( # Escape the delimiter core for regex use escaped_delimiter_core = re.escape(delimiter_core) - # Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> + # Fix: <|##|> -> <|#|>, <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> record = re.sub( - rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", + rf"<\|{escaped_delimiter_core}\|*?{escaped_delimiter_core}\|>", tuple_delimiter, record, ) @@ -2604,9 +2604,9 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: -> <|#|>, <|#|Y> -> <|#|>, -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes) + # Fix: -> <|#|>, <|#|Y> -> <|#|>, -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes) record = re.sub( - rf"<.?\|{escaped_delimiter_core}\|.?>", + rf"<.?\|{escaped_delimiter_core}\|*?>", tuple_delimiter, record, ) @@ -2625,9 +2625,10 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: <|#| -> <|#|> (missing closing >) + # Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >) + # record = re.sub( - rf"<\|{escaped_delimiter_core}\|(?!>)", + rf"<\|{escaped_delimiter_core}\|+(?!>)", tuple_delimiter, record, )