From 40688def20f64211b344057261b9892aa1d45efc Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 12 Sep 2025 04:10:14 +0800
Subject: [PATCH] Refactor tuple delimiter corruption fix into reusable utility
 function

- Extract regex fixes to utils module
- Add case-insensitive delimiter handling
---
 lightrag/operate.py | 57 +++++-------------------------------
 lightrag/prompt.py  |  2 +-
 lightrag/utils.py   | 70 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 51 deletions(-)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 1e11293e..69cd2205 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -3,7 +3,6 @@ from functools import partial
 
 import asyncio
 import json
-import re
 import json_repair
 from typing import Any, AsyncIterator
 from collections import Counter, defaultdict
@@ -30,6 +29,7 @@ from .utils import (
     build_file_path,
     safe_vdb_operation_with_exception,
     create_prefixed_exception,
+    fix_tuple_delimiter_corruption,
 )
 from .base import (
     BaseGraphStorage,
@@ -875,55 +875,12 @@ async def _process_extraction_result(
         if record is None:
             continue
 
-        # Fix various forms of tuple_delimiter corruption from the LLM output.
-        # It handles missing or replaced characters around the core delimiter.
-        # 1. There might be extra characters inserted between the bracket and pipeline.
-        # 2. `|` may be missing or replaced by another character.
-        # 3. Missing opening `<` or closing `>`
-        # Example transformations:
-        #   <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|>  ((one extra characters outside pipes)
-        #   <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
-        #   <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|>  (where one | is replace by other charater)
-        #   |SEP|> -> <|SEP|>, <|SEP| -> <|SEP|> (where one | is missing)
-
-        escaped_delimiter_core = re.escape(
-            tuple_delimiter[2:-2]
-        )  # Extract "SEP" from "<|SEP|>"
-
-        # Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|>  (one extra characters outside pipes)
-        record = re.sub(
-            rf"<.?\|{escaped_delimiter_core}\|.?>",
-            tuple_delimiter,
-            record,
-        )
-
-        # Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
-        record = re.sub(
-            rf"<\|?{escaped_delimiter_core}\|?>",
-            tuple_delimiter,
-            record,
-        )
-
-        # Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
-        record = re.sub(
-            rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
-            tuple_delimiter,
-            record,
-        )
-
-        # Fix: |SEP|> -> <|SEP|> (missing opening <)
-        record = re.sub(
-            rf"(?<!<)\|{escaped_delimiter_core}\|>",
-            tuple_delimiter,
-            record,
-        )
-
-        # Fix: <|SEP| -> <|SEP|> (missing closing >)
-        record = re.sub(
-            rf"<\|{escaped_delimiter_core}\|(?!>)",
-            tuple_delimiter,
-            record,
-        )
+        # Fix various forms of tuple_delimiter corruption from the LLM output using the dedicated function
+        delimiter_core = tuple_delimiter[2:-2]  # Extract "SEP" from "<|SEP|>"
+        record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
+        # change delimiter_core to lower case, and fix again
+        delimiter_core = delimiter_core.lower()
+        record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter)
 
         record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
 
diff --git a/lightrag/prompt.py b/lightrag/prompt.py
index f3939e01..b3bd6c6b 100644
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -4,7 +4,7 @@ from typing import Any
 
 PROMPTS: dict[str, Any] = {}
 
-# Delimiter must be bracketed in "<|...|>"
+# All delimiters must be formatted as "<|UPPER_CASE_STRING|>"
 PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|SEP|>"
 PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
 
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 4952d98a..cf39e64e 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -2544,6 +2544,76 @@ def get_pinyin_sort_key(text: str) -> str:
         return text.lower()
 
 
+def fix_tuple_delimiter_corruption(
+    record: str, delimiter_core: str, tuple_delimiter: str
+) -> str:
+    """
+    Fix various forms of tuple_delimiter corruption from LLM output.
+
+    This function handles missing or replaced characters around the core delimiter.
+    It fixes common corruption patterns where the LLM output doesn't match the expected
+    tuple_delimiter format.
+
+    Args:
+        record: The text record to fix
+        delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>")
+        tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>")
+
+    Returns:
+        The corrected record with proper tuple_delimiter format
+
+    Examples:
+        >>> fix_tuple_delimiter_corruption("entity<X|SEP|>name", "SEP", "<|SEP|>")
+        "entity<|SEP|>name"
+        >>> fix_tuple_delimiter_corruption("entity<SEP>name", "SEP", "<|SEP|>")
+        "entity<|SEP|>name"
+        >>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>")
+        "entity<|SEP|>name"
+    """
+    if not record or not delimiter_core or not tuple_delimiter:
+        return record
+
+    # Escape the delimiter core for regex use
+    escaped_delimiter_core = re.escape(delimiter_core)
+
+    # Fix: <X|SEP|> -> <|SEP|>, <|SEP|Y> -> <|SEP|>, <X|SEP|Y> -> <|SEP|>  (one extra characters outside pipes)
+    record = re.sub(
+        rf"<.?\|{escaped_delimiter_core}\|.?>",
+        tuple_delimiter,
+        record,
+    )
+
+    # Fix: <SEP>, <SEP|>, <|SEP> -> <|SEP|> (missing one or both pipes)
+    record = re.sub(
+        rf"<\|?{escaped_delimiter_core}\|?>",
+        tuple_delimiter,
+        record,
+    )
+
+    # Fix: <XSEP|> -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character)
+    record = re.sub(
+        rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>",
+        tuple_delimiter,
+        record,
+    )
+
+    # Fix: |SEP|> -> <|SEP|> (missing opening <)
+    record = re.sub(
+        rf"(?<!<)\|{escaped_delimiter_core}\|>",
+        tuple_delimiter,
+        record,
+    )
+
+    # Fix: <|SEP| -> <|SEP|> (missing closing >)
+    record = re.sub(
+        rf"<\|{escaped_delimiter_core}\|(?!>)",
+        tuple_delimiter,
+        record,
+    )
+
+    return record
+
+
 def create_prefixed_exception(original_exception: Exception, prefix: str) -> Exception:
     """
     Safely create a prefixed exception that adapts to all error types.