From 40688def20f64211b344057261b9892aa1d45efc Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 12 Sep 2025 04:10:14 +0800 Subject: [PATCH] Refactor tuple delimiter corruption fix into reusable utility function - Extract regex fixes to utils module - Add case-insensitive delimiter handling --- lightrag/operate.py | 57 +++++------------------------------- lightrag/prompt.py | 2 +- lightrag/utils.py | 70 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 51 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 1e11293e..69cd2205 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -3,7 +3,6 @@ from functools import partial import asyncio import json -import re import json_repair from typing import Any, AsyncIterator from collections import Counter, defaultdict @@ -30,6 +29,7 @@ from .utils import ( build_file_path, safe_vdb_operation_with_exception, create_prefixed_exception, + fix_tuple_delimiter_corruption, ) from .base import ( BaseGraphStorage, @@ -875,55 +875,12 @@ async def _process_extraction_result( if record is None: continue - # Fix various forms of tuple_delimiter corruption from the LLM output. - # It handles missing or replaced characters around the core delimiter. - # 1. There might be extra characters inserted between the bracket and pipeline. - # 2. `|` may be missing or replaced by another character. - # 3. Missing opening `<` or closing `>` - # Example transformations: - # -> <|SEP|>, <|SEP|Y> -> <|SEP|>, -> <|SEP|> ((one extra characters outside pipes) - # , , <|SEP> -> <|SEP|> (missing one or both pipes) - # -> <|SEP|>, <|SEPX> -> <|SEP|> (where one | is replace by other charater) - # |SEP|> -> <|SEP|>, <|SEP| -> <|SEP|> (where one | is missing) - - escaped_delimiter_core = re.escape( - tuple_delimiter[2:-2] - ) # Extract "SEP" from "<|SEP|>" - - # Fix: -> <|SEP|>, <|SEP|Y> -> <|SEP|>, -> <|SEP|> (one extra characters outside pipes) - record = re.sub( - rf"<.?\|{escaped_delimiter_core}\|.?>", - tuple_delimiter, - record, - ) - - # Fix: , , <|SEP> -> <|SEP|> (missing one or both pipes) - record = re.sub( - rf"<\|?{escaped_delimiter_core}\|?>", - tuple_delimiter, - record, - ) - - # Fix: -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character) - record = re.sub( - rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>", - tuple_delimiter, - record, - ) - - # Fix: |SEP|> -> <|SEP|> (missing opening <) - record = re.sub( - rf"(?", - tuple_delimiter, - record, - ) - - # Fix: <|SEP| -> <|SEP|> (missing closing >) - record = re.sub( - rf"<\|{escaped_delimiter_core}\|(?!>)", - tuple_delimiter, - record, - ) + # Fix various forms of tuple_delimiter corruption from the LLM output using the dedicated function + delimiter_core = tuple_delimiter[2:-2] # Extract "SEP" from "<|SEP|>" + record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter) + # change delimiter_core to lower case, and fix again + delimiter_core = delimiter_core.lower() + record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter) record_attributes = split_string_by_multi_markers(record, [tuple_delimiter]) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index f3939e01..b3bd6c6b 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -4,7 +4,7 @@ from typing import Any PROMPTS: dict[str, Any] = {} -# Delimiter must be bracketed in "<|...|>" +# All delimiters must be formatted as "<|UPPER_CASE_STRING|>" PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|SEP|>" PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" diff --git a/lightrag/utils.py b/lightrag/utils.py index 4952d98a..cf39e64e 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2544,6 +2544,76 @@ def get_pinyin_sort_key(text: str) -> str: return text.lower() +def fix_tuple_delimiter_corruption( + record: str, delimiter_core: str, tuple_delimiter: str +) -> str: + """ + Fix various forms of tuple_delimiter corruption from LLM output. + + This function handles missing or replaced characters around the core delimiter. + It fixes common corruption patterns where the LLM output doesn't match the expected + tuple_delimiter format. + + Args: + record: The text record to fix + delimiter_core: The core delimiter (e.g., "SEP" from "<|SEP|>") + tuple_delimiter: The complete tuple delimiter (e.g., "<|SEP|>") + + Returns: + The corrected record with proper tuple_delimiter format + + Examples: + >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|SEP|>") + "entity<|SEP|>name" + >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|SEP|>") + "entity<|SEP|>name" + >>> fix_tuple_delimiter_corruption("entity|SEP|>name", "SEP", "<|SEP|>") + "entity<|SEP|>name" + """ + if not record or not delimiter_core or not tuple_delimiter: + return record + + # Escape the delimiter core for regex use + escaped_delimiter_core = re.escape(delimiter_core) + + # Fix: -> <|SEP|>, <|SEP|Y> -> <|SEP|>, -> <|SEP|> (one extra characters outside pipes) + record = re.sub( + rf"<.?\|{escaped_delimiter_core}\|.?>", + tuple_delimiter, + record, + ) + + # Fix: , , <|SEP> -> <|SEP|> (missing one or both pipes) + record = re.sub( + rf"<\|?{escaped_delimiter_core}\|?>", + tuple_delimiter, + record, + ) + + # Fix: -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character) + record = re.sub( + rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>", + tuple_delimiter, + record, + ) + + # Fix: |SEP|> -> <|SEP|> (missing opening <) + record = re.sub( + rf"(?", + tuple_delimiter, + record, + ) + + # Fix: <|SEP| -> <|SEP|> (missing closing >) + record = re.sub( + rf"<\|{escaped_delimiter_core}\|(?!>)", + tuple_delimiter, + record, + ) + + return record + + def create_prefixed_exception(original_exception: Exception, prefix: str) -> Exception: """ Safely create a prefixed exception that adapts to all error types.