From b9f80263b87ec148cfd26f1fbafcc331cc16f558 Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 12 Sep 2025 00:56:40 +0800 Subject: [PATCH] Simplify tuple delimiter regex patterns for LLM output fixing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Consolidate 6 regex patterns into 3 • More efficient pattern matching • Clearer comments and examples • Same functionality, less code • Better maintainability --- lightrag/operate.py | 51 +++++++++++---------------------------------- 1 file changed, 12 insertions(+), 39 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index c29156b7..1e11293e 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -877,63 +877,36 @@ async def _process_extraction_result( # Fix various forms of tuple_delimiter corruption from the LLM output. # It handles missing or replaced characters around the core delimiter. - # 1. `<` or `>` may be missing. + # 1. There might be extra characters inserted between the bracket and pipeline. # 2. `|` may be missing or replaced by another character. - # 3. There might be extra characters inserted. - # 4. Missing opening `<` or closing `>` + # 3. Missing opening `<` or closing `>` # Example transformations: - # -> <|SEP|> - # -> <|SEP|> (where left | is missing) - # <|SEP> -> <|SEP|> (where right | is missing) - # -> <|SEP|> (where left | is replace by other charater) - # <|SEPX> -> <|SEP|> (where right | is replace by other charater) - # <|SEP|X> -> <|SEP|> (where X is not '>') - # -> <|SEP|> (handles extra characters) - # |SEP|> -> <|SEP|> (where left | is missing) - # <|SEP| -> <|SEP|> (where right | is missing) + # -> <|SEP|>, <|SEP|Y> -> <|SEP|>, -> <|SEP|> ((one extra characters outside pipes) + # , , <|SEP> -> <|SEP|> (missing one or both pipes) + # -> <|SEP|>, <|SEPX> -> <|SEP|> (where one | is replace by other charater) + # |SEP|> -> <|SEP|>, <|SEP| -> <|SEP|> (where one | is missing) escaped_delimiter_core = re.escape( tuple_delimiter[2:-2] ) # Extract "SEP" from "<|SEP|>" - # Fix: -> <|SEP|> (missing pipes) + # Fix: -> <|SEP|>, <|SEP|Y> -> <|SEP|>, -> <|SEP|> (one extra characters outside pipes) record = re.sub( - rf"<{escaped_delimiter_core}>", + rf"<.?\|{escaped_delimiter_core}\|.?>", tuple_delimiter, record, ) - # Fix: -> <|SEP|> (missing left pipe only) + # Fix: , , <|SEP> -> <|SEP|> (missing one or both pipes) record = re.sub( - rf"<{escaped_delimiter_core}\|>", + rf"<\|?{escaped_delimiter_core}\|?>", tuple_delimiter, record, ) - # Fix: <|SEP> -> <|SEP|> (missing right pipe only) + # Fix: -> <|SEP|>, <|SEPX> -> <|SEP|> (one pipe is replaced by other character) record = re.sub( - rf"<\|{escaped_delimiter_core}>", - tuple_delimiter, - record, - ) - - # Fix: -> <|SEP|> (character X replacing first pipe) - record = re.sub( - rf"<[^|]+{escaped_delimiter_core}\|>", - tuple_delimiter, - record, - ) - - # Fix: <|SEPX> -> <|SEP|> (character X replacing second pipe) - record = re.sub( - rf"<\|{escaped_delimiter_core}[^|]+>", - tuple_delimiter, - record, - ) - - # Fix: -> <|SEP|> (extra characters around, but preserve correct delimiters) - record = re.sub( - rf"<[^<>]+\|{escaped_delimiter_core}\|[^<>]+>", + rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>", tuple_delimiter, record, )