From 3cdc98f366db6f5307ed816dd496e624d23b6b8a Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 2 Sep 2025 00:26:04 +0800 Subject: [PATCH] Improve extraction parsing with better bracket handling and delimiter fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Standardize Chinese/English brackets • Fix incomplete tuple delimiters • Remove duplicate delimiter fix code • Support mixed bracket formats • Enhance record parsing robustness --- lightrag/operate.py | 63 +++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 607143a5..1d0821c3 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -822,23 +822,39 @@ async def _parse_extraction_result( maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) - # Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues - if context_base["tuple_delimiter"] == "<|>": - # 1. Convert <||> to <|> - extraction_result = extraction_result.replace("<||>", "<|>") - # 2. Convert < | > to <|> - extraction_result = extraction_result.replace("< | >", "<|>") + # Standardize Chinese brackets around record_delimiter to English brackets + record_delimiter = context_base["record_delimiter"] + bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]" + extraction_result = re.sub(bracket_pattern, ")\\1(", extraction_result) # Parse the extraction result using the same logic as in extract_entities records = split_string_by_multi_markers( extraction_result, [context_base["record_delimiter"], context_base["completion_delimiter"]], ) + for record in records: - record = re.search(r"\((.*)\)", record, re.DOTALL) + # Remove outer brackets + record = record.strip() + if record.startswith("(") or record.startswith("("): + record = record[1:] + if record.endswith(")") or record.endswith(")"): + record = record[:-1] + + record = record.strip() if record is None: continue - record = record.group(1) + + if context_base["tuple_delimiter"] == "<|>": + # fix entity<| with entity<|> + record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record) + # fix relationship<| with relationship<|> + record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record) + # fix <||> with <|> + record = record.replace("<||>", "<|>") + # fix < | > with <|> + record = record.replace("< | >", "<|>") + record_attributes = split_string_by_multi_markers( record, [context_base["tuple_delimiter"]] ) @@ -1736,12 +1752,10 @@ async def extract_entities( maybe_nodes = defaultdict(list) maybe_edges = defaultdict(list) - # Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues - if context_base["tuple_delimiter"] == "<|>": - # 1. Convert <||> to <|> - result = result.replace("<||>", "<|>") - # 2. Convert < | > to <|> - result = result.replace("< | >", "<|>") + # Standardize Chinese brackets around record_delimiter to English brackets + record_delimiter = context_base["record_delimiter"] + bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]" + result = re.sub(bracket_pattern, ")\\1(", result) records = split_string_by_multi_markers( result, @@ -1749,10 +1763,27 @@ async def extract_entities( ) for record in records: - record = re.search(r"\((.*)\)", record, re.DOTALL) + # Remove outer brackets (support English and Chinese brackets) + record = record.strip() + if record.startswith("(") or record.startswith("("): + record = record[1:] + if record.endswith(")") or record.endswith(")"): + record = record[:-1] + + record = record.strip() if record is None: continue - record = record.group(1) + + if context_base["tuple_delimiter"] == "<|>": + # fix entity<| with entity<|> + record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record) + # fix relationship<| with relationship<|> + record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record) + # fix <||> with <|> + record = record.replace("<||>", "<|>") + # fix < | > with <|> + record = record.replace("< | >", "<|>") + record_attributes = split_string_by_multi_markers( record, [context_base["tuple_delimiter"]] )