From 02e7462645632f0cd648e48cc38a7811fb0f5ce9 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 10 Sep 2025 18:10:06 +0800 Subject: [PATCH] feat: enhance LLM output format tolerance for bracket processing - Expand bracket tolerance to support additional characters: < > " ' - Implement symmetric handling for both leading and trailing characters - Replace simple string matching with robust regex-based pattern detection - Maintain full backward compatibility with existing bracket formats --- lightrag/operate.py | 27 ++++++++++++++++++++++----- lightrag/prompt.py | 6 +++--- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 745b042c..304b8298 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -876,22 +876,39 @@ async def _process_extraction_result( ) for record in records: - # Remove outer brackets (support English and Chinese brackets) + # Remove outer brackets (support English and Chinese brackets with enhanced tolerance) record = record.strip() + + # Define allowed leading and trailing characters + leading_trailing_chars = r'[`<>"\']*' + + # Handle leading characters before left bracket if record.startswith("(") or record.startswith("("): record = record[1:] else: - if record.startswith("`(") or record.startswith("`("): - record = record[2:] + # Check for leading characters + left bracket pattern + leading_bracket_pattern = r'^' + leading_trailing_chars + r'([((])' + match = re.search(leading_bracket_pattern, record) + if match: + # Extract content from the left bracket position + bracket_pos = match.start(1) + record = record[bracket_pos + 1:] else: logger.warning( f"{chunk_key}: Record starting bracket can not be found in extraction result" ) + + # Handle trailing characters after right bracket if record.endswith(")") or record.endswith(")"): record = record[:-1] else: - if record.endswith(")`") or record.endswith(")`"): - record = record[:-2] + # Check for right bracket + trailing characters pattern + trailing_bracket_pattern = r'([))])' + leading_trailing_chars + r'$' + match = re.search(trailing_bracket_pattern, record) + if match: + # Extract content up to the right bracket position + bracket_pos = match.start(1) + record = record[:bracket_pos] else: logger.warning( f"{chunk_key}: Record ending bracket can not be found in extraction result" diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 9b44edb0..a2e87957 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -28,8 +28,8 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel 5. **Relationship Order:** Prioritize relationships based on their significance to the intended meaning of input text, and output more crucial relationships first. 6. **Avoid Pronouns:** For entity names and all descriptions, explicitly name the subject or object instead of using pronouns; avoid pronouns such as `this document`, `our company`, `I`, `you`, and `he/she`. 7. **Undirectional Relationship:** Treat relationships as undirected; swapping the source and target entities does not constitute a new relationship. Avoid outputting duplicate relationships. -8. **Language:** Output entity names, keywords and descriptions in {language}. -9. **Delimiter:** Use `{record_delimiter}` as the entity or relationship list delimiter; output `{completion_delimiter}` when all the entities and relationships are extracted. +8. **Language:** Output entity names, keywords and descriptions in {language}. Proper nouns, such as personal names, should not be translated. Please keep them in their original language. +9. **Delimiter:** Use {record_delimiter} as the entity or relationship list delimiter; output {completion_delimiter} when all the entities and relationships are extracted. ---Examples--- {examples} @@ -49,7 +49,7 @@ Extract entities and relationships from the input text to be Processed. ---Instructions--- 1. Output entities and relationships, prioritized by their relevance to the input text's core meaning. 2. Output `{completion_delimiter}` when all the entities and relationships are extracted. -3. Ensure the output language is {language}. +3. Ensure the output language is {language}. Proper nouns, such as personal names, should not be translated. Please keep them in their original language. """