From cbdc386c9a71ba818fcd9e24bb3c06c2197cdcb6 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 12:37:21 +0800 Subject: [PATCH 1/6] Fix syntax warning by removin examples from fix_tuple_delimiter_corruption docstring --- lightrag/utils.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 4eb54571..9cb25f4c 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2576,26 +2576,6 @@ def fix_tuple_delimiter_corruption( Returns: The corrected record with proper tuple_delimiter format - - Examples: - >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|#|>") - "entity<|#|>name" - >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|#|>") - "entity<|#|>name" - >>> fix_tuple_delimiter_corruption("entity|#|>name", "SEP", "<|#|>") - "entity<|#|>name" - Regex Sample: - <\|S\|+S\|> - <\|\\S\|> - <\|+> - <.?\|S\|.?> - <\|?S\|?> - <[^|]S\|>|<\|S[^|]> - <\|S\|(?!>) - <\|\|(?!>) - (? - <\|S\|>\| - \|\|S\|\| """ if not record or not delimiter_core or not tuple_delimiter: return record From 3792f86de31b68fa86e5b61509ed6cba553e29f7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 13:45:59 +0800 Subject: [PATCH 2/6] Improve entity extraction prompts and error message formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Fix typo in error log message • Clarify format requirements in prompts • Make extraction instructions clearer • Improve user prompt consistency --- lightrag/operate.py | 2 +- lightrag/prompt.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b87ac37f..a31eaf9a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -323,7 +323,7 @@ async def _handle_single_entity_extraction( if len(record_attributes) != 4 or "entity" not in record_attributes[0]: if len(record_attributes) > 1 and "entity" in record_attributes[0]: logger.warning( - f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` of type {record_attributes[2] if len(record_attributes) > 2 else 'N/A'}" + f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`" ) return None diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 31318b40..b3b313c3 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -74,7 +74,7 @@ PROMPTS["entity_extraction_user_prompt"] = """---Task--- Extract entities and relationships from the input text to be processed. ---Instructions--- -1. Output each entity and relationship on a single line; use `{tuple_delimiter}` as the field separator within each extracted item. +1. Adhere strictly to the format requirements for entity and relationship lists as specified in the system prompts. 2. Output `{completion_delimiter}` only after all relevant entities and relationships have been extracted. 3. Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated. @@ -85,7 +85,7 @@ PROMPTS["entity_continue_extraction_user_prompt"] = """---Task--- Identify any missed entities or relationships from the input text to be processed based on the last extraction task. ---Instructions--- -1. Output entities and relationships in the same format as the previous extraction task. +1. Adhere strictly to the format requirements for entity and relationship lists as specified in the system prompts. 2. Do not include entities and relationships that were correctly extracted in the last extraction task. 3. If an entity or relationship output was truncated or had missing fields in the last extraction task, please re-output it in the correct format. 4. Output each entity and relationship on a single line; use `{tuple_delimiter}` as the field separator within each extracted item. From 4dafec8884403fde7305995f2f5446145f01efbc Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 17:29:27 +0800 Subject: [PATCH 3/6] Fix tuple delimiter regex patterns and add debug logging - Add debug logs for malformed records - Fix regex for consecutive delimiters - Handle missing closing brackets --- lightrag/operate.py | 2 ++ lightrag/utils.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index a31eaf9a..7f0a7299 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -325,6 +325,7 @@ async def _handle_single_entity_extraction( logger.warning( f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}` @ `{record_attributes[2] if len(record_attributes) > 2 else 'N/A'}`" ) + logger.debug(record_attributes) return None try: @@ -398,6 +399,7 @@ async def _handle_single_relationship_extraction( logger.warning( f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`~`{record_attributes[2] if len(record_attributes) >2 else 'N/A'}`" ) + logger.debug(record_attributes) return None try: diff --git a/lightrag/utils.py b/lightrag/utils.py index 9cb25f4c..568be0a9 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2583,9 +2583,9 @@ def fix_tuple_delimiter_corruption( # Escape the delimiter core for regex use escaped_delimiter_core = re.escape(delimiter_core) - # Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> + # Fix: <|##|> -> <|#|>, <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> record = re.sub( - rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", + rf"<\|{escaped_delimiter_core}\|*?{escaped_delimiter_core}\|>", tuple_delimiter, record, ) @@ -2604,9 +2604,9 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: -> <|#|>, <|#|Y> -> <|#|>, -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes) + # Fix: -> <|#|>, <|#|Y> -> <|#|>, -> <|#|>, <||#||> -> <|#|>, <||#> -> <|#|> (one extra characters outside pipes) record = re.sub( - rf"<.?\|{escaped_delimiter_core}\|.?>", + rf"<.?\|{escaped_delimiter_core}\|*?>", tuple_delimiter, record, ) @@ -2625,9 +2625,10 @@ def fix_tuple_delimiter_corruption( record, ) - # Fix: <|#| -> <|#|> (missing closing >) + # Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >) + # record = re.sub( - rf"<\|{escaped_delimiter_core}\|(?!>)", + rf"<\|{escaped_delimiter_core}\|+(?!>)", tuple_delimiter, record, ) From 92f8fc6fbf8310579fd71589cd9b1494668ce07f Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 17:50:56 +0800 Subject: [PATCH 4/6] Improve entity extraction prompt clarity and make sure LLM output content only --- lightrag/prompt.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index b3b313c3..6f927d6d 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -74,23 +74,24 @@ PROMPTS["entity_extraction_user_prompt"] = """---Task--- Extract entities and relationships from the input text to be processed. ---Instructions--- -1. Adhere strictly to the format requirements for entity and relationship lists as specified in the system prompts. -2. Output `{completion_delimiter}` only after all relevant entities and relationships have been extracted. -3. Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated. +1. **Strict Adherence to Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system prompt. +2. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list. +3. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant entities and relationships have been extracted and presented. """ PROMPTS["entity_continue_extraction_user_prompt"] = """---Task--- -Identify any missed entities or relationships from the input text to be processed based on the last extraction task. +Based on the last extraction task, identify and extract any **missed or incorrectly formatted** entities and relationships from the input text. ---Instructions--- -1. Adhere strictly to the format requirements for entity and relationship lists as specified in the system prompts. -2. Do not include entities and relationships that were correctly extracted in the last extraction task. -3. If an entity or relationship output was truncated or had missing fields in the last extraction task, please re-output it in the correct format. -4. Output each entity and relationship on a single line; use `{tuple_delimiter}` as the field separator within each extracted item. -5. Output `{completion_delimiter}` only after all relevant entities and relationships have been extracted. -6. Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) may in their original language if proper translation is not available. +1. **Strict Adherence to System Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system instructions. +2. **Focus on Corrections/Additions:** + * **Do NOT** re-output entities and relationships that were **correctly and fully** extracted in the last task. + * If an entity or relationship was **missed** in the last task, extract and output it now according to the system format. + * If an entity or relationship was **truncated, had missing fields, or was otherwise incorrectly formatted** in the last task, re-output the *corrected and complete* version in the specified format. +3. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list. +4. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant missing or corrected entities and relationships have been extracted and presented. """ From 27f1eef616e7db78ba061c9a783d6fc7376cfb4e Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 18:26:41 +0800 Subject: [PATCH 5/6] Add language control and format clarity to extraction prompts --- lightrag/prompt.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 6f927d6d..f6842700 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -77,6 +77,7 @@ Extract entities and relationships from the input text to be processed. 1. **Strict Adherence to Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system prompt. 2. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list. 3. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant entities and relationships have been extracted and presented. +4. **Oputput Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated. """ @@ -90,8 +91,11 @@ Based on the last extraction task, identify and extract any **missed or incorrec * **Do NOT** re-output entities and relationships that were **correctly and fully** extracted in the last task. * If an entity or relationship was **missed** in the last task, extract and output it now according to the system format. * If an entity or relationship was **truncated, had missing fields, or was otherwise incorrectly formatted** in the last task, re-output the *corrected and complete* version in the specified format. -3. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list. -4. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant missing or corrected entities and relationships have been extracted and presented. +3. **Output Format - Entities:** Output a total of 4 fields for each entity, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `entity`. +4. **Output Format - Relationships:** Output a total of 5 fields for each relationship, delimited by `{tuple_delimiter}`, on a single line. The first field *must* be the literal string `relation`. +5. **Output Content Only:** Output *only* the extracted list of entities and relationships. Do not include any introductory or concluding remarks, explanations, or additional text before or after the list. +6. **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant missing or corrected entities and relationships have been extracted and presented. +7. **Oputput Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated. """ From 965d94210360d428907ff8c63d694e38309a6d58 Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 14 Sep 2025 18:31:33 +0800 Subject: [PATCH 6/6] Fix lingting --- lightrag/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 568be0a9..c7a048fb 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2626,7 +2626,6 @@ def fix_tuple_delimiter_corruption( ) # Fix: <|#| -> <|#|>, <|#|| -> <|#|> (missing closing >) - # record = re.sub( rf"<\|{escaped_delimiter_core}\|+(?!>)", tuple_delimiter,