diff --git a/lightrag/operate.py b/lightrag/operate.py index b83790ab..4fa5ddb1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -319,7 +319,7 @@ async def _handle_single_entity_extraction( try: entity_name = sanitize_and_normalize_extracted_text( - record_attributes[1], is_entity=True + record_attributes[1], remove_inner_quotes=True ) # Validate entity name after all cleaning steps @@ -330,9 +330,13 @@ async def _handle_single_entity_extraction( return None # Process entity type with same cleaning pipeline - entity_type = sanitize_and_normalize_extracted_text(record_attributes[2]) + entity_type = sanitize_and_normalize_extracted_text( + record_attributes[2], remove_inner_quotes=True + ) - if not entity_type.strip() or entity_type.startswith('("'): + if not entity_type.strip() or any( + char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"] + ): logger.warning( f"Entity extraction error: invalid entity type in: {record_attributes}" ) @@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction( return None try: - source = sanitize_and_normalize_extracted_text(record_attributes[1]) - target = sanitize_and_normalize_extracted_text(record_attributes[2]) + source = sanitize_and_normalize_extracted_text( + record_attributes[1], remove_inner_quotes=True + ) + target = sanitize_and_normalize_extracted_text( + record_attributes[2], remove_inner_quotes=True + ) # Validate entity names after all cleaning steps if not source: @@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction( edge_description = sanitize_and_normalize_extracted_text(record_attributes[3]) # Process keywords with same cleaning pipeline - edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4]) + edge_keywords = sanitize_and_normalize_extracted_text( + record_attributes[4], remove_inner_quotes=True + ) edge_keywords = edge_keywords.replace(",", ",") edge_source_id = chunk_key diff --git a/lightrag/utils.py b/lightrag/utils.py index 82a7cce4..9dc5a5b2 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." -def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str: +def sanitize_and_normalize_extracted_text( + input_text: str, remove_inner_quotes=False +) -> str: """Santitize and normalize extracted text Args: input_text: text string to be processed @@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str Santitized and normalized text string """ safe_input_text = sanitize_text_for_encoding(input_text) - normalized_text = normalize_extracted_info(safe_input_text, is_name) - return normalized_text + if safe_input_text: + normalized_text = normalize_extracted_info( + safe_input_text, remove_inner_quotes=remove_inner_quotes + ) + return normalized_text + return "" -def normalize_extracted_info(name: str, is_entity=False) -> str: +def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: """Normalize entity/relation names and description with the following rules: - 1. Remove spaces between Chinese characters - 2. Remove spaces between Chinese characters and English letters/numbers - 3. Preserve spaces within English text and numbers - 4. Replace Chinese parentheses with English parentheses - 5. Replace Chinese dash with English dash - 6. Remove English quotation marks from the beginning and end of the text - 7. Remove English quotation marks in and around chinese - 8. Remove Chinese quotation marks + 1. Clean HTML tags (paragraph and line break tags) + 2. Convert Chinese symbols to English symbols + 3. Remove spaces between Chinese characters + 4. Remove spaces between Chinese characters and English letters/numbers + 5. Preserve spaces within English text and numbers + 6. Replace Chinese parentheses with English parentheses + 7. Replace Chinese dash with English dash + 8. Remove English quotation marks from the beginning and end of the text + 9. Remove English quotation marks in and around chinese + 10. Remove Chinese quotation marks + 11. Filter out short numeric-only text (length < 3 and only digits/dots) Args: name: Entity name to normalize + is_entity: Whether this is an entity name (affects quote handling) Returns: Normalized entity name """ + # 1. Clean HTML tags - remove paragraph and line break tags + name = re.sub(r"
||
", "", name, flags=re.IGNORECASE) + name = re.sub(r"|