From b7474179612a1e7a574b0bf0a64367f36436985e Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 31 Aug 2025 13:17:20 +0800 Subject: [PATCH] feat: enhance text extraction text sanitization and normalization - Improve reduntant quotes in entity and relation name, type and keywords - Add HTML tag cleaning and Chinese symbol conversion - Filter out short numeric content and malformed text - Enhance entity type validation with character filtering --- lightrag/operate.py | 22 ++++++--- lightrag/utils.py | 111 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 108 insertions(+), 25 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b83790ab..4fa5ddb1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -319,7 +319,7 @@ async def _handle_single_entity_extraction( try: entity_name = sanitize_and_normalize_extracted_text( - record_attributes[1], is_entity=True + record_attributes[1], remove_inner_quotes=True ) # Validate entity name after all cleaning steps @@ -330,9 +330,13 @@ async def _handle_single_entity_extraction( return None # Process entity type with same cleaning pipeline - entity_type = sanitize_and_normalize_extracted_text(record_attributes[2]) + entity_type = sanitize_and_normalize_extracted_text( + record_attributes[2], remove_inner_quotes=True + ) - if not entity_type.strip() or entity_type.startswith('("'): + if not entity_type.strip() or any( + char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"] + ): logger.warning( f"Entity extraction error: invalid entity type in: {record_attributes}" ) @@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction( return None try: - source = sanitize_and_normalize_extracted_text(record_attributes[1]) - target = sanitize_and_normalize_extracted_text(record_attributes[2]) + source = sanitize_and_normalize_extracted_text( + record_attributes[1], remove_inner_quotes=True + ) + target = sanitize_and_normalize_extracted_text( + record_attributes[2], remove_inner_quotes=True + ) # Validate entity names after all cleaning steps if not source: @@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction( edge_description = sanitize_and_normalize_extracted_text(record_attributes[3]) # Process keywords with same cleaning pipeline - edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4]) + edge_keywords = sanitize_and_normalize_extracted_text( + record_attributes[4], remove_inner_quotes=True + ) edge_keywords = edge_keywords.replace(",", ",") edge_source_id = chunk_key diff --git a/lightrag/utils.py b/lightrag/utils.py index 82a7cce4..9dc5a5b2 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." -def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str: +def sanitize_and_normalize_extracted_text( + input_text: str, remove_inner_quotes=False +) -> str: """Santitize and normalize extracted text Args: input_text: text string to be processed @@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str Santitized and normalized text string """ safe_input_text = sanitize_text_for_encoding(input_text) - normalized_text = normalize_extracted_info(safe_input_text, is_name) - return normalized_text + if safe_input_text: + normalized_text = normalize_extracted_info( + safe_input_text, remove_inner_quotes=remove_inner_quotes + ) + return normalized_text + return "" -def normalize_extracted_info(name: str, is_entity=False) -> str: +def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: """Normalize entity/relation names and description with the following rules: - 1. Remove spaces between Chinese characters - 2. Remove spaces between Chinese characters and English letters/numbers - 3. Preserve spaces within English text and numbers - 4. Replace Chinese parentheses with English parentheses - 5. Replace Chinese dash with English dash - 6. Remove English quotation marks from the beginning and end of the text - 7. Remove English quotation marks in and around chinese - 8. Remove Chinese quotation marks + 1. Clean HTML tags (paragraph and line break tags) + 2. Convert Chinese symbols to English symbols + 3. Remove spaces between Chinese characters + 4. Remove spaces between Chinese characters and English letters/numbers + 5. Preserve spaces within English text and numbers + 6. Replace Chinese parentheses with English parentheses + 7. Replace Chinese dash with English dash + 8. Remove English quotation marks from the beginning and end of the text + 9. Remove English quotation marks in and around chinese + 10. Remove Chinese quotation marks + 11. Filter out short numeric-only text (length < 3 and only digits/dots) Args: name: Entity name to normalize + is_entity: Whether this is an entity name (affects quote handling) Returns: Normalized entity name """ + # 1. Clean HTML tags - remove paragraph and line break tags + name = re.sub(r"||

", "", name, flags=re.IGNORECASE) + name = re.sub(r"||
", "", name, flags=re.IGNORECASE) + + # 2. Convert Chinese symbols to English symbols + # Chinese full-width letters to half-width (A-Z, a-z) + name = name.translate( + str.maketrans( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + ) + ) + + # Chinese full-width numbers to half-width + name = name.translate(str.maketrans("0123456789", "0123456789")) + + # Chinese full-width symbols to half-width + name = name.replace("-", "-") # Chinese minus + name = name.replace("+", "+") # Chinese plus + name = name.replace("/", "/") # Chinese slash + name = name.replace("*", "*") # Chinese asterisk + # Replace Chinese parentheses with English parentheses name = name.replace("(", "(").replace(")", ")") - # Replace Chinese dash with English dash + # Replace Chinese dash with English dash (additional patterns) name = name.replace("—", "-").replace("-", "-") + # Chinese full-width space to regular space (after other replacements) + name = name.replace(" ", " ") + # Use regex to remove spaces between Chinese characters # Regex explanation: # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character @@ -1767,19 +1802,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str: r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name ) - # Remove English quotation marks from the beginning and end - if len(name) >= 2 and name.startswith('"') and name.endswith('"'): - name = name[1:-1] - if len(name) >= 2 and name.startswith("'") and name.endswith("'"): - name = name[1:-1] + # Remove outer quotes + if len(name) >= 2: + # Handle double quotes + if name.startswith('"') and name.endswith('"'): + inner_content = name[1:-1] + if '"' not in inner_content: # No double quotes inside + name = inner_content - if is_entity: + # Handle single quotes + if name.startswith("'") and name.endswith("'"): + inner_content = name[1:-1] + if "'" not in inner_content: # No single quotes inside + name = inner_content + + # Handle Chinese-style double quotes + if name.startswith("“") and name.endswith("”"): + inner_content = name[1:-1] + if "“" not in inner_content and "”" not in inner_content: + name = inner_content + if name.startswith("‘") and name.endswith("’"): + inner_content = name[1:-1] + if "‘" not in inner_content and "’" not in inner_content: + name = inner_content + + if remove_inner_quotes: # remove Chinese quotes name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "") # remove English queotes in and around chinese name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name) name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name) + # Remove spaces from the beginning and end of the text + name = name.strip() + + # Filter out pure numeric content with length < 3 + if len(name) < 3 and re.match(r"^[0-9]+$", name): + return "" + + def should_filter_by_dots(text): + """ + Check if the string consists only of dots and digits, with at least one dot + Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc. + """ + return all(c.isdigit() or c == "." for c in text) and "." in text + + if len(name) < 6 and should_filter_by_dots(name): + # Filter out mixed numeric and dot content with length < 6 + return "" + # Filter out mixed numeric and dot content with length < 6, requiring at least one dot + return "" + return name