Improve text normalization and add entity type capitalization

- Capitalize entity types with .title() - Add non-breaking space handling - Add narrow non-breaking space regex
2025-09-02 02:51:41 +08:00 · 2025-09-02 02:51:41 +08:00 · 5b2deccbef
commit 5b2deccbef
parent 29f0ecc88c
2 changed files with 26 additions and 15 deletions
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
            )
            return None
        # Captitalize first letter of entity_type
        entity_type = entity_type.title()
        # Process entity description with same cleaning pipeline
        entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
 def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
    """Normalize entity/relation names and description with the following rules:
-    1. Clean HTML tags (paragraph and line break tags)
+    - Clean HTML tags (paragraph and line break tags)
-    2. Convert Chinese symbols to English symbols
+    - Convert Chinese symbols to English symbols
-    3. Remove spaces between Chinese characters
+    - Remove spaces between Chinese characters
-    4. Remove spaces between Chinese characters and English letters/numbers
+    - Remove spaces between Chinese characters and English letters/numbers
-    5. Preserve spaces within English text and numbers
+    - Preserve spaces within English text and numbers
-    6. Replace Chinese parentheses with English parentheses
+    - Replace Chinese parentheses with English parentheses
-    7. Replace Chinese dash with English dash
+    - Replace Chinese dash with English dash
-    8. Remove English quotation marks from the beginning and end of the text
+    - Remove English quotation marks from the beginning and end of the text
-    9. Remove English quotation marks in and around chinese
+    - Remove English quotation marks in and around chinese
-    10. Remove Chinese quotation marks
+    - Remove Chinese quotation marks
-    11. Filter out short numeric-only text (length < 3 and only digits/dots)
+    - Filter out short numeric-only text (length < 3 and only digits/dots)
    - remove_inner_quotes = True
        remove Chinese quotes
        remove English queotes in and around chinese
        Convert non-breaking spaces to regular spaces
        Convert narrow non-breaking spaces after non-digits to regular spaces
    Args:
        name: Entity name to normalize
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
    Returns:
        Normalized entity name
    """
-    # 1. Clean HTML tags - remove paragraph and line break tags
+    # Clean HTML tags - remove paragraph and line break tags
    name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
    name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
    # 2. Convert Chinese symbols to English symbols
    # Chinese full-width letters to half-width (A-Z, a-z)
    name = name.translate(
        str.maketrans(
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
                name = inner_content
    if remove_inner_quotes:
-        # remove Chinese quotes
+        # Remove Chinese quotes
        name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
-        # remove English queotes in and around chinese
+        # Remove English queotes in and around chinese
        name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
        name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
        # Convert non-breaking space to regular space
        name = name.replace("\u00a0", " ")
        # Convert narrow non-breaking space to regular space when after non-digits
        name = re.sub(r"(?<=[^\d])\u202F", " ", name)
    # Remove spaces from the beginning and end of the text
    name = name.strip()