From 5b2deccbef2fde022d567862500d5689db8bd475 Mon Sep 17 00:00:00 2001
From: yangdx
Date: Tue, 2 Sep 2025 02:51:41 +0800
Subject: [PATCH] Improve text normalization and add entity type capitalization
- Capitalize entity types with .title()
- Add non-breaking space handling
- Add narrow non-breaking space regex
---
lightrag/operate.py | 3 +++
lightrag/utils.py | 38 +++++++++++++++++++++++---------------
2 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index ccf2550d..6c9a5538 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
)
return None
+ # Captitalize first letter of entity_type
+ entity_type = entity_type.title()
+
# Process entity description with same cleaning pipeline
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
diff --git a/lightrag/utils.py b/lightrag/utils.py
index c215d52b..f2d56282 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
- 1. Clean HTML tags (paragraph and line break tags)
- 2. Convert Chinese symbols to English symbols
- 3. Remove spaces between Chinese characters
- 4. Remove spaces between Chinese characters and English letters/numbers
- 5. Preserve spaces within English text and numbers
- 6. Replace Chinese parentheses with English parentheses
- 7. Replace Chinese dash with English dash
- 8. Remove English quotation marks from the beginning and end of the text
- 9. Remove English quotation marks in and around chinese
- 10. Remove Chinese quotation marks
- 11. Filter out short numeric-only text (length < 3 and only digits/dots)
+ - Clean HTML tags (paragraph and line break tags)
+ - Convert Chinese symbols to English symbols
+ - Remove spaces between Chinese characters
+ - Remove spaces between Chinese characters and English letters/numbers
+ - Preserve spaces within English text and numbers
+ - Replace Chinese parentheses with English parentheses
+ - Replace Chinese dash with English dash
+ - Remove English quotation marks from the beginning and end of the text
+ - Remove English quotation marks in and around chinese
+ - Remove Chinese quotation marks
+ - Filter out short numeric-only text (length < 3 and only digits/dots)
+ - remove_inner_quotes = True
+ remove Chinese quotes
+ remove English queotes in and around chinese
+ Convert non-breaking spaces to regular spaces
+ Convert narrow non-breaking spaces after non-digits to regular spaces
Args:
name: Entity name to normalize
@@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
Returns:
Normalized entity name
"""
- # 1. Clean HTML tags - remove paragraph and line break tags
+ # Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"
||
", "", name, flags=re.IGNORECASE)
name = re.sub(r"|
|
", "", name, flags=re.IGNORECASE)
- # 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
@@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
name = inner_content
if remove_inner_quotes:
- # remove Chinese quotes
+ # Remove Chinese quotes
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
- # remove English queotes in and around chinese
+ # Remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
+ # Convert non-breaking space to regular space
+ name = name.replace("\u00a0", " ")
+ # Convert narrow non-breaking space to regular space when after non-digits
+ name = re.sub(r"(?<=[^\d])\u202F", " ", name)
# Remove spaces from the beginning and end of the text
name = name.strip()