diff --git a/lightrag/operate.py b/lightrag/operate.py index ccf2550d..6c9a5538 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -347,6 +347,9 @@ async def _handle_single_entity_extraction( ) return None + # Captitalize first letter of entity_type + entity_type = entity_type.title() + # Process entity description with same cleaning pipeline entity_description = sanitize_and_normalize_extracted_text(record_attributes[3]) diff --git a/lightrag/utils.py b/lightrag/utils.py index c215d52b..f2d56282 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text( def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: """Normalize entity/relation names and description with the following rules: - 1. Clean HTML tags (paragraph and line break tags) - 2. Convert Chinese symbols to English symbols - 3. Remove spaces between Chinese characters - 4. Remove spaces between Chinese characters and English letters/numbers - 5. Preserve spaces within English text and numbers - 6. Replace Chinese parentheses with English parentheses - 7. Replace Chinese dash with English dash - 8. Remove English quotation marks from the beginning and end of the text - 9. Remove English quotation marks in and around chinese - 10. Remove Chinese quotation marks - 11. Filter out short numeric-only text (length < 3 and only digits/dots) + - Clean HTML tags (paragraph and line break tags) + - Convert Chinese symbols to English symbols + - Remove spaces between Chinese characters + - Remove spaces between Chinese characters and English letters/numbers + - Preserve spaces within English text and numbers + - Replace Chinese parentheses with English parentheses + - Replace Chinese dash with English dash + - Remove English quotation marks from the beginning and end of the text + - Remove English quotation marks in and around chinese + - Remove Chinese quotation marks + - Filter out short numeric-only text (length < 3 and only digits/dots) + - remove_inner_quotes = True + remove Chinese quotes + remove English queotes in and around chinese + Convert non-breaking spaces to regular spaces + Convert narrow non-breaking spaces after non-digits to regular spaces Args: name: Entity name to normalize @@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: Returns: Normalized entity name """ - # 1. Clean HTML tags - remove paragraph and line break tags + # Clean HTML tags - remove paragraph and line break tags name = re.sub(r"||

", "", name, flags=re.IGNORECASE) name = re.sub(r"||
", "", name, flags=re.IGNORECASE) - # 2. Convert Chinese symbols to English symbols # Chinese full-width letters to half-width (A-Z, a-z) name = name.translate( str.maketrans( @@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: name = inner_content if remove_inner_quotes: - # remove Chinese quotes + # Remove Chinese quotes name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "") - # remove English queotes in and around chinese + # Remove English queotes in and around chinese name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name) name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name) + # Convert non-breaking space to regular space + name = name.replace("\u00a0", " ") + # Convert narrow non-breaking space to regular space when after non-digits + name = re.sub(r"(?<=[^\d])\u202F", " ", name) # Remove spaces from the beginning and end of the text name = name.strip()