Improve text normalization and add entity type capitalization

- Capitalize entity types with .title()
- Add non-breaking space handling
- Add narrow non-breaking space regex
This commit is contained in:
yangdx 2025-09-02 02:51:41 +08:00
parent 29f0ecc88c
commit 5b2deccbef
2 changed files with 26 additions and 15 deletions

View file

@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
) )
return None return None
# Captitalize first letter of entity_type
entity_type = entity_type.title()
# Process entity description with same cleaning pipeline # Process entity description with same cleaning pipeline
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3]) entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])

View file

@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str: def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules: """Normalize entity/relation names and description with the following rules:
1. Clean HTML tags (paragraph and line break tags) - Clean HTML tags (paragraph and line break tags)
2. Convert Chinese symbols to English symbols - Convert Chinese symbols to English symbols
3. Remove spaces between Chinese characters - Remove spaces between Chinese characters
4. Remove spaces between Chinese characters and English letters/numbers - Remove spaces between Chinese characters and English letters/numbers
5. Preserve spaces within English text and numbers - Preserve spaces within English text and numbers
6. Replace Chinese parentheses with English parentheses - Replace Chinese parentheses with English parentheses
7. Replace Chinese dash with English dash - Replace Chinese dash with English dash
8. Remove English quotation marks from the beginning and end of the text - Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese - Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks - Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots) - Filter out short numeric-only text (length < 3 and only digits/dots)
- remove_inner_quotes = True
remove Chinese quotes
remove English queotes in and around chinese
Convert non-breaking spaces to regular spaces
Convert narrow non-breaking spaces after non-digits to regular spaces
Args: Args:
name: Entity name to normalize name: Entity name to normalize
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
Returns: Returns:
Normalized entity name Normalized entity name
""" """
# 1. Clean HTML tags - remove paragraph and line break tags # Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE) name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE) name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z) # Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate( name = name.translate(
str.maketrans( str.maketrans(
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
name = inner_content name = inner_content
if remove_inner_quotes: if remove_inner_quotes:
# remove Chinese quotes # Remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "") name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese # Remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name) name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name) name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Convert non-breaking space to regular space
name = name.replace("\u00a0", " ")
# Convert narrow non-breaking space to regular space when after non-digits
name = re.sub(r"(?<=[^\d])\u202F", " ", name)
# Remove spaces from the beginning and end of the text # Remove spaces from the beginning and end of the text
name = name.strip() name = name.strip()