Improve text normalization and add entity type capitalization

- Capitalize entity types with .title()
- Add non-breaking space handling
- Add narrow non-breaking space regex
This commit is contained in:
yangdx 2025-09-02 02:51:41 +08:00
parent 29f0ecc88c
commit 5b2deccbef
2 changed files with 26 additions and 15 deletions

View file

@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
)
return None
# Captitalize first letter of entity_type
entity_type = entity_type.title()
# Process entity description with same cleaning pipeline
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])

View file

@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Clean HTML tags (paragraph and line break tags)
2. Convert Chinese symbols to English symbols
3. Remove spaces between Chinese characters
4. Remove spaces between Chinese characters and English letters/numbers
5. Preserve spaces within English text and numbers
6. Replace Chinese parentheses with English parentheses
7. Replace Chinese dash with English dash
8. Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots)
- Clean HTML tags (paragraph and line break tags)
- Convert Chinese symbols to English symbols
- Remove spaces between Chinese characters
- Remove spaces between Chinese characters and English letters/numbers
- Preserve spaces within English text and numbers
- Replace Chinese parentheses with English parentheses
- Replace Chinese dash with English dash
- Remove English quotation marks from the beginning and end of the text
- Remove English quotation marks in and around chinese
- Remove Chinese quotation marks
- Filter out short numeric-only text (length < 3 and only digits/dots)
- remove_inner_quotes = True
remove Chinese quotes
remove English queotes in and around chinese
Convert non-breaking spaces to regular spaces
Convert narrow non-breaking spaces after non-digits to regular spaces
Args:
name: Entity name to normalize
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
Returns:
Normalized entity name
"""
# 1. Clean HTML tags - remove paragraph and line break tags
# Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
name = inner_content
if remove_inner_quotes:
# remove Chinese quotes
# Remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
# Remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Convert non-breaking space to regular space
name = name.replace("\u00a0", " ")
# Convert narrow non-breaking space to regular space when after non-digits
name = re.sub(r"(?<=[^\d])\u202F", " ", name)
# Remove spaces from the beginning and end of the text
name = name.strip()