Improve text normalization and add entity type capitalization
- Capitalize entity types with .title() - Add non-breaking space handling - Add narrow non-breaking space regex
This commit is contained in:
parent
29f0ecc88c
commit
5b2deccbef
2 changed files with 26 additions and 15 deletions
|
|
@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Captitalize first letter of entity_type
|
||||||
|
entity_type = entity_type.title()
|
||||||
|
|
||||||
# Process entity description with same cleaning pipeline
|
# Process entity description with same cleaning pipeline
|
||||||
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
|
||||||
|
|
||||||
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
||||||
"""Normalize entity/relation names and description with the following rules:
|
"""Normalize entity/relation names and description with the following rules:
|
||||||
1. Clean HTML tags (paragraph and line break tags)
|
- Clean HTML tags (paragraph and line break tags)
|
||||||
2. Convert Chinese symbols to English symbols
|
- Convert Chinese symbols to English symbols
|
||||||
3. Remove spaces between Chinese characters
|
- Remove spaces between Chinese characters
|
||||||
4. Remove spaces between Chinese characters and English letters/numbers
|
- Remove spaces between Chinese characters and English letters/numbers
|
||||||
5. Preserve spaces within English text and numbers
|
- Preserve spaces within English text and numbers
|
||||||
6. Replace Chinese parentheses with English parentheses
|
- Replace Chinese parentheses with English parentheses
|
||||||
7. Replace Chinese dash with English dash
|
- Replace Chinese dash with English dash
|
||||||
8. Remove English quotation marks from the beginning and end of the text
|
- Remove English quotation marks from the beginning and end of the text
|
||||||
9. Remove English quotation marks in and around chinese
|
- Remove English quotation marks in and around chinese
|
||||||
10. Remove Chinese quotation marks
|
- Remove Chinese quotation marks
|
||||||
11. Filter out short numeric-only text (length < 3 and only digits/dots)
|
- Filter out short numeric-only text (length < 3 and only digits/dots)
|
||||||
|
- remove_inner_quotes = True
|
||||||
|
remove Chinese quotes
|
||||||
|
remove English queotes in and around chinese
|
||||||
|
Convert non-breaking spaces to regular spaces
|
||||||
|
Convert narrow non-breaking spaces after non-digits to regular spaces
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name: Entity name to normalize
|
name: Entity name to normalize
|
||||||
|
|
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
||||||
Returns:
|
Returns:
|
||||||
Normalized entity name
|
Normalized entity name
|
||||||
"""
|
"""
|
||||||
# 1. Clean HTML tags - remove paragraph and line break tags
|
# Clean HTML tags - remove paragraph and line break tags
|
||||||
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
|
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
|
||||||
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
|
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
|
||||||
|
|
||||||
# 2. Convert Chinese symbols to English symbols
|
|
||||||
# Chinese full-width letters to half-width (A-Z, a-z)
|
# Chinese full-width letters to half-width (A-Z, a-z)
|
||||||
name = name.translate(
|
name = name.translate(
|
||||||
str.maketrans(
|
str.maketrans(
|
||||||
|
|
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
||||||
name = inner_content
|
name = inner_content
|
||||||
|
|
||||||
if remove_inner_quotes:
|
if remove_inner_quotes:
|
||||||
# remove Chinese quotes
|
# Remove Chinese quotes
|
||||||
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
||||||
# remove English queotes in and around chinese
|
# Remove English queotes in and around chinese
|
||||||
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
||||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
||||||
|
# Convert non-breaking space to regular space
|
||||||
|
name = name.replace("\u00a0", " ")
|
||||||
|
# Convert narrow non-breaking space to regular space when after non-digits
|
||||||
|
name = re.sub(r"(?<=[^\d])\u202F", " ", name)
|
||||||
|
|
||||||
# Remove spaces from the beginning and end of the text
|
# Remove spaces from the beginning and end of the text
|
||||||
name = name.strip()
|
name = name.strip()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue