feat: enhance text extraction text sanitization and normalization

- Improve reduntant quotes in entity and relation name, type and keywords
- Add HTML tag cleaning and Chinese symbol conversion
- Filter out short numeric content and malformed text
- Enhance entity type validation with character filtering
This commit is contained in:
yangdx 2025-08-31 13:17:20 +08:00
parent d4bbc5dea9
commit b747417961
2 changed files with 108 additions and 25 deletions

View file

@ -319,7 +319,7 @@ async def _handle_single_entity_extraction(
try:
entity_name = sanitize_and_normalize_extracted_text(
record_attributes[1], is_entity=True
record_attributes[1], remove_inner_quotes=True
)
# Validate entity name after all cleaning steps
@ -330,9 +330,13 @@ async def _handle_single_entity_extraction(
return None
# Process entity type with same cleaning pipeline
entity_type = sanitize_and_normalize_extracted_text(record_attributes[2])
entity_type = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
if not entity_type.strip() or entity_type.startswith('("'):
if not entity_type.strip() or any(
char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"]
):
logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}"
)
@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction(
return None
try:
source = sanitize_and_normalize_extracted_text(record_attributes[1])
target = sanitize_and_normalize_extracted_text(record_attributes[2])
source = sanitize_and_normalize_extracted_text(
record_attributes[1], remove_inner_quotes=True
)
target = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
# Validate entity names after all cleaning steps
if not source:
@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction(
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
# Process keywords with same cleaning pipeline
edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4])
edge_keywords = sanitize_and_normalize_extracted_text(
record_attributes[4], remove_inner_quotes=True
)
edge_keywords = edge_keywords.replace("", ",")
edge_source_id = chunk_key

View file

@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..."
def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str:
def sanitize_and_normalize_extracted_text(
input_text: str, remove_inner_quotes=False
) -> str:
"""Santitize and normalize extracted text
Args:
input_text: text string to be processed
@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str
Santitized and normalized text string
"""
safe_input_text = sanitize_text_for_encoding(input_text)
normalized_text = normalize_extracted_info(safe_input_text, is_name)
return normalized_text
if safe_input_text:
normalized_text = normalize_extracted_info(
safe_input_text, remove_inner_quotes=remove_inner_quotes
)
return normalized_text
return ""
def normalize_extracted_info(name: str, is_entity=False) -> str:
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Remove spaces between Chinese characters
2. Remove spaces between Chinese characters and English letters/numbers
3. Preserve spaces within English text and numbers
4. Replace Chinese parentheses with English parentheses
5. Replace Chinese dash with English dash
6. Remove English quotation marks from the beginning and end of the text
7. Remove English quotation marks in and around chinese
8. Remove Chinese quotation marks
1. Clean HTML tags (paragraph and line break tags)
2. Convert Chinese symbols to English symbols
3. Remove spaces between Chinese characters
4. Remove spaces between Chinese characters and English letters/numbers
5. Preserve spaces within English text and numbers
6. Replace Chinese parentheses with English parentheses
7. Replace Chinese dash with English dash
8. Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots)
Args:
name: Entity name to normalize
is_entity: Whether this is an entity name (affects quote handling)
Returns:
Normalized entity name
"""
# 1. Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
"",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
)
)
# Chinese full-width numbers to half-width
name = name.translate(str.maketrans("", "0123456789"))
# Chinese full-width symbols to half-width
name = name.replace("", "-") # Chinese minus
name = name.replace("", "+") # Chinese plus
name = name.replace("", "/") # Chinese slash
name = name.replace("", "*") # Chinese asterisk
# Replace Chinese parentheses with English parentheses
name = name.replace("", "(").replace("", ")")
# Replace Chinese dash with English dash
# Replace Chinese dash with English dash (additional patterns)
name = name.replace("", "-").replace("", "-")
# Chinese full-width space to regular space (after other replacements)
name = name.replace(" ", " ")
# Use regex to remove spaces between Chinese characters
# Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
@ -1767,19 +1802,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
)
# Remove English quotation marks from the beginning and end
if len(name) >= 2 and name.startswith('"') and name.endswith('"'):
name = name[1:-1]
if len(name) >= 2 and name.startswith("'") and name.endswith("'"):
name = name[1:-1]
# Remove outer quotes
if len(name) >= 2:
# Handle double quotes
if name.startswith('"') and name.endswith('"'):
inner_content = name[1:-1]
if '"' not in inner_content: # No double quotes inside
name = inner_content
if is_entity:
# Handle single quotes
if name.startswith("'") and name.endswith("'"):
inner_content = name[1:-1]
if "'" not in inner_content: # No single quotes inside
name = inner_content
# Handle Chinese-style double quotes
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if remove_inner_quotes:
# remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Remove spaces from the beginning and end of the text
name = name.strip()
# Filter out pure numeric content with length < 3
if len(name) < 3 and re.match(r"^[0-9]+$", name):
return ""
def should_filter_by_dots(text):
"""
Check if the string consists only of dots and digits, with at least one dot
Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc.
"""
return all(c.isdigit() or c == "." for c in text) and "." in text
if len(name) < 6 and should_filter_by_dots(name):
# Filter out mixed numeric and dot content with length < 6
return ""
# Filter out mixed numeric and dot content with length < 6, requiring at least one dot
return ""
return name