feat: enhance text extraction text sanitization and normalization

- Improve reduntant quotes in entity and relation name, type and keywords
- Add HTML tag cleaning and Chinese symbol conversion
- Filter out short numeric content and malformed text
- Enhance entity type validation with character filtering
This commit is contained in:
yangdx 2025-08-31 13:17:20 +08:00
parent d4bbc5dea9
commit b747417961
2 changed files with 108 additions and 25 deletions

View file

@ -319,7 +319,7 @@ async def _handle_single_entity_extraction(
try: try:
entity_name = sanitize_and_normalize_extracted_text( entity_name = sanitize_and_normalize_extracted_text(
record_attributes[1], is_entity=True record_attributes[1], remove_inner_quotes=True
) )
# Validate entity name after all cleaning steps # Validate entity name after all cleaning steps
@ -330,9 +330,13 @@ async def _handle_single_entity_extraction(
return None return None
# Process entity type with same cleaning pipeline # Process entity type with same cleaning pipeline
entity_type = sanitize_and_normalize_extracted_text(record_attributes[2]) entity_type = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
if not entity_type.strip() or entity_type.startswith('("'): if not entity_type.strip() or any(
char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"]
):
logger.warning( logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}" f"Entity extraction error: invalid entity type in: {record_attributes}"
) )
@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction(
return None return None
try: try:
source = sanitize_and_normalize_extracted_text(record_attributes[1]) source = sanitize_and_normalize_extracted_text(
target = sanitize_and_normalize_extracted_text(record_attributes[2]) record_attributes[1], remove_inner_quotes=True
)
target = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
# Validate entity names after all cleaning steps # Validate entity names after all cleaning steps
if not source: if not source:
@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction(
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3]) edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
# Process keywords with same cleaning pipeline # Process keywords with same cleaning pipeline
edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4]) edge_keywords = sanitize_and_normalize_extracted_text(
record_attributes[4], remove_inner_quotes=True
)
edge_keywords = edge_keywords.replace("", ",") edge_keywords = edge_keywords.replace("", ",")
edge_source_id = chunk_key edge_source_id = chunk_key

View file

@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..." return content[:max_length] + "..."
def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str: def sanitize_and_normalize_extracted_text(
input_text: str, remove_inner_quotes=False
) -> str:
"""Santitize and normalize extracted text """Santitize and normalize extracted text
Args: Args:
input_text: text string to be processed input_text: text string to be processed
@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str
Santitized and normalized text string Santitized and normalized text string
""" """
safe_input_text = sanitize_text_for_encoding(input_text) safe_input_text = sanitize_text_for_encoding(input_text)
normalized_text = normalize_extracted_info(safe_input_text, is_name) if safe_input_text:
return normalized_text normalized_text = normalize_extracted_info(
safe_input_text, remove_inner_quotes=remove_inner_quotes
)
return normalized_text
return ""
def normalize_extracted_info(name: str, is_entity=False) -> str: def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules: """Normalize entity/relation names and description with the following rules:
1. Remove spaces between Chinese characters 1. Clean HTML tags (paragraph and line break tags)
2. Remove spaces between Chinese characters and English letters/numbers 2. Convert Chinese symbols to English symbols
3. Preserve spaces within English text and numbers 3. Remove spaces between Chinese characters
4. Replace Chinese parentheses with English parentheses 4. Remove spaces between Chinese characters and English letters/numbers
5. Replace Chinese dash with English dash 5. Preserve spaces within English text and numbers
6. Remove English quotation marks from the beginning and end of the text 6. Replace Chinese parentheses with English parentheses
7. Remove English quotation marks in and around chinese 7. Replace Chinese dash with English dash
8. Remove Chinese quotation marks 8. Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots)
Args: Args:
name: Entity name to normalize name: Entity name to normalize
is_entity: Whether this is an entity name (affects quote handling)
Returns: Returns:
Normalized entity name Normalized entity name
""" """
# 1. Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
"",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
)
)
# Chinese full-width numbers to half-width
name = name.translate(str.maketrans("", "0123456789"))
# Chinese full-width symbols to half-width
name = name.replace("", "-") # Chinese minus
name = name.replace("", "+") # Chinese plus
name = name.replace("", "/") # Chinese slash
name = name.replace("", "*") # Chinese asterisk
# Replace Chinese parentheses with English parentheses # Replace Chinese parentheses with English parentheses
name = name.replace("", "(").replace("", ")") name = name.replace("", "(").replace("", ")")
# Replace Chinese dash with English dash # Replace Chinese dash with English dash (additional patterns)
name = name.replace("", "-").replace("", "-") name = name.replace("", "-").replace("", "-")
# Chinese full-width space to regular space (after other replacements)
name = name.replace(" ", " ")
# Use regex to remove spaces between Chinese characters # Use regex to remove spaces between Chinese characters
# Regex explanation: # Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
@ -1767,19 +1802,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
) )
# Remove English quotation marks from the beginning and end # Remove outer quotes
if len(name) >= 2 and name.startswith('"') and name.endswith('"'): if len(name) >= 2:
name = name[1:-1] # Handle double quotes
if len(name) >= 2 and name.startswith("'") and name.endswith("'"): if name.startswith('"') and name.endswith('"'):
name = name[1:-1] inner_content = name[1:-1]
if '"' not in inner_content: # No double quotes inside
name = inner_content
if is_entity: # Handle single quotes
if name.startswith("'") and name.endswith("'"):
inner_content = name[1:-1]
if "'" not in inner_content: # No single quotes inside
name = inner_content
# Handle Chinese-style double quotes
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if remove_inner_quotes:
# remove Chinese quotes # remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "") name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese # remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name) name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name) name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Remove spaces from the beginning and end of the text
name = name.strip()
# Filter out pure numeric content with length < 3
if len(name) < 3 and re.match(r"^[0-9]+$", name):
return ""
def should_filter_by_dots(text):
"""
Check if the string consists only of dots and digits, with at least one dot
Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc.
"""
return all(c.isdigit() or c == "." for c in text) and "." in text
if len(name) < 6 and should_filter_by_dots(name):
# Filter out mixed numeric and dot content with length < 6
return ""
# Filter out mixed numeric and dot content with length < 6, requiring at least one dot
return ""
return name return name