feat: enhance text extraction text sanitization and normalization
- Improve reduntant quotes in entity and relation name, type and keywords - Add HTML tag cleaning and Chinese symbol conversion - Filter out short numeric content and malformed text - Enhance entity type validation with character filtering
This commit is contained in:
parent
d4bbc5dea9
commit
b747417961
2 changed files with 108 additions and 25 deletions
|
|
@ -319,7 +319,7 @@ async def _handle_single_entity_extraction(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
entity_name = sanitize_and_normalize_extracted_text(
|
entity_name = sanitize_and_normalize_extracted_text(
|
||||||
record_attributes[1], is_entity=True
|
record_attributes[1], remove_inner_quotes=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate entity name after all cleaning steps
|
# Validate entity name after all cleaning steps
|
||||||
|
|
@ -330,9 +330,13 @@ async def _handle_single_entity_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process entity type with same cleaning pipeline
|
# Process entity type with same cleaning pipeline
|
||||||
entity_type = sanitize_and_normalize_extracted_text(record_attributes[2])
|
entity_type = sanitize_and_normalize_extracted_text(
|
||||||
|
record_attributes[2], remove_inner_quotes=True
|
||||||
|
)
|
||||||
|
|
||||||
if not entity_type.strip() or entity_type.startswith('("'):
|
if not entity_type.strip() or any(
|
||||||
|
char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"]
|
||||||
|
):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Entity extraction error: invalid entity type in: {record_attributes}"
|
f"Entity extraction error: invalid entity type in: {record_attributes}"
|
||||||
)
|
)
|
||||||
|
|
@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
source = sanitize_and_normalize_extracted_text(record_attributes[1])
|
source = sanitize_and_normalize_extracted_text(
|
||||||
target = sanitize_and_normalize_extracted_text(record_attributes[2])
|
record_attributes[1], remove_inner_quotes=True
|
||||||
|
)
|
||||||
|
target = sanitize_and_normalize_extracted_text(
|
||||||
|
record_attributes[2], remove_inner_quotes=True
|
||||||
|
)
|
||||||
|
|
||||||
# Validate entity names after all cleaning steps
|
# Validate entity names after all cleaning steps
|
||||||
if not source:
|
if not source:
|
||||||
|
|
@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction(
|
||||||
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
||||||
|
|
||||||
# Process keywords with same cleaning pipeline
|
# Process keywords with same cleaning pipeline
|
||||||
edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4])
|
edge_keywords = sanitize_and_normalize_extracted_text(
|
||||||
|
record_attributes[4], remove_inner_quotes=True
|
||||||
|
)
|
||||||
edge_keywords = edge_keywords.replace(",", ",")
|
edge_keywords = edge_keywords.replace(",", ",")
|
||||||
|
|
||||||
edge_source_id = chunk_key
|
edge_source_id = chunk_key
|
||||||
|
|
|
||||||
|
|
@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
|
||||||
return content[:max_length] + "..."
|
return content[:max_length] + "..."
|
||||||
|
|
||||||
|
|
||||||
def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str:
|
def sanitize_and_normalize_extracted_text(
|
||||||
|
input_text: str, remove_inner_quotes=False
|
||||||
|
) -> str:
|
||||||
"""Santitize and normalize extracted text
|
"""Santitize and normalize extracted text
|
||||||
Args:
|
Args:
|
||||||
input_text: text string to be processed
|
input_text: text string to be processed
|
||||||
|
|
@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str
|
||||||
Santitized and normalized text string
|
Santitized and normalized text string
|
||||||
"""
|
"""
|
||||||
safe_input_text = sanitize_text_for_encoding(input_text)
|
safe_input_text = sanitize_text_for_encoding(input_text)
|
||||||
normalized_text = normalize_extracted_info(safe_input_text, is_name)
|
if safe_input_text:
|
||||||
return normalized_text
|
normalized_text = normalize_extracted_info(
|
||||||
|
safe_input_text, remove_inner_quotes=remove_inner_quotes
|
||||||
|
)
|
||||||
|
return normalized_text
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def normalize_extracted_info(name: str, is_entity=False) -> str:
|
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
||||||
"""Normalize entity/relation names and description with the following rules:
|
"""Normalize entity/relation names and description with the following rules:
|
||||||
1. Remove spaces between Chinese characters
|
1. Clean HTML tags (paragraph and line break tags)
|
||||||
2. Remove spaces between Chinese characters and English letters/numbers
|
2. Convert Chinese symbols to English symbols
|
||||||
3. Preserve spaces within English text and numbers
|
3. Remove spaces between Chinese characters
|
||||||
4. Replace Chinese parentheses with English parentheses
|
4. Remove spaces between Chinese characters and English letters/numbers
|
||||||
5. Replace Chinese dash with English dash
|
5. Preserve spaces within English text and numbers
|
||||||
6. Remove English quotation marks from the beginning and end of the text
|
6. Replace Chinese parentheses with English parentheses
|
||||||
7. Remove English quotation marks in and around chinese
|
7. Replace Chinese dash with English dash
|
||||||
8. Remove Chinese quotation marks
|
8. Remove English quotation marks from the beginning and end of the text
|
||||||
|
9. Remove English quotation marks in and around chinese
|
||||||
|
10. Remove Chinese quotation marks
|
||||||
|
11. Filter out short numeric-only text (length < 3 and only digits/dots)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name: Entity name to normalize
|
name: Entity name to normalize
|
||||||
|
is_entity: Whether this is an entity name (affects quote handling)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Normalized entity name
|
Normalized entity name
|
||||||
"""
|
"""
|
||||||
|
# 1. Clean HTML tags - remove paragraph and line break tags
|
||||||
|
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
|
||||||
|
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
# 2. Convert Chinese symbols to English symbols
|
||||||
|
# Chinese full-width letters to half-width (A-Z, a-z)
|
||||||
|
name = name.translate(
|
||||||
|
str.maketrans(
|
||||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
|
||||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chinese full-width numbers to half-width
|
||||||
|
name = name.translate(str.maketrans("0123456789", "0123456789"))
|
||||||
|
|
||||||
|
# Chinese full-width symbols to half-width
|
||||||
|
name = name.replace("-", "-") # Chinese minus
|
||||||
|
name = name.replace("+", "+") # Chinese plus
|
||||||
|
name = name.replace("/", "/") # Chinese slash
|
||||||
|
name = name.replace("*", "*") # Chinese asterisk
|
||||||
|
|
||||||
# Replace Chinese parentheses with English parentheses
|
# Replace Chinese parentheses with English parentheses
|
||||||
name = name.replace("(", "(").replace(")", ")")
|
name = name.replace("(", "(").replace(")", ")")
|
||||||
|
|
||||||
# Replace Chinese dash with English dash
|
# Replace Chinese dash with English dash (additional patterns)
|
||||||
name = name.replace("—", "-").replace("-", "-")
|
name = name.replace("—", "-").replace("-", "-")
|
||||||
|
|
||||||
|
# Chinese full-width space to regular space (after other replacements)
|
||||||
|
name = name.replace(" ", " ")
|
||||||
|
|
||||||
# Use regex to remove spaces between Chinese characters
|
# Use regex to remove spaces between Chinese characters
|
||||||
# Regex explanation:
|
# Regex explanation:
|
||||||
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
|
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
|
||||||
|
|
@ -1767,19 +1802,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
|
||||||
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
|
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove English quotation marks from the beginning and end
|
# Remove outer quotes
|
||||||
if len(name) >= 2 and name.startswith('"') and name.endswith('"'):
|
if len(name) >= 2:
|
||||||
name = name[1:-1]
|
# Handle double quotes
|
||||||
if len(name) >= 2 and name.startswith("'") and name.endswith("'"):
|
if name.startswith('"') and name.endswith('"'):
|
||||||
name = name[1:-1]
|
inner_content = name[1:-1]
|
||||||
|
if '"' not in inner_content: # No double quotes inside
|
||||||
|
name = inner_content
|
||||||
|
|
||||||
if is_entity:
|
# Handle single quotes
|
||||||
|
if name.startswith("'") and name.endswith("'"):
|
||||||
|
inner_content = name[1:-1]
|
||||||
|
if "'" not in inner_content: # No single quotes inside
|
||||||
|
name = inner_content
|
||||||
|
|
||||||
|
# Handle Chinese-style double quotes
|
||||||
|
if name.startswith("“") and name.endswith("”"):
|
||||||
|
inner_content = name[1:-1]
|
||||||
|
if "“" not in inner_content and "”" not in inner_content:
|
||||||
|
name = inner_content
|
||||||
|
if name.startswith("‘") and name.endswith("’"):
|
||||||
|
inner_content = name[1:-1]
|
||||||
|
if "‘" not in inner_content and "’" not in inner_content:
|
||||||
|
name = inner_content
|
||||||
|
|
||||||
|
if remove_inner_quotes:
|
||||||
# remove Chinese quotes
|
# remove Chinese quotes
|
||||||
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
||||||
# remove English queotes in and around chinese
|
# remove English queotes in and around chinese
|
||||||
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
||||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
||||||
|
|
||||||
|
# Remove spaces from the beginning and end of the text
|
||||||
|
name = name.strip()
|
||||||
|
|
||||||
|
# Filter out pure numeric content with length < 3
|
||||||
|
if len(name) < 3 and re.match(r"^[0-9]+$", name):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def should_filter_by_dots(text):
|
||||||
|
"""
|
||||||
|
Check if the string consists only of dots and digits, with at least one dot
|
||||||
|
Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc.
|
||||||
|
"""
|
||||||
|
return all(c.isdigit() or c == "." for c in text) and "." in text
|
||||||
|
|
||||||
|
if len(name) < 6 and should_filter_by_dots(name):
|
||||||
|
# Filter out mixed numeric and dot content with length < 6
|
||||||
|
return ""
|
||||||
|
# Filter out mixed numeric and dot content with length < 6, requiring at least one dot
|
||||||
|
return ""
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue