refactor: Merge multi-step text sanitization into single function
This commit is contained in:
parent
68f18eacf8
commit
d4bbc5dea9
2 changed files with 36 additions and 53 deletions
|
|
@ -11,11 +11,10 @@ from collections import Counter, defaultdict
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
logger,
|
logger,
|
||||||
clean_str,
|
|
||||||
compute_mdhash_id,
|
compute_mdhash_id,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
is_float_regex,
|
is_float_regex,
|
||||||
normalize_extracted_info,
|
sanitize_and_normalize_extracted_text,
|
||||||
pack_user_ass_to_openai_messages,
|
pack_user_ass_to_openai_messages,
|
||||||
split_string_by_multi_markers,
|
split_string_by_multi_markers,
|
||||||
truncate_list_by_token_size,
|
truncate_list_by_token_size,
|
||||||
|
|
@ -31,7 +30,6 @@ from .utils import (
|
||||||
pick_by_vector_similarity,
|
pick_by_vector_similarity,
|
||||||
process_chunks_unified,
|
process_chunks_unified,
|
||||||
build_file_path,
|
build_file_path,
|
||||||
sanitize_text_for_encoding,
|
|
||||||
)
|
)
|
||||||
from .base import (
|
from .base import (
|
||||||
BaseGraphStorage,
|
BaseGraphStorage,
|
||||||
|
|
@ -320,14 +318,9 @@ async def _handle_single_entity_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
|
entity_name = sanitize_and_normalize_extracted_text(
|
||||||
entity_name = sanitize_text_for_encoding(record_attributes[1])
|
record_attributes[1], is_entity=True
|
||||||
|
)
|
||||||
# Step 2: HTML and control character cleaning
|
|
||||||
entity_name = clean_str(entity_name).strip()
|
|
||||||
|
|
||||||
# Step 3: Business logic normalization
|
|
||||||
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
|
||||||
|
|
||||||
# Validate entity name after all cleaning steps
|
# Validate entity name after all cleaning steps
|
||||||
if not entity_name or not entity_name.strip():
|
if not entity_name or not entity_name.strip():
|
||||||
|
|
@ -337,8 +330,8 @@ async def _handle_single_entity_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process entity type with same cleaning pipeline
|
# Process entity type with same cleaning pipeline
|
||||||
entity_type = sanitize_text_for_encoding(record_attributes[2])
|
entity_type = sanitize_and_normalize_extracted_text(record_attributes[2])
|
||||||
entity_type = clean_str(entity_type).strip('"')
|
|
||||||
if not entity_type.strip() or entity_type.startswith('("'):
|
if not entity_type.strip() or entity_type.startswith('("'):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Entity extraction error: invalid entity type in: {record_attributes}"
|
f"Entity extraction error: invalid entity type in: {record_attributes}"
|
||||||
|
|
@ -346,9 +339,7 @@ async def _handle_single_entity_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process entity description with same cleaning pipeline
|
# Process entity description with same cleaning pipeline
|
||||||
entity_description = sanitize_text_for_encoding(record_attributes[3])
|
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
||||||
entity_description = clean_str(entity_description)
|
|
||||||
entity_description = normalize_extracted_info(entity_description)
|
|
||||||
|
|
||||||
if not entity_description.strip():
|
if not entity_description.strip():
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
@ -385,27 +376,17 @@ async def _handle_single_relationship_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Process source and target entities with strict cleaning pipeline
|
source = sanitize_and_normalize_extracted_text(record_attributes[1])
|
||||||
# Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
|
target = sanitize_and_normalize_extracted_text(record_attributes[2])
|
||||||
source = sanitize_text_for_encoding(record_attributes[1])
|
|
||||||
# Step 2: HTML and control character cleaning
|
|
||||||
source = clean_str(source)
|
|
||||||
# Step 3: Business logic normalization
|
|
||||||
source = normalize_extracted_info(source, is_entity=True)
|
|
||||||
|
|
||||||
# Same pipeline for target entity
|
|
||||||
target = sanitize_text_for_encoding(record_attributes[2])
|
|
||||||
target = clean_str(target)
|
|
||||||
target = normalize_extracted_info(target, is_entity=True)
|
|
||||||
|
|
||||||
# Validate entity names after all cleaning steps
|
# Validate entity names after all cleaning steps
|
||||||
if not source or not source.strip():
|
if not source:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
|
f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not target or not target.strip():
|
if not target:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
|
f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
|
||||||
)
|
)
|
||||||
|
|
@ -418,14 +399,10 @@ async def _handle_single_relationship_extraction(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process relationship description with same cleaning pipeline
|
# Process relationship description with same cleaning pipeline
|
||||||
edge_description = sanitize_text_for_encoding(record_attributes[3])
|
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
||||||
edge_description = clean_str(edge_description)
|
|
||||||
edge_description = normalize_extracted_info(edge_description)
|
|
||||||
|
|
||||||
# Process keywords with same cleaning pipeline
|
# Process keywords with same cleaning pipeline
|
||||||
edge_keywords = sanitize_text_for_encoding(record_attributes[4])
|
edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4])
|
||||||
edge_keywords = clean_str(edge_keywords)
|
|
||||||
edge_keywords = normalize_extracted_info(edge_keywords, is_entity=True)
|
|
||||||
edge_keywords = edge_keywords.replace(",", ",")
|
edge_keywords = edge_keywords.replace(",", ",")
|
||||||
|
|
||||||
edge_source_id = chunk_key
|
edge_source_id = chunk_key
|
||||||
|
|
|
||||||
|
|
@ -931,19 +931,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
|
||||||
return [r.strip() for r in results if r.strip()]
|
return [r.strip() for r in results if r.strip()]
|
||||||
|
|
||||||
|
|
||||||
# Refer the utils functions of the official GraphRAG implementation:
|
|
||||||
# https://github.com/microsoft/graphrag
|
|
||||||
def clean_str(input: Any) -> str:
|
|
||||||
"""Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
|
|
||||||
# If we get non-string input, just give it back
|
|
||||||
if not isinstance(input, str):
|
|
||||||
return input
|
|
||||||
|
|
||||||
result = html.unescape(input.strip())
|
|
||||||
# https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
|
|
||||||
return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
|
|
||||||
|
|
||||||
|
|
||||||
def is_float_regex(value: str) -> bool:
|
def is_float_regex(value: str) -> bool:
|
||||||
return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
|
return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
|
||||||
|
|
||||||
|
|
@ -1728,6 +1715,20 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
|
||||||
return content[:max_length] + "..."
|
return content[:max_length] + "..."
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str:
|
||||||
|
"""Santitize and normalize extracted text
|
||||||
|
Args:
|
||||||
|
input_text: text string to be processed
|
||||||
|
is_name: whether the input text is a entity or relation name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Santitized and normalized text string
|
||||||
|
"""
|
||||||
|
safe_input_text = sanitize_text_for_encoding(input_text)
|
||||||
|
normalized_text = normalize_extracted_info(safe_input_text, is_name)
|
||||||
|
return normalized_text
|
||||||
|
|
||||||
|
|
||||||
def normalize_extracted_info(name: str, is_entity=False) -> str:
|
def normalize_extracted_info(name: str, is_entity=False) -> str:
|
||||||
"""Normalize entity/relation names and description with the following rules:
|
"""Normalize entity/relation names and description with the following rules:
|
||||||
1. Remove spaces between Chinese characters
|
1. Remove spaces between Chinese characters
|
||||||
|
|
@ -1789,6 +1790,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
- Surrogate characters (the main cause of encoding errors)
|
- Surrogate characters (the main cause of encoding errors)
|
||||||
- Other invalid Unicode sequences
|
- Other invalid Unicode sequences
|
||||||
- Control characters that might cause issues
|
- Control characters that might cause issues
|
||||||
|
- Unescape HTML escapes
|
||||||
|
- Remove control characters
|
||||||
- Whitespace trimming
|
- Whitespace trimming
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -1801,9 +1804,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: When text contains uncleanable encoding issues that cannot be safely processed
|
ValueError: When text contains uncleanable encoding issues that cannot be safely processed
|
||||||
"""
|
"""
|
||||||
if not isinstance(text, str):
|
|
||||||
return str(text)
|
|
||||||
|
|
||||||
if not text:
|
if not text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
@ -1845,7 +1845,13 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
# Test final encoding to ensure it's safe
|
# Test final encoding to ensure it's safe
|
||||||
sanitized.encode("utf-8")
|
sanitized.encode("utf-8")
|
||||||
|
|
||||||
return sanitized
|
# Unescape HTML escapes
|
||||||
|
sanitized = html.unescape(sanitized)
|
||||||
|
|
||||||
|
# Remove control characters
|
||||||
|
sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized)
|
||||||
|
|
||||||
|
return sanitized.strip()
|
||||||
|
|
||||||
except UnicodeEncodeError as e:
|
except UnicodeEncodeError as e:
|
||||||
# Critical change: Don't return placeholder, raise exception for caller to handle
|
# Critical change: Don't return placeholder, raise exception for caller to handle
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue