From d4bbc5dea9f0774c9662d7d14d9443f96508f8ef Mon Sep 17 00:00:00 2001 From: yangdx Date: Sun, 31 Aug 2025 10:36:56 +0800 Subject: [PATCH] refactor: Merge multi-step text sanitization into single function --- lightrag/operate.py | 49 ++++++++++++--------------------------------- lightrag/utils.py | 40 ++++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index afa8205f..b83790ab 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -11,11 +11,10 @@ from collections import Counter, defaultdict from .utils import ( logger, - clean_str, compute_mdhash_id, Tokenizer, is_float_regex, - normalize_extracted_info, + sanitize_and_normalize_extracted_text, pack_user_ass_to_openai_messages, split_string_by_multi_markers, truncate_list_by_token_size, @@ -31,7 +30,6 @@ from .utils import ( pick_by_vector_similarity, process_chunks_unified, build_file_path, - sanitize_text_for_encoding, ) from .base import ( BaseGraphStorage, @@ -320,14 +318,9 @@ async def _handle_single_entity_extraction( return None try: - # Step 1: Strict UTF-8 encoding sanitization (fail-fast approach) - entity_name = sanitize_text_for_encoding(record_attributes[1]) - - # Step 2: HTML and control character cleaning - entity_name = clean_str(entity_name).strip() - - # Step 3: Business logic normalization - entity_name = normalize_extracted_info(entity_name, is_entity=True) + entity_name = sanitize_and_normalize_extracted_text( + record_attributes[1], is_entity=True + ) # Validate entity name after all cleaning steps if not entity_name or not entity_name.strip(): @@ -337,8 +330,8 @@ async def _handle_single_entity_extraction( return None # Process entity type with same cleaning pipeline - entity_type = sanitize_text_for_encoding(record_attributes[2]) - entity_type = clean_str(entity_type).strip('"') + entity_type = sanitize_and_normalize_extracted_text(record_attributes[2]) + if not entity_type.strip() or entity_type.startswith('("'): logger.warning( f"Entity extraction error: invalid entity type in: {record_attributes}" @@ -346,9 +339,7 @@ async def _handle_single_entity_extraction( return None # Process entity description with same cleaning pipeline - entity_description = sanitize_text_for_encoding(record_attributes[3]) - entity_description = clean_str(entity_description) - entity_description = normalize_extracted_info(entity_description) + entity_description = sanitize_and_normalize_extracted_text(record_attributes[3]) if not entity_description.strip(): logger.warning( @@ -385,27 +376,17 @@ async def _handle_single_relationship_extraction( return None try: - # Process source and target entities with strict cleaning pipeline - # Step 1: Strict UTF-8 encoding sanitization (fail-fast approach) - source = sanitize_text_for_encoding(record_attributes[1]) - # Step 2: HTML and control character cleaning - source = clean_str(source) - # Step 3: Business logic normalization - source = normalize_extracted_info(source, is_entity=True) - - # Same pipeline for target entity - target = sanitize_text_for_encoding(record_attributes[2]) - target = clean_str(target) - target = normalize_extracted_info(target, is_entity=True) + source = sanitize_and_normalize_extracted_text(record_attributes[1]) + target = sanitize_and_normalize_extracted_text(record_attributes[2]) # Validate entity names after all cleaning steps - if not source or not source.strip(): + if not source: logger.warning( f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'" ) return None - if not target or not target.strip(): + if not target: logger.warning( f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'" ) @@ -418,14 +399,10 @@ async def _handle_single_relationship_extraction( return None # Process relationship description with same cleaning pipeline - edge_description = sanitize_text_for_encoding(record_attributes[3]) - edge_description = clean_str(edge_description) - edge_description = normalize_extracted_info(edge_description) + edge_description = sanitize_and_normalize_extracted_text(record_attributes[3]) # Process keywords with same cleaning pipeline - edge_keywords = sanitize_text_for_encoding(record_attributes[4]) - edge_keywords = clean_str(edge_keywords) - edge_keywords = normalize_extracted_info(edge_keywords, is_entity=True) + edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4]) edge_keywords = edge_keywords.replace(",", ",") edge_source_id = chunk_key diff --git a/lightrag/utils.py b/lightrag/utils.py index 87ce5b6a..82a7cce4 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -931,19 +931,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] return [r.strip() for r in results if r.strip()] -# Refer the utils functions of the official GraphRAG implementation: -# https://github.com/microsoft/graphrag -def clean_str(input: Any) -> str: - """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" - # If we get non-string input, just give it back - if not isinstance(input, str): - return input - - result = html.unescape(input.strip()) - # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python - return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) - - def is_float_regex(value: str) -> bool: return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) @@ -1728,6 +1715,20 @@ def get_content_summary(content: str, max_length: int = 250) -> str: return content[:max_length] + "..." +def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str: + """Santitize and normalize extracted text + Args: + input_text: text string to be processed + is_name: whether the input text is a entity or relation name + + Returns: + Santitized and normalized text string + """ + safe_input_text = sanitize_text_for_encoding(input_text) + normalized_text = normalize_extracted_info(safe_input_text, is_name) + return normalized_text + + def normalize_extracted_info(name: str, is_entity=False) -> str: """Normalize entity/relation names and description with the following rules: 1. Remove spaces between Chinese characters @@ -1789,6 +1790,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: - Surrogate characters (the main cause of encoding errors) - Other invalid Unicode sequences - Control characters that might cause issues + - Unescape HTML escapes + - Remove control characters - Whitespace trimming Args: @@ -1801,9 +1804,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: Raises: ValueError: When text contains uncleanable encoding issues that cannot be safely processed """ - if not isinstance(text, str): - return str(text) - if not text: return text @@ -1845,7 +1845,13 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str: # Test final encoding to ensure it's safe sanitized.encode("utf-8") - return sanitized + # Unescape HTML escapes + sanitized = html.unescape(sanitized) + + # Remove control characters + sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized) + + return sanitized.strip() except UnicodeEncodeError as e: # Critical change: Don't return placeholder, raise exception for caller to handle