From b7474179612a1e7a574b0bf0a64367f36436985e Mon Sep 17 00:00:00 2001
From: yangdx
Date: Sun, 31 Aug 2025 13:17:20 +0800
Subject: [PATCH] feat: enhance text extraction text sanitization and
normalization
- Improve reduntant quotes in entity and relation name, type and keywords
- Add HTML tag cleaning and Chinese symbol conversion
- Filter out short numeric content and malformed text
- Enhance entity type validation with character filtering
---
lightrag/operate.py | 22 ++++++---
lightrag/utils.py | 111 ++++++++++++++++++++++++++++++++++++--------
2 files changed, 108 insertions(+), 25 deletions(-)
diff --git a/lightrag/operate.py b/lightrag/operate.py
index b83790ab..4fa5ddb1 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -319,7 +319,7 @@ async def _handle_single_entity_extraction(
try:
entity_name = sanitize_and_normalize_extracted_text(
- record_attributes[1], is_entity=True
+ record_attributes[1], remove_inner_quotes=True
)
# Validate entity name after all cleaning steps
@@ -330,9 +330,13 @@ async def _handle_single_entity_extraction(
return None
# Process entity type with same cleaning pipeline
- entity_type = sanitize_and_normalize_extracted_text(record_attributes[2])
+ entity_type = sanitize_and_normalize_extracted_text(
+ record_attributes[2], remove_inner_quotes=True
+ )
- if not entity_type.strip() or entity_type.startswith('("'):
+ if not entity_type.strip() or any(
+ char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"]
+ ):
logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}"
)
@@ -376,8 +380,12 @@ async def _handle_single_relationship_extraction(
return None
try:
- source = sanitize_and_normalize_extracted_text(record_attributes[1])
- target = sanitize_and_normalize_extracted_text(record_attributes[2])
+ source = sanitize_and_normalize_extracted_text(
+ record_attributes[1], remove_inner_quotes=True
+ )
+ target = sanitize_and_normalize_extracted_text(
+ record_attributes[2], remove_inner_quotes=True
+ )
# Validate entity names after all cleaning steps
if not source:
@@ -402,7 +410,9 @@ async def _handle_single_relationship_extraction(
edge_description = sanitize_and_normalize_extracted_text(record_attributes[3])
# Process keywords with same cleaning pipeline
- edge_keywords = sanitize_and_normalize_extracted_text(record_attributes[4])
+ edge_keywords = sanitize_and_normalize_extracted_text(
+ record_attributes[4], remove_inner_quotes=True
+ )
edge_keywords = edge_keywords.replace(",", ",")
edge_source_id = chunk_key
diff --git a/lightrag/utils.py b/lightrag/utils.py
index 82a7cce4..9dc5a5b2 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1715,7 +1715,9 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..."
-def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str:
+def sanitize_and_normalize_extracted_text(
+ input_text: str, remove_inner_quotes=False
+) -> str:
"""Santitize and normalize extracted text
Args:
input_text: text string to be processed
@@ -1725,33 +1727,66 @@ def sanitize_and_normalize_extracted_text(input_text: str, is_name=False) -> str
Santitized and normalized text string
"""
safe_input_text = sanitize_text_for_encoding(input_text)
- normalized_text = normalize_extracted_info(safe_input_text, is_name)
- return normalized_text
+ if safe_input_text:
+ normalized_text = normalize_extracted_info(
+ safe_input_text, remove_inner_quotes=remove_inner_quotes
+ )
+ return normalized_text
+ return ""
-def normalize_extracted_info(name: str, is_entity=False) -> str:
+def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
- 1. Remove spaces between Chinese characters
- 2. Remove spaces between Chinese characters and English letters/numbers
- 3. Preserve spaces within English text and numbers
- 4. Replace Chinese parentheses with English parentheses
- 5. Replace Chinese dash with English dash
- 6. Remove English quotation marks from the beginning and end of the text
- 7. Remove English quotation marks in and around chinese
- 8. Remove Chinese quotation marks
+ 1. Clean HTML tags (paragraph and line break tags)
+ 2. Convert Chinese symbols to English symbols
+ 3. Remove spaces between Chinese characters
+ 4. Remove spaces between Chinese characters and English letters/numbers
+ 5. Preserve spaces within English text and numbers
+ 6. Replace Chinese parentheses with English parentheses
+ 7. Replace Chinese dash with English dash
+ 8. Remove English quotation marks from the beginning and end of the text
+ 9. Remove English quotation marks in and around chinese
+ 10. Remove Chinese quotation marks
+ 11. Filter out short numeric-only text (length < 3 and only digits/dots)
Args:
name: Entity name to normalize
+ is_entity: Whether this is an entity name (affects quote handling)
Returns:
Normalized entity name
"""
+ # 1. Clean HTML tags - remove paragraph and line break tags
+ name = re.sub(r"
||
", "", name, flags=re.IGNORECASE)
+ name = re.sub(r"|
|
", "", name, flags=re.IGNORECASE)
+
+ # 2. Convert Chinese symbols to English symbols
+ # Chinese full-width letters to half-width (A-Z, a-z)
+ name = name.translate(
+ str.maketrans(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+ )
+ )
+
+ # Chinese full-width numbers to half-width
+ name = name.translate(str.maketrans("0123456789", "0123456789"))
+
+ # Chinese full-width symbols to half-width
+ name = name.replace("-", "-") # Chinese minus
+ name = name.replace("+", "+") # Chinese plus
+ name = name.replace("/", "/") # Chinese slash
+ name = name.replace("*", "*") # Chinese asterisk
+
# Replace Chinese parentheses with English parentheses
name = name.replace("(", "(").replace(")", ")")
- # Replace Chinese dash with English dash
+ # Replace Chinese dash with English dash (additional patterns)
name = name.replace("—", "-").replace("-", "-")
+ # Chinese full-width space to regular space (after other replacements)
+ name = name.replace(" ", " ")
+
# Use regex to remove spaces between Chinese characters
# Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
@@ -1767,19 +1802,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
)
- # Remove English quotation marks from the beginning and end
- if len(name) >= 2 and name.startswith('"') and name.endswith('"'):
- name = name[1:-1]
- if len(name) >= 2 and name.startswith("'") and name.endswith("'"):
- name = name[1:-1]
+ # Remove outer quotes
+ if len(name) >= 2:
+ # Handle double quotes
+ if name.startswith('"') and name.endswith('"'):
+ inner_content = name[1:-1]
+ if '"' not in inner_content: # No double quotes inside
+ name = inner_content
- if is_entity:
+ # Handle single quotes
+ if name.startswith("'") and name.endswith("'"):
+ inner_content = name[1:-1]
+ if "'" not in inner_content: # No single quotes inside
+ name = inner_content
+
+ # Handle Chinese-style double quotes
+ if name.startswith("“") and name.endswith("”"):
+ inner_content = name[1:-1]
+ if "“" not in inner_content and "”" not in inner_content:
+ name = inner_content
+ if name.startswith("‘") and name.endswith("’"):
+ inner_content = name[1:-1]
+ if "‘" not in inner_content and "’" not in inner_content:
+ name = inner_content
+
+ if remove_inner_quotes:
# remove Chinese quotes
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
# remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
+ # Remove spaces from the beginning and end of the text
+ name = name.strip()
+
+ # Filter out pure numeric content with length < 3
+ if len(name) < 3 and re.match(r"^[0-9]+$", name):
+ return ""
+
+ def should_filter_by_dots(text):
+ """
+ Check if the string consists only of dots and digits, with at least one dot
+ Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc.
+ """
+ return all(c.isdigit() or c == "." for c in text) and "." in text
+
+ if len(name) < 6 and should_filter_by_dots(name):
+ # Filter out mixed numeric and dot content with length < 6
+ return ""
+ # Filter out mixed numeric and dot content with length < 6, requiring at least one dot
+ return ""
+
return name