Merge pull request #2034 from danielaskdd/fix-entity-type-env

Fix ENTITY_TYPES Environment Variable Handling
This commit is contained in:
Daniel.y 2025-09-01 00:43:46 +08:00 committed by GitHub
commit cdc4570cfe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 317 additions and 187 deletions

View file

@ -275,7 +275,7 @@ if __name__ == "__main__":
| **vector_db_storage_cls_kwargs** | `dict` | 向量数据库的附加参数,如设置节点和关系检索的阈值 | cosine_better_than_threshold: 0.2默认值由环境变量COSINE_THRESHOLD更改 |
| **enable_llm_cache** | `bool` | 如果为`TRUE`将LLM结果存储在缓存中重复的提示返回缓存的响应 | `TRUE` |
| **enable_llm_cache_for_entity_extract** | `bool` | 如果为`TRUE`将实体提取的LLM结果存储在缓存中适合初学者调试应用程序 | `TRUE` |
| **addon_params** | `dict` | 附加参数,例如`{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"]}`:设置示例限制、输出语言和文档处理的批量大小 | `example_number: 所有示例, language: English` |
| **addon_params** | `dict` | 附加参数,例如`{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`:设置示例限制、输出语言和文档处理的批量大小 | language: English` |
| **embedding_cache_config** | `dict` | 问答缓存的配置。包含三个参数:`enabled`:布尔值,启用/禁用缓存查找功能。启用时,系统将在生成新答案之前检查缓存的响应。`similarity_threshold`浮点值0-1相似度阈值。当新问题与缓存问题的相似度超过此阈值时将直接返回缓存的答案而不调用LLM。`use_llm_check`:布尔值,启用/禁用LLM相似度验证。启用时在返回缓存答案之前将使用LLM作为二次检查来验证问题之间的相似度。 | 默认:`{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |
</details>

View file

@ -282,7 +282,7 @@ A full list of LightRAG init parameters:
| **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2default value changed by env var COSINE_THRESHOLD) |
| **enable_llm_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
| **enable_llm_cache_for_entity_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` |
| **addon_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"]}`: sets example limit, entiy/relation extraction output language | `example_number: all examples, language: English` |
| **addon_params** | `dict` | Additional parameters, e.g., `{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`: sets example limit, entiy/relation extraction output language | language: English` |
| **embedding_cache_config** | `dict` | Configuration for question-answer caching. Contains three parameters: `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers. `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM. `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |
</details>

View file

@ -125,7 +125,7 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
SUMMARY_LANGUAGE=English
### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES=["person", "organization", "location", "event", "concept"]
# ENTITY_TYPES='["Organization", "Person", "Equiment", "Product", "Technology", "Location", "Event", "Category"]'
### Chunk size for document splitting, 500~1500 is recommended
# CHUNK_SIZE=1200

View file

@ -352,7 +352,7 @@ def parse_args() -> argparse.Namespace:
# Add environment variables that were previously read directly
args.cors_origins = get_env_value("CORS_ORIGINS", "*")
args.summary_language = get_env_value("SUMMARY_LANGUAGE", DEFAULT_SUMMARY_LANGUAGE)
args.entity_types = get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES)
args.entity_types = get_env_value("ENTITY_TYPES", DEFAULT_ENTITY_TYPES, list)
args.whitelist_paths = get_env_value("WHITELIST_PATHS", "/health,/api/*")
# For JWT Auth

View file

@ -24,11 +24,14 @@ DEFAULT_SUMMARY_LENGTH_RECOMMENDED = 600
DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
# Default entities to extract if ENTITY_TYPES is not specified in .env
DEFAULT_ENTITY_TYPES = [
"organization",
"person",
"geo",
"event",
"category",
"Organization",
"Person",
"Equiment",
"Product",
"Technology",
"Location",
"Event",
"Category",
]
# Separator for graph fields

View file

@ -469,6 +469,7 @@ class LightRAG:
self.embedding_func = priority_limit_async_func_call(
self.embedding_func_max_async,
llm_timeout=self.default_embedding_timeout,
queue_name="Embedding func:",
)(self.embedding_func)
# Initialize all storages
@ -565,6 +566,7 @@ class LightRAG:
self.llm_model_func = priority_limit_async_func_call(
self.llm_model_max_async,
llm_timeout=self.default_llm_timeout,
queue_name="LLM func:",
)(
partial(
self.llm_model_func, # type: ignore

View file

@ -11,11 +11,10 @@ from collections import Counter, defaultdict
from .utils import (
logger,
clean_str,
compute_mdhash_id,
Tokenizer,
is_float_regex,
normalize_extracted_info,
sanitize_and_normalize_extracted_text,
pack_user_ass_to_openai_messages,
split_string_by_multi_markers,
truncate_list_by_token_size,
@ -31,7 +30,6 @@ from .utils import (
pick_by_vector_similarity,
process_chunks_unified,
build_file_path,
sanitize_text_for_encoding,
)
from .base import (
BaseGraphStorage,
@ -316,18 +314,18 @@ async def _handle_single_entity_extraction(
chunk_key: str,
file_path: str = "unknown_source",
):
if len(record_attributes) < 4 or '"entity"' not in record_attributes[0]:
if len(record_attributes) < 4 or "entity" not in record_attributes[0]:
if len(record_attributes) > 1 and "entity" in record_attributes[0]:
logger.warning(
f"Entity extraction failed in {chunk_key}: expecting 4 fields but got {len(record_attributes)}"
)
logger.warning(f"Entity extracted: {record_attributes[1]}")
return None
try:
# Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
entity_name = sanitize_text_for_encoding(record_attributes[1])
# Step 2: HTML and control character cleaning
entity_name = clean_str(entity_name).strip()
# Step 3: Business logic normalization
entity_name = normalize_extracted_info(entity_name, is_entity=True)
entity_name = sanitize_and_normalize_extracted_text(
record_attributes[1], remove_inner_quotes=True
)
# Validate entity name after all cleaning steps
if not entity_name or not entity_name.strip():
@ -337,18 +335,20 @@ async def _handle_single_entity_extraction(
return None
# Process entity type with same cleaning pipeline
entity_type = sanitize_text_for_encoding(record_attributes[2])
entity_type = clean_str(entity_type).strip('"')
if not entity_type.strip() or entity_type.startswith('("'):
entity_type = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
if not entity_type.strip() or any(
char in entity_type for char in ["'", "(", ")", "<", ">", "|", "/", "\\"]
):
logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}"
)
return None
# Process entity description with same cleaning pipeline
entity_description = sanitize_text_for_encoding(record_attributes[3])
entity_description = clean_str(entity_description)
entity_description = normalize_extracted_info(entity_description)
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
if not entity_description.strip():
logger.warning(
@ -381,31 +381,30 @@ async def _handle_single_relationship_extraction(
chunk_key: str,
file_path: str = "unknown_source",
):
if len(record_attributes) < 5 or '"relationship"' not in record_attributes[0]:
if len(record_attributes) < 5 or "relationship" not in record_attributes[0]:
if len(record_attributes) > 1 and "relationship" in record_attributes[0]:
logger.warning(
f"Relation extraction failed in {chunk_key}: expecting 5 fields but got {len(record_attributes)}"
)
logger.warning(f"Relation extracted: {record_attributes[1]}")
return None
try:
# Process source and target entities with strict cleaning pipeline
# Step 1: Strict UTF-8 encoding sanitization (fail-fast approach)
source = sanitize_text_for_encoding(record_attributes[1])
# Step 2: HTML and control character cleaning
source = clean_str(source)
# Step 3: Business logic normalization
source = normalize_extracted_info(source, is_entity=True)
# Same pipeline for target entity
target = sanitize_text_for_encoding(record_attributes[2])
target = clean_str(target)
target = normalize_extracted_info(target, is_entity=True)
source = sanitize_and_normalize_extracted_text(
record_attributes[1], remove_inner_quotes=True
)
target = sanitize_and_normalize_extracted_text(
record_attributes[2], remove_inner_quotes=True
)
# Validate entity names after all cleaning steps
if not source or not source.strip():
if not source:
logger.warning(
f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
)
return None
if not target or not target.strip():
if not target:
logger.warning(
f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
)
@ -417,17 +416,15 @@ async def _handle_single_relationship_extraction(
)
return None
# Process relationship description with same cleaning pipeline
edge_description = sanitize_text_for_encoding(record_attributes[3])
edge_description = clean_str(edge_description)
edge_description = normalize_extracted_info(edge_description)
# Process keywords with same cleaning pipeline
edge_keywords = sanitize_text_for_encoding(record_attributes[4])
edge_keywords = clean_str(edge_keywords)
edge_keywords = normalize_extracted_info(edge_keywords, is_entity=True)
edge_keywords = sanitize_and_normalize_extracted_text(
record_attributes[3], remove_inner_quotes=True
)
edge_keywords = edge_keywords.replace("", ",")
# Process relationship description with same cleaning pipeline
edge_description = sanitize_and_normalize_extracted_text(record_attributes[4])
edge_source_id = chunk_key
weight = (
float(record_attributes[-1].strip('"').strip("'"))
@ -446,12 +443,12 @@ async def _handle_single_relationship_extraction(
)
except ValueError as e:
logger.error(
logger.warning(
f"Relationship extraction failed due to encoding issues in chunk {chunk_key}: {e}"
)
return None
except Exception as e:
logger.error(
logger.warning(
f"Relationship extraction failed with unexpected error in chunk {chunk_key}: {e}"
)
return None
@ -1689,13 +1686,8 @@ async def extract_entities(
entity_types = global_config["addon_params"].get(
"entity_types", DEFAULT_ENTITY_TYPES
)
example_number = global_config["addon_params"].get("example_number", None)
if example_number and example_number < len(PROMPTS["entity_extraction_examples"]):
examples = "\n".join(
PROMPTS["entity_extraction_examples"][: int(example_number)]
)
else:
examples = "\n".join(PROMPTS["entity_extraction_examples"])
examples = "\n".join(PROMPTS["entity_extraction_examples"])
example_context_base = dict(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
@ -2140,13 +2132,8 @@ async def extract_keywords_only(
)
# 2. Build the examples
example_number = global_config["addon_params"].get("example_number", None)
if example_number and example_number < len(PROMPTS["keywords_extraction_examples"]):
examples = "\n".join(
PROMPTS["keywords_extraction_examples"][: int(example_number)]
)
else:
examples = "\n".join(PROMPTS["keywords_extraction_examples"])
examples = "\n".join(PROMPTS["keywords_extraction_examples"])
language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE)
# 3. Process conversation history

View file

@ -15,27 +15,35 @@ Given a text document that is potentially relevant to this activity and a list o
Use {language} as output language.
---Steps---
1. Identify all entities. For each identified entity, extract the following information:
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
- entity_type: One of the following types: [{entity_types}]
- entity_description: Provide a comprehensive description of the entity's attributes and activities *based solely on the information present in the input text*. **Do not infer or hallucinate information not explicitly stated.** If the text provides insufficient information to create a comprehensive description, state "Description not available in text."
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
2. Format each entity as:
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
4. Format each relationship as:
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
5. When finished, output {completion_delimiter}
6. When finished, output `{completion_delimiter}`
7. Return identified entities and relationships in {language}.
---Quality Guidelines---
- Only extract entities that are clearly defined and meaningful in the context
- Avoid over-interpretation; stick to what is explicitly stated in the text
- Include specific numerical data in entity name when relevant
- Ensure entity names are consistent throughout the extraction
---Examples---
{examples}
@ -43,15 +51,18 @@ Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_
---Real Data---
Entity_types: [{entity_types}]
Text:
```
{input_text}
```
---Output---
Output:"""
Output:
"""
PROMPTS["entity_extraction_examples"] = [
"""------Example 1------
Entity_types: [person, technology, mission, organization, location]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
Text:
```
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
@ -64,22 +75,22 @@ It was a small transformation, barely perceptible, but one that Alex noted with
```
Output:
("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}
("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}
("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}
("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
(entity{tuple_delimiter}Alex{tuple_delimiter}person{tuple_delimiter}Alex is a character who experiences frustration and is observant of the dynamics among other characters.){record_delimiter}
(entity{tuple_delimiter}Taylor{tuple_delimiter}person{tuple_delimiter}Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective.){record_delimiter}
(entity{tuple_delimiter}Jordan{tuple_delimiter}person{tuple_delimiter}Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device.){record_delimiter}
(entity{tuple_delimiter}Cruz{tuple_delimiter}person{tuple_delimiter}Cruz is associated with a vision of control and order, influencing the dynamics among other characters.){record_delimiter}
(entity{tuple_delimiter}The Device{tuple_delimiter}equiment{tuple_delimiter}The Device is central to the story, with potential game-changing implications, and is revered by Taylor.){record_delimiter}
(relationship{tuple_delimiter}Alex{tuple_delimiter}Taylor{tuple_delimiter}power dynamics, observation{tuple_delimiter}Alex observes Taylor's authoritarian behavior and notes changes in Taylor's attitude toward the device.){record_delimiter}
(relationship{tuple_delimiter}Alex{tuple_delimiter}Jordan{tuple_delimiter}shared goals, rebellion{tuple_delimiter}Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision.){record_delimiter}
(relationship{tuple_delimiter}Taylor{tuple_delimiter}Jordan{tuple_delimiter}conflict resolution, mutual respect{tuple_delimiter}Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce.){record_delimiter}
(relationship{tuple_delimiter}Jordan{tuple_delimiter}Cruz{tuple_delimiter}ideological conflict, rebellion{tuple_delimiter}Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order.){record_delimiter}
(relationship{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}reverence, technological significance{tuple_delimiter}Taylor shows reverence towards the device, indicating its importance and potential impact.){record_delimiter}
{completion_delimiter}
""",
"""------Example 2------
Entity_types: [company, index, commodity, market_trend, economic_policy, biological]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
Text:
```
Stock markets faced a sharp downturn today as tech giants saw significant declines, with the Global Tech Index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty.
@ -92,40 +103,64 @@ Financial experts are closely watching the Federal Reserve's next move, as specu
```
Output:
("entity"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"index"{tuple_delimiter}"The Global Tech Index tracks the performance of major technology stocks and experienced a 3.4% decline today."){record_delimiter}
("entity"{tuple_delimiter}"Nexon Technologies"{tuple_delimiter}"company"{tuple_delimiter}"Nexon Technologies is a tech company that saw its stock decline by 7.8% after disappointing earnings."){record_delimiter}
("entity"{tuple_delimiter}"Omega Energy"{tuple_delimiter}"company"{tuple_delimiter}"Omega Energy is an energy company that gained 2.1% in stock value due to rising oil prices."){record_delimiter}
("entity"{tuple_delimiter}"Gold Futures"{tuple_delimiter}"commodity"{tuple_delimiter}"Gold futures rose by 1.5%, indicating increased investor interest in safe-haven assets."){record_delimiter}
("entity"{tuple_delimiter}"Crude Oil"{tuple_delimiter}"commodity"{tuple_delimiter}"Crude oil prices rose to $87.60 per barrel due to supply constraints and strong demand."){record_delimiter}
("entity"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"market_trend"{tuple_delimiter}"Market selloff refers to the significant decline in stock values due to investor concerns over interest rates and regulations."){record_delimiter}
("entity"{tuple_delimiter}"Federal Reserve Policy Announcement"{tuple_delimiter}"economic_policy"{tuple_delimiter}"The Federal Reserve's upcoming policy announcement is expected to impact investor confidence and market stability."){record_delimiter}
("relationship"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"The decline in the Global Tech Index is part of the broader market selloff driven by investor concerns."{tuple_delimiter}"market performance, investor sentiment"{tuple_delimiter}9){record_delimiter}
("relationship"{tuple_delimiter}"Nexon Technologies"{tuple_delimiter}"Global Tech Index"{tuple_delimiter}"Nexon Technologies' stock decline contributed to the overall drop in the Global Tech Index."{tuple_delimiter}"company impact, index movement"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Gold Futures"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"Gold prices rose as investors sought safe-haven assets during the market selloff."{tuple_delimiter}"market reaction, safe-haven investment"{tuple_delimiter}10){record_delimiter}
("relationship"{tuple_delimiter}"Federal Reserve Policy Announcement"{tuple_delimiter}"Market Selloff"{tuple_delimiter}"Speculation over Federal Reserve policy changes contributed to market volatility and investor selloff."{tuple_delimiter}"interest rate impact, financial regulation"{tuple_delimiter}7){record_delimiter}
("content_keywords"{tuple_delimiter}"market downturn, investor sentiment, commodities, Federal Reserve, stock performance"){completion_delimiter}
(entity{tuple_delimiter}Global Tech Index{tuple_delimiter}category{tuple_delimiter}The Global Tech Index tracks the performance of major technology stocks and experienced a 3.4% decline today.){record_delimiter}
(entity{tuple_delimiter}Nexon Technologies{tuple_delimiter}organization{tuple_delimiter}Nexon Technologies is a tech company that saw its stock decline by 7.8% after disappointing earnings.){record_delimiter}
(entity{tuple_delimiter}Omega Energy{tuple_delimiter}organization{tuple_delimiter}Omega Energy is an energy company that gained 2.1% in stock value due to rising oil prices.){record_delimiter}
(entity{tuple_delimiter}Gold Futures{tuple_delimiter}product{tuple_delimiter}Gold futures rose by 1.5%, indicating increased investor interest in safe-haven assets.){record_delimiter}
(entity{tuple_delimiter}Crude Oil{tuple_delimiter}product{tuple_delimiter}Crude oil prices rose to $87.60 per barrel due to supply constraints and strong demand.){record_delimiter}
(entity{tuple_delimiter}Market Selloff{tuple_delimiter}category{tuple_delimiter}Market selloff refers to the significant decline in stock values due to investor concerns over interest rates and regulations.){record_delimiter}
(entity{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}category{tuple_delimiter}The Federal Reserve's upcoming policy announcement is expected to impact investor confidence and market stability.){record_delimiter}
(entity{tuple_delimiter}3.4% Decline{tuple_delimiter}category{tuple_delimiter}The Global Tech Index experienced a 3.4% decline in midday trading.){record_delimiter}
(relationship{tuple_delimiter}Global Tech Index{tuple_delimiter}Market Selloff{tuple_delimiter}market performance, investor sentiment{tuple_delimiter}The decline in the Global Tech Index is part of the broader market selloff driven by investor concerns.){record_delimiter}
(relationship{tuple_delimiter}Nexon Technologies{tuple_delimiter}Global Tech Index{tuple_delimiter}company impact, index movement{tuple_delimiter}Nexon Technologies' stock decline contributed to the overall drop in the Global Tech Index.){record_delimiter}
(relationship{tuple_delimiter}Gold Futures{tuple_delimiter}Market Selloff{tuple_delimiter}market reaction, safe-haven investment{tuple_delimiter}Gold prices rose as investors sought safe-haven assets during the market selloff.){record_delimiter}
(relationship{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}Market Selloff{tuple_delimiter}interest rate impact, financial regulation{tuple_delimiter}Speculation over Federal Reserve policy changes contributed to market volatility and investor selloff.){record_delimiter}
{completion_delimiter}
""",
"""------Example 3------
Entity_types: [economic_policy, athlete, event, location, record, organization, equipment]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
Text:
```
At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes.
```
Output:
("entity"{tuple_delimiter}"World Athletics Championship"{tuple_delimiter}"event"{tuple_delimiter}"The World Athletics Championship is a global sports competition featuring top athletes in track and field."){record_delimiter}
("entity"{tuple_delimiter}"Tokyo"{tuple_delimiter}"location"{tuple_delimiter}"Tokyo is the host city of the World Athletics Championship."){record_delimiter}
("entity"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"athlete"{tuple_delimiter}"Noah Carter is a sprinter who set a new record in the 100m sprint at the World Athletics Championship."){record_delimiter}
("entity"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"record"{tuple_delimiter}"The 100m sprint record is a benchmark in athletics, recently broken by Noah Carter."){record_delimiter}
("entity"{tuple_delimiter}"Carbon-Fiber Spikes"{tuple_delimiter}"equipment"{tuple_delimiter}"Carbon-fiber spikes are advanced sprinting shoes that provide enhanced speed and traction."){record_delimiter}
("entity"{tuple_delimiter}"World Athletics Federation"{tuple_delimiter}"organization"{tuple_delimiter}"The World Athletics Federation is the governing body overseeing the World Athletics Championship and record validations."){record_delimiter}
("relationship"{tuple_delimiter}"World Athletics Championship"{tuple_delimiter}"Tokyo"{tuple_delimiter}"The World Athletics Championship is being hosted in Tokyo."{tuple_delimiter}"event location, international competition"{tuple_delimiter}8){record_delimiter}
("relationship"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"Noah Carter set a new 100m sprint record at the championship."{tuple_delimiter}"athlete achievement, record-breaking"{tuple_delimiter}10){record_delimiter}
("relationship"{tuple_delimiter}"Noah Carter"{tuple_delimiter}"Carbon-Fiber Spikes"{tuple_delimiter}"Noah Carter used carbon-fiber spikes to enhance performance during the race."{tuple_delimiter}"athletic equipment, performance boost"{tuple_delimiter}7){record_delimiter}
("relationship"{tuple_delimiter}"World Athletics Federation"{tuple_delimiter}"100m Sprint Record"{tuple_delimiter}"The World Athletics Federation is responsible for validating and recognizing new sprint records."{tuple_delimiter}"sports regulation, record certification"{tuple_delimiter}9){record_delimiter}
("content_keywords"{tuple_delimiter}"athletics, sprinting, record-breaking, sports technology, competition"){completion_delimiter}
(entity{tuple_delimiter}World Athletics Championship{tuple_delimiter}event{tuple_delimiter}The World Athletics Championship is a global sports competition featuring top athletes in track and field.){record_delimiter}
(entity{tuple_delimiter}Tokyo{tuple_delimiter}location{tuple_delimiter}Tokyo is the host city of the World Athletics Championship.){record_delimiter}
(entity{tuple_delimiter}Noah Carter{tuple_delimiter}person{tuple_delimiter}Noah Carter is a sprinter who set a new record in the 100m sprint at the World Athletics Championship.){record_delimiter}
(entity{tuple_delimiter}100m Sprint Record{tuple_delimiter}category{tuple_delimiter}The 100m sprint record is a benchmark in athletics, recently broken by Noah Carter.){record_delimiter}
(entity{tuple_delimiter}Carbon-Fiber Spikes{tuple_delimiter}equipment{tuple_delimiter}Carbon-fiber spikes are advanced sprinting shoes that provide enhanced speed and traction.){record_delimiter}
(entity{tuple_delimiter}World Athletics Federation{tuple_delimiter}organization{tuple_delimiter}The World Athletics Federation is the governing body overseeing the World Athletics Championship and record validations.){record_delimiter}
(relationship{tuple_delimiter}World Athletics Championship{tuple_delimiter}Tokyo{tuple_delimiter}event location, international competition{tuple_delimiter}The World Athletics Championship is being hosted in Tokyo.){record_delimiter}
(relationship{tuple_delimiter}Noah Carter{tuple_delimiter}100m Sprint Record{tuple_delimiter}athlete achievement, record-breaking{tuple_delimiter}Noah Carter set a new 100m sprint record at the championship.){record_delimiter}
(relationship{tuple_delimiter}Noah Carter{tuple_delimiter}Carbon-Fiber Spikes{tuple_delimiter}athletic equipment, performance boost{tuple_delimiter}Noah Carter used carbon-fiber spikes to enhance performance during the race.){record_delimiter}
(relationship{tuple_delimiter}Noah Carter{tuple_delimiter}World Athletics Championship{tuple_delimiter}athlete participation, competition{tuple_delimiter}Noah Carter is competing at the World Athletics Championship.){record_delimiter}
{completion_delimiter}
""",
"""------Example 4------
Entity_types: [organization,person,equiment,product,technology,location,event,category]
Text:
```
在北京举行的人工智能大会上腾讯公司的首席技术官张伟发布了最新的大语言模型"腾讯智言"该模型在自然语言处理方面取得了重大突破
```
Output:
(entity{tuple_delimiter}人工智能大会{tuple_delimiter}event{tuple_delimiter}人工智能大会是在北京举行的技术会议专注于人工智能领域的最新发展){record_delimiter}
(entity{tuple_delimiter}北京{tuple_delimiter}location{tuple_delimiter}北京是人工智能大会的举办城市){record_delimiter}
(entity{tuple_delimiter}腾讯公司{tuple_delimiter}organization{tuple_delimiter}腾讯公司是参与人工智能大会的科技企业发布了新的语言模型产品){record_delimiter}
(entity{tuple_delimiter}张伟{tuple_delimiter}person{tuple_delimiter}张伟是腾讯公司的首席技术官在大会上发布了新产品){record_delimiter}
(entity{tuple_delimiter}腾讯智言{tuple_delimiter}product{tuple_delimiter}腾讯智言是腾讯公司发布的大语言模型产品在自然语言处理方面有重大突破){record_delimiter}
(entity{tuple_delimiter}自然语言处理技术{tuple_delimiter}technology{tuple_delimiter}自然语言处理技术是腾讯智言模型取得重大突破的技术领域){record_delimiter}
(relationship{tuple_delimiter}人工智能大会{tuple_delimiter}北京{tuple_delimiter}会议地点, 举办关系{tuple_delimiter}人工智能大会在北京举行){record_delimiter}
(relationship{tuple_delimiter}张伟{tuple_delimiter}腾讯公司{tuple_delimiter}雇佣关系, 高管职位{tuple_delimiter}张伟担任腾讯公司的首席技术官){record_delimiter}
(relationship{tuple_delimiter}张伟{tuple_delimiter}腾讯智言{tuple_delimiter}产品发布, 技术展示{tuple_delimiter}张伟在大会上发布了腾讯智言大语言模型){record_delimiter}
(relationship{tuple_delimiter}腾讯智言{tuple_delimiter}自然语言处理技术{tuple_delimiter}技术应用, 突破创新{tuple_delimiter}腾讯智言在自然语言处理技术方面取得了重大突破){record_delimiter}
{completion_delimiter}
""",
]
@ -149,40 +184,39 @@ Description List:
{description_list}
---Output---
Output:
"""
Output:"""
PROMPTS["entity_continue_extraction"] = """
MANY entities and relationships were missed in the last extraction. Please find only the missing entities and relationships from previous text.
MANY entities and relationships were missed in the last extraction. Please find only the missing entities and relationships from previous text. Do not include entities and relations that have been previously extracted. :\n
---Remember Steps---
1. Identify all entities. For each identified entity, extract the following information:
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
- entity_type: One of the following types: [{entity_types}]
- entity_description: Provide a comprehensive description of the entity's attributes and activities *based solely on the information present in the input text*. **Do not infer or hallucinate information not explicitly stated.** If the text provides insufficient information to create a comprehensive description, state "Description not available in text."
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
2. Format each entity as:
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
4. Format each relationship as:
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
5. When finished, output {completion_delimiter}
6. When finished, output `{completion_delimiter}`
7. Return identified entities and relationships in {language}.
---Output---
Add new entities and relations below using the same format, and do not include entities and relations that have been previously extracted. :\n
""".strip()
Output:
"""
PROMPTS["entity_if_loop_extraction"] = """
---Goal---'

View file

@ -82,6 +82,27 @@ def get_env_value(
if value_type is bool:
return value.lower() in ("true", "1", "yes", "t", "on")
# Handle list type with JSON parsing
if value_type is list:
try:
import json
parsed_value = json.loads(value)
# Ensure the parsed value is actually a list
if isinstance(parsed_value, list):
return parsed_value
else:
logger.warning(
f"Environment variable {env_key} is not a valid JSON list, using default"
)
return default
except (json.JSONDecodeError, ValueError) as e:
logger.warning(
f"Failed to parse {env_key} as JSON list: {e}, using default"
)
return default
try:
return value_type(value)
except (ValueError, TypeError):
@ -374,6 +395,7 @@ def priority_limit_async_func_call(
max_task_duration: float = None,
max_queue_size: int = 1000,
cleanup_timeout: float = 2.0,
queue_name: str = "limit_async",
):
"""
Enhanced priority-limited asynchronous function call decorator with robust timeout handling
@ -391,6 +413,7 @@ def priority_limit_async_func_call(
max_execution_timeout: Maximum time for worker to execute function (defaults to llm_timeout + 30s)
max_task_duration: Maximum time before health check intervenes (defaults to llm_timeout + 60s)
cleanup_timeout: Maximum time to wait for cleanup operations (defaults to 2.0s)
queue_name: Optional queue name for logging identification (defaults to "limit_async")
Returns:
Decorator function
@ -482,7 +505,7 @@ def priority_limit_async_func_call(
except asyncio.TimeoutError:
# Worker-level timeout (max_execution_timeout exceeded)
logger.warning(
f"limit_async: Worker timeout for task {task_id} after {max_execution_timeout}s"
f"{queue_name}: Worker timeout for task {task_id} after {max_execution_timeout}s"
)
if not task_state.future.done():
task_state.future.set_exception(
@ -495,12 +518,12 @@ def priority_limit_async_func_call(
if not task_state.future.done():
task_state.future.cancel()
logger.debug(
f"limit_async: Task {task_id} cancelled during execution"
f"{queue_name}: Task {task_id} cancelled during execution"
)
except Exception as e:
# Function execution error
logger.error(
f"limit_async: Error in decorated function for task {task_id}: {str(e)}"
f"{queue_name}: Error in decorated function for task {task_id}: {str(e)}"
)
if not task_state.future.done():
task_state.future.set_exception(e)
@ -512,10 +535,12 @@ def priority_limit_async_func_call(
except Exception as e:
# Critical error in worker loop
logger.error(f"limit_async: Critical error in worker: {str(e)}")
logger.error(
f"{queue_name}: Critical error in worker: {str(e)}"
)
await asyncio.sleep(0.1)
finally:
logger.debug("limit_async: Worker exiting")
logger.debug(f"{queue_name}: Worker exiting")
async def enhanced_health_check():
"""Enhanced health check with stuck task detection and recovery"""
@ -549,7 +574,7 @@ def priority_limit_async_func_call(
# Force cleanup of stuck tasks
for task_id, execution_duration in stuck_tasks:
logger.warning(
f"limit_async: Detected stuck task {task_id} (execution time: {execution_duration:.1f}s), forcing cleanup"
f"{queue_name}: Detected stuck task {task_id} (execution time: {execution_duration:.1f}s), forcing cleanup"
)
async with task_states_lock:
if task_id in task_states:
@ -572,7 +597,7 @@ def priority_limit_async_func_call(
if workers_needed > 0:
logger.info(
f"limit_async: Creating {workers_needed} new workers"
f"{queue_name}: Creating {workers_needed} new workers"
)
new_tasks = set()
for _ in range(workers_needed):
@ -582,9 +607,9 @@ def priority_limit_async_func_call(
tasks.update(new_tasks)
except Exception as e:
logger.error(f"limit_async: Error in enhanced health check: {str(e)}")
logger.error(f"{queue_name}: Error in enhanced health check: {str(e)}")
finally:
logger.debug("limit_async: Enhanced health check task exiting")
logger.debug(f"{queue_name}: Enhanced health check task exiting")
initialized = False
async def ensure_workers():
@ -601,7 +626,7 @@ def priority_limit_async_func_call(
if reinit_count > 0:
reinit_count += 1
logger.warning(
f"limit_async: Reinitializing system (count: {reinit_count})"
f"{queue_name}: Reinitializing system (count: {reinit_count})"
)
else:
reinit_count = 1
@ -614,7 +639,7 @@ def priority_limit_async_func_call(
active_tasks_count = len(tasks)
if active_tasks_count > 0 and reinit_count > 1:
logger.warning(
f"limit_async: {active_tasks_count} tasks still running during reinitialization"
f"{queue_name}: {active_tasks_count} tasks still running during reinitialization"
)
# Create worker tasks
@ -641,12 +666,12 @@ def priority_limit_async_func_call(
f" (Timeouts: {', '.join(timeout_info)})" if timeout_info else ""
)
logger.info(
f"limit_async: {workers_needed} new workers initialized {timeout_str}"
f"{queue_name}: {workers_needed} new workers initialized {timeout_str}"
)
async def shutdown():
"""Gracefully shut down all workers and cleanup resources"""
logger.info("limit_async: Shutting down priority queue workers")
logger.info(f"{queue_name}: Shutting down priority queue workers")
shutdown_event.set()
@ -667,7 +692,7 @@ def priority_limit_async_func_call(
await asyncio.wait_for(queue.join(), timeout=5.0)
except asyncio.TimeoutError:
logger.warning(
"limit_async: Timeout waiting for queue to empty during shutdown"
f"{queue_name}: Timeout waiting for queue to empty during shutdown"
)
# Cancel worker tasks
@ -687,7 +712,7 @@ def priority_limit_async_func_call(
except asyncio.CancelledError:
pass
logger.info("limit_async: Priority queue workers shutdown complete")
logger.info(f"{queue_name}: Priority queue workers shutdown complete")
@wraps(func)
async def wait_func(
@ -750,7 +775,7 @@ def priority_limit_async_func_call(
)
except asyncio.TimeoutError:
raise QueueFullError(
f"Queue full, timeout after {_queue_timeout} seconds"
f"{queue_name}: Queue full, timeout after {_queue_timeout} seconds"
)
except Exception as e:
# Clean up on queue error
@ -785,14 +810,14 @@ def priority_limit_async_func_call(
await asyncio.sleep(0.1)
raise TimeoutError(
f"limit_async: User timeout after {_timeout} seconds"
f"{queue_name}: User timeout after {_timeout} seconds"
)
except WorkerTimeoutError as e:
# This is Worker-level timeout, directly propagate exception information
raise TimeoutError(f"limit_async: {str(e)}")
raise TimeoutError(f"{queue_name}: {str(e)}")
except HealthCheckTimeoutError as e:
# This is Health Check-level timeout, directly propagate exception information
raise TimeoutError(f"limit_async: {str(e)}")
raise TimeoutError(f"{queue_name}: {str(e)}")
finally:
# Ensure cleanup
@ -931,19 +956,6 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]
return [r.strip() for r in results if r.strip()]
# Refer the utils functions of the official GraphRAG implementation:
# https://github.com/microsoft/graphrag
def clean_str(input: Any) -> str:
"""Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
# If we get non-string input, just give it back
if not isinstance(input, str):
return input
result = html.unescape(input.strip())
# https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
def is_float_regex(value: str) -> bool:
return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
@ -1728,29 +1740,78 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
return content[:max_length] + "..."
def normalize_extracted_info(name: str, is_entity=False) -> str:
def sanitize_and_normalize_extracted_text(
input_text: str, remove_inner_quotes=False
) -> str:
"""Santitize and normalize extracted text
Args:
input_text: text string to be processed
is_name: whether the input text is a entity or relation name
Returns:
Santitized and normalized text string
"""
safe_input_text = sanitize_text_for_encoding(input_text)
if safe_input_text:
normalized_text = normalize_extracted_info(
safe_input_text, remove_inner_quotes=remove_inner_quotes
)
return normalized_text
return ""
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Remove spaces between Chinese characters
2. Remove spaces between Chinese characters and English letters/numbers
3. Preserve spaces within English text and numbers
4. Replace Chinese parentheses with English parentheses
5. Replace Chinese dash with English dash
6. Remove English quotation marks from the beginning and end of the text
7. Remove English quotation marks in and around chinese
8. Remove Chinese quotation marks
1. Clean HTML tags (paragraph and line break tags)
2. Convert Chinese symbols to English symbols
3. Remove spaces between Chinese characters
4. Remove spaces between Chinese characters and English letters/numbers
5. Preserve spaces within English text and numbers
6. Replace Chinese parentheses with English parentheses
7. Replace Chinese dash with English dash
8. Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots)
Args:
name: Entity name to normalize
is_entity: Whether this is an entity name (affects quote handling)
Returns:
Normalized entity name
"""
# 1. Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
"",
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
)
)
# Chinese full-width numbers to half-width
name = name.translate(str.maketrans("", "0123456789"))
# Chinese full-width symbols to half-width
name = name.replace("", "-") # Chinese minus
name = name.replace("", "+") # Chinese plus
name = name.replace("", "/") # Chinese slash
name = name.replace("", "*") # Chinese asterisk
# Replace Chinese parentheses with English parentheses
name = name.replace("", "(").replace("", ")")
# Replace Chinese dash with English dash
# Replace Chinese dash with English dash (additional patterns)
name = name.replace("", "-").replace("", "-")
# Chinese full-width space to regular space (after other replacements)
name = name.replace(" ", " ")
# Use regex to remove spaces between Chinese characters
# Regex explanation:
# (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
@ -1766,19 +1827,57 @@ def normalize_extracted_info(name: str, is_entity=False) -> str:
r"(?<=[a-zA-Z0-9\(\)\[\]@#$%!&\*\-=+_])\s+(?=[\u4e00-\u9fa5])", "", name
)
# Remove English quotation marks from the beginning and end
if len(name) >= 2 and name.startswith('"') and name.endswith('"'):
name = name[1:-1]
if len(name) >= 2 and name.startswith("'") and name.endswith("'"):
name = name[1:-1]
# Remove outer quotes
if len(name) >= 2:
# Handle double quotes
if name.startswith('"') and name.endswith('"'):
inner_content = name[1:-1]
if '"' not in inner_content: # No double quotes inside
name = inner_content
if is_entity:
# Handle single quotes
if name.startswith("'") and name.endswith("'"):
inner_content = name[1:-1]
if "'" not in inner_content: # No single quotes inside
name = inner_content
# Handle Chinese-style double quotes
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if name.startswith("") and name.endswith(""):
inner_content = name[1:-1]
if "" not in inner_content and "" not in inner_content:
name = inner_content
if remove_inner_quotes:
# remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Remove spaces from the beginning and end of the text
name = name.strip()
# Filter out pure numeric content with length < 3
if len(name) < 3 and re.match(r"^[0-9]+$", name):
return ""
def should_filter_by_dots(text):
"""
Check if the string consists only of dots and digits, with at least one dot
Filter cases include: 1.2.3, 12.3, .123, 123., 12.3., .1.23 etc.
"""
return all(c.isdigit() or c == "." for c in text) and "." in text
if len(name) < 6 and should_filter_by_dots(name):
# Filter out mixed numeric and dot content with length < 6
return ""
# Filter out mixed numeric and dot content with length < 6, requiring at least one dot
return ""
return name
@ -1789,6 +1888,8 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
- Surrogate characters (the main cause of encoding errors)
- Other invalid Unicode sequences
- Control characters that might cause issues
- Unescape HTML escapes
- Remove control characters
- Whitespace trimming
Args:
@ -1801,9 +1902,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
Raises:
ValueError: When text contains uncleanable encoding issues that cannot be safely processed
"""
if not isinstance(text, str):
return str(text)
if not text:
return text
@ -1845,7 +1943,13 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
# Test final encoding to ensure it's safe
sanitized.encode("utf-8")
return sanitized
# Unescape HTML escapes
sanitized = html.unescape(sanitized)
# Remove control characters
sanitized = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", sanitized)
return sanitized.strip()
except UnicodeEncodeError as e:
# Critical change: Don't return placeholder, raise exception for caller to handle