Merge pull request #2051 from danielaskdd/extract-result-process

Enhance KG Extraction for LLM with Small Parameters
This commit is contained in:
Daniel.y 2025-09-03 17:59:09 +08:00 committed by GitHub
commit 61fb2444f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 214 additions and 226 deletions

View file

@ -125,7 +125,7 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
SUMMARY_LANGUAGE=English
### Entity types that the LLM will attempt to recognize
# ENTITY_TYPES='["Organization", "Person", "Equiment", "Product", "Technology", "Location", "Event", "Category"]'
# ENTITY_TYPES='["Organization", "Person", "Location", "Event", "Technology", "Equipment", "Product", "Document", "Category"]'
### Chunk size for document splitting, 500~1500 is recommended
# CHUNK_SIZE=1200
@ -175,11 +175,9 @@ LLM_BINDING_API_KEY=your_api_key
# LLM_BINDING=openai
### OpenAI Specific Parameters
### To mitigate endless output loops and prevent greedy decoding for Qwen3, set the temperature parameter to a value between 0.8 and 1.0
# OPENAI_LLM_TEMPERATURE=1.0
# OPENAI_LLM_REASONING_EFFORT=low
### If the presence penalty still can not stop the model from generates repetitive or unconstrained output
# OPENAI_LLM_MAX_COMPLETION_TOKENS=16384
### To mitigate endless output loops and prevent greedy decoding for Qwen3, set the temperature and frequency penalty parameter to a highter value
# OPENAI_LLM_TEMPERATURE=1.2
# OPENAI_FREQUENCY_PENALTY=1.5
### OpenRouter Specific Parameters
# OPENAI_LLM_EXTRA_BODY='{"reasoning": {"enabled": false}}'
@ -194,7 +192,7 @@ LLM_BINDING_API_KEY=your_api_key
OLLAMA_LLM_NUM_CTX=32768
# OLLAMA_LLM_TEMPERATURE=1.0
### Stop sequences for Ollama LLM
# OLLAMA_LLM_STOP='["</s>", "Assistant:", "\n\n"]'
# OLLAMA_LLM_STOP='["</s>", "<|EOT|>"]'
### use the following command to see all support options for Ollama LLM
### lightrag-server --llm-binding ollama --help

View file

@ -1 +1 @@
__api_version__ = "0211"
__api_version__ = "0213"

File diff suppressed because one or more lines are too long

View file

@ -8,7 +8,7 @@
<link rel="icon" type="image/png" href="favicon.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title>
<script type="module" crossorigin src="/webui/assets/index-CIdJpUuC.js"></script>
<script type="module" crossorigin src="/webui/assets/index-BOxJ2b27.js"></script>
<link rel="modulepreload" crossorigin href="/webui/assets/react-vendor-DEwriMA6.js">
<link rel="modulepreload" crossorigin href="/webui/assets/ui-vendor-CeCm8EER.js">
<link rel="modulepreload" crossorigin href="/webui/assets/graph-vendor-B-X5JegA.js">

View file

@ -26,11 +26,12 @@ DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
DEFAULT_ENTITY_TYPES = [
"Organization",
"Person",
"Equiment",
"Product",
"Technology",
"Location",
"Event",
"Technology",
"Equipment",
"Product",
"Document",
"Category",
]

View file

@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
)
return None
# Captitalize first letter of entity_type
entity_type = entity_type.title()
# Process entity description with same cleaning pipeline
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
@ -793,6 +796,87 @@ async def _get_cached_extraction_results(
return sorted_cached_results
async def _process_extraction_result(
result: str,
chunk_key: str,
file_path: str = "unknown_source",
tuple_delimiter: str = "<|>",
record_delimiter: str = "##",
completion_delimiter: str = "<|COMPLETE|>",
) -> tuple[dict, dict]:
"""Process a single extraction result (either initial or gleaning)
Args:
result (str): The extraction result to process
chunk_key (str): The chunk key for source tracking
file_path (str): The file path for citation
tuple_delimiter (str): Delimiter for tuple fields
record_delimiter (str): Delimiter for records
completion_delimiter (str): Delimiter for completion
Returns:
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
"""
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
# Standardize Chinese brackets around record_delimiter to English brackets
bracket_pattern = f"[)](\\s*{re.escape(record_delimiter)}\\s*)[(]"
result = re.sub(bracket_pattern, ")\\1(", result)
records = split_string_by_multi_markers(
result,
[record_delimiter, completion_delimiter],
)
for record in records:
# Remove outer brackets (support English and Chinese brackets)
record = record.strip()
if record.startswith("(") or record.startswith(""):
record = record[1:]
if record.endswith(")") or record.endswith(""):
record = record[:-1]
record = record.strip()
if record is None:
continue
if tuple_delimiter == "<|>":
# fix entity<| with entity<|>
record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
# fix relationship<| with relationship<|>
record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
# fix <||> with <|>
record = record.replace("<||>", "<|>")
# fix < | > with <|>
record = record.replace("< | >", "<|>")
# fix <<|>> with <|>
record = record.replace("<<|>>", "<|>")
# fix <|>> with <|>
record = record.replace("<|>>", "<|>")
# fix <<|> with <|>
record = record.replace("<<|>", "<|>")
record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
# Try to parse as entity
entity_data = await _handle_single_entity_extraction(
record_attributes, chunk_key, file_path
)
if entity_data is not None:
maybe_nodes[entity_data["entity_name"]].append(entity_data)
continue
# Try to parse as relationship
relationship_data = await _handle_single_relationship_extraction(
record_attributes, chunk_key, file_path
)
if relationship_data is not None:
maybe_edges[
(relationship_data["src_id"], relationship_data["tgt_id"])
].append(relationship_data)
return dict(maybe_nodes), dict(maybe_edges)
async def _parse_extraction_result(
text_chunks_storage: BaseKVStorage, extraction_result: str, chunk_id: str
) -> tuple[dict, dict]:
@ -814,53 +898,16 @@ async def _parse_extraction_result(
if chunk_data
else "unknown_source"
)
context_base = dict(
# Call the shared processing function
return await _process_extraction_result(
extraction_result,
chunk_id,
file_path,
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
)
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
if context_base["tuple_delimiter"] == "<|>":
# 1. Convert <||> to <|>
extraction_result = extraction_result.replace("<||>", "<|>")
# 2. Convert < | > to <|>
extraction_result = extraction_result.replace("< | >", "<|>")
# Parse the extraction result using the same logic as in extract_entities
records = split_string_by_multi_markers(
extraction_result,
[context_base["record_delimiter"], context_base["completion_delimiter"]],
)
for record in records:
record = re.search(r"\((.*)\)", record, re.DOTALL)
if record is None:
continue
record = record.group(1)
record_attributes = split_string_by_multi_markers(
record, [context_base["tuple_delimiter"]]
)
# Try to parse as entity
entity_data = await _handle_single_entity_extraction(
record_attributes, chunk_id, file_path
)
if entity_data is not None:
maybe_nodes[entity_data["entity_name"]].append(entity_data)
continue
# Try to parse as relationship
relationship_data = await _handle_single_relationship_extraction(
record_attributes, chunk_id, file_path
)
if relationship_data is not None:
maybe_edges[
(relationship_data["src_id"], relationship_data["tgt_id"])
].append(relationship_data)
return dict(maybe_nodes), dict(maybe_edges)
async def _rebuild_single_entity(
@ -1717,63 +1764,10 @@ async def extract_entities(
)
continue_prompt = PROMPTS["entity_continue_extraction"].format(**context_base)
if_loop_prompt = PROMPTS["entity_if_loop_extraction"]
processed_chunks = 0
total_chunks = len(ordered_chunks)
async def _process_extraction_result(
result: str, chunk_key: str, file_path: str = "unknown_source"
):
"""Process a single extraction result (either initial or gleaning)
Args:
result (str): The extraction result to process
chunk_key (str): The chunk key for source tracking
file_path (str): The file path for citation
Returns:
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
"""
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
if context_base["tuple_delimiter"] == "<|>":
# 1. Convert <||> to <|>
result = result.replace("<||>", "<|>")
# 2. Convert < | > to <|>
result = result.replace("< | >", "<|>")
records = split_string_by_multi_markers(
result,
[context_base["record_delimiter"], context_base["completion_delimiter"]],
)
for record in records:
record = re.search(r"\((.*)\)", record, re.DOTALL)
if record is None:
continue
record = record.group(1)
record_attributes = split_string_by_multi_markers(
record, [context_base["tuple_delimiter"]]
)
if_entities = await _handle_single_entity_extraction(
record_attributes, chunk_key, file_path
)
if if_entities is not None:
maybe_nodes[if_entities["entity_name"]].append(if_entities)
continue
if_relation = await _handle_single_relationship_extraction(
record_attributes, chunk_key, file_path
)
if if_relation is not None:
maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
if_relation
)
return maybe_nodes, maybe_edges
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
"""Process a single chunk
Args:
@ -1811,11 +1805,16 @@ async def extract_entities(
# Process initial extraction with file path
maybe_nodes, maybe_edges = await _process_extraction_result(
final_result, chunk_key, file_path
final_result,
chunk_key,
file_path,
tuple_delimiter=context_base["tuple_delimiter"],
record_delimiter=context_base["record_delimiter"],
completion_delimiter=context_base["completion_delimiter"],
)
# Process additional gleaning results
for now_glean_index in range(entity_extract_max_gleaning):
if entity_extract_max_gleaning > 0:
glean_result = await use_llm_func_with_cache(
continue_prompt,
use_llm_func,
@ -1830,7 +1829,12 @@ async def extract_entities(
# Process gleaning result separately with file path
glean_nodes, glean_edges = await _process_extraction_result(
glean_result, chunk_key, file_path
glean_result,
chunk_key,
file_path,
tuple_delimiter=context_base["tuple_delimiter"],
record_delimiter=context_base["record_delimiter"],
completion_delimiter=context_base["completion_delimiter"],
)
# Merge results - only add entities and edges with new names
@ -1838,28 +1842,15 @@ async def extract_entities(
if (
entity_name not in maybe_nodes
): # Only accetp entities with new name in gleaning stage
maybe_nodes[entity_name] = [] # Explicitly create the list
maybe_nodes[entity_name].extend(entities)
for edge_key, edges in glean_edges.items():
if (
edge_key not in maybe_edges
): # Only accetp edges with new name in gleaning stage
maybe_edges[edge_key] = [] # Explicitly create the list
maybe_edges[edge_key].extend(edges)
if now_glean_index == entity_extract_max_gleaning - 1:
break
if_loop_result: str = await use_llm_func_with_cache(
if_loop_prompt,
use_llm_func,
llm_response_cache=llm_response_cache,
history_messages=history,
cache_type="extract",
cache_keys_collector=cache_keys_collector,
)
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
if if_loop_result != "yes":
break
# Batch update chunk's llm_cache_list with all collected cache keys
if cache_keys_collector and text_chunks_storage:
await update_chunk_cache_list(

View file

@ -10,45 +10,37 @@ PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
PROMPTS["DEFAULT_USER_PROMPT"] = "n/a"
PROMPTS["entity_extraction"] = """---Goal---
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
Use {language} as output language.
PROMPTS["entity_extraction"] = """---Task---
Given a text document and a list of entity types, identify all entities of those types and all relationships among the identified entities.
---Steps---
---Instructions---
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
2. Format each entity as:
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
- entity_type: Categorize the entity using the provided `Entity_types` list. If a suitable category cannot be determined, classify it as "Other".
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. To ensure clarity and precision, all descriptions must replace pronouns and referential terms (e.g., "this document," "our company," "I," "you," "he/she") with the specific nouns they represent.
2. Format each entity as: ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
4. Format each relationship as:
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
6. When finished, output `{completion_delimiter}`
7. Return identified entities and relationships in {language}.
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
4. Format each relationship as: ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the entity or relation list delimiter.
6. Return identified entities and relationships in {language}.
7. Output `{completion_delimiter}` when all the entities and relationships are extracted.
---Quality Guidelines---
- Only extract entities that are clearly defined and meaningful in the context
- Avoid over-interpretation; stick to what is explicitly stated in the text
- For all output content, explicitly name the subject or object rather than using pronouns
- Include specific numerical data in entity name when relevant
- Ensure entity names are consistent throughout the extraction
---Examples---
{examples}
---Real Data---
---Input---
Entity_types: [{entity_types}]
Text:
```
@ -56,13 +48,13 @@ Text:
```
---Output---
Output:
"""
PROMPTS["entity_extraction_examples"] = [
"""------Example 1------
"""[Example 1]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
---Input---
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
Text:
```
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
@ -74,7 +66,7 @@ The underlying dismissal earlier seemed to falter, replaced by a glimpse of relu
It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
```
Output:
---Output---
(entity{tuple_delimiter}Alex{tuple_delimiter}person{tuple_delimiter}Alex is a character who experiences frustration and is observant of the dynamics among other characters.){record_delimiter}
(entity{tuple_delimiter}Taylor{tuple_delimiter}person{tuple_delimiter}Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective.){record_delimiter}
(entity{tuple_delimiter}Jordan{tuple_delimiter}person{tuple_delimiter}Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device.){record_delimiter}
@ -88,9 +80,10 @@ Output:
{completion_delimiter}
""",
"""------Example 2------
"""[Example 2]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
---Input---
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
Text:
```
Stock markets faced a sharp downturn today as tech giants saw significant declines, with the Global Tech Index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty.
@ -102,7 +95,7 @@ Meanwhile, commodity markets reflected a mixed sentiment. Gold futures rose by 1
Financial experts are closely watching the Federal Reserve's next move, as speculation grows over potential rate hikes. The upcoming policy announcement is expected to influence investor confidence and overall market stability.
```
Output:
---Output---
(entity{tuple_delimiter}Global Tech Index{tuple_delimiter}category{tuple_delimiter}The Global Tech Index tracks the performance of major technology stocks and experienced a 3.4% decline today.){record_delimiter}
(entity{tuple_delimiter}Nexon Technologies{tuple_delimiter}organization{tuple_delimiter}Nexon Technologies is a tech company that saw its stock decline by 7.8% after disappointing earnings.){record_delimiter}
(entity{tuple_delimiter}Omega Energy{tuple_delimiter}organization{tuple_delimiter}Omega Energy is an energy company that gained 2.1% in stock value due to rising oil prices.){record_delimiter}
@ -118,15 +111,16 @@ Output:
{completion_delimiter}
""",
"""------Example 3------
"""[Example 3]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
---Input---
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
Text:
```
At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes.
```
Output:
---Output---
(entity{tuple_delimiter}World Athletics Championship{tuple_delimiter}event{tuple_delimiter}The World Athletics Championship is a global sports competition featuring top athletes in track and field.){record_delimiter}
(entity{tuple_delimiter}Tokyo{tuple_delimiter}location{tuple_delimiter}Tokyo is the host city of the World Athletics Championship.){record_delimiter}
(entity{tuple_delimiter}Noah Carter{tuple_delimiter}person{tuple_delimiter}Noah Carter is a sprinter who set a new record in the 100m sprint at the World Athletics Championship.){record_delimiter}
@ -140,16 +134,16 @@ Output:
{completion_delimiter}
""",
"""------Example 4------
"""[Example 4]
Entity_types: [organization,person,equiment,product,technology,location,event,category]
---Input---
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
Text:
```
在北京举行的人工智能大会上腾讯公司的首席技术官张伟发布了最新的大语言模型"腾讯智言"该模型在自然语言处理方面取得了重大突破
```
Output:
---Output---
(entity{tuple_delimiter}人工智能大会{tuple_delimiter}event{tuple_delimiter}人工智能大会是在北京举行的技术会议专注于人工智能领域的最新发展){record_delimiter}
(entity{tuple_delimiter}北京{tuple_delimiter}location{tuple_delimiter}北京是人工智能大会的举办城市){record_delimiter}
(entity{tuple_delimiter}腾讯公司{tuple_delimiter}organization{tuple_delimiter}腾讯公司是参与人工智能大会的科技企业发布了新的语言模型产品){record_delimiter}
@ -185,48 +179,29 @@ Description List:
{description_list}
---Output---
Output:"""
PROMPTS["entity_continue_extraction"] = """
MANY entities and relationships were missed in the last extraction. Please find only the missing entities and relationships from previous text. Do not include entities and relations that have been previously extracted. :\n
---Remember Steps---
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
2. Format each entity as:
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
4. Format each relationship as:
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
6. When finished, output `{completion_delimiter}`
7. Return identified entities and relationships in {language}.
---Output---
Output:
"""
PROMPTS["entity_continue_extraction"] = """---Task---
Identify any missed entities or relationships in the last extraction task.
---Instructions---
1. Output the entities and realtionships in the same format as previous extraction task.
2. Do not include entities and relations that have been previously extracted.
3. If the entity doesn't clearly fit in any of`Entity_types` provided, classify it as "Other".
4. Return identified entities and relationships in {language}.
5. Output `{completion_delimiter}` when all the entities and relationships are extracted.
---Output---
"""
# TODO: Deprecated
PROMPTS["entity_if_loop_extraction"] = """
---Goal---'
It appears some entities may have still been missed.
Check if it appears some entities may have still been missed. Output "Yes" if so, otherwise "No".
---Output---
Output:
""".strip()
Output:"""
PROMPTS["fail_response"] = (
"Sorry, I'm not able to provide an answer to that question.[no-context]"
@ -270,7 +245,7 @@ Generate a concise response based on Knowledge Base and follow Response Rules, c
- Additional user prompt: {user_prompt}
---Response---
Output:"""
"""
PROMPTS["keywords_extraction"] = """---Role---
You are an expert keyword extractor, specializing in analyzing user queries for a Retrieval-Augmented Generation (RAG) system. Your purpose is to identify both high-level and low-level keywords in the user's query that will be used for effective document retrieval.

View file

@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
"""Normalize entity/relation names and description with the following rules:
1. Clean HTML tags (paragraph and line break tags)
2. Convert Chinese symbols to English symbols
3. Remove spaces between Chinese characters
4. Remove spaces between Chinese characters and English letters/numbers
5. Preserve spaces within English text and numbers
6. Replace Chinese parentheses with English parentheses
7. Replace Chinese dash with English dash
8. Remove English quotation marks from the beginning and end of the text
9. Remove English quotation marks in and around chinese
10. Remove Chinese quotation marks
11. Filter out short numeric-only text (length < 3 and only digits/dots)
- Clean HTML tags (paragraph and line break tags)
- Convert Chinese symbols to English symbols
- Remove spaces between Chinese characters
- Remove spaces between Chinese characters and English letters/numbers
- Preserve spaces within English text and numbers
- Replace Chinese parentheses with English parentheses
- Replace Chinese dash with English dash
- Remove English quotation marks from the beginning and end of the text
- Remove English quotation marks in and around chinese
- Remove Chinese quotation marks
- Filter out short numeric-only text (length < 3 and only digits/dots)
- remove_inner_quotes = True
remove Chinese quotes
remove English queotes in and around chinese
Convert non-breaking spaces to regular spaces
Convert narrow non-breaking spaces after non-digits to regular spaces
Args:
name: Entity name to normalize
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
Returns:
Normalized entity name
"""
# 1. Clean HTML tags - remove paragraph and line break tags
# Clean HTML tags - remove paragraph and line break tags
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
# 2. Convert Chinese symbols to English symbols
# Chinese full-width letters to half-width (A-Z, a-z)
name = name.translate(
str.maketrans(
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
name = inner_content
if remove_inner_quotes:
# remove Chinese quotes
# Remove Chinese quotes
name = name.replace("", "").replace("", "").replace("", "").replace("", "")
# remove English queotes in and around chinese
# Remove English queotes in and around chinese
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
# Convert non-breaking space to regular space
name = name.replace("\u00a0", " ")
# Convert narrow non-breaking space to regular space when after non-digits
name = re.sub(r"(?<=[^\d])\u202F", " ", name)
# Remove spaces from the beginning and end of the text
name = name.strip()

View file

@ -187,7 +187,10 @@
"unknown": "غير معروف",
"object": "مصنوع",
"group": "مجموعة",
"technology": "العلوم"
"technology": "العلوم",
"product": "منتج",
"document": "وثيقة",
"other": "أخرى"
},
"sideBar": {
"settings": {

View file

@ -187,7 +187,10 @@
"unknown": "Unknown",
"object": "Object",
"group": "Group",
"technology": "Technology"
"technology": "Technology",
"product": "Product",
"document": "Document",
"other": "Other"
},
"sideBar": {
"settings": {

View file

@ -187,7 +187,10 @@
"unknown": "Inconnu",
"object": "Objet",
"group": "Groupe",
"technology": "Technologie"
"technology": "Technologie",
"product": "Produit",
"document": "Document",
"other": "Autre"
},
"sideBar": {
"settings": {

View file

@ -187,7 +187,10 @@
"unknown": "未知",
"object": "物品",
"group": "群组",
"technology": "技术"
"technology": "技术",
"product": "产品",
"document": "文档",
"other": "其他"
},
"sideBar": {
"settings": {

View file

@ -187,7 +187,10 @@
"unknown": "未知",
"object": "物品",
"group": "群組",
"technology": "技術"
"technology": "技術",
"product": "產品",
"document": "文檔",
"other": "其他"
},
"sideBar": {
"settings": {