Merge pull request #2051 from danielaskdd/extract-result-process
Enhance KG Extraction for LLM with Small Parameters
This commit is contained in:
commit
61fb2444f0
13 changed files with 214 additions and 226 deletions
12
env.example
12
env.example
|
|
@ -125,7 +125,7 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true
|
|||
SUMMARY_LANGUAGE=English
|
||||
|
||||
### Entity types that the LLM will attempt to recognize
|
||||
# ENTITY_TYPES='["Organization", "Person", "Equiment", "Product", "Technology", "Location", "Event", "Category"]'
|
||||
# ENTITY_TYPES='["Organization", "Person", "Location", "Event", "Technology", "Equipment", "Product", "Document", "Category"]'
|
||||
|
||||
### Chunk size for document splitting, 500~1500 is recommended
|
||||
# CHUNK_SIZE=1200
|
||||
|
|
@ -175,11 +175,9 @@ LLM_BINDING_API_KEY=your_api_key
|
|||
# LLM_BINDING=openai
|
||||
|
||||
### OpenAI Specific Parameters
|
||||
### To mitigate endless output loops and prevent greedy decoding for Qwen3, set the temperature parameter to a value between 0.8 and 1.0
|
||||
# OPENAI_LLM_TEMPERATURE=1.0
|
||||
# OPENAI_LLM_REASONING_EFFORT=low
|
||||
### If the presence penalty still can not stop the model from generates repetitive or unconstrained output
|
||||
# OPENAI_LLM_MAX_COMPLETION_TOKENS=16384
|
||||
### To mitigate endless output loops and prevent greedy decoding for Qwen3, set the temperature and frequency penalty parameter to a highter value
|
||||
# OPENAI_LLM_TEMPERATURE=1.2
|
||||
# OPENAI_FREQUENCY_PENALTY=1.5
|
||||
|
||||
### OpenRouter Specific Parameters
|
||||
# OPENAI_LLM_EXTRA_BODY='{"reasoning": {"enabled": false}}'
|
||||
|
|
@ -194,7 +192,7 @@ LLM_BINDING_API_KEY=your_api_key
|
|||
OLLAMA_LLM_NUM_CTX=32768
|
||||
# OLLAMA_LLM_TEMPERATURE=1.0
|
||||
### Stop sequences for Ollama LLM
|
||||
# OLLAMA_LLM_STOP='["</s>", "Assistant:", "\n\n"]'
|
||||
# OLLAMA_LLM_STOP='["</s>", "<|EOT|>"]'
|
||||
### use the following command to see all support options for Ollama LLM
|
||||
### lightrag-server --llm-binding ollama --help
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
__api_version__ = "0211"
|
||||
__api_version__ = "0213"
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
2
lightrag/api/webui/index.html
generated
2
lightrag/api/webui/index.html
generated
|
|
@ -8,7 +8,7 @@
|
|||
<link rel="icon" type="image/png" href="favicon.png" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Lightrag</title>
|
||||
<script type="module" crossorigin src="/webui/assets/index-CIdJpUuC.js"></script>
|
||||
<script type="module" crossorigin src="/webui/assets/index-BOxJ2b27.js"></script>
|
||||
<link rel="modulepreload" crossorigin href="/webui/assets/react-vendor-DEwriMA6.js">
|
||||
<link rel="modulepreload" crossorigin href="/webui/assets/ui-vendor-CeCm8EER.js">
|
||||
<link rel="modulepreload" crossorigin href="/webui/assets/graph-vendor-B-X5JegA.js">
|
||||
|
|
|
|||
|
|
@ -26,11 +26,12 @@ DEFAULT_SUMMARY_CONTEXT_SIZE = 12000
|
|||
DEFAULT_ENTITY_TYPES = [
|
||||
"Organization",
|
||||
"Person",
|
||||
"Equiment",
|
||||
"Product",
|
||||
"Technology",
|
||||
"Location",
|
||||
"Event",
|
||||
"Technology",
|
||||
"Equipment",
|
||||
"Product",
|
||||
"Document",
|
||||
"Category",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -347,6 +347,9 @@ async def _handle_single_entity_extraction(
|
|||
)
|
||||
return None
|
||||
|
||||
# Captitalize first letter of entity_type
|
||||
entity_type = entity_type.title()
|
||||
|
||||
# Process entity description with same cleaning pipeline
|
||||
entity_description = sanitize_and_normalize_extracted_text(record_attributes[3])
|
||||
|
||||
|
|
@ -793,6 +796,87 @@ async def _get_cached_extraction_results(
|
|||
return sorted_cached_results
|
||||
|
||||
|
||||
async def _process_extraction_result(
|
||||
result: str,
|
||||
chunk_key: str,
|
||||
file_path: str = "unknown_source",
|
||||
tuple_delimiter: str = "<|>",
|
||||
record_delimiter: str = "##",
|
||||
completion_delimiter: str = "<|COMPLETE|>",
|
||||
) -> tuple[dict, dict]:
|
||||
"""Process a single extraction result (either initial or gleaning)
|
||||
Args:
|
||||
result (str): The extraction result to process
|
||||
chunk_key (str): The chunk key for source tracking
|
||||
file_path (str): The file path for citation
|
||||
tuple_delimiter (str): Delimiter for tuple fields
|
||||
record_delimiter (str): Delimiter for records
|
||||
completion_delimiter (str): Delimiter for completion
|
||||
Returns:
|
||||
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
|
||||
"""
|
||||
maybe_nodes = defaultdict(list)
|
||||
maybe_edges = defaultdict(list)
|
||||
|
||||
# Standardize Chinese brackets around record_delimiter to English brackets
|
||||
bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]"
|
||||
result = re.sub(bracket_pattern, ")\\1(", result)
|
||||
|
||||
records = split_string_by_multi_markers(
|
||||
result,
|
||||
[record_delimiter, completion_delimiter],
|
||||
)
|
||||
|
||||
for record in records:
|
||||
# Remove outer brackets (support English and Chinese brackets)
|
||||
record = record.strip()
|
||||
if record.startswith("(") or record.startswith("("):
|
||||
record = record[1:]
|
||||
if record.endswith(")") or record.endswith(")"):
|
||||
record = record[:-1]
|
||||
|
||||
record = record.strip()
|
||||
if record is None:
|
||||
continue
|
||||
|
||||
if tuple_delimiter == "<|>":
|
||||
# fix entity<| with entity<|>
|
||||
record = re.sub(r"^entity<\|(?!>)", r"entity<|>", record)
|
||||
# fix relationship<| with relationship<|>
|
||||
record = re.sub(r"^relationship<\|(?!>)", r"relationship<|>", record)
|
||||
# fix <||> with <|>
|
||||
record = record.replace("<||>", "<|>")
|
||||
# fix < | > with <|>
|
||||
record = record.replace("< | >", "<|>")
|
||||
# fix <<|>> with <|>
|
||||
record = record.replace("<<|>>", "<|>")
|
||||
# fix <|>> with <|>
|
||||
record = record.replace("<|>>", "<|>")
|
||||
# fix <<|> with <|>
|
||||
record = record.replace("<<|>", "<|>")
|
||||
|
||||
record_attributes = split_string_by_multi_markers(record, [tuple_delimiter])
|
||||
|
||||
# Try to parse as entity
|
||||
entity_data = await _handle_single_entity_extraction(
|
||||
record_attributes, chunk_key, file_path
|
||||
)
|
||||
if entity_data is not None:
|
||||
maybe_nodes[entity_data["entity_name"]].append(entity_data)
|
||||
continue
|
||||
|
||||
# Try to parse as relationship
|
||||
relationship_data = await _handle_single_relationship_extraction(
|
||||
record_attributes, chunk_key, file_path
|
||||
)
|
||||
if relationship_data is not None:
|
||||
maybe_edges[
|
||||
(relationship_data["src_id"], relationship_data["tgt_id"])
|
||||
].append(relationship_data)
|
||||
|
||||
return dict(maybe_nodes), dict(maybe_edges)
|
||||
|
||||
|
||||
async def _parse_extraction_result(
|
||||
text_chunks_storage: BaseKVStorage, extraction_result: str, chunk_id: str
|
||||
) -> tuple[dict, dict]:
|
||||
|
|
@ -814,53 +898,16 @@ async def _parse_extraction_result(
|
|||
if chunk_data
|
||||
else "unknown_source"
|
||||
)
|
||||
context_base = dict(
|
||||
|
||||
# Call the shared processing function
|
||||
return await _process_extraction_result(
|
||||
extraction_result,
|
||||
chunk_id,
|
||||
file_path,
|
||||
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
|
||||
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
|
||||
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
|
||||
)
|
||||
maybe_nodes = defaultdict(list)
|
||||
maybe_edges = defaultdict(list)
|
||||
|
||||
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
|
||||
if context_base["tuple_delimiter"] == "<|>":
|
||||
# 1. Convert <||> to <|>
|
||||
extraction_result = extraction_result.replace("<||>", "<|>")
|
||||
# 2. Convert < | > to <|>
|
||||
extraction_result = extraction_result.replace("< | >", "<|>")
|
||||
|
||||
# Parse the extraction result using the same logic as in extract_entities
|
||||
records = split_string_by_multi_markers(
|
||||
extraction_result,
|
||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||
)
|
||||
for record in records:
|
||||
record = re.search(r"\((.*)\)", record, re.DOTALL)
|
||||
if record is None:
|
||||
continue
|
||||
record = record.group(1)
|
||||
record_attributes = split_string_by_multi_markers(
|
||||
record, [context_base["tuple_delimiter"]]
|
||||
)
|
||||
|
||||
# Try to parse as entity
|
||||
entity_data = await _handle_single_entity_extraction(
|
||||
record_attributes, chunk_id, file_path
|
||||
)
|
||||
if entity_data is not None:
|
||||
maybe_nodes[entity_data["entity_name"]].append(entity_data)
|
||||
continue
|
||||
|
||||
# Try to parse as relationship
|
||||
relationship_data = await _handle_single_relationship_extraction(
|
||||
record_attributes, chunk_id, file_path
|
||||
)
|
||||
if relationship_data is not None:
|
||||
maybe_edges[
|
||||
(relationship_data["src_id"], relationship_data["tgt_id"])
|
||||
].append(relationship_data)
|
||||
|
||||
return dict(maybe_nodes), dict(maybe_edges)
|
||||
|
||||
|
||||
async def _rebuild_single_entity(
|
||||
|
|
@ -1717,63 +1764,10 @@ async def extract_entities(
|
|||
)
|
||||
|
||||
continue_prompt = PROMPTS["entity_continue_extraction"].format(**context_base)
|
||||
if_loop_prompt = PROMPTS["entity_if_loop_extraction"]
|
||||
|
||||
processed_chunks = 0
|
||||
total_chunks = len(ordered_chunks)
|
||||
|
||||
async def _process_extraction_result(
|
||||
result: str, chunk_key: str, file_path: str = "unknown_source"
|
||||
):
|
||||
"""Process a single extraction result (either initial or gleaning)
|
||||
Args:
|
||||
result (str): The extraction result to process
|
||||
chunk_key (str): The chunk key for source tracking
|
||||
file_path (str): The file path for citation
|
||||
Returns:
|
||||
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
|
||||
"""
|
||||
maybe_nodes = defaultdict(list)
|
||||
maybe_edges = defaultdict(list)
|
||||
|
||||
# Preventive fix: when tuple_delimiter is <|>, fix LLM output instability issues
|
||||
if context_base["tuple_delimiter"] == "<|>":
|
||||
# 1. Convert <||> to <|>
|
||||
result = result.replace("<||>", "<|>")
|
||||
# 2. Convert < | > to <|>
|
||||
result = result.replace("< | >", "<|>")
|
||||
|
||||
records = split_string_by_multi_markers(
|
||||
result,
|
||||
[context_base["record_delimiter"], context_base["completion_delimiter"]],
|
||||
)
|
||||
|
||||
for record in records:
|
||||
record = re.search(r"\((.*)\)", record, re.DOTALL)
|
||||
if record is None:
|
||||
continue
|
||||
record = record.group(1)
|
||||
record_attributes = split_string_by_multi_markers(
|
||||
record, [context_base["tuple_delimiter"]]
|
||||
)
|
||||
|
||||
if_entities = await _handle_single_entity_extraction(
|
||||
record_attributes, chunk_key, file_path
|
||||
)
|
||||
if if_entities is not None:
|
||||
maybe_nodes[if_entities["entity_name"]].append(if_entities)
|
||||
continue
|
||||
|
||||
if_relation = await _handle_single_relationship_extraction(
|
||||
record_attributes, chunk_key, file_path
|
||||
)
|
||||
if if_relation is not None:
|
||||
maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
|
||||
if_relation
|
||||
)
|
||||
|
||||
return maybe_nodes, maybe_edges
|
||||
|
||||
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
|
||||
"""Process a single chunk
|
||||
Args:
|
||||
|
|
@ -1811,11 +1805,16 @@ async def extract_entities(
|
|||
|
||||
# Process initial extraction with file path
|
||||
maybe_nodes, maybe_edges = await _process_extraction_result(
|
||||
final_result, chunk_key, file_path
|
||||
final_result,
|
||||
chunk_key,
|
||||
file_path,
|
||||
tuple_delimiter=context_base["tuple_delimiter"],
|
||||
record_delimiter=context_base["record_delimiter"],
|
||||
completion_delimiter=context_base["completion_delimiter"],
|
||||
)
|
||||
|
||||
# Process additional gleaning results
|
||||
for now_glean_index in range(entity_extract_max_gleaning):
|
||||
if entity_extract_max_gleaning > 0:
|
||||
glean_result = await use_llm_func_with_cache(
|
||||
continue_prompt,
|
||||
use_llm_func,
|
||||
|
|
@ -1830,7 +1829,12 @@ async def extract_entities(
|
|||
|
||||
# Process gleaning result separately with file path
|
||||
glean_nodes, glean_edges = await _process_extraction_result(
|
||||
glean_result, chunk_key, file_path
|
||||
glean_result,
|
||||
chunk_key,
|
||||
file_path,
|
||||
tuple_delimiter=context_base["tuple_delimiter"],
|
||||
record_delimiter=context_base["record_delimiter"],
|
||||
completion_delimiter=context_base["completion_delimiter"],
|
||||
)
|
||||
|
||||
# Merge results - only add entities and edges with new names
|
||||
|
|
@ -1838,28 +1842,15 @@ async def extract_entities(
|
|||
if (
|
||||
entity_name not in maybe_nodes
|
||||
): # Only accetp entities with new name in gleaning stage
|
||||
maybe_nodes[entity_name] = [] # Explicitly create the list
|
||||
maybe_nodes[entity_name].extend(entities)
|
||||
for edge_key, edges in glean_edges.items():
|
||||
if (
|
||||
edge_key not in maybe_edges
|
||||
): # Only accetp edges with new name in gleaning stage
|
||||
maybe_edges[edge_key] = [] # Explicitly create the list
|
||||
maybe_edges[edge_key].extend(edges)
|
||||
|
||||
if now_glean_index == entity_extract_max_gleaning - 1:
|
||||
break
|
||||
|
||||
if_loop_result: str = await use_llm_func_with_cache(
|
||||
if_loop_prompt,
|
||||
use_llm_func,
|
||||
llm_response_cache=llm_response_cache,
|
||||
history_messages=history,
|
||||
cache_type="extract",
|
||||
cache_keys_collector=cache_keys_collector,
|
||||
)
|
||||
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
|
||||
if if_loop_result != "yes":
|
||||
break
|
||||
|
||||
# Batch update chunk's llm_cache_list with all collected cache keys
|
||||
if cache_keys_collector and text_chunks_storage:
|
||||
await update_chunk_cache_list(
|
||||
|
|
|
|||
|
|
@ -10,45 +10,37 @@ PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
|
|||
|
||||
PROMPTS["DEFAULT_USER_PROMPT"] = "n/a"
|
||||
|
||||
PROMPTS["entity_extraction"] = """---Goal---
|
||||
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
|
||||
Use {language} as output language.
|
||||
PROMPTS["entity_extraction"] = """---Task---
|
||||
Given a text document and a list of entity types, identify all entities of those types and all relationships among the identified entities.
|
||||
|
||||
---Steps---
|
||||
---Instructions---
|
||||
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
|
||||
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
|
||||
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
|
||||
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
|
||||
|
||||
2. Format each entity as:
|
||||
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
|
||||
|
||||
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
|
||||
- entity_type: Categorize the entity using the provided `Entity_types` list. If a suitable category cannot be determined, classify it as "Other".
|
||||
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. To ensure clarity and precision, all descriptions must replace pronouns and referential terms (e.g., "this document," "our company," "I," "you," "he/she") with the specific nouns they represent.
|
||||
2. Format each entity as: ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
|
||||
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
|
||||
For each pair of related entities, extract the following information:
|
||||
- source_entity: name of the source entity, as identified in step 1
|
||||
- target_entity: name of the target entity, as identified in step 1
|
||||
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
|
||||
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
|
||||
|
||||
4. Format each relationship as:
|
||||
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
|
||||
|
||||
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
|
||||
|
||||
6. When finished, output `{completion_delimiter}`
|
||||
|
||||
7. Return identified entities and relationships in {language}.
|
||||
- source_entity: name of the source entity, as identified in step 1
|
||||
- target_entity: name of the target entity, as identified in step 1
|
||||
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
|
||||
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
|
||||
4. Format each relationship as: ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
|
||||
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the entity or relation list delimiter.
|
||||
6. Return identified entities and relationships in {language}.
|
||||
7. Output `{completion_delimiter}` when all the entities and relationships are extracted.
|
||||
|
||||
---Quality Guidelines---
|
||||
- Only extract entities that are clearly defined and meaningful in the context
|
||||
- Avoid over-interpretation; stick to what is explicitly stated in the text
|
||||
- For all output content, explicitly name the subject or object rather than using pronouns
|
||||
- Include specific numerical data in entity name when relevant
|
||||
- Ensure entity names are consistent throughout the extraction
|
||||
|
||||
---Examples---
|
||||
{examples}
|
||||
|
||||
---Real Data---
|
||||
---Input---
|
||||
Entity_types: [{entity_types}]
|
||||
Text:
|
||||
```
|
||||
|
|
@ -56,13 +48,13 @@ Text:
|
|||
```
|
||||
|
||||
---Output---
|
||||
Output:
|
||||
"""
|
||||
|
||||
PROMPTS["entity_extraction_examples"] = [
|
||||
"""------Example 1------
|
||||
"""[Example 1]
|
||||
|
||||
Entity_types: [organization,person,equiment,product,technology,location,event,category]
|
||||
---Input---
|
||||
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
|
||||
Text:
|
||||
```
|
||||
while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
|
||||
|
|
@ -74,7 +66,7 @@ The underlying dismissal earlier seemed to falter, replaced by a glimpse of relu
|
|||
It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
|
||||
```
|
||||
|
||||
Output:
|
||||
---Output---
|
||||
(entity{tuple_delimiter}Alex{tuple_delimiter}person{tuple_delimiter}Alex is a character who experiences frustration and is observant of the dynamics among other characters.){record_delimiter}
|
||||
(entity{tuple_delimiter}Taylor{tuple_delimiter}person{tuple_delimiter}Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective.){record_delimiter}
|
||||
(entity{tuple_delimiter}Jordan{tuple_delimiter}person{tuple_delimiter}Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device.){record_delimiter}
|
||||
|
|
@ -88,9 +80,10 @@ Output:
|
|||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
"""------Example 2------
|
||||
"""[Example 2]
|
||||
|
||||
Entity_types: [organization,person,equiment,product,technology,location,event,category]
|
||||
---Input---
|
||||
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
|
||||
Text:
|
||||
```
|
||||
Stock markets faced a sharp downturn today as tech giants saw significant declines, with the Global Tech Index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty.
|
||||
|
|
@ -102,7 +95,7 @@ Meanwhile, commodity markets reflected a mixed sentiment. Gold futures rose by 1
|
|||
Financial experts are closely watching the Federal Reserve's next move, as speculation grows over potential rate hikes. The upcoming policy announcement is expected to influence investor confidence and overall market stability.
|
||||
```
|
||||
|
||||
Output:
|
||||
---Output---
|
||||
(entity{tuple_delimiter}Global Tech Index{tuple_delimiter}category{tuple_delimiter}The Global Tech Index tracks the performance of major technology stocks and experienced a 3.4% decline today.){record_delimiter}
|
||||
(entity{tuple_delimiter}Nexon Technologies{tuple_delimiter}organization{tuple_delimiter}Nexon Technologies is a tech company that saw its stock decline by 7.8% after disappointing earnings.){record_delimiter}
|
||||
(entity{tuple_delimiter}Omega Energy{tuple_delimiter}organization{tuple_delimiter}Omega Energy is an energy company that gained 2.1% in stock value due to rising oil prices.){record_delimiter}
|
||||
|
|
@ -118,15 +111,16 @@ Output:
|
|||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
"""------Example 3------
|
||||
"""[Example 3]
|
||||
|
||||
Entity_types: [organization,person,equiment,product,technology,location,event,category]
|
||||
---Input---
|
||||
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
|
||||
Text:
|
||||
```
|
||||
At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes.
|
||||
```
|
||||
|
||||
Output:
|
||||
---Output---
|
||||
(entity{tuple_delimiter}World Athletics Championship{tuple_delimiter}event{tuple_delimiter}The World Athletics Championship is a global sports competition featuring top athletes in track and field.){record_delimiter}
|
||||
(entity{tuple_delimiter}Tokyo{tuple_delimiter}location{tuple_delimiter}Tokyo is the host city of the World Athletics Championship.){record_delimiter}
|
||||
(entity{tuple_delimiter}Noah Carter{tuple_delimiter}person{tuple_delimiter}Noah Carter is a sprinter who set a new record in the 100m sprint at the World Athletics Championship.){record_delimiter}
|
||||
|
|
@ -140,16 +134,16 @@ Output:
|
|||
{completion_delimiter}
|
||||
|
||||
""",
|
||||
"""------Example 4------
|
||||
"""[Example 4]
|
||||
|
||||
Entity_types: [organization,person,equiment,product,technology,location,event,category]
|
||||
---Input---
|
||||
Entity_types: [organization,person,location,event,technology,equiment,product,Document,category]
|
||||
Text:
|
||||
```
|
||||
在北京举行的人工智能大会上,腾讯公司的首席技术官张伟发布了最新的大语言模型"腾讯智言",该模型在自然语言处理方面取得了重大突破。
|
||||
|
||||
```
|
||||
|
||||
Output:
|
||||
---Output---
|
||||
(entity{tuple_delimiter}人工智能大会{tuple_delimiter}event{tuple_delimiter}人工智能大会是在北京举行的技术会议,专注于人工智能领域的最新发展。){record_delimiter}
|
||||
(entity{tuple_delimiter}北京{tuple_delimiter}location{tuple_delimiter}北京是人工智能大会的举办城市。){record_delimiter}
|
||||
(entity{tuple_delimiter}腾讯公司{tuple_delimiter}organization{tuple_delimiter}腾讯公司是参与人工智能大会的科技企业,发布了新的语言模型产品。){record_delimiter}
|
||||
|
|
@ -185,48 +179,29 @@ Description List:
|
|||
{description_list}
|
||||
|
||||
---Output---
|
||||
Output:"""
|
||||
|
||||
PROMPTS["entity_continue_extraction"] = """
|
||||
MANY entities and relationships were missed in the last extraction. Please find only the missing entities and relationships from previous text. Do not include entities and relations that have been previously extracted. :\n
|
||||
|
||||
---Remember Steps---
|
||||
1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information:
|
||||
- entity_name: Name of the entity, use same language as input text. If English, capitalized the name
|
||||
- entity_type: One of the following types: [{entity_types}]. If the entity doesn't clearly fit any category, classify it as "Other".
|
||||
- entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. Do not add external knowledge.
|
||||
|
||||
2. Format each entity as:
|
||||
("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
|
||||
|
||||
3. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are directly and clearly related based on the text. Unsubstantiated relationships must be excluded from the output.
|
||||
For each pair of related entities, extract the following information:
|
||||
- source_entity: name of the source entity, as identified in step 1
|
||||
- target_entity: name of the target entity, as identified in step 1
|
||||
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
|
||||
- relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection
|
||||
|
||||
4. Format each relationship as:
|
||||
("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_description>)
|
||||
|
||||
5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the list delimiter. Ensure no spaces are added around the delimiters.
|
||||
|
||||
6. When finished, output `{completion_delimiter}`
|
||||
|
||||
7. Return identified entities and relationships in {language}.
|
||||
|
||||
---Output---
|
||||
Output:
|
||||
"""
|
||||
|
||||
PROMPTS["entity_continue_extraction"] = """---Task---
|
||||
Identify any missed entities or relationships in the last extraction task.
|
||||
|
||||
---Instructions---
|
||||
1. Output the entities and realtionships in the same format as previous extraction task.
|
||||
2. Do not include entities and relations that have been previously extracted.
|
||||
3. If the entity doesn't clearly fit in any of`Entity_types` provided, classify it as "Other".
|
||||
4. Return identified entities and relationships in {language}.
|
||||
5. Output `{completion_delimiter}` when all the entities and relationships are extracted.
|
||||
|
||||
---Output---
|
||||
"""
|
||||
|
||||
# TODO: Deprecated
|
||||
PROMPTS["entity_if_loop_extraction"] = """
|
||||
---Goal---'
|
||||
|
||||
It appears some entities may have still been missed.
|
||||
Check if it appears some entities may have still been missed. Output "Yes" if so, otherwise "No".
|
||||
|
||||
---Output---
|
||||
Output:
|
||||
""".strip()
|
||||
Output:"""
|
||||
|
||||
PROMPTS["fail_response"] = (
|
||||
"Sorry, I'm not able to provide an answer to that question.[no-context]"
|
||||
|
|
@ -270,7 +245,7 @@ Generate a concise response based on Knowledge Base and follow Response Rules, c
|
|||
- Additional user prompt: {user_prompt}
|
||||
|
||||
---Response---
|
||||
Output:"""
|
||||
"""
|
||||
|
||||
PROMPTS["keywords_extraction"] = """---Role---
|
||||
You are an expert keyword extractor, specializing in analyzing user queries for a Retrieval-Augmented Generation (RAG) system. Your purpose is to identify both high-level and low-level keywords in the user's query that will be used for effective document retrieval.
|
||||
|
|
|
|||
|
|
@ -1759,17 +1759,22 @@ def sanitize_and_normalize_extracted_text(
|
|||
|
||||
def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
||||
"""Normalize entity/relation names and description with the following rules:
|
||||
1. Clean HTML tags (paragraph and line break tags)
|
||||
2. Convert Chinese symbols to English symbols
|
||||
3. Remove spaces between Chinese characters
|
||||
4. Remove spaces between Chinese characters and English letters/numbers
|
||||
5. Preserve spaces within English text and numbers
|
||||
6. Replace Chinese parentheses with English parentheses
|
||||
7. Replace Chinese dash with English dash
|
||||
8. Remove English quotation marks from the beginning and end of the text
|
||||
9. Remove English quotation marks in and around chinese
|
||||
10. Remove Chinese quotation marks
|
||||
11. Filter out short numeric-only text (length < 3 and only digits/dots)
|
||||
- Clean HTML tags (paragraph and line break tags)
|
||||
- Convert Chinese symbols to English symbols
|
||||
- Remove spaces between Chinese characters
|
||||
- Remove spaces between Chinese characters and English letters/numbers
|
||||
- Preserve spaces within English text and numbers
|
||||
- Replace Chinese parentheses with English parentheses
|
||||
- Replace Chinese dash with English dash
|
||||
- Remove English quotation marks from the beginning and end of the text
|
||||
- Remove English quotation marks in and around chinese
|
||||
- Remove Chinese quotation marks
|
||||
- Filter out short numeric-only text (length < 3 and only digits/dots)
|
||||
- remove_inner_quotes = True
|
||||
remove Chinese quotes
|
||||
remove English queotes in and around chinese
|
||||
Convert non-breaking spaces to regular spaces
|
||||
Convert narrow non-breaking spaces after non-digits to regular spaces
|
||||
|
||||
Args:
|
||||
name: Entity name to normalize
|
||||
|
|
@ -1778,11 +1783,10 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
|||
Returns:
|
||||
Normalized entity name
|
||||
"""
|
||||
# 1. Clean HTML tags - remove paragraph and line break tags
|
||||
# Clean HTML tags - remove paragraph and line break tags
|
||||
name = re.sub(r"</p\s*>|<p\s*>|<p/>", "", name, flags=re.IGNORECASE)
|
||||
name = re.sub(r"</br\s*>|<br\s*>|<br/>", "", name, flags=re.IGNORECASE)
|
||||
|
||||
# 2. Convert Chinese symbols to English symbols
|
||||
# Chinese full-width letters to half-width (A-Z, a-z)
|
||||
name = name.translate(
|
||||
str.maketrans(
|
||||
|
|
@ -1849,11 +1853,15 @@ def normalize_extracted_info(name: str, remove_inner_quotes=False) -> str:
|
|||
name = inner_content
|
||||
|
||||
if remove_inner_quotes:
|
||||
# remove Chinese quotes
|
||||
# Remove Chinese quotes
|
||||
name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
|
||||
# remove English queotes in and around chinese
|
||||
# Remove English queotes in and around chinese
|
||||
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
||||
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
||||
# Convert non-breaking space to regular space
|
||||
name = name.replace("\u00a0", " ")
|
||||
# Convert narrow non-breaking space to regular space when after non-digits
|
||||
name = re.sub(r"(?<=[^\d])\u202F", " ", name)
|
||||
|
||||
# Remove spaces from the beginning and end of the text
|
||||
name = name.strip()
|
||||
|
|
|
|||
|
|
@ -187,7 +187,10 @@
|
|||
"unknown": "غير معروف",
|
||||
"object": "مصنوع",
|
||||
"group": "مجموعة",
|
||||
"technology": "العلوم"
|
||||
"technology": "العلوم",
|
||||
"product": "منتج",
|
||||
"document": "وثيقة",
|
||||
"other": "أخرى"
|
||||
},
|
||||
"sideBar": {
|
||||
"settings": {
|
||||
|
|
|
|||
|
|
@ -187,7 +187,10 @@
|
|||
"unknown": "Unknown",
|
||||
"object": "Object",
|
||||
"group": "Group",
|
||||
"technology": "Technology"
|
||||
"technology": "Technology",
|
||||
"product": "Product",
|
||||
"document": "Document",
|
||||
"other": "Other"
|
||||
},
|
||||
"sideBar": {
|
||||
"settings": {
|
||||
|
|
|
|||
|
|
@ -187,7 +187,10 @@
|
|||
"unknown": "Inconnu",
|
||||
"object": "Objet",
|
||||
"group": "Groupe",
|
||||
"technology": "Technologie"
|
||||
"technology": "Technologie",
|
||||
"product": "Produit",
|
||||
"document": "Document",
|
||||
"other": "Autre"
|
||||
},
|
||||
"sideBar": {
|
||||
"settings": {
|
||||
|
|
|
|||
|
|
@ -187,7 +187,10 @@
|
|||
"unknown": "未知",
|
||||
"object": "物品",
|
||||
"group": "群组",
|
||||
"technology": "技术"
|
||||
"technology": "技术",
|
||||
"product": "产品",
|
||||
"document": "文档",
|
||||
"other": "其他"
|
||||
},
|
||||
"sideBar": {
|
||||
"settings": {
|
||||
|
|
|
|||
|
|
@ -187,7 +187,10 @@
|
|||
"unknown": "未知",
|
||||
"object": "物品",
|
||||
"group": "群組",
|
||||
"technology": "技術"
|
||||
"technology": "技術",
|
||||
"product": "產品",
|
||||
"document": "文檔",
|
||||
"other": "其他"
|
||||
},
|
||||
"sideBar": {
|
||||
"settings": {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue