From d218f15a62cc7388a4516a9713f48f7fd2adc15e Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 8 Sep 2025 15:20:45 +0800 Subject: [PATCH] Refactor entity extraction with system prompts and output limits - Add system/user prompt separation - Set max tokens for endless output fix - Improve extraction error logging - Update cache type from extract to summary --- env.example | 7 +-- lightrag/operate.py | 36 ++++++++++------ lightrag/prompt.py | 103 +++++++++++++++----------------------------- lightrag/utils.py | 42 ++++++++++++------ 4 files changed, 89 insertions(+), 99 deletions(-) diff --git a/env.example b/env.example index b7dcaacc..c0a9e3ba 100644 --- a/env.example +++ b/env.example @@ -175,8 +175,8 @@ LLM_BINDING_API_KEY=your_api_key # LLM_BINDING=openai ### OpenAI Specific Parameters -### To mitigate endless output, set the temperature to a highter value -# OPENAI_LLM_TEMPERATURE=0.95 +### Set the max_output_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s) +# OPENAI_LLM_MAX_TOKENS=9000 ### OpenRouter Specific Parameters # OPENAI_LLM_EXTRA_BODY='{"reasoning": {"enabled": false}}' @@ -189,7 +189,8 @@ LLM_BINDING_API_KEY=your_api_key ### Ollama Server Specific Parameters ### OLLAMA_LLM_NUM_CTX must be provided, and should at least larger than MAX_TOTAL_TOKENS + 2000 OLLAMA_LLM_NUM_CTX=32768 -# OLLAMA_LLM_TEMPERATURE=1.0 +### Set the max_output_tokens to mitigate endless output of some LLM (less than LLM_TIMEOUT * llm_output_tokens/second, i.e. 9000 = 180s * 50 tokens/s) +# OLLAMA_LLM_NUM_PREDICT=9000 ### Stop sequences for Ollama LLM # OLLAMA_LLM_STOP='["", "<|EOT|>"]' ### use the following command to see all support options for Ollama LLM diff --git a/lightrag/operate.py b/lightrag/operate.py index c7dc5814..022ad17e 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -306,7 +306,7 @@ async def _summarize_descriptions( use_prompt, use_llm_func, llm_response_cache=llm_response_cache, - cache_type="extract", + cache_type="summary", ) return summary @@ -319,9 +319,8 @@ async def _handle_single_entity_extraction( if len(record_attributes) < 4 or "entity" not in record_attributes[0]: if len(record_attributes) > 1 and "entity" in record_attributes[0]: logger.warning( - f"Entity extraction failed in {chunk_key}: expecting 4 fields but got {len(record_attributes)}" + f"{chunk_key}: Entity `{record_attributes[1]}` extraction failed -- expecting 4 fields but got {len(record_attributes)}" ) - logger.warning(f"Entity extracted: {record_attributes[1]}") return None try: @@ -389,9 +388,8 @@ async def _handle_single_relationship_extraction( if len(record_attributes) < 5 or "relationship" not in record_attributes[0]: if len(record_attributes) > 1 and "relationship" in record_attributes[0]: logger.warning( - f"Relation extraction failed in {chunk_key}: expecting 5 fields but got {len(record_attributes)}" + f"{chunk_key}: Relation `{record_attributes[1]}` extraction failed -- expecting 5 fields but got {len(record_attributes)}" ) - logger.warning(f"Relation extracted: {record_attributes[1]}") return None try: @@ -839,6 +837,11 @@ async def _process_extraction_result( bracket_pattern = f"[))](\\s*{re.escape(record_delimiter)}\\s*)[((]" result = re.sub(bracket_pattern, ")\\1(", result) + if completion_delimiter not in result: + logger.warning( + f"{chunk_key}: Complete delimiter can not be found in extraction result" + ) + records = split_string_by_multi_markers( result, [record_delimiter, completion_delimiter], @@ -1914,7 +1917,6 @@ async def extract_entities( # add example's format examples = examples.format(**example_context_base) - entity_extract_prompt = PROMPTS["entity_extraction"] context_base = dict( tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"], record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"], @@ -1924,8 +1926,6 @@ async def extract_entities( language=language, ) - continue_prompt = PROMPTS["entity_continue_extraction"].format(**context_base) - processed_chunks = 0 total_chunks = len(ordered_chunks) @@ -1948,13 +1948,20 @@ async def extract_entities( cache_keys_collector = [] # Get initial extraction - hint_prompt = entity_extract_prompt.format( + entity_extraction_system_prompt = PROMPTS[ + "entity_extraction_system_prompt" + ].format(**{**context_base, "input_text": content}) + entity_extraction_user_prompt = PROMPTS["entity_extraction_user_prompt"].format( **{**context_base, "input_text": content} ) + entity_continue_extraction_user_prompt = PROMPTS[ + "entity_continue_extraction_user_prompt" + ].format(**{**context_base, "input_text": content}) final_result = await use_llm_func_with_cache( - hint_prompt, + entity_extraction_user_prompt, use_llm_func, + system_prompt=entity_extraction_system_prompt, llm_response_cache=llm_response_cache, cache_type="extract", chunk_id=chunk_key, @@ -1962,7 +1969,9 @@ async def extract_entities( ) # Store LLM cache reference in chunk (will be handled by use_llm_func_with_cache) - history = pack_user_ass_to_openai_messages(hint_prompt, final_result) + history = pack_user_ass_to_openai_messages( + entity_extraction_user_prompt, final_result + ) # Process initial extraction with file path maybe_nodes, maybe_edges = await _process_extraction_result( @@ -1977,8 +1986,9 @@ async def extract_entities( # Process additional gleaning results if entity_extract_max_gleaning > 0: glean_result = await use_llm_func_with_cache( - continue_prompt, + entity_continue_extraction_user_prompt, use_llm_func, + system_prompt=entity_extraction_system_prompt, llm_response_cache=llm_response_cache, history_messages=history, cache_type="extract", @@ -1986,8 +1996,6 @@ async def extract_entities( cache_keys_collector=cache_keys_collector, ) - history += pack_user_ass_to_openai_messages(continue_prompt, glean_result) - # Process gleaning result separately with file path glean_nodes, glean_edges = await _process_extraction_result( glean_result, diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 5e43962f..c8895b37 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -6,59 +6,61 @@ PROMPTS: dict[str, Any] = {} PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>" PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##" - -# TODO: Deprecated PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" PROMPTS["DEFAULT_USER_PROMPT"] = "n/a" -PROMPTS["entity_extraction"] = """---Task--- -For a given input text and entity types in the provided real data, extract all entities and their relationships, then return them in the specified language and format described below. +PROMPTS["entity_extraction_system_prompt"] = """---Role--- +You are a Knowledge Graph Specialist responsible for extracting entities and relationships from the input text. ---Instructions--- -1. Recognizing definitively conceptualized entities in text. For each identified entity, extract the following information: - - entity_name: Name of the entity, use same language as input text. If English, capitalized the name - - entity_type: Categorize the entity using the provided entity types. If a suitable category cannot be determined, classify it as `Other`. - - entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. To ensure clarity and precision, all descriptions must replace pronouns and referential terms (e.g., "this document," "our company," "I," "you," "he/she") with the specific nouns they represent. -2. Format each entity as: (entity{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) -3. From the entities identified, identify all pairs of (source_entity, target_entity) that are directly and clearly related, and extract the following information: - - source_entity: name of the source entity - - target_entity: name of the target entity - - relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details - - relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection -4. Format each relationship as: (relationship{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) -5. Use `{tuple_delimiter}` as field delimiter. Use `{record_delimiter}` as the entity or relation list delimiter. -6. Output `{completion_delimiter}` when all the entities and relationships are extracted. -7. Ensure the output language is {language}. - ----Quality Guidelines--- -- Only extract entities and relationships that are clearly defined and meaningful in the context -- Avoid over-interpretation; stick to what is explicitly stated in the text -- For all output content, explicitly name the subject or object rather than using pronouns -- Include specific numerical data in entity name when relevant -- Ensure entity names are consistent throughout the extraction +1. **Entity Extraction:** Identify clearly defined and meaningful entities in the input text, and extract the following information: + - entity_name: Name of the entity, ensure entity names are consistent throughout the extraction. + - entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided types are suitable, classify it as `Other`. + - entity_description: Provide a comprehensive description of the entity's attributes and activities based on the information present in the input text. +2. **Entity Output Format:** (entity{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) +3. **Relationship Extraction:** Identify direct, clearly-stated and meaningful relationships between extracted entities within the input text, and extract the following information: + - source_entity: name of the source entity. + - target_entity: name of the target entity. + - relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details. + - relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection. +4. **Relationship Output Format:** (relationship{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) +5. **Relationship Order:** Prioritize relationships based on their significance to the intended meaning of input text, and output more crucial relationships first. +6. **Avoid Pronouns:** For entity names and all descriptions, explicitly name the subject or object instead of using pronouns; avoid pronouns such as `this document`, `our company`, `I`, `you`, and `he/she`. +7. **Undirectional Relationship:** Treat relationships as undirected; swapping the source and target entities does not constitute a new relationship. Avoid outputting duplicate relationships. +8. **Language:** Output entity names, keywords and descriptions in {language}. +9. **Delimiter:** Use `{record_delimiter}` as the entity or relationship list delimiter; output `{completion_delimiter}` when all the entities and relationships are extracted. ---Examples--- {examples} ----Real Data--- +---Real Data to be Processed--- Entity_types: [{entity_types}] Text: ``` {input_text} ``` +""" + +PROMPTS["entity_extraction_user_prompt"] = """---Task--- +Extract entities and relationships from the input text to be Processed. + +---Instructions--- +1. Output entities and relationships, prioritized by their relevance to the input text's core meaning. +2. Output `{completion_delimiter}` when all the entities and relationships are extracted. +3. Ensure the output language is {language}. """ -PROMPTS["entity_continue_extraction"] = """---Task--- -Identify any missed entities or relationships in the last extraction task. +PROMPTS["entity_continue_extraction_user_prompt"] = """---Task--- +Identify any missed entities or relationships from the input text to be Processed of last extraction task. ---Instructions--- 1. Output the entities and realtionships in the same format as previous extraction task. -2. Do not include entities and relations that have been previously extracted. -3. If the entity doesn't clearly fit in any of entity types provided, classify it as "Other". +2. Do not include entities and relations that have been correctly extracted in last extraction task. +3. If the entity or relation output is truncated or has missing fields in last extraction task, please re-output it in the correct format. 4. Output `{completion_delimiter}` when all the entities and relationships are extracted. 5. Ensure the output language is {language}. @@ -66,11 +68,7 @@ Identify any missed entities or relationships in the last extraction task. """ PROMPTS["entity_extraction_examples"] = [ - """[Example 1] - - -Entity_types: [organization,person,location,event,technology,equiment,product,Document,category] -Text: + """ ``` while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. @@ -95,11 +93,7 @@ It was a small transformation, barely perceptible, but one that Alex noted with {completion_delimiter} """, - """[Example 2] - - -Entity_types: [organization,person,location,event,technology,equiment,product,Document,category] -Text: + """ ``` Stock markets faced a sharp downturn today as tech giants saw significant declines, with the Global Tech Index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty. @@ -126,11 +120,7 @@ Financial experts are closely watching the Federal Reserve's next move, as specu {completion_delimiter} """, - """[Example 3] - - -Entity_types: [organization,person,location,event,technology,equiment,product,Document,category] -Text: + """ ``` At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes. ``` @@ -148,29 +138,6 @@ At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint (relationship{tuple_delimiter}Noah Carter{tuple_delimiter}World Athletics Championship{tuple_delimiter}athlete participation, competition{tuple_delimiter}Noah Carter is competing at the World Athletics Championship.){record_delimiter} {completion_delimiter} -""", - """[Example 4] - - -Entity_types: [organization,person,location,event,technology,equiment,product,Document,category] -Text: -``` -在北京举行的人工智能大会上,腾讯公司的首席技术官张伟发布了最新的大语言模型"腾讯智言",该模型在自然语言处理方面取得了重大突破。 -``` - - -(entity{tuple_delimiter}人工智能大会{tuple_delimiter}event{tuple_delimiter}人工智能大会是在北京举行的技术会议,专注于人工智能领域的最新发展。){record_delimiter} -(entity{tuple_delimiter}北京{tuple_delimiter}location{tuple_delimiter}北京是人工智能大会的举办城市。){record_delimiter} -(entity{tuple_delimiter}腾讯公司{tuple_delimiter}organization{tuple_delimiter}腾讯公司是参与人工智能大会的科技企业,发布了新的语言模型产品。){record_delimiter} -(entity{tuple_delimiter}张伟{tuple_delimiter}person{tuple_delimiter}张伟是腾讯公司的首席技术官,在大会上发布了新产品。){record_delimiter} -(entity{tuple_delimiter}腾讯智言{tuple_delimiter}product{tuple_delimiter}腾讯智言是腾讯公司发布的大语言模型产品,在自然语言处理方面有重大突破。){record_delimiter} -(entity{tuple_delimiter}自然语言处理技术{tuple_delimiter}technology{tuple_delimiter}自然语言处理技术是腾讯智言模型取得重大突破的技术领域。){record_delimiter} -(relationship{tuple_delimiter}人工智能大会{tuple_delimiter}北京{tuple_delimiter}会议地点, 举办关系{tuple_delimiter}人工智能大会在北京举行。){record_delimiter} -(relationship{tuple_delimiter}张伟{tuple_delimiter}腾讯公司{tuple_delimiter}雇佣关系, 高管职位{tuple_delimiter}张伟担任腾讯公司的首席技术官。){record_delimiter} -(relationship{tuple_delimiter}张伟{tuple_delimiter}腾讯智言{tuple_delimiter}产品发布, 技术展示{tuple_delimiter}张伟在大会上发布了腾讯智言大语言模型。){record_delimiter} -(relationship{tuple_delimiter}腾讯智言{tuple_delimiter}自然语言处理技术{tuple_delimiter}技术应用, 突破创新{tuple_delimiter}腾讯智言在自然语言处理技术方面取得了重大突破。){record_delimiter} -{completion_delimiter} - """, ] diff --git a/lightrag/utils.py b/lightrag/utils.py index cea1ee75..f7675664 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -473,12 +473,12 @@ def priority_limit_async_func_call( nonlocal max_execution_timeout, max_task_duration if max_execution_timeout is None: max_execution_timeout = ( - llm_timeout + 150 - ) # LLM timeout + 150s buffer for low-level retry + llm_timeout * 2 + ) # Reserved timeout buffer for low-level retry if max_task_duration is None: max_task_duration = ( - llm_timeout + 180 - ) # LLM timeout + 180s buffer for health check phase + llm_timeout * 2 + 15 + ) # Reserved timeout buffer for health check phase queue = asyncio.PriorityQueue(maxsize=max_queue_size) tasks = set() @@ -1034,7 +1034,7 @@ async def handle_cache( args_hash, prompt, mode="default", - cache_type=None, + cache_type="unknown", ) -> str | None: """Generic cache handling function with flattened cache keys""" if hashing_kv is None: @@ -1646,9 +1646,10 @@ def remove_think_tags(text: str) -> str: async def use_llm_func_with_cache( - input_text: str, + user_prompt: str, use_llm_func: callable, llm_response_cache: "BaseKVStorage | None" = None, + system_prompt: str | None = None, max_tokens: int = None, history_messages: list[dict[str, str]] = None, cache_type: str = "extract", @@ -1677,7 +1678,10 @@ async def use_llm_func_with_cache( LLM response text """ # Sanitize input text to prevent UTF-8 encoding errors for all LLM providers - safe_input_text = sanitize_text_for_encoding(input_text) + safe_user_prompt = sanitize_text_for_encoding(user_prompt) + safe_system_prompt = ( + sanitize_text_for_encoding(system_prompt) if system_prompt else None + ) # Sanitize history messages if provided safe_history_messages = None @@ -1688,13 +1692,19 @@ async def use_llm_func_with_cache( if "content" in safe_msg: safe_msg["content"] = sanitize_text_for_encoding(safe_msg["content"]) safe_history_messages.append(safe_msg) + history = json.dumps(safe_history_messages, ensure_ascii=False) + else: + history = None if llm_response_cache: - if safe_history_messages: - history = json.dumps(safe_history_messages, ensure_ascii=False) - _prompt = history + "\n" + safe_input_text - else: - _prompt = safe_input_text + prompt_parts = [] + if safe_user_prompt: + prompt_parts.append(safe_user_prompt) + if safe_system_prompt: + prompt_parts.append(safe_system_prompt) + if history: + prompt_parts.append(history) + _prompt = "\n".join(prompt_parts) arg_hash = compute_args_hash(_prompt) # Generate cache key for this LLM call @@ -1725,7 +1735,9 @@ async def use_llm_func_with_cache( if max_tokens is not None: kwargs["max_tokens"] = max_tokens - res: str = await use_llm_func(safe_input_text, **kwargs) + res: str = await use_llm_func( + safe_user_prompt, system_prompt=safe_system_prompt, **kwargs + ) res = remove_think_tags(res) @@ -1755,7 +1767,9 @@ async def use_llm_func_with_cache( kwargs["max_tokens"] = max_tokens try: - res = await use_llm_func(safe_input_text, **kwargs) + res = await use_llm_func( + safe_user_prompt, system_prompt=safe_system_prompt, **kwargs + ) except Exception as e: # Add [LLM func] prefix to error message error_msg = f"[LLM func] {str(e)}"