diff --git a/env.example b/env.example index 501035da..6492f456 100644 --- a/env.example +++ b/env.example @@ -125,7 +125,7 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true SUMMARY_LANGUAGE=English ### Entity types that the LLM will attempt to recognize -# ENTITY_TYPES='["Person", "Organization", "Location", "Event", "Concept", "Artifact", "CreativeWork", "NaturalObject"]' +# ENTITY_TYPES='["Person", "Organization", "Location", "Event", "Concept", "Content", "Artifact", "NaturalObject"]' ### Chunk size for document splitting, 500~1500 is recommended # CHUNK_SIZE=1200 diff --git a/lightrag/constants.py b/lightrag/constants.py index 3e2945d8..fc3ec79f 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -29,8 +29,8 @@ DEFAULT_ENTITY_TYPES = [ "Location", "Event", "Concept", + "Content", "Artifact", - "CreativeWork", "NaturalObject", ] diff --git a/lightrag/operate.py b/lightrag/operate.py index 42093a3d..3c61faa2 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -323,7 +323,7 @@ async def _handle_single_entity_extraction( if len(record_attributes) != 4 or "entity" not in record_attributes[0]: if len(record_attributes) > 1 and "entity" in record_attributes[0]: logger.warning( - f"{chunk_key}: extraction failed! Found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}`" + f"{chunk_key}: LLM output format error; found {len(record_attributes)}/4 feilds on ENTITY `{record_attributes[1]}`" ) return None @@ -394,7 +394,7 @@ async def _handle_single_relationship_extraction( if len(record_attributes) != 5 or "relationship" not in record_attributes[0]: if len(record_attributes) > 1 and "relationship" in record_attributes[0]: logger.warning( - f"{chunk_key}: extraction failed! Found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`" + f"{chunk_key}: LLM output format error; found {len(record_attributes)}/5 fields on REALTION `{record_attributes[1]}`" ) return None @@ -852,7 +852,7 @@ async def _process_extraction_result( chunk_key: str, timestamp: int, file_path: str = "unknown_source", - tuple_delimiter: str = "<|S|>", + tuple_delimiter: str = "<|#|>", completion_delimiter: str = "<|COMPLETE|>", ) -> tuple[dict, dict]: """Process a single extraction result (either initial or gleaning) @@ -885,7 +885,7 @@ async def _process_extraction_result( continue # Fix various forms of tuple_delimiter corruption from the LLM output using the dedicated function - delimiter_core = tuple_delimiter[2:-2] # Extract "S" from "<|S|>" + delimiter_core = tuple_delimiter[2:-2] # Extract "#" from "<|#|>" record = fix_tuple_delimiter_corruption(record, delimiter_core, tuple_delimiter) # change delimiter_core to lower case, and fix again delimiter_core = delimiter_core.lower() diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 23f2d75d..622e30ba 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -18,19 +18,20 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel - entity_name: entity_name: The name of the entity. If entity name is case-insensitive, capitalize the first letter of each word in the entity name. Entity names must be consistently applied across the entire extraction. - entity_type: Categorize the entity using the following entity types: {entity_types}; if none of the provided entity types are suitable, classify it as `Other`. - entity_description: Provide a concise yet comprehensive description of the entity's attributes and activities based on the information present in the input text. -2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities, and extract the following information: +2. Relationship Extraction: Identify direct, clearly stated and meaningful relationships between extracted entities. For relationship of 3 or more entities, decompose it into multiple binary (two-entity) relationships for separate description. For each binary relationship, extract the following information: - source_entity: Name of the source entity. If the entity name is case-insensitive, capitalize the first letter of each word in the entity name. Use consistency names in entity extraction stage. - target_entity: Name of the target entity. If the entity name is case-insensitive, capitalize the first letter of each word in the entity name. Use consistency names in entity extraction stage. - - relationship_keywords: one or more high-level keywords that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details. + - relationship_keywords: one or more high-level keywords that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details. Output mulptiple keywords in one field seperated by comma. - relationship_description: Explain the nature of the relationship between the source and target entities, providing a clear rationale for their connection. -3. Output Each Entity On A Single Line: Output 4 fields delimited by `{tuple_delimiter}`, adhering to the following format: entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description -4. Output Each Relationship On A Single Line: Output 5 fields delimited by `{tuple_delimiter}`, adhering to the following format: relationship{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description +3. Output Each Entity On A Single Line: Output 4 fields delimited by `{tuple_delimiter}`, starting with `entity` as 1st field, adhering to the following format: entity{tuple_delimiter}entity_name{tuple_delimiter}entity_type{tuple_delimiter}entity_description +4. Output Each Relationship On A Single Line: Output 5 fields delimited by `{tuple_delimiter}`, starting with `relationship` as 1st field, adhering to the following format: relationship{tuple_delimiter}source_entity{tuple_delimiter}target_entity{tuple_delimiter}relationship_keywords{tuple_delimiter}relationship_description 5. Crucial Delimiter Rule: The `{tuple_delimiter}` is a complete, atomic marker and must not be filled with content. For example, do NOT output `entity{tuple_delimiter}Tokyo<|location|>Tokyo is the capital of Japan.`. The correct format is `entity{tuple_delimiter}Tokyo{tuple_delimiter}location{tuple_delimiter}Tokyo is the capital of Japan.` -6. Undirected Relationship: Treat relationships as undirected; swapping the source and target entities does not constitute a new relationship. Avoid outputting duplicate relationships. -7. Output Order: Output the entity list first, followed by the relationship list. Within the relationship list, prioritize relationships based on their significance to the intended meaning of the input text, outputting more crucial relationships first. -8. Keep Full Context: Ensure the entity name and description are writtenin third person, explicitly name the subject or object instead of using pronouns; avoid pronouns such as `this article`, `this paper`, `our company`, `I`, `you`, and `he/she`. -9. Language: Ensure the output language of entity names, keywords, and descriptions is {language}. Proper nouns (e.g., personal names, place names, organization names) may in their original language if proper translation is not available. -10. Output `{completion_delimiter}` when all the entities and relationships have been extracted. +6. Multiple Keywords Seperation: Use comma `,` to seperate multiple relationship keywords. Do not use `{tuple_delimiter}` for separating multiple relaltionship keywords. +7. Undirected Relationship: Treat relationships as undirected; swapping the source and target entities does not constitute a new relationship. Avoid outputting duplicate relationships. +8. Output Order: Output the entity list first, followed by the relationship list. Within the relationship list, prioritize relationships based on their significance to the intended meaning of the input text, outputting more crucial relationships first. +9. Keep Full Context: Ensure the entity name and description are writtenin third person, explicitly name the subject or object instead of using pronouns; avoid pronouns such as `this article`, `this paper`, `our company`, `I`, `you`, and `he/she`. +10. Language: Ensure the output language of entity names, keywords, and descriptions is {language}. Proper nouns (e.g., personal names, place names, organization names) may in their original language if proper translation is not available. +11. Output `{completion_delimiter}` when all the entities and relationships have been extracted. ---Examples--- {examples} diff --git a/lightrag/utils.py b/lightrag/utils.py index 50b86f98..4e10c393 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -2571,19 +2571,19 @@ def fix_tuple_delimiter_corruption( Args: record: The text record to fix - delimiter_core: The core delimiter (e.g., "S" from "<|S|>") - tuple_delimiter: The complete tuple delimiter (e.g., "<|S|>") + delimiter_core: The core delimiter (e.g., "S" from "<|#|>") + tuple_delimiter: The complete tuple delimiter (e.g., "<|#|>") Returns: The corrected record with proper tuple_delimiter format Examples: - >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|S|>") - "entity<|S|>name" - >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|S|>") - "entity<|S|>name" - >>> fix_tuple_delimiter_corruption("entity|S|>name", "SEP", "<|S|>") - "entity<|S|>name" + >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|#|>") + "entity<|#|>name" + >>> fix_tuple_delimiter_corruption("entityname", "SEP", "<|#|>") + "entity<|#|>name" + >>> fix_tuple_delimiter_corruption("entity|#|>name", "SEP", "<|#|>") + "entity<|#|>name" Regex Sample: <\|S\|+S\|> <\|\\S\|> @@ -2603,77 +2603,77 @@ def fix_tuple_delimiter_corruption( # Escape the delimiter core for regex use escaped_delimiter_core = re.escape(delimiter_core) - # Fix: <|S||S|> -> <|S|>, <|S|||S|> -> <|S|> + # Fix: <|#||#|> -> <|#|>, <|#|||#|> -> <|#|> record = re.sub( rf"<\|{escaped_delimiter_core}\|+{escaped_delimiter_core}\|>", tuple_delimiter, record, ) - # Fix: <|\S|> -> <|S|> + # Fix: <|\#|> -> <|#|> record = re.sub( rf"<\|\\{escaped_delimiter_core}\|>", tuple_delimiter, record, ) - # Fix: <|> -> <|S|>, <||> -> <|S|> + # Fix: <|> -> <|#|>, <||> -> <|#|> record = re.sub( r"<\|+>", tuple_delimiter, record, ) - # Fix: -> <|S|>, <|S|Y> -> <|S|>, -> <|S|>, <||S||> -> <|S|> (one extra characters outside pipes) + # Fix: -> <|#|>, <|#|Y> -> <|#|>, -> <|#|>, <||#||> -> <|#|> (one extra characters outside pipes) record = re.sub( rf"<.?\|{escaped_delimiter_core}\|.?>", tuple_delimiter, record, ) - # Fix: , , <|S> -> <|S|> (missing one or both pipes) + # Fix: <#>, <#|>, <|#> -> <|#|> (missing one or both pipes) record = re.sub( rf"<\|?{escaped_delimiter_core}\|?>", tuple_delimiter, record, ) - # Fix: -> <|S|>, <|SX> -> <|S|> (one pipe is replaced by other character) + # Fix: -> <|#|>, <|#X> -> <|#|> (one pipe is replaced by other character) record = re.sub( rf"<[^|]{escaped_delimiter_core}\|>|<\|{escaped_delimiter_core}[^|]>", tuple_delimiter, record, ) - # Fix: <|S| -> <|S|> (missing closing >) + # Fix: <|#| -> <|#|> (missing closing >) record = re.sub( rf"<\|{escaped_delimiter_core}\|(?!>)", tuple_delimiter, record, ) - # Fix: <|| -> <|S|> + # Fix: <|| -> <|#|> record = re.sub( r"<\|\|(?!>)", tuple_delimiter, record, ) - # Fix: |S|> -> <|S|> (missing opening <) + # Fix: |#|> -> <|#|> (missing opening <) record = re.sub( rf"(?", tuple_delimiter, record, ) - # Fix: <|S|>| -> <|S|> ( this is a fix for: <|S|| -> <|S|> ) + # Fix: <|#|>| -> <|#|> ( this is a fix for: <|#|| -> <|#|> ) record = re.sub( rf"<\|{escaped_delimiter_core}\|>\|", tuple_delimiter, record, ) - # Fix: ||S|| -> <|S|> (double pipes on both sides without angle brackets) + # Fix: ||#|| -> <|#|> (double pipes on both sides without angle brackets) record = re.sub( rf"\|\|{escaped_delimiter_core}\|\|", tuple_delimiter,