feat: adds event entity extraction

2025-08-27 18:03:38 +02:00 · 2025-08-27 18:03:38 +02:00 · 7468ef6e53
commit 7468ef6e53
parent 3482f353a9
5 changed files with 69 additions and 0 deletions
--- a/cognee/infrastructure/llm/LLMGateway.py
+++ b/cognee/infrastructure/llm/LLMGateway.py
@ -144,3 +144,12 @@ class LLMGateway:
        )

        return extract_event_graph(content=content, response_model=response_model)
+
+    @staticmethod
+    def extract_event_entities(content: str, response_model: Type[BaseModel]) -> Coroutine:
+        # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
+        from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
+            extract_event_entities,
+        )
+
+        return extract_event_entities(content=content, response_model=response_model)
--- a/cognee/infrastructure/llm/config.py
+++ b/cognee/infrastructure/llm/config.py
@ -53,6 +53,7 @@ class LLMConfig(BaseSettings):
    transcription_model: str = "whisper-1"
    graph_prompt_path: str = "generate_graph_prompt.txt"
    temporal_graph_prompt_path: str = "generate_event_graph_prompt.txt"
+    event_entity_prompt_path: str = "generate_event_entity_prompt.txt"
    llm_rate_limit_enabled: bool = False
    llm_rate_limit_requests: int = 60
    llm_rate_limit_interval: int = 60  # in seconds (default is 60 requests per minute)
--- a/cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt
+++ b/cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt
@ -0,0 +1,25 @@
+For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular entities from events text. An entity is any distinct, identifiable thing, person, place, object, organization, concept, or phenomenon that can be named, referenced, or described in the event context. This includes but is not limited to: people, places, objects, organizations, concepts, events, processes, states, conditions, properties, attributes, roles, functions, and any other meaningful referents that contribute to understanding the event.
+**Temporal Entity Exclusion**: Do not extract timestamp-like entities (dates, times, durations) as these are handled separately. However, extract named temporal periods, eras, historical epochs, and culturally significant time references
+## Input Format
+The input will be a list of dictionaries, each containing:
+- `event_name`: The name of the event
+- `description`: The description of the event
+## Task
+For each event, extract all entities mentioned in the event description and determine their relationship to the event.
+## Output Format
+Return the same enriched JSON with an additional key in each dictionary: `attributes`.
+The `attributes` should be a list of dictionaries, each containing:
+- `entity`: The name of the entity
+- `entity_type`: The type/category of the entity (person, place, organization, object, concept, etc.)
+- `relationship`: A concise description of how the entity relates to the event
+## Requirements
+- **Be extremely thorough** - extract EVERY non-temporal entity mentioned, no matter how small, obvious, or seemingly insignificant
+- **After you are done with obvious entities, every noun, pronoun, proper noun, and named reference =  one entity**
+- We expect rich entity networks from any event, easily reaching a dozens of entities per event
+- Granularity and richness of the entity extraction is key to our success and is of utmost importance
+- **Do not skip any entities** - if you're unsure whether something is an entity, extract it anyway
+- Use the event name for context when determining relationships
+- Relationships should be technical with one or at most two words. If two words, use underscore camelcase style
+- Relationships could imply general meaning like: subject, object, participant, recipient, agent, instrument, tool, source, cause, effect, purpose, manner, resource, etc.
+- You can combine two words to form a relationship name: subject_role, previous_owner, etc.
+- Focus on how the entity specifically relates to the event
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/init.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/init.py
@ -2,3 +2,4 @@ from .knowledge_graph.extract_content_graph import extract_content_graph
 from .knowledge_graph.extract_event_graph import extract_event_graph
 from .extract_categories import extract_categories
 from .extract_summary import extract_summary, extract_code_summary
+from .extract_event_entities import extract_event_entities
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py
@ -0,0 +1,33 @@
+import os
+from typing import List, Type
+from pydantic import BaseModel
+from cognee.infrastructure.llm.LLMGateway import LLMGateway
+from cognee.infrastructure.llm.config import (
+    get_llm_config,
+)
+
+
+async def extract_event_entities(
+    content: str, response_model: Type[BaseModel]
+):
+    """Extract event entities from content using LLM."""
+    llm_config = get_llm_config()
+
+    prompt_path = llm_config.event_entity_prompt_path
+
+    # Check if the prompt path is an absolute path or just a filename
+    if os.path.isabs(prompt_path):
+        # directory containing the file
+        base_directory = os.path.dirname(prompt_path)
+        # just the filename itself
+        prompt_path = os.path.basename(prompt_path)
+    else:
+        base_directory = None
+
+    system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
+
+    content_graph = await LLMGateway.acreate_structured_output(
+        content, system_prompt, response_model
+    )
+
+    return content_graph