feat: adds event graph extraction to LLMGateway for litellm

2025-08-27 15:17:32 +02:00 · 2025-08-27 15:17:32 +02:00 · 9bb36f37c0
commit 9bb36f37c0
parent f5489f2027
6 changed files with 76 additions and 0 deletions
--- a/cognee/infrastructure/llm/LLMGateway.py
+++ b/cognee/infrastructure/llm/LLMGateway.py
@ -135,3 +135,12 @@ class LLMGateway:
            )

            return extract_summary(content=content, response_model=response_model)
+
+    @staticmethod
+    def extract_event_graph(content: str, response_model: Type[BaseModel]) -> Coroutine:
+        # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
+        from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
+            extract_event_graph,
+        )
+
+        return extract_event_graph(content=content, response_model=response_model)
--- a/cognee/infrastructure/llm/config.py
+++ b/cognee/infrastructure/llm/config.py
@ -52,6 +52,7 @@ class LLMConfig(BaseSettings):

    transcription_model: str = "whisper-1"
    graph_prompt_path: str = "generate_graph_prompt.txt"
+    temporal_graph_prompt_path: str = "generate_event_graph_prompt.txt"
    llm_rate_limit_enabled: bool = False
    llm_rate_limit_requests: int = 60
    llm_rate_limit_interval: int = 60  # in seconds (default is 60 requests per minute)
--- a/cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt
+++ b/cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt
@ -0,0 +1,30 @@
+For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular stream events from a text. The events are defined as follows:
+## Event Definition
+- Anything with a date or a timestamp is an event
+- Anything that took place in time (even if the time is unknown) is an event
+- Anything that lasted over a period of time, or happened in an instant is an event: from historical milestones (wars, presidencies, olympiads) to personal milestones (birth, death, employment, etc.), to mundane actions (a walk, a conversation, etc.)
+- **ANY action or verb represents an event** - this is the most important rule
+- Every single verb in the text corresponds to an event that must be extracted
+- This includes: thinking, feeling, seeing, hearing, moving, speaking, writing, reading, eating, sleeping, working, playing, studying, traveling, meeting, calling, texting, buying, selling, creating, destroying, building, breaking, starting, stopping, beginning, ending, etc.
+- Even the most mundane or obvious actions are events: "he walked", "she sat", "they talked", "I thought", "we waited"
+## Requirements
+- **Be extremely thorough** - extract EVERY event mentioned, no matter how small or obvious
+- **Timestamped first" - every time stamp, or date should have atleast one event
+- **Verbs/actions  = one event** - After you are done with timestamped events -- every verb that is an action should have a corresponding event.
+- We expect long streams of events from any piece of text, easily reaching a hundred events
+- Granularity and richness of the stream is key to our success and is of utmost importance
+- Not all events will have timestamps, add timestamps only to known events
+- For events that were instantaneous, just attach the time_from or time_to property don't create both
+- **Do not skip any events** - if you're unsure whether something is an event, extract it anyway
+- **Quantity over filtering** - it's better to extract too many events than to miss any
+- **Descriptions** - Always include the event description together with entities (Who did what, what happened? What is the event?). If you can include the corresponding part from the text.
+## Output Format
+Your reply should be a JSON: list of dictionaries with the following structure:
+```python
+class Event(BaseModel):
+    name: str [concise]
+    description: Optional[str] = None
+    time_from: Optional[Timestamp] = None
+    time_to: Optional[Timestamp] = None
+    location: Optional[str] = None
+```
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/init.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/init.py
@ -1,3 +1,4 @@
 from .knowledge_graph.extract_content_graph import extract_content_graph
+from .knowledge_graph.extract_event_graph import extract_event_graph
 from .extract_categories import extract_categories
 from .extract_summary import extract_summary, extract_code_summary
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/init.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/init.py
@ -1 +1,2 @@
 from .extract_content_graph import extract_content_graph
+from .extract_event_graph import extract_event_graph
--- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py
+++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py
@ -0,0 +1,34 @@
+import os
+from pydantic import BaseModel
+from typing import Type
+from cognee.infrastructure.llm.LLMGateway import LLMGateway
+
+from cognee.infrastructure.llm.config import (
+    get_llm_config,
+)
+
+async def extract_event_graph(
+    content: str, response_model: Type[BaseModel], system_prompt: str = None
+):
+    """Extract event graph from content using LLM."""
+
+    llm_config = get_llm_config()
+
+    prompt_path = llm_config.graph_prompt_path
+
+    # Check if the prompt path is an absolute path or just a filename
+    if os.path.isabs(prompt_path):
+        # directory containing the file
+        base_directory = os.path.dirname(prompt_path)
+        # just the filename itself
+        prompt_path = os.path.basename(prompt_path)
+    else:
+        base_directory = None
+
+    system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
+
+    content_graph = await LLMGateway.acreate_structured_output(
+        content, system_prompt, response_model
+    )
+
+    return content_graph