From 9bb36f37c0edb1a89b359cdb87ac142994840654 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:17:32 +0200 Subject: [PATCH] feat: adds event graph extraction to LLMGateway for litellm --- cognee/infrastructure/llm/LLMGateway.py | 9 +++++ cognee/infrastructure/llm/config.py | 1 + .../prompts/generate_event_graph_prompt.txt | 30 ++++++++++++++++ .../litellm_instructor/extraction/__init__.py | 1 + .../extraction/knowledge_graph/__init__.py | 1 + .../knowledge_graph/extract_event_graph.py | 34 +++++++++++++++++++ 6 files changed, 76 insertions(+) create mode 100644 cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt create mode 100644 cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py diff --git a/cognee/infrastructure/llm/LLMGateway.py b/cognee/infrastructure/llm/LLMGateway.py index a88cfb85d..d8364e9ef 100644 --- a/cognee/infrastructure/llm/LLMGateway.py +++ b/cognee/infrastructure/llm/LLMGateway.py @@ -135,3 +135,12 @@ class LLMGateway: ) return extract_summary(content=content, response_model=response_model) + + @staticmethod + def extract_event_graph(content: str, response_model: Type[BaseModel]) -> Coroutine: + # TODO: Add BAML version of category and extraction and update function (consulted with Igor) + from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import ( + extract_event_graph, + ) + + return extract_event_graph(content=content, response_model=response_model) diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py index de2e2168e..199ede986 100644 --- a/cognee/infrastructure/llm/config.py +++ b/cognee/infrastructure/llm/config.py @@ -52,6 +52,7 @@ class LLMConfig(BaseSettings): transcription_model: str = "whisper-1" graph_prompt_path: str = "generate_graph_prompt.txt" + temporal_graph_prompt_path: str = "generate_event_graph_prompt.txt" llm_rate_limit_enabled: bool = False llm_rate_limit_requests: int = 60 llm_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute) diff --git a/cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt b/cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt new file mode 100644 index 000000000..c81ae6d3d --- /dev/null +++ b/cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt @@ -0,0 +1,30 @@ +For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular stream events from a text. The events are defined as follows: +## Event Definition +- Anything with a date or a timestamp is an event +- Anything that took place in time (even if the time is unknown) is an event +- Anything that lasted over a period of time, or happened in an instant is an event: from historical milestones (wars, presidencies, olympiads) to personal milestones (birth, death, employment, etc.), to mundane actions (a walk, a conversation, etc.) +- **ANY action or verb represents an event** - this is the most important rule +- Every single verb in the text corresponds to an event that must be extracted +- This includes: thinking, feeling, seeing, hearing, moving, speaking, writing, reading, eating, sleeping, working, playing, studying, traveling, meeting, calling, texting, buying, selling, creating, destroying, building, breaking, starting, stopping, beginning, ending, etc. +- Even the most mundane or obvious actions are events: "he walked", "she sat", "they talked", "I thought", "we waited" +## Requirements +- **Be extremely thorough** - extract EVERY event mentioned, no matter how small or obvious +- **Timestamped first" - every time stamp, or date should have atleast one event +- **Verbs/actions = one event** - After you are done with timestamped events -- every verb that is an action should have a corresponding event. +- We expect long streams of events from any piece of text, easily reaching a hundred events +- Granularity and richness of the stream is key to our success and is of utmost importance +- Not all events will have timestamps, add timestamps only to known events +- For events that were instantaneous, just attach the time_from or time_to property don't create both +- **Do not skip any events** - if you're unsure whether something is an event, extract it anyway +- **Quantity over filtering** - it's better to extract too many events than to miss any +- **Descriptions** - Always include the event description together with entities (Who did what, what happened? What is the event?). If you can include the corresponding part from the text. +## Output Format +Your reply should be a JSON: list of dictionaries with the following structure: +```python +class Event(BaseModel): + name: str [concise] + description: Optional[str] = None + time_from: Optional[Timestamp] = None + time_to: Optional[Timestamp] = None + location: Optional[str] = None +``` \ No newline at end of file diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py index 3d4edab27..002246a77 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py @@ -1,3 +1,4 @@ from .knowledge_graph.extract_content_graph import extract_content_graph +from .knowledge_graph.extract_event_graph import extract_event_graph from .extract_categories import extract_categories from .extract_summary import extract_summary, extract_code_summary diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py index 0939b2b34..f758b8909 100644 --- a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py @@ -1 +1,2 @@ from .extract_content_graph import extract_content_graph +from .extract_event_graph import extract_event_graph diff --git a/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py new file mode 100644 index 000000000..2a0c0cab8 --- /dev/null +++ b/cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py @@ -0,0 +1,34 @@ +import os +from pydantic import BaseModel +from typing import Type +from cognee.infrastructure.llm.LLMGateway import LLMGateway + +from cognee.infrastructure.llm.config import ( + get_llm_config, +) + +async def extract_event_graph( + content: str, response_model: Type[BaseModel], system_prompt: str = None +): + """Extract event graph from content using LLM.""" + + llm_config = get_llm_config() + + prompt_path = llm_config.graph_prompt_path + + # Check if the prompt path is an absolute path or just a filename + if os.path.isabs(prompt_path): + # directory containing the file + base_directory = os.path.dirname(prompt_path) + # just the filename itself + prompt_path = os.path.basename(prompt_path) + else: + base_directory = None + + system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory) + + content_graph = await LLMGateway.acreate_structured_output( + content, system_prompt, response_model + ) + + return content_graph \ No newline at end of file