cognee/cognee/temporal_poc/event_knowledge_graph.py

from typing import List, Type
from pydantic import BaseModel

from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.modules.engine.models.Entity import Entity
from cognee.modules.engine.models.EntityType import EntityType
from cognee.temporal_poc.models.models import EventEntityList

ENTITY_EXTRACTION_SYSTEM_PROMPT = """For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular entities from events text. An entity is any distinct, identifiable thing, person, place, object, organization, concept, or phenomenon that can be named, referenced, or described in the event context. This includes but is not limited to: people, places, objects, organizations, concepts, events, processes, states, conditions, properties, attributes, roles, functions, and any other meaningful referents that contribute to understanding the event.
**Temporal Entity Exclusion**: Do not extract timestamp-like entities (dates, times, durations) as these are handled separately. However, extract named temporal periods, eras, historical epochs, and culturally significant time references
## Input Format
The input will be a list of dictionaries, each containing:
- `event_name`: The name of the event
- `description`: The description of the event

## Task
For each event, extract all entities mentioned in the event description and determine their relationship to the event.

## Output Format
Return the same enriched JSON with an additional key in each dictionary: `attributes`.

The `attributes` should be a list of dictionaries, each containing:
- `entity`: The name of the entity
- `entity_type`: The type/category of the entity (person, place, organization, object, concept, etc.)
- `relationship`: A concise description of how the entity relates to the event

## Requirements
- **Be extremely thorough** - extract EVERY non-temporal entity mentioned, no matter how small, obvious, or seemingly insignificant
- **After you are done with obvious entities, every noun, pronoun, proper noun, and named reference =  one entity**
- We expect rich entity networks from any event, easily reaching a dozens of entities per event
- Granularity and richness of the entity extraction is key to our success and is of utmost importance
- **Do not skip any entities** - if you're unsure whether something is an entity, extract it anyway
- Use the event name for context when determining relationships
- Relationships should be technical with one or at most two words. If two words, use underscore camelcase style
- Relationships could imply general meaning like: subject, object, participant, recipient, agent, instrument, tool, source, cause, effect, purpose, manner, resource, etc.
- You can combine two words to form a relationship name: subject_role, previous_owner, etc.
- Focus on how the entity specifically relates to the event
"""


async def extract_event_entities(
    content: str, response_model: Type[BaseModel], system_prompt: str = None
):
    """Extract event entities from content using LLM."""
    llm_client = get_llm_client()

    if system_prompt is None:
        system_prompt = ENTITY_EXTRACTION_SYSTEM_PROMPT

    content_graph = await llm_client.acreate_structured_output(
        content, system_prompt, response_model
    )

    return content_graph


async def extract_event_knowledge_graph(data_chunks: List[DocumentChunk]) -> List[DocumentChunk]:
    """Extract events from chunks and process them for entity extraction."""
    from cognee.temporal_poc.datapoints.datapoints import Event
    import asyncio
    import json

    # Extract events from chunks - create a list of lists
    chunk_events = [
        [
            {"event_name": item.name, "description": item.description or ""}
            for item in chunk.contains
            if isinstance(item, Event)
        ]
        for chunk in data_chunks
    ]

    # Convert each chunk's events to JSON and gather all calls
    events_jsons = [json.dumps(events) if events else "[]" for events in chunk_events]

    # Extract entities from all chunks concurrently
    entity_results = await asyncio.gather(
        *[extract_event_entities(events_json, EventEntityList) for events_json in events_jsons]
    )

    # Process entity results and enrich chunks
    for chunk, entity_result in zip(data_chunks, entity_results):
        add_entities_to_chunk(chunk, entity_result)

    return data_chunks


def add_entities_to_chunk(chunk: DocumentChunk, entity_result) -> None:
    """Add entities and entity types to a chunk."""
    entity_types = {}  # Cache to avoid duplicates
    for event_with_entities in entity_result.events:
        for attribute in event_with_entities.attributes:
            entity_type = get_or_create_entity_type(entity_types, attribute.entity_type)
            create_entity(chunk, attribute, entity_type)


def get_or_create_entity_type(entity_types: dict, entity_type_name: str) -> EntityType:
    """Get existing entity type or create new one."""
    if entity_type_name not in entity_types:
        entity_type = EntityType(name=entity_type_name, description=f"Type for {entity_type_name}")
        entity_types[entity_type_name] = entity_type

    return entity_types[entity_type_name]


def create_entity(chunk: DocumentChunk, attribute, entity_type: EntityType) -> None:
    """Create and add entity to chunk."""
    entity = Entity(
        name=attribute.entity,
        is_a=entity_type,
        description=f"Entity {attribute.entity} of type {attribute.entity_type}",
    )
    chunk.contains.append(entity)


async def process_event_knowledge_graph(data_chunks: List[DocumentChunk]) -> List[DocumentChunk]:
    """Process document chunks for event knowledge graph construction."""
    return await extract_event_knowledge_graph(data_chunks)