move summary out of attribute extraction (#792)

* move summary out of attribute extraction * linter * linter * fix db query
2025-07-31 12:15:21 -04:00 · 2025-07-31 12:15:21 -04:00 · ab8106cb4f
commit ab8106cb4f
parent e5112244e5
11 changed files with 93 additions and 57 deletions
--- a/graphiti_core/graphiti.py
+++ b/graphiti_core/graphiti.py
@ -359,10 +359,10 @@ class Graphiti:
        group_id: str | None = None,
        uuid: str | None = None,
        update_communities: bool = False,
-        entity_types: dict[str, BaseModel] | None = None,
+        entity_types: dict[str, type[BaseModel]] | None = None,
        excluded_entity_types: list[str] | None = None,
        previous_episode_uuids: list[str] | None = None,
-        edge_types: dict[str, BaseModel] | None = None,
+        edge_types: dict[str, type[BaseModel]] | None = None,
        edge_type_map: dict[tuple[str, str], list[str]] | None = None,
    ) -> AddEpisodeResults:
        """
@ -555,9 +555,9 @@ class Graphiti:
        self,
        bulk_episodes: list[RawEpisode],
        group_id: str | None = None,
-        entity_types: dict[str, BaseModel] | None = None,
+        entity_types: dict[str, type[BaseModel]] | None = None,
        excluded_entity_types: list[str] | None = None,
-        edge_types: dict[str, BaseModel] | None = None,
+        edge_types: dict[str, type[BaseModel]] | None = None,
        edge_type_map: dict[tuple[str, str], list[str]] | None = None,
    ):
        """
--- a/graphiti_core/helpers.py
+++ b/graphiti_core/helpers.py
@ -148,7 +148,7 @@ def validate_group_id(group_id: str) -> bool:
 def validate_excluded_entity_types(
-    excluded_entity_types: list[str] | None, entity_types: dict[str, BaseModel] | None = None
+    excluded_entity_types: list[str] | None, entity_types: dict[str, type[BaseModel]] | None = None
 ) -> bool:
    """
    Validate that excluded entity types are valid type names.
--- a/graphiti_core/prompts/extract_nodes.py
+++ b/graphiti_core/prompts/extract_nodes.py
@ -52,6 +52,13 @@ class EntityClassification(BaseModel):
    )
 class EntitySummary(BaseModel):
    summary: str = Field(
        ...,
        description='Summary containing the important information about the entity. Under 250 words',
    )
 class Prompt(Protocol):
    extract_message: PromptVersion
    extract_json: PromptVersion
@ -59,6 +66,7 @@ class Prompt(Protocol):
    reflexion: PromptVersion
    classify_nodes: PromptVersion
    extract_attributes: PromptVersion
    extract_summary: PromptVersion
 class Versions(TypedDict):
@ -68,6 +76,7 @@ class Versions(TypedDict):
    reflexion: PromptFunction
    classify_nodes: PromptFunction
    extract_attributes: PromptFunction
    extract_summary: PromptFunction
 def extract_message(context: dict[str, Any]) -> list[Message]:
@ -259,9 +268,39 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]:
        Guidelines:
        1. Do not hallucinate entity property values if they cannot be found in the current context.
        2. Only use the provided MESSAGES and ENTITY to set attribute values.
        <ENTITY>
        {context['node']}
        </ENTITY>
        """,
        ),
    ]
 def extract_summary(context: dict[str, Any]) -> list[Message]:
    return [
        Message(
            role='system',
            content='You are a helpful assistant that extracts entity summaries from the provided text.',
        ),
        Message(
            role='user',
            content=f"""
        <MESSAGES>
        {json.dumps(context['previous_episodes'], indent=2)}
        {json.dumps(context['episode_content'], indent=2)}
        </MESSAGES>
        Given the above MESSAGES and the following ENTITY, update the summary that combines relevant information about the entity
        from the messages and relevant information from the existing summary.
        Guidelines:
        1. Do not hallucinate entity summary information if they cannot be found in the current context.
        2. Only use the provided MESSAGES and ENTITY to set attribute values.
        3. The summary attribute represents a summary of the ENTITY, and should be updated with new information about the Entity from the MESSAGES. 
            Summaries must be no longer than 250 words.
-        
+
        <ENTITY>
        {context['node']}
        </ENTITY>
@ -275,6 +314,7 @@ versions: Versions = {
    'extract_json': extract_json,
    'extract_text': extract_text,
    'reflexion': reflexion,
    'extract_summary': extract_summary,
    'classify_nodes': classify_nodes,
    'extract_attributes': extract_attributes,
 }
--- a/graphiti_core/search/search_utils.py
+++ b/graphiti_core/search/search_utils.py
@ -314,17 +314,15 @@ async def node_fulltext_search(
        + """
        YIELD node AS n, score
        WHERE n:Entity AND n.group_id IN $group_ids
        WITH n, score
        LIMIT $limit
        """
        + filter_query
        + """
        WITH n, score
        ORDER BY score DESC
        LIMIT $limit
        RETURN
        """
        + ENTITY_NODE_RETURN
        + """
        ORDER BY score DESC
        """
    )
    records, _, _ = await driver.execute_query(
--- a/graphiti_core/utils/bulk_utils.py
+++ b/graphiti_core/utils/bulk_utils.py
@ -169,9 +169,9 @@ async def extract_nodes_and_edges_bulk(
    clients: GraphitiClients,
    episode_tuples: list[tuple[EpisodicNode, list[EpisodicNode]]],
    edge_type_map: dict[tuple[str, str], list[str]],
-    entity_types: dict[str, BaseModel] | None = None,
+    entity_types: dict[str, type[BaseModel]] | None = None,
    excluded_entity_types: list[str] | None = None,
-    edge_types: dict[str, BaseModel] | None = None,
+    edge_types: dict[str, type[BaseModel]] | None = None,
 ) -> tuple[list[list[EntityNode]], list[list[EntityEdge]]]:
    extracted_nodes_bulk: list[list[EntityNode]] = await semaphore_gather(
        *[
@ -202,7 +202,7 @@ async def dedupe_nodes_bulk(
    clients: GraphitiClients,
    extracted_nodes: list[list[EntityNode]],
    episode_tuples: list[tuple[EpisodicNode, list[EpisodicNode]]],
-    entity_types: dict[str, BaseModel] | None = None,
+    entity_types: dict[str, type[BaseModel]] | None = None,
 ) -> tuple[dict[str, list[EntityNode]], dict[str, str]]:
    embedder = clients.embedder
    min_score = 0.8
@ -290,7 +290,7 @@ async def dedupe_edges_bulk(
    extracted_edges: list[list[EntityEdge]],
    episode_tuples: list[tuple[EpisodicNode, list[EpisodicNode]]],
    _entities: list[EntityNode],
-    edge_types: dict[str, BaseModel],
+    edge_types: dict[str, type[BaseModel]],
    _edge_type_map: dict[tuple[str, str], list[str]],
 ) -> dict[str, list[EntityEdge]]:
    embedder = clients.embedder
--- a/graphiti_core/utils/maintenance/edge_operations.py
+++ b/graphiti_core/utils/maintenance/edge_operations.py
@ -114,7 +114,7 @@ async def extract_edges(
    previous_episodes: list[EpisodicNode],
    edge_type_map: dict[tuple[str, str], list[str]],
    group_id: str = '',
-    edge_types: dict[str, BaseModel] | None = None,
+    edge_types: dict[str, type[BaseModel]] | None = None,
 ) -> list[EntityEdge]:
    start = time()
@ -249,7 +249,7 @@ async def resolve_extracted_edges(
    extracted_edges: list[EntityEdge],
    episode: EpisodicNode,
    entities: list[EntityNode],
-    edge_types: dict[str, BaseModel],
+    edge_types: dict[str, type[BaseModel]],
    edge_type_map: dict[tuple[str, str], list[str]],
 ) -> tuple[list[EntityEdge], list[EntityEdge]]:
    driver = clients.driver
@ -272,7 +272,7 @@ async def resolve_extracted_edges(
    uuid_entity_map: dict[str, EntityNode] = {entity.uuid: entity for entity in entities}
    # Determine which edge types are relevant for each edge
-    edge_types_lst: list[dict[str, BaseModel]] = []
+    edge_types_lst: list[dict[str, type[BaseModel]]] = []
    for extracted_edge in extracted_edges:
        source_node = uuid_entity_map.get(extracted_edge.source_node_uuid)
        target_node = uuid_entity_map.get(extracted_edge.target_node_uuid)
@ -381,7 +381,7 @@ async def resolve_extracted_edge(
    related_edges: list[EntityEdge],
    existing_edges: list[EntityEdge],
    episode: EpisodicNode,
-    edge_types: dict[str, BaseModel] | None = None,
+    edge_types: dict[str, type[BaseModel]] | None = None,
 ) -> tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]:
    if len(related_edges) == 0 and len(existing_edges) == 0:
        return extracted_edge, [], []
--- a/graphiti_core/utils/maintenance/node_operations.py
+++ b/graphiti_core/utils/maintenance/node_operations.py
@ -15,13 +15,10 @@ limitations under the License.
 """
 import logging
 from contextlib import suppress
 from time import time
 from typing import Any
 from uuid import uuid4
-import pydantic
+from pydantic import BaseModel
 from pydantic import BaseModel, Field
 from graphiti_core.graphiti_types import GraphitiClients
 from graphiti_core.helpers import MAX_REFLEXION_ITERATIONS, semaphore_gather
@ -31,6 +28,7 @@ from graphiti_core.nodes import EntityNode, EpisodeType, EpisodicNode, create_en
 from graphiti_core.prompts import prompt_library
 from graphiti_core.prompts.dedupe_nodes import NodeDuplicate, NodeResolutions
 from graphiti_core.prompts.extract_nodes import (
    EntitySummary,
    ExtractedEntities,
    ExtractedEntity,
    MissedEntities,
@ -70,7 +68,7 @@ async def extract_nodes(
    clients: GraphitiClients,
    episode: EpisodicNode,
    previous_episodes: list[EpisodicNode],
-    entity_types: dict[str, BaseModel] | None = None,
+    entity_types: dict[str, type[BaseModel]] | None = None,
    excluded_entity_types: list[str] | None = None,
 ) -> list[EntityNode]:
    start = time()
@ -180,7 +178,7 @@ async def resolve_extracted_nodes(
    extracted_nodes: list[EntityNode],
    episode: EpisodicNode | None = None,
    previous_episodes: list[EpisodicNode] | None = None,
-    entity_types: dict[str, BaseModel] | None = None,
+    entity_types: dict[str, type[BaseModel]] | None = None,
    existing_nodes_override: list[EntityNode] | None = None,
 ) -> tuple[list[EntityNode], dict[str, str], list[tuple[EntityNode, EntityNode]]]:
    llm_client = clients.llm_client
@ -223,7 +221,7 @@ async def resolve_extracted_nodes(
        ],
    )
-    entity_types_dict: dict[str, BaseModel] = entity_types if entity_types is not None else {}
+    entity_types_dict: dict[str, type[BaseModel]] = entity_types if entity_types is not None else {}
    # Prepare context for LLM
    extracted_nodes_context = [
@ -297,7 +295,7 @@ async def extract_attributes_from_nodes(
    nodes: list[EntityNode],
    episode: EpisodicNode | None = None,
    previous_episodes: list[EpisodicNode] | None = None,
-    entity_types: dict[str, BaseModel] | None = None,
+    entity_types: dict[str, type[BaseModel]] | None = None,
 ) -> list[EntityNode]:
    llm_client = clients.llm_client
    embedder = clients.embedder
@ -326,7 +324,7 @@ async def extract_attributes_from_node(
    node: EntityNode,
    episode: EpisodicNode | None = None,
    previous_episodes: list[EpisodicNode] | None = None,
-    entity_type: BaseModel | None = None,
+    entity_type: type[BaseModel] | None = None,
 ) -> EntityNode:
    node_context: dict[str, Any] = {
        'name': node.name,
@ -335,25 +333,14 @@ async def extract_attributes_from_node(
        'attributes': node.attributes,
    }
-    attributes_definitions: dict[str, Any] = {
+    attributes_context: dict[str, Any] = {
-        'summary': (
+        'node': node_context,
-            str,
+        'episode_content': episode.content if episode is not None else '',
-            Field(
+        'previous_episodes': [ep.content for ep in previous_episodes]
-                description='Summary containing the important information about the entity. Under 250 words',
+        if previous_episodes is not None
-            ),
+        else [],
        )
    }
    if entity_type is not None:
        for field_name, field_info in entity_type.model_fields.items():
            attributes_definitions[field_name] = (
                field_info.annotation,
                Field(description=field_info.description),
            )
    unique_model_name = f'EntityAttributes_{uuid4().hex}'
    entity_attributes_model = pydantic.create_model(unique_model_name, **attributes_definitions)
    summary_context: dict[str, Any] = {
        'node': node_context,
        'episode_content': episode.content if episode is not None else '',
@ -362,20 +349,30 @@ async def extract_attributes_from_node(
        else [],
    }
-    llm_response = await llm_client.generate_response(
+    llm_response = (
-        prompt_library.extract_nodes.extract_attributes(summary_context),
+        (
-        response_model=entity_attributes_model,
+            await llm_client.generate_response(
                prompt_library.extract_nodes.extract_attributes(attributes_context),
                response_model=entity_type,
                model_size=ModelSize.small,
            )
        )
        if entity_type is not None
        else {}
    )
    summary_response = await llm_client.generate_response(
        prompt_library.extract_nodes.extract_summary(summary_context),
        response_model=EntitySummary,
        model_size=ModelSize.small,
    )
-    entity_attributes_model(**llm_response)
+    if entity_type is not None:
        entity_type(**llm_response)
-    node.summary = llm_response.get('summary', '')
+    node.summary = summary_response.get('summary', '')
    node_attributes = {key: value for key, value in llm_response.items()}
    with suppress(KeyError):
        del node_attributes['summary']
    node.attributes.update(node_attributes)
    return node
--- a/graphiti_core/utils/ontology_utils/entity_types_utils.py
+++ b/graphiti_core/utils/ontology_utils/entity_types_utils.py
@ -21,7 +21,7 @@ from graphiti_core.nodes import EntityNode
 def validate_entity_types(
-    entity_types: dict[str, BaseModel] | None,
+    entity_types: dict[str, type[BaseModel]] | None,
 ) -> bool:
    if entity_types is None:
        return True
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +1,7 @@
 [project]
 name = "graphiti-core"
 description = "A temporal graph building library"
-version = "0.18.1"
+version = "0.18.2"
 authors = [
    { name = "Paul Paliychuk", email = "paul@getzep.com" },
    { name = "Preston Rasmussen", email = "preston@getzep.com" },
--- a/tests/test_graphiti_int.py
+++ b/tests/test_graphiti_int.py
@ -64,7 +64,8 @@ async def test_graphiti_init(driver):
    await graphiti.build_indices_and_constraints()
    search_filter = SearchFilters(
-        created_at=[[DateFilter(date=utc_now(), comparison_operator=ComparisonOperator.less_than)]]
+        node_labels=['Person'],
        created_at=[[DateFilter(date=utc_now(), comparison_operator=ComparisonOperator.less_than)]],
    )
    results = await graphiti.search_(
--- a/uv.lock
+++ b/uv.lock
@ -746,7 +746,7 @@ wheels = [
 [[package]]
 name = "graphiti-core"
-version = "0.18.1"
+version = "0.18.2"
 source = { editable = "." }
 dependencies = [
    { name = "diskcache" },