From bec3f0203625cd2c1ba2f5683b56afe2d129f47d Mon Sep 17 00:00:00 2001 From: Preston Rasmussen <109292228+prasmussen15@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:26:51 -0400 Subject: [PATCH 1/4] filter out falsey values before creating embeddings (#966) * filter out falsey values * update * early return --- graphiti_core/edges.py | 9 ++++++--- graphiti_core/nodes.py | 9 ++++++--- pyproject.toml | 2 +- uv.lock | 4 ++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/graphiti_core/edges.py b/graphiti_core/edges.py index 165dee53..88d2a472 100644 --- a/graphiti_core/edges.py +++ b/graphiti_core/edges.py @@ -644,8 +644,11 @@ def get_community_edge_from_record(record: Any): async def create_entity_edge_embeddings(embedder: EmbedderClient, edges: list[EntityEdge]): - if len(edges) == 0: + # filter out falsey values from edges + filtered_edges = [edge for edge in edges if edge.fact] + + if len(filtered_edges) == 0: return - fact_embeddings = await embedder.create_batch([edge.fact for edge in edges]) - for edge, fact_embedding in zip(edges, fact_embeddings, strict=True): + fact_embeddings = await embedder.create_batch([edge.fact for edge in filtered_edges]) + for edge, fact_embedding in zip(filtered_edges, fact_embeddings, strict=True): edge.fact_embedding = fact_embedding diff --git a/graphiti_core/nodes.py b/graphiti_core/nodes.py index 7fafbe4f..4105c88e 100644 --- a/graphiti_core/nodes.py +++ b/graphiti_core/nodes.py @@ -868,9 +868,12 @@ def get_community_node_from_record(record: Any) -> CommunityNode: async def create_entity_node_embeddings(embedder: EmbedderClient, nodes: list[EntityNode]): - if not nodes: # Handle empty list case + # filter out falsey values from nodes + filtered_nodes = [node for node in nodes if node.name] + + if not filtered_nodes: return - name_embeddings = await embedder.create_batch([node.name for node in nodes]) - for node, name_embedding in zip(nodes, name_embeddings, strict=True): + name_embeddings = await embedder.create_batch([node.name for node in filtered_nodes]) + for node, name_embedding in zip(filtered_nodes, name_embeddings, strict=True): node.name_embedding = name_embedding diff --git a/pyproject.toml b/pyproject.toml index 5dacc78c..939eb0ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "graphiti-core" description = "A temporal graph building library" -version = "0.21.0pre10" +version = "0.21.0pre11" authors = [ { name = "Paul Paliychuk", email = "paul@getzep.com" }, { name = "Preston Rasmussen", email = "preston@getzep.com" }, diff --git a/uv.lock b/uv.lock index a67aa561..7f362eec 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10, <4" resolution-markers = [ "python_full_version >= '3.14'", @@ -783,7 +783,7 @@ wheels = [ [[package]] name = "graphiti-core" -version = "0.21.0rc8" +version = "0.21.0rc11" source = { editable = "." } dependencies = [ { name = "diskcache" }, From b28bd92c167e2f29bc3e1ebe8da4d37f11034df0 Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:10:57 -0700 Subject: [PATCH 2/4] Remove ensure_ascii configuration parameter (#969) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove ensure_ascii configuration parameter - Changed to_prompt_json default from ensure_ascii=True to False - Removed ensure_ascii parameter from Graphiti.__init__ and GraphitiClients - Removed ensure_ascii from all function signatures and context dictionaries - Removed ensure_ascii from all test files - All JSON serialization now preserves Unicode characters by default 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * format --------- Co-authored-by: Claude --- graphiti_core/graphiti.py | 12 +------- graphiti_core/graphiti_types.py | 1 - graphiti_core/prompts/dedupe_edges.py | 6 ++-- graphiti_core/prompts/dedupe_nodes.py | 16 +++++------ graphiti_core/prompts/eval.py | 6 ++-- graphiti_core/prompts/extract_edges.py | 6 ++-- graphiti_core/prompts/extract_nodes.py | 14 +++++----- graphiti_core/prompts/prompt_helpers.py | 6 ++-- graphiti_core/prompts/summarize_nodes.py | 10 +++---- graphiti_core/search/search_helpers.py | 12 ++++---- graphiti_core/utils/bulk_utils.py | 1 - .../utils/maintenance/community_operations.py | 28 ++++++------------- .../utils/maintenance/edge_operations.py | 7 ----- .../utils/maintenance/node_operations.py | 11 -------- .../utils/maintenance/temporal_operations.py | 4 --- tests/utils/maintenance/test_bulk_utils.py | 2 -- .../utils/maintenance/test_edge_operations.py | 8 ------ .../utils/maintenance/test_node_operations.py | 10 ------- 18 files changed, 46 insertions(+), 114 deletions(-) diff --git a/graphiti_core/graphiti.py b/graphiti_core/graphiti.py index 3eed0488..f296f15d 100644 --- a/graphiti_core/graphiti.py +++ b/graphiti_core/graphiti.py @@ -136,7 +136,6 @@ class Graphiti: store_raw_episode_content: bool = True, graph_driver: GraphDriver | None = None, max_coroutines: int | None = None, - ensure_ascii: bool = False, ): """ Initialize a Graphiti instance. @@ -169,10 +168,6 @@ class Graphiti: max_coroutines : int | None, optional The maximum number of concurrent operations allowed. Overrides SEMAPHORE_LIMIT set in the environment. If not set, the Graphiti default is used. - ensure_ascii : bool, optional - Whether to escape non-ASCII characters in JSON serialization for prompts. Defaults to False. - Set as False to preserve non-ASCII characters (e.g., Korean, Japanese, Chinese) in their - original form, making them readable in LLM logs and improving model understanding. Returns ------- @@ -202,7 +197,6 @@ class Graphiti: self.store_raw_episode_content = store_raw_episode_content self.max_coroutines = max_coroutines - self.ensure_ascii = ensure_ascii if llm_client: self.llm_client = llm_client else: @@ -221,7 +215,6 @@ class Graphiti: llm_client=self.llm_client, embedder=self.embedder, cross_encoder=self.cross_encoder, - ensure_ascii=self.ensure_ascii, ) # Capture telemetry event @@ -559,9 +552,7 @@ class Graphiti: if update_communities: communities, community_edges = await semaphore_gather( *[ - update_community( - self.driver, self.llm_client, self.embedder, node, self.ensure_ascii - ) + update_community(self.driver, self.llm_client, self.embedder, node) for node in nodes ], max_coroutines=self.max_coroutines, @@ -1071,7 +1062,6 @@ class Graphiti: ), None, None, - self.ensure_ascii, ) edges: list[EntityEdge] = [resolved_edge] + invalidated_edges diff --git a/graphiti_core/graphiti_types.py b/graphiti_core/graphiti_types.py index 8d140597..decdf027 100644 --- a/graphiti_core/graphiti_types.py +++ b/graphiti_core/graphiti_types.py @@ -27,6 +27,5 @@ class GraphitiClients(BaseModel): llm_client: LLMClient embedder: EmbedderClient cross_encoder: CrossEncoderClient - ensure_ascii: bool = False model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/graphiti_core/prompts/dedupe_edges.py b/graphiti_core/prompts/dedupe_edges.py index 84d25f4e..35acb31f 100644 --- a/graphiti_core/prompts/dedupe_edges.py +++ b/graphiti_core/prompts/dedupe_edges.py @@ -67,11 +67,11 @@ def edge(context: dict[str, Any]) -> list[Message]: Given the following context, determine whether the New Edge represents any of the edges in the list of Existing Edges. - {to_prompt_json(context['related_edges'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['related_edges'], indent=2)} - {to_prompt_json(context['extracted_edges'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['extracted_edges'], indent=2)} Task: @@ -98,7 +98,7 @@ def edge_list(context: dict[str, Any]) -> list[Message]: Given the following context, find all of the duplicates in a list of facts: Facts: - {to_prompt_json(context['edges'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['edges'], indent=2)} Task: If any facts in Facts is a duplicate of another fact, return a new fact with one of their uuid's. diff --git a/graphiti_core/prompts/dedupe_nodes.py b/graphiti_core/prompts/dedupe_nodes.py index 84ea1317..9ecc926a 100644 --- a/graphiti_core/prompts/dedupe_nodes.py +++ b/graphiti_core/prompts/dedupe_nodes.py @@ -64,20 +64,20 @@ def node(context: dict[str, Any]) -> list[Message]: role='user', content=f""" - {to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} {context['episode_content']} - {to_prompt_json(context['extracted_node'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['extracted_node'], indent=2)} - {to_prompt_json(context['entity_type_description'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['entity_type_description'], indent=2)} - {to_prompt_json(context['existing_nodes'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['existing_nodes'], indent=2)} Given the above EXISTING ENTITIES and their attributes, MESSAGE, and PREVIOUS MESSAGES; Determine if the NEW ENTITY extracted from the conversation @@ -125,7 +125,7 @@ def nodes(context: dict[str, Any]) -> list[Message]: role='user', content=f""" - {to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} {context['episode_content']} @@ -142,11 +142,11 @@ def nodes(context: dict[str, Any]) -> list[Message]: }} - {to_prompt_json(context['extracted_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['extracted_nodes'], indent=2)} - {to_prompt_json(context['existing_nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['existing_nodes'], indent=2)} Each entry in EXISTING ENTITIES is an object with the following structure: @@ -197,7 +197,7 @@ def node_list(context: dict[str, Any]) -> list[Message]: Given the following context, deduplicate a list of nodes: Nodes: - {to_prompt_json(context['nodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['nodes'], indent=2)} Task: 1. Group nodes together such that all duplicate nodes are in the same list of uuids diff --git a/graphiti_core/prompts/eval.py b/graphiti_core/prompts/eval.py index 5c3fa64e..ed79cfb8 100644 --- a/graphiti_core/prompts/eval.py +++ b/graphiti_core/prompts/eval.py @@ -68,7 +68,7 @@ def query_expansion(context: dict[str, Any]) -> list[Message]: Bob is asking Alice a question, are you able to rephrase the question into a simpler one about Alice in the third person that maintains the relevant context? - {to_prompt_json(context['query'], ensure_ascii=context.get('ensure_ascii', False))} + {to_prompt_json(context['query'])} """ return [ @@ -84,10 +84,10 @@ def qa_prompt(context: dict[str, Any]) -> list[Message]: Your task is to briefly answer the question in the way that you think Alice would answer the question. You are given the following entity summaries and facts to help you determine the answer to your question. - {to_prompt_json(context['entity_summaries'], ensure_ascii=context.get('ensure_ascii', False))} + {to_prompt_json(context['entity_summaries'])} - {to_prompt_json(context['facts'], ensure_ascii=context.get('ensure_ascii', False))} + {to_prompt_json(context['facts'])} {context['query']} diff --git a/graphiti_core/prompts/extract_edges.py b/graphiti_core/prompts/extract_edges.py index 9ae3f6f9..21f68709 100644 --- a/graphiti_core/prompts/extract_edges.py +++ b/graphiti_core/prompts/extract_edges.py @@ -73,7 +73,7 @@ def edge(context: dict[str, Any]) -> list[Message]: -{to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', False), indent=2)} +{to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} @@ -133,7 +133,7 @@ def reflexion(context: dict[str, Any]) -> list[Message]: user_prompt = f""" -{to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', False), indent=2)} +{to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} {context['episode_content']} @@ -167,7 +167,7 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]: content=f""" - {to_prompt_json(context['episode_content'], ensure_ascii=context.get('ensure_ascii', False), indent=2)} + {to_prompt_json(context['episode_content'], indent=2)} {context['reference_time']} diff --git a/graphiti_core/prompts/extract_nodes.py b/graphiti_core/prompts/extract_nodes.py index 9d774ab8..3ee88afc 100644 --- a/graphiti_core/prompts/extract_nodes.py +++ b/graphiti_core/prompts/extract_nodes.py @@ -89,7 +89,7 @@ def extract_message(context: dict[str, Any]) -> list[Message]: -{to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', True), indent=2)} +{to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} @@ -197,7 +197,7 @@ def reflexion(context: dict[str, Any]) -> list[Message]: user_prompt = f""" -{to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', True), indent=2)} +{to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} {context['episode_content']} @@ -221,7 +221,7 @@ def classify_nodes(context: dict[str, Any]) -> list[Message]: user_prompt = f""" - {to_prompt_json([ep for ep in context['previous_episodes']], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json([ep for ep in context['previous_episodes']], indent=2)} {context['episode_content']} @@ -259,8 +259,8 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]: content=f""" - {to_prompt_json(context['previous_episodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} - {to_prompt_json(context['episode_content'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['previous_episodes'], indent=2)} + {to_prompt_json(context['episode_content'], indent=2)} Given the above MESSAGES and the following ENTITY, update any of its attributes based on the information provided @@ -289,8 +289,8 @@ def extract_summary(context: dict[str, Any]) -> list[Message]: content=f""" - {to_prompt_json(context['previous_episodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} - {to_prompt_json(context['episode_content'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['previous_episodes'], indent=2)} + {to_prompt_json(context['episode_content'], indent=2)} Given the above MESSAGES and the following ENTITY, update the summary that combines relevant information about the entity diff --git a/graphiti_core/prompts/prompt_helpers.py b/graphiti_core/prompts/prompt_helpers.py index 91db423d..aa506547 100644 --- a/graphiti_core/prompts/prompt_helpers.py +++ b/graphiti_core/prompts/prompt_helpers.py @@ -4,20 +4,20 @@ from typing import Any DO_NOT_ESCAPE_UNICODE = '\nDo not escape unicode characters.\n' -def to_prompt_json(data: Any, ensure_ascii: bool = True, indent: int = 2) -> str: +def to_prompt_json(data: Any, ensure_ascii: bool = False, indent: int = 2) -> str: """ Serialize data to JSON for use in prompts. Args: data: The data to serialize - ensure_ascii: If True, escape non-ASCII characters. If False, preserve them. + ensure_ascii: If True, escape non-ASCII characters. If False (default), preserve them. indent: Number of spaces for indentation Returns: JSON string representation of the data Notes: - When ensure_ascii=False, non-ASCII characters (e.g., Korean, Japanese, Chinese) + By default (ensure_ascii=False), non-ASCII characters (e.g., Korean, Japanese, Chinese) are preserved in their original form in the prompt, making them readable in LLM logs and improving model understanding. """ diff --git a/graphiti_core/prompts/summarize_nodes.py b/graphiti_core/prompts/summarize_nodes.py index c9dcea34..5b595dc2 100644 --- a/graphiti_core/prompts/summarize_nodes.py +++ b/graphiti_core/prompts/summarize_nodes.py @@ -59,7 +59,7 @@ def summarize_pair(context: dict[str, Any]) -> list[Message]: Summaries must be under 250 words. Summaries: - {to_prompt_json(context['node_summaries'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['node_summaries'], indent=2)} """, ), ] @@ -76,8 +76,8 @@ def summarize_context(context: dict[str, Any]) -> list[Message]: content=f""" - {to_prompt_json(context['previous_episodes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} - {to_prompt_json(context['episode_content'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['previous_episodes'], indent=2)} + {to_prompt_json(context['episode_content'], indent=2)} Given the above MESSAGES and the following ENTITY name, create a summary for the ENTITY. Your summary must only use @@ -100,7 +100,7 @@ def summarize_context(context: dict[str, Any]) -> list[Message]: - {to_prompt_json(context['attributes'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['attributes'], indent=2)} """, ), @@ -120,7 +120,7 @@ def summary_description(context: dict[str, Any]) -> list[Message]: Summaries must be under 250 words. Summary: - {to_prompt_json(context['summary'], ensure_ascii=context.get('ensure_ascii', True), indent=2)} + {to_prompt_json(context['summary'], indent=2)} """, ), ] diff --git a/graphiti_core/search/search_helpers.py b/graphiti_core/search/search_helpers.py index 2e3a1c50..620f8ceb 100644 --- a/graphiti_core/search/search_helpers.py +++ b/graphiti_core/search/search_helpers.py @@ -24,9 +24,7 @@ def format_edge_date_range(edge: EntityEdge) -> str: return f'{edge.valid_at if edge.valid_at else "date unknown"} - {(edge.invalid_at if edge.invalid_at else "present")}' -def search_results_to_context_string( - search_results: SearchResults, ensure_ascii: bool = False -) -> str: +def search_results_to_context_string(search_results: SearchResults) -> str: """Reformats a set of SearchResults into a single string to pass directly to an LLM as context""" fact_json = [ { @@ -58,16 +56,16 @@ def search_results_to_context_string( These are the most relevant facts and their valid and invalid dates. Facts are considered valid between their valid_at and invalid_at dates. Facts with an invalid_at date of "Present" are considered valid. - {to_prompt_json(fact_json, ensure_ascii=ensure_ascii, indent=12)} + {to_prompt_json(fact_json, indent=12)} - {to_prompt_json(entity_json, ensure_ascii=ensure_ascii, indent=12)} + {to_prompt_json(entity_json, indent=12)} - {to_prompt_json(episode_json, ensure_ascii=ensure_ascii, indent=12)} + {to_prompt_json(episode_json, indent=12)} - {to_prompt_json(community_json, ensure_ascii=ensure_ascii, indent=12)} + {to_prompt_json(community_json, indent=12)} """ diff --git a/graphiti_core/utils/bulk_utils.py b/graphiti_core/utils/bulk_utils.py index 321cbd33..dfbcb109 100644 --- a/graphiti_core/utils/bulk_utils.py +++ b/graphiti_core/utils/bulk_utils.py @@ -479,7 +479,6 @@ async def dedupe_edges_bulk( episode, edge_types, set(edge_types), - clients.ensure_ascii, ) for episode, edge, candidates in dedupe_tuples ] diff --git a/graphiti_core/utils/maintenance/community_operations.py b/graphiti_core/utils/maintenance/community_operations.py index 260870bc..f9ebf9a6 100644 --- a/graphiti_core/utils/maintenance/community_operations.py +++ b/graphiti_core/utils/maintenance/community_operations.py @@ -131,13 +131,10 @@ def label_propagation(projection: dict[str, list[Neighbor]]) -> list[list[str]]: return clusters -async def summarize_pair( - llm_client: LLMClient, summary_pair: tuple[str, str], ensure_ascii: bool = True -) -> str: +async def summarize_pair(llm_client: LLMClient, summary_pair: tuple[str, str]) -> str: # Prepare context for LLM context = { 'node_summaries': [{'summary': summary} for summary in summary_pair], - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( @@ -149,12 +146,9 @@ async def summarize_pair( return pair_summary -async def generate_summary_description( - llm_client: LLMClient, summary: str, ensure_ascii: bool = True -) -> str: +async def generate_summary_description(llm_client: LLMClient, summary: str) -> str: context = { 'summary': summary, - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( @@ -168,7 +162,7 @@ async def generate_summary_description( async def build_community( - llm_client: LLMClient, community_cluster: list[EntityNode], ensure_ascii: bool = True + llm_client: LLMClient, community_cluster: list[EntityNode] ) -> tuple[CommunityNode, list[CommunityEdge]]: summaries = [entity.summary for entity in community_cluster] length = len(summaries) @@ -180,9 +174,7 @@ async def build_community( new_summaries: list[str] = list( await semaphore_gather( *[ - summarize_pair( - llm_client, (str(left_summary), str(right_summary)), ensure_ascii - ) + summarize_pair(llm_client, (str(left_summary), str(right_summary))) for left_summary, right_summary in zip( summaries[: int(length / 2)], summaries[int(length / 2) :], strict=False ) @@ -195,7 +187,7 @@ async def build_community( length = len(summaries) summary = summaries[0] - name = await generate_summary_description(llm_client, summary, ensure_ascii) + name = await generate_summary_description(llm_client, summary) now = utc_now() community_node = CommunityNode( name=name, @@ -215,7 +207,6 @@ async def build_communities( driver: GraphDriver, llm_client: LLMClient, group_ids: list[str] | None, - ensure_ascii: bool = True, ) -> tuple[list[CommunityNode], list[CommunityEdge]]: community_clusters = await get_community_clusters(driver, group_ids) @@ -223,7 +214,7 @@ async def build_communities( async def limited_build_community(cluster): async with semaphore: - return await build_community(llm_client, cluster, ensure_ascii) + return await build_community(llm_client, cluster) communities: list[tuple[CommunityNode, list[CommunityEdge]]] = list( await semaphore_gather( @@ -312,17 +303,14 @@ async def update_community( llm_client: LLMClient, embedder: EmbedderClient, entity: EntityNode, - ensure_ascii: bool = True, ) -> tuple[list[CommunityNode], list[CommunityEdge]]: community, is_new = await determine_entity_community(driver, entity) if community is None: return [], [] - new_summary = await summarize_pair( - llm_client, (entity.summary, community.summary), ensure_ascii - ) - new_name = await generate_summary_description(llm_client, new_summary, ensure_ascii) + new_summary = await summarize_pair(llm_client, (entity.summary, community.summary)) + new_name = await generate_summary_description(llm_client, new_summary) community.summary = new_summary community.name = new_name diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py index 60a35357..a6760a40 100644 --- a/graphiti_core/utils/maintenance/edge_operations.py +++ b/graphiti_core/utils/maintenance/edge_operations.py @@ -130,7 +130,6 @@ async def extract_edges( 'reference_time': episode.valid_at, 'edge_types': edge_types_context, 'custom_prompt': '', - 'ensure_ascii': clients.ensure_ascii, } facts_missed = True @@ -358,7 +357,6 @@ async def resolve_extracted_edges( episode, extracted_edge_types, custom_type_names, - clients.ensure_ascii, ) for extracted_edge, related_edges, existing_edges, extracted_edge_types in zip( extracted_edges, @@ -431,7 +429,6 @@ async def resolve_extracted_edge( episode: EpisodicNode, edge_type_candidates: dict[str, type[BaseModel]] | None = None, custom_edge_type_names: set[str] | None = None, - ensure_ascii: bool = True, ) -> tuple[EntityEdge, list[EntityEdge], list[EntityEdge]]: """Resolve an extracted edge against existing graph context. @@ -453,8 +450,6 @@ async def resolve_extracted_edge( Full catalog of registered custom edge names. Used to distinguish between disallowed custom types (which fall back to the default label) and ad-hoc labels emitted by the LLM. - ensure_ascii : bool - Whether prompt payloads should coerce ASCII output. Returns ------- @@ -504,7 +499,6 @@ async def resolve_extracted_edge( 'new_edge': extracted_edge.fact, 'edge_invalidation_candidates': invalidation_edge_candidates_context, 'edge_types': edge_types_context, - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( @@ -548,7 +542,6 @@ async def resolve_extracted_edge( 'episode_content': episode.content, 'reference_time': episode.valid_at, 'fact': resolved_edge.fact, - 'ensure_ascii': ensure_ascii, } edge_model = edge_type_candidates.get(fact_type) if edge_type_candidates else None diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 758349c6..7f85b52c 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -64,14 +64,12 @@ async def extract_nodes_reflexion( episode: EpisodicNode, previous_episodes: list[EpisodicNode], node_names: list[str], - ensure_ascii: bool = False, ) -> list[str]: # Prepare context for LLM context = { 'episode_content': episode.content, 'previous_episodes': [ep.content for ep in previous_episodes], 'extracted_entities': node_names, - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( @@ -124,7 +122,6 @@ async def extract_nodes( 'custom_prompt': custom_prompt, 'entity_types': entity_types_context, 'source_description': episode.source_description, - 'ensure_ascii': clients.ensure_ascii, } while entities_missed and reflexion_iterations <= MAX_REFLEXION_ITERATIONS: @@ -155,7 +152,6 @@ async def extract_nodes( episode, previous_episodes, [entity.name for entity in extracted_entities], - clients.ensure_ascii, ) entities_missed = len(missing_entities) != 0 @@ -239,7 +235,6 @@ async def _resolve_with_llm( extracted_nodes: list[EntityNode], indexes: DedupCandidateIndexes, state: DedupResolutionState, - ensure_ascii: bool, episode: EpisodicNode | None, previous_episodes: list[EpisodicNode] | None, entity_types: dict[str, type[BaseModel]] | None, @@ -309,7 +304,6 @@ async def _resolve_with_llm( 'previous_episodes': ( [ep.content for ep in previous_episodes] if previous_episodes is not None else [] ), - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( @@ -416,7 +410,6 @@ async def resolve_extracted_nodes( extracted_nodes, indexes, state, - clients.ensure_ascii, episode, previous_episodes, entity_types, @@ -465,7 +458,6 @@ async def extract_attributes_from_nodes( if entity_types is not None else None ), - clients.ensure_ascii, should_summarize_node, ) for node in nodes @@ -483,7 +475,6 @@ async def extract_attributes_from_node( episode: EpisodicNode | None = None, previous_episodes: list[EpisodicNode] | None = None, entity_type: type[BaseModel] | None = None, - ensure_ascii: bool = False, should_summarize_node: NodeSummaryFilter | None = None, ) -> EntityNode: node_context: dict[str, Any] = { @@ -499,7 +490,6 @@ async def extract_attributes_from_node( 'previous_episodes': ( [ep.content for ep in previous_episodes] if previous_episodes is not None else [] ), - 'ensure_ascii': ensure_ascii, } summary_context: dict[str, Any] = { @@ -508,7 +498,6 @@ async def extract_attributes_from_node( 'previous_episodes': ( [ep.content for ep in previous_episodes] if previous_episodes is not None else [] ), - 'ensure_ascii': ensure_ascii, } has_entity_attributes: bool = bool( diff --git a/graphiti_core/utils/maintenance/temporal_operations.py b/graphiti_core/utils/maintenance/temporal_operations.py index b53ec2f8..1f64decb 100644 --- a/graphiti_core/utils/maintenance/temporal_operations.py +++ b/graphiti_core/utils/maintenance/temporal_operations.py @@ -35,14 +35,12 @@ async def extract_edge_dates( edge: EntityEdge, current_episode: EpisodicNode, previous_episodes: list[EpisodicNode], - ensure_ascii: bool = False, ) -> tuple[datetime | None, datetime | None]: context = { 'edge_fact': edge.fact, 'current_episode': current_episode.content, 'previous_episodes': [ep.content for ep in previous_episodes], 'reference_timestamp': current_episode.valid_at.isoformat(), - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( prompt_library.extract_edge_dates.v1(context), response_model=EdgeDates @@ -75,7 +73,6 @@ async def get_edge_contradictions( llm_client: LLMClient, new_edge: EntityEdge, existing_edges: list[EntityEdge], - ensure_ascii: bool = False, ) -> list[EntityEdge]: start = time() @@ -87,7 +84,6 @@ async def get_edge_contradictions( context = { 'new_edge': new_edge_context, 'existing_edges': existing_edge_context, - 'ensure_ascii': ensure_ascii, } llm_response = await llm_client.generate_response( diff --git a/tests/utils/maintenance/test_bulk_utils.py b/tests/utils/maintenance/test_bulk_utils.py index 2616c79a..643071ae 100644 --- a/tests/utils/maintenance/test_bulk_utils.py +++ b/tests/utils/maintenance/test_bulk_utils.py @@ -34,7 +34,6 @@ def _make_clients() -> GraphitiClients: embedder=embedder, cross_encoder=cross_encoder, llm_client=llm_client, - ensure_ascii=False, ) @@ -260,7 +259,6 @@ async def test_dedupe_edges_bulk_deduplicates_within_episode(monkeypatch): episode, edge_type_candidates=None, custom_edge_type_names=None, - ensure_ascii=False, ): # Track that this edge was compared against the related_edges comparisons_made.append((extracted_edge.uuid, [r.uuid for r in related_edges])) diff --git a/tests/utils/maintenance/test_edge_operations.py b/tests/utils/maintenance/test_edge_operations.py index 0395e7a3..b5d01e54 100644 --- a/tests/utils/maintenance/test_edge_operations.py +++ b/tests/utils/maintenance/test_edge_operations.py @@ -143,7 +143,6 @@ async def test_resolve_extracted_edge_exact_fact_short_circuit( mock_existing_edges, mock_current_episode, edge_type_candidates=None, - ensure_ascii=True, ) assert resolved_edge is related_edges[0] @@ -184,7 +183,6 @@ async def test_resolve_extracted_edges_resets_unmapped_names(monkeypatch): llm_client=llm_client, embedder=MagicMock(), cross_encoder=MagicMock(), - ensure_ascii=True, ) source_node = EntityNode( @@ -265,7 +263,6 @@ async def test_resolve_extracted_edges_keeps_unknown_names(monkeypatch): llm_client=llm_client, embedder=MagicMock(), cross_encoder=MagicMock(), - ensure_ascii=True, ) source_node = EntityNode( @@ -369,7 +366,6 @@ async def test_resolve_extracted_edge_rejects_unmapped_fact_type(mock_llm_client episode, edge_type_candidates={}, custom_edge_type_names={'OCCURRED_AT'}, - ensure_ascii=True, ) assert resolved_edge.name == DEFAULT_EDGE_NAME @@ -427,7 +423,6 @@ async def test_resolve_extracted_edge_accepts_unknown_fact_type(mock_llm_client) episode, edge_type_candidates={'OCCURRED_AT': OccurredAtEdge}, custom_edge_type_names={'OCCURRED_AT'}, - ensure_ascii=True, ) assert resolved_edge.name == 'INTERACTED_WITH' @@ -515,7 +510,6 @@ async def test_resolve_extracted_edge_uses_integer_indices_for_duplicates(mock_l episode, edge_type_candidates=None, custom_edge_type_names=set(), - ensure_ascii=True, ) # Verify LLM was called @@ -553,7 +547,6 @@ async def test_resolve_extracted_edges_fast_path_deduplication(monkeypatch): episode, edge_type_candidates=None, custom_edge_type_names=None, - ensure_ascii=False, ): nonlocal resolve_call_count resolve_call_count += 1 @@ -576,7 +569,6 @@ async def test_resolve_extracted_edges_fast_path_deduplication(monkeypatch): llm_client=llm_client, embedder=MagicMock(), cross_encoder=MagicMock(), - ensure_ascii=True, ) source_node = EntityNode( diff --git a/tests/utils/maintenance/test_node_operations.py b/tests/utils/maintenance/test_node_operations.py index c144e1d2..927a287d 100644 --- a/tests/utils/maintenance/test_node_operations.py +++ b/tests/utils/maintenance/test_node_operations.py @@ -46,7 +46,6 @@ def _make_clients(): embedder=embedder, cross_encoder=cross_encoder, llm_client=llm_client, - ensure_ascii=False, ) return clients, llm_generate @@ -335,7 +334,6 @@ async def test_resolve_with_llm_updates_unresolved(monkeypatch): [extracted], indexes, state, - ensure_ascii=False, episode=_make_episode(), previous_episodes=[], entity_types=None, @@ -380,7 +378,6 @@ async def test_resolve_with_llm_ignores_out_of_range_relative_ids(monkeypatch, c [extracted], indexes, state, - ensure_ascii=False, episode=_make_episode(), previous_episodes=[], entity_types=None, @@ -428,7 +425,6 @@ async def test_resolve_with_llm_ignores_duplicate_relative_ids(monkeypatch): [extracted], indexes, state, - ensure_ascii=False, episode=_make_episode(), previous_episodes=[], entity_types=None, @@ -470,7 +466,6 @@ async def test_resolve_with_llm_invalid_duplicate_idx_defaults_to_extracted(monk [extracted], indexes, state, - ensure_ascii=False, episode=_make_episode(), previous_episodes=[], entity_types=None, @@ -498,7 +493,6 @@ async def test_extract_attributes_without_callback_generates_summary(): episode=episode, previous_episodes=[], entity_type=None, - ensure_ascii=False, should_summarize_node=None, # No callback provided ) @@ -529,7 +523,6 @@ async def test_extract_attributes_with_callback_skip_summary(): episode=episode, previous_episodes=[], entity_type=None, - ensure_ascii=False, should_summarize_node=skip_summary_filter, ) @@ -560,7 +553,6 @@ async def test_extract_attributes_with_callback_generate_summary(): episode=episode, previous_episodes=[], entity_type=None, - ensure_ascii=False, should_summarize_node=generate_summary_filter, ) @@ -595,7 +587,6 @@ async def test_extract_attributes_with_selective_callback(): episode=episode, previous_episodes=[], entity_type=None, - ensure_ascii=False, should_summarize_node=selective_filter, ) @@ -605,7 +596,6 @@ async def test_extract_attributes_with_selective_callback(): episode=episode, previous_episodes=[], entity_type=None, - ensure_ascii=False, should_summarize_node=selective_filter, ) From 4a307dbf10b8de20b0f8429785ef8ce2956c452c Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 2 Oct 2025 17:07:43 -0700 Subject: [PATCH 3/4] Optimize edge deduplication prompt for caching and clarity (#970) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Optimize edge deduplication prompt for caching and clarity - Restructure prompt to place invariant instructions at top and dynamic context at bottom for better LLM caching - Change 'id' to 'idx' in edge context lists to avoid confusion with other identifiers - Remove 'fact_type_id' from edge types context as LLM only needs fact_type_name - Remove dynamic range values from prompt instructions (e.g., "range 0-N") - Add debug logging before LLM call to track input sizes - Add validation logging after LLM response to catch invalid idx values - Clarify that duplicate_facts uses EXISTING FACTS idx and contradicted_facts uses INVALIDATION CANDIDATES idx 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * Address terminology consistency and edge case logging - Update Pydantic field descriptions to use 'idx' instead of 'ids' for consistency - Fix debug logging to handle empty list edge case (avoid 'idx 0--1' display) Note on review feedback: - Validation is intentionally non-redundant: warnings provide visibility, list comprehensions ensure robustness - WARNING level is appropriate for LLM output issues (not system errors) - Existing test coverage is sufficient for this defensive logging addition 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --------- Co-authored-by: Claude --- graphiti_core/prompts/dedupe_edges.py | 67 +++++++++++-------- .../utils/maintenance/edge_operations.py | 34 ++++++++-- 2 files changed, 69 insertions(+), 32 deletions(-) diff --git a/graphiti_core/prompts/dedupe_edges.py b/graphiti_core/prompts/dedupe_edges.py index 35acb31f..c5b55427 100644 --- a/graphiti_core/prompts/dedupe_edges.py +++ b/graphiti_core/prompts/dedupe_edges.py @@ -25,11 +25,11 @@ from .prompt_helpers import to_prompt_json class EdgeDuplicate(BaseModel): duplicate_facts: list[int] = Field( ..., - description='List of ids of any duplicate facts. If no duplicate facts are found, default to empty list.', + description='List of idx values of any duplicate facts. If no duplicate facts are found, default to empty list.', ) contradicted_facts: list[int] = Field( ..., - description='List of ids of facts that should be invalidated. If no facts should be invalidated, the list should be empty.', + description='List of idx values of facts that should be invalidated. If no facts should be invalidated, the list should be empty.', ) fact_type: str = Field(..., description='One of the provided fact types or DEFAULT') @@ -124,37 +124,48 @@ def resolve_edge(context: dict[str, Any]) -> list[Message]: Message( role='user', content=f""" - - {context['new_edge']} - - - - {context['existing_edges']} - - - {context['edge_invalidation_candidates']} - - - - {context['edge_types']} - - - Task: - If the NEW FACT represents identical factual information of one or more in EXISTING FACTS, return the idx of the duplicate facts. - Facts with similar information that contain key differences should not be marked as duplicates. - If the NEW FACT is not a duplicate of any of the EXISTING FACTS, return an empty list. - - Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types. - Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES. - - Based on the provided FACT INVALIDATION CANDIDATES and NEW FACT, determine which existing facts the new fact contradicts. - Return a list containing all idx's of the facts that are contradicted by the NEW FACT. - If there are no contradicted facts, return an empty list. + You will receive TWO separate lists of facts. Each list uses 'idx' as its index field, starting from 0. + + 1. DUPLICATE DETECTION: + - If the NEW FACT represents identical factual information as any fact in EXISTING FACTS, return those idx values in duplicate_facts. + - Facts with similar information that contain key differences should NOT be marked as duplicates. + - Return idx values from EXISTING FACTS. + - If no duplicates, return an empty list for duplicate_facts. + + 2. FACT TYPE CLASSIFICATION: + - Given the predefined FACT TYPES, determine if the NEW FACT should be classified as one of these types. + - Return the fact type as fact_type or DEFAULT if NEW FACT is not one of the FACT TYPES. + + 3. CONTRADICTION DETECTION: + - Based on FACT INVALIDATION CANDIDATES and NEW FACT, determine which facts the new fact contradicts. + - Return idx values from FACT INVALIDATION CANDIDATES. + - If no contradictions, return an empty list for contradicted_facts. + + IMPORTANT: + - duplicate_facts: Use ONLY 'idx' values from EXISTING FACTS + - contradicted_facts: Use ONLY 'idx' values from FACT INVALIDATION CANDIDATES + - These are two separate lists with independent idx ranges starting from 0 Guidelines: 1. Some facts may be very similar but will have key differences, particularly around numeric values in the facts. Do not mark these facts as duplicates. + + + {context['edge_types']} + + + + {context['existing_edges']} + + + + {context['edge_invalidation_candidates']} + + + + {context['new_edge']} + """, ), ] diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py index a6760a40..3e8f6990 100644 --- a/graphiti_core/utils/maintenance/edge_operations.py +++ b/graphiti_core/utils/maintenance/edge_operations.py @@ -475,20 +475,19 @@ async def resolve_extracted_edge( start = time() # Prepare context for LLM - related_edges_context = [{'id': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)] + related_edges_context = [{'idx': i, 'fact': edge.fact} for i, edge in enumerate(related_edges)] invalidation_edge_candidates_context = [ - {'id': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges) + {'idx': i, 'fact': existing_edge.fact} for i, existing_edge in enumerate(existing_edges) ] edge_types_context = ( [ { - 'fact_type_id': i, 'fact_type_name': type_name, 'fact_type_description': type_model.__doc__, } - for i, (type_name, type_model) in enumerate(edge_type_candidates.items()) + for type_name, type_model in edge_type_candidates.items() ] if edge_type_candidates is not None else [] @@ -501,6 +500,15 @@ async def resolve_extracted_edge( 'edge_types': edge_types_context, } + if related_edges or existing_edges: + logger.debug( + 'Resolving edge: sent %d EXISTING FACTS%s and %d INVALIDATION CANDIDATES%s', + len(related_edges), + f' (idx 0-{len(related_edges) - 1})' if related_edges else '', + len(existing_edges), + f' (idx 0-{len(existing_edges) - 1})' if existing_edges else '', + ) + llm_response = await llm_client.generate_response( prompt_library.dedupe_edges.resolve_edge(context), response_model=EdgeDuplicate, @@ -509,6 +517,15 @@ async def resolve_extracted_edge( response_object = EdgeDuplicate(**llm_response) duplicate_facts = response_object.duplicate_facts + # Validate duplicate_facts are in valid range for EXISTING FACTS + invalid_duplicates = [i for i in duplicate_facts if i < 0 or i >= len(related_edges)] + if invalid_duplicates: + logger.warning( + 'LLM returned invalid duplicate_facts idx values %s (valid range: 0-%d for EXISTING FACTS)', + invalid_duplicates, + len(related_edges) - 1, + ) + duplicate_fact_ids: list[int] = [i for i in duplicate_facts if 0 <= i < len(related_edges)] resolved_edge = extracted_edge @@ -521,6 +538,15 @@ async def resolve_extracted_edge( contradicted_facts: list[int] = response_object.contradicted_facts + # Validate contradicted_facts are in valid range for INVALIDATION CANDIDATES + invalid_contradictions = [i for i in contradicted_facts if i < 0 or i >= len(existing_edges)] + if invalid_contradictions: + logger.warning( + 'LLM returned invalid contradicted_facts idx values %s (valid range: 0-%d for INVALIDATION CANDIDATES)', + invalid_contradictions, + len(existing_edges) - 1, + ) + invalidation_candidates: list[EntityEdge] = [ existing_edges[i] for i in contradicted_facts if 0 <= i < len(existing_edges) ] From 590282524a26eaa77685cc2f4929737bfdbb6ddb Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:45:11 -0700 Subject: [PATCH 4/4] fix: Improve edge extraction entity ID validation (#968) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Improve edge extraction entity ID validation Fixes invalid entity ID references in edge extraction that caused warnings like: "WARNING: source or target node not filled WILL_FIND. source_node_uuid: 23 and target_node_uuid: 3" Changes: - Format ENTITIES list as proper JSON in prompt for better LLM parsing - Clarify field descriptions to reference entity id from ENTITIES list - Add explicit entity ID validation as #1 extraction rule with examples - Improve error logging (removed PII, added entity count and valid range) These changes follow patterns from extract_nodes.py and dedupe_nodes.py where entity referencing works reliably. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * wip * fix: Align fact field naming and add description - Change extraction rule to reference 'fact' instead of 'fact_text' - Add descriptive text for fact field in Edge model * fix: Remove ensure_ascii parameter from to_prompt_json call Align with other to_prompt_json calls that don't use ensure_ascii * fix: Use validated target_node_idx variable consistently Line 190 was using raw edge_data.target_entity_id instead of the validated target_node_idx variable, creating inconsistency with line 189 * fix: Improve edge extraction validation checks - Add explicit check for empty nodes list - Use more explicit 0 <= idx comparison instead of -1 < idx - Prevents nonsensical error message when no entities provided * chore: Restore uv.lock from main branch Previously deleted in commit 7e4464b, now restored to match main branch state * Update uv.lock --------- Co-authored-by: Claude --- graphiti_core/prompts/extract_edges.py | 20 +++++++++++++------ .../utils/maintenance/edge_operations.py | 13 +++++++++--- uv.lock | 6 ------ 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/graphiti_core/prompts/extract_edges.py b/graphiti_core/prompts/extract_edges.py index 21f68709..28d9bddc 100644 --- a/graphiti_core/prompts/extract_edges.py +++ b/graphiti_core/prompts/extract_edges.py @@ -24,9 +24,16 @@ from .prompt_helpers import to_prompt_json class Edge(BaseModel): relation_type: str = Field(..., description='FACT_PREDICATE_IN_SCREAMING_SNAKE_CASE') - source_entity_id: int = Field(..., description='The id of the source entity of the fact.') - target_entity_id: int = Field(..., description='The id of the target entity of the fact.') - fact: str = Field(..., description='') + source_entity_id: int = Field( + ..., description='The id of the source entity from the ENTITIES list' + ) + target_entity_id: int = Field( + ..., description='The id of the target entity from the ENTITIES list' + ) + fact: str = Field( + ..., + description='A natural language description of the relationship between the entities, paraphrased from the source text', + ) valid_at: str | None = Field( None, description='The date and time when the relationship described by the edge fact became true or was established. Use ISO 8601 format (YYYY-MM-DDTHH:MM:SS.SSSSSSZ)', @@ -81,7 +88,7 @@ def edge(context: dict[str, Any]) -> list[Message]: -{context['nodes']} +{to_prompt_json(context['nodes'], indent=2)} @@ -107,11 +114,12 @@ You may use information from the PREVIOUS MESSAGES only to disambiguate referenc # EXTRACTION RULES -1. Only emit facts where both the subject and object match IDs in ENTITIES. +1. **Entity ID Validation**: `source_entity_id` and `target_entity_id` must use only the `id` values from the ENTITIES list provided above. + - **CRITICAL**: Using IDs not in the list will cause the edge to be rejected 2. Each fact must involve two **distinct** entities. 3. Use a SCREAMING_SNAKE_CASE string as the `relation_type` (e.g., FOUNDED, WORKS_AT). 4. Do not emit duplicate or semantically redundant facts. -5. The `fact_text` should closely paraphrase the original source sentence(s). Do not verbatim quote the original text. +5. The `fact` should closely paraphrase the original source sentence(s). Do not verbatim quote the original text. 6. Use `REFERENCE_TIME` to resolve vague or relative temporal expressions (e.g., "last week"). 7. Do **not** hallucinate or infer temporal bounds from unrelated events. diff --git a/graphiti_core/utils/maintenance/edge_operations.py b/graphiti_core/utils/maintenance/edge_operations.py index 3e8f6990..70300041 100644 --- a/graphiti_core/utils/maintenance/edge_operations.py +++ b/graphiti_core/utils/maintenance/edge_operations.py @@ -179,13 +179,20 @@ async def extract_edges( source_node_idx = edge_data.source_entity_id target_node_idx = edge_data.target_entity_id - if not (-1 < source_node_idx < len(nodes) and -1 < target_node_idx < len(nodes)): + + if len(nodes) == 0: + logger.warning('No entities provided for edge extraction') + continue + + if not (0 <= source_node_idx < len(nodes) and 0 <= target_node_idx < len(nodes)): logger.warning( - f'WARNING: source or target node not filled {edge_data.relation_type}. source_node_uuid: {source_node_idx} and target_node_uuid: {target_node_idx} ' + f'Invalid entity IDs in edge extraction for {edge_data.relation_type}. ' + f'source_entity_id: {source_node_idx}, target_entity_id: {target_node_idx}, ' + f'but only {len(nodes)} entities available (valid range: 0-{len(nodes) - 1})' ) continue source_node_uuid = nodes[source_node_idx].uuid - target_node_uuid = nodes[edge_data.target_entity_id].uuid + target_node_uuid = nodes[target_node_idx].uuid if valid_at: try: diff --git a/uv.lock b/uv.lock index 7f362eec..8f181dc9 100644 --- a/uv.lock +++ b/uv.lock @@ -803,7 +803,6 @@ anthropic = [ ] dev = [ { name = "anthropic" }, - { name = "boto3" }, { name = "diskcache-stubs" }, { name = "falkordb" }, { name = "google-genai" }, @@ -812,11 +811,9 @@ dev = [ { name = "jupyterlab" }, { name = "kuzu" }, { name = "langchain-anthropic" }, - { name = "langchain-aws" }, { name = "langchain-openai" }, { name = "langgraph" }, { name = "langsmith" }, - { name = "opensearch-py" }, { name = "pyright" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -858,7 +855,6 @@ voyageai = [ requires-dist = [ { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.49.0" }, { name = "anthropic", marker = "extra == 'dev'", specifier = ">=0.49.0" }, - { name = "boto3", marker = "extra == 'dev'", specifier = ">=1.39.16" }, { name = "boto3", marker = "extra == 'neo4j-opensearch'", specifier = ">=1.39.16" }, { name = "boto3", marker = "extra == 'neptune'", specifier = ">=1.39.16" }, { name = "diskcache", specifier = ">=5.6.3" }, @@ -874,7 +870,6 @@ requires-dist = [ { name = "kuzu", marker = "extra == 'dev'", specifier = ">=0.11.2" }, { name = "kuzu", marker = "extra == 'kuzu'", specifier = ">=0.11.2" }, { name = "langchain-anthropic", marker = "extra == 'dev'", specifier = ">=0.2.4" }, - { name = "langchain-aws", marker = "extra == 'dev'", specifier = ">=0.2.29" }, { name = "langchain-aws", marker = "extra == 'neptune'", specifier = ">=0.2.29" }, { name = "langchain-openai", marker = "extra == 'dev'", specifier = ">=0.2.6" }, { name = "langgraph", marker = "extra == 'dev'", specifier = ">=0.2.15" }, @@ -882,7 +877,6 @@ requires-dist = [ { name = "neo4j", specifier = ">=5.26.0" }, { name = "numpy", specifier = ">=1.0.0" }, { name = "openai", specifier = ">=1.91.0" }, - { name = "opensearch-py", marker = "extra == 'dev'", specifier = ">=3.0.0" }, { name = "opensearch-py", marker = "extra == 'neo4j-opensearch'", specifier = ">=3.0.0" }, { name = "opensearch-py", marker = "extra == 'neptune'", specifier = ">=3.0.0" }, { name = "posthog", specifier = ">=3.0.0" },