From 8770012745bc0b3bcfe4bd22c8497f118e099f2e Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Sat, 4 Oct 2025 19:06:32 -0700 Subject: [PATCH] Refactor prompt structure: move MESSAGES after instructions (#980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor prompt structure: move MESSAGES after instructions Reordered prompt structure in extract_nodes.py to place MESSAGES section after instructions/guidelines in both extract_attributes and extract_summary functions for improved prompt clarity. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * Add sentence-aware text truncator for entity summaries - Created truncate_at_sentence() utility function that truncates text at sentence boundaries while respecting max character limits - Added MAX_SUMMARY_CHARS constant (250 chars) for entity summaries - Applied truncator to entity summaries in prompts (extract_nodes.py) - Applied truncator to LLM-generated summaries (node_operations.py) - Added comprehensive test suite for truncation logic 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * Clean up formatting in extract_attributes prompt - Remove extra blank lines - Fix indentation of MESSAGES tag 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude * Bump version to 0.22.0pre3 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --------- Co-authored-by: Claude --- graphiti_core/prompts/extract_nodes.py | 26 ++--- .../utils/maintenance/node_operations.py | 5 +- graphiti_core/utils/text_utils.py | 53 +++++++++ pyproject.toml | 2 +- tests/test_text_utils.py | 106 ++++++++++++++++++ 5 files changed, 176 insertions(+), 16 deletions(-) create mode 100644 graphiti_core/utils/text_utils.py create mode 100644 tests/test_text_utils.py diff --git a/graphiti_core/prompts/extract_nodes.py b/graphiti_core/prompts/extract_nodes.py index 29e99978..8e85c7a6 100644 --- a/graphiti_core/prompts/extract_nodes.py +++ b/graphiti_core/prompts/extract_nodes.py @@ -18,6 +18,8 @@ from typing import Any, Protocol, TypedDict from pydantic import BaseModel, Field +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS + from .models import Message, PromptFunction, PromptVersion from .prompt_helpers import to_prompt_json from .snippets import summary_instructions @@ -57,7 +59,7 @@ class EntityClassification(BaseModel): class EntitySummary(BaseModel): summary: str = Field( ..., - description='Summary containing the important information about the entity. Under 250 characters.', + description=f'Summary containing the important information about the entity. Under {MAX_SUMMARY_CHARS} characters.', ) @@ -259,18 +261,17 @@ def extract_attributes(context: dict[str, Any]) -> list[Message]: Message( role='user', content=f""" - - - {to_prompt_json(context['previous_episodes'], indent=2)} - {to_prompt_json(context['episode_content'], indent=2)} - - - Given the above MESSAGES and the following ENTITY, update any of its attributes based on the information provided + Given the MESSAGES and the following ENTITY, update any of its attributes based on the information provided in MESSAGES. Use the provided attribute descriptions to better understand how each attribute should be determined. Guidelines: 1. Do not hallucinate entity property values if they cannot be found in the current context. 2. Only use the provided MESSAGES and ENTITY to set attribute values. + + + {to_prompt_json(context['previous_episodes'], indent=2)} + {to_prompt_json(context['episode_content'], indent=2)} + {context['node']} @@ -289,17 +290,16 @@ def extract_summary(context: dict[str, Any]) -> list[Message]: Message( role='user', content=f""" + Given the MESSAGES and the ENTITY, update the summary that combines relevant information about the entity + from the messages and relevant information from the existing summary. + + {summary_instructions} {to_prompt_json(context['previous_episodes'], indent=2)} {to_prompt_json(context['episode_content'], indent=2)} - Given the above MESSAGES and the following ENTITY, update the summary that combines relevant information about the entity - from the messages and relevant information from the existing summary. - - {summary_instructions} - {context['node']} diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 8db44218..a23f30e7 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -53,6 +53,7 @@ from graphiti_core.utils.maintenance.dedup_helpers import ( from graphiti_core.utils.maintenance.edge_operations import ( filter_existing_duplicate_of_edges, ) +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence logger = logging.getLogger(__name__) @@ -547,7 +548,7 @@ async def _extract_entity_summary( summary_context = _build_episode_context( node_data={ 'name': node.name, - 'summary': node.summary, + 'summary': truncate_at_sentence(node.summary, MAX_SUMMARY_CHARS), 'entity_types': node.labels, 'attributes': node.attributes, }, @@ -562,7 +563,7 @@ async def _extract_entity_summary( group_id=node.group_id, ) - node.summary = summary_response.get('summary', '') + node.summary = truncate_at_sentence(summary_response.get('summary', ''), MAX_SUMMARY_CHARS) def _build_episode_context( diff --git a/graphiti_core/utils/text_utils.py b/graphiti_core/utils/text_utils.py new file mode 100644 index 00000000..fc5240f0 --- /dev/null +++ b/graphiti_core/utils/text_utils.py @@ -0,0 +1,53 @@ +""" +Copyright 2024, Zep Software, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re + +# Maximum length for entity/node summaries +MAX_SUMMARY_CHARS = 250 + + +def truncate_at_sentence(text: str, max_chars: int) -> str: + """ + Truncate text at or about max_chars while respecting sentence boundaries. + + Attempts to truncate at the last complete sentence before max_chars. + If no sentence boundary is found before max_chars, truncates at max_chars. + + Args: + text: The text to truncate + max_chars: Maximum number of characters + + Returns: + Truncated text + """ + if not text or len(text) <= max_chars: + return text + + # Find all sentence boundaries (., !, ?) up to max_chars + truncated = text[:max_chars] + + # Look for sentence boundaries: period, exclamation, or question mark followed by space or end + sentence_pattern = r'[.!?](?:\s|$)' + matches = list(re.finditer(sentence_pattern, truncated)) + + if matches: + # Truncate at the last sentence boundary found + last_match = matches[-1] + return text[: last_match.end()].rstrip() + + # No sentence boundary found, truncate at max_chars + return truncated.rstrip() diff --git a/pyproject.toml b/pyproject.toml index b9048692..ce7f2ab4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "graphiti-core" description = "A temporal graph building library" -version = "0.22.0pre2" +version = "0.22.0pre3" authors = [ { name = "Paul Paliychuk", email = "paul@getzep.com" }, { name = "Preston Rasmussen", email = "preston@getzep.com" }, diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py new file mode 100644 index 00000000..38231d93 --- /dev/null +++ b/tests/test_text_utils.py @@ -0,0 +1,106 @@ +""" +Copyright 2024, Zep Software, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence + + +def test_truncate_at_sentence_short_text(): + """Test that short text is returned unchanged.""" + text = 'This is a short sentence.' + result = truncate_at_sentence(text, 100) + assert result == text + + +def test_truncate_at_sentence_empty(): + """Test that empty text is handled correctly.""" + assert truncate_at_sentence('', 100) == '' + assert truncate_at_sentence(None, 100) is None + + +def test_truncate_at_sentence_exact_length(): + """Test text at exactly max_chars.""" + text = 'A' * 100 + result = truncate_at_sentence(text, 100) + assert result == text + + +def test_truncate_at_sentence_with_period(): + """Test truncation at sentence boundary with period.""" + text = 'First sentence. Second sentence. Third sentence. Fourth sentence.' + result = truncate_at_sentence(text, 40) + assert result == 'First sentence. Second sentence.' + assert len(result) <= 40 + + +def test_truncate_at_sentence_with_question(): + """Test truncation at sentence boundary with question mark.""" + text = 'What is this? This is a test. More text here.' + result = truncate_at_sentence(text, 30) + assert result == 'What is this? This is a test.' + assert len(result) <= 32 + + +def test_truncate_at_sentence_with_exclamation(): + """Test truncation at sentence boundary with exclamation mark.""" + text = 'Hello world! This is exciting. And more text.' + result = truncate_at_sentence(text, 30) + assert result == 'Hello world! This is exciting.' + assert len(result) <= 32 + + +def test_truncate_at_sentence_no_boundary(): + """Test truncation when no sentence boundary exists before max_chars.""" + text = 'This is a very long sentence without any punctuation marks near the beginning' + result = truncate_at_sentence(text, 30) + assert len(result) <= 30 + assert result.startswith('This is a very long sentence') + + +def test_truncate_at_sentence_multiple_periods(): + """Test with multiple sentence endings.""" + text = 'A. B. C. D. E. F. G. H.' + result = truncate_at_sentence(text, 10) + assert result == 'A. B. C.' + assert len(result) <= 10 + + +def test_truncate_at_sentence_strips_trailing_whitespace(): + """Test that trailing whitespace is stripped.""" + text = 'First sentence. Second sentence.' + result = truncate_at_sentence(text, 20) + assert result == 'First sentence.' + assert not result.endswith(' ') + + +def test_max_summary_chars_constant(): + """Test that MAX_SUMMARY_CHARS is set to expected value.""" + assert MAX_SUMMARY_CHARS == 250 + + +def test_truncate_at_sentence_realistic_summary(): + """Test with a realistic entity summary.""" + text = ( + 'John is a software engineer who works at a tech company in San Francisco. ' + 'He has been programming for over 10 years and specializes in Python and distributed systems. ' + 'John enjoys hiking on weekends and is learning to play guitar. ' + 'He graduated from MIT with a degree in computer science.' + ) + result = truncate_at_sentence(text, MAX_SUMMARY_CHARS) + assert len(result) <= MAX_SUMMARY_CHARS + # Should keep complete sentences + assert result.endswith('.') + # Should include at least the first sentence + assert 'John is a software engineer' in result