From 9647c0b6f3e43740e3f3b3c9d9cc84907197e5ba Mon Sep 17 00:00:00 2001 From: Daniel Chalef <131175+danielchalef@users.noreply.github.com> Date: Sat, 4 Oct 2025 18:57:26 -0700 Subject: [PATCH] Add sentence-aware text truncator for entity summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Created truncate_at_sentence() utility function that truncates text at sentence boundaries while respecting max character limits - Added MAX_SUMMARY_CHARS constant (250 chars) for entity summaries - Applied truncator to entity summaries in prompts (extract_nodes.py) - Applied truncator to LLM-generated summaries (node_operations.py) - Added comprehensive test suite for truncation logic 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- graphiti_core/prompts/extract_nodes.py | 4 +- .../utils/maintenance/node_operations.py | 5 +- graphiti_core/utils/text_utils.py | 53 +++++++++ tests/test_text_utils.py | 106 ++++++++++++++++++ 4 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 graphiti_core/utils/text_utils.py create mode 100644 tests/test_text_utils.py diff --git a/graphiti_core/prompts/extract_nodes.py b/graphiti_core/prompts/extract_nodes.py index f9d6eabd..f8c90b75 100644 --- a/graphiti_core/prompts/extract_nodes.py +++ b/graphiti_core/prompts/extract_nodes.py @@ -18,6 +18,8 @@ from typing import Any, Protocol, TypedDict from pydantic import BaseModel, Field +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS + from .models import Message, PromptFunction, PromptVersion from .prompt_helpers import to_prompt_json from .snippets import summary_instructions @@ -57,7 +59,7 @@ class EntityClassification(BaseModel): class EntitySummary(BaseModel): summary: str = Field( ..., - description='Summary containing the important information about the entity. Under 250 characters.', + description=f'Summary containing the important information about the entity. Under {MAX_SUMMARY_CHARS} characters.', ) diff --git a/graphiti_core/utils/maintenance/node_operations.py b/graphiti_core/utils/maintenance/node_operations.py index 8db44218..a23f30e7 100644 --- a/graphiti_core/utils/maintenance/node_operations.py +++ b/graphiti_core/utils/maintenance/node_operations.py @@ -53,6 +53,7 @@ from graphiti_core.utils.maintenance.dedup_helpers import ( from graphiti_core.utils.maintenance.edge_operations import ( filter_existing_duplicate_of_edges, ) +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence logger = logging.getLogger(__name__) @@ -547,7 +548,7 @@ async def _extract_entity_summary( summary_context = _build_episode_context( node_data={ 'name': node.name, - 'summary': node.summary, + 'summary': truncate_at_sentence(node.summary, MAX_SUMMARY_CHARS), 'entity_types': node.labels, 'attributes': node.attributes, }, @@ -562,7 +563,7 @@ async def _extract_entity_summary( group_id=node.group_id, ) - node.summary = summary_response.get('summary', '') + node.summary = truncate_at_sentence(summary_response.get('summary', ''), MAX_SUMMARY_CHARS) def _build_episode_context( diff --git a/graphiti_core/utils/text_utils.py b/graphiti_core/utils/text_utils.py new file mode 100644 index 00000000..fc5240f0 --- /dev/null +++ b/graphiti_core/utils/text_utils.py @@ -0,0 +1,53 @@ +""" +Copyright 2024, Zep Software, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re + +# Maximum length for entity/node summaries +MAX_SUMMARY_CHARS = 250 + + +def truncate_at_sentence(text: str, max_chars: int) -> str: + """ + Truncate text at or about max_chars while respecting sentence boundaries. + + Attempts to truncate at the last complete sentence before max_chars. + If no sentence boundary is found before max_chars, truncates at max_chars. + + Args: + text: The text to truncate + max_chars: Maximum number of characters + + Returns: + Truncated text + """ + if not text or len(text) <= max_chars: + return text + + # Find all sentence boundaries (., !, ?) up to max_chars + truncated = text[:max_chars] + + # Look for sentence boundaries: period, exclamation, or question mark followed by space or end + sentence_pattern = r'[.!?](?:\s|$)' + matches = list(re.finditer(sentence_pattern, truncated)) + + if matches: + # Truncate at the last sentence boundary found + last_match = matches[-1] + return text[: last_match.end()].rstrip() + + # No sentence boundary found, truncate at max_chars + return truncated.rstrip() diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py new file mode 100644 index 00000000..38231d93 --- /dev/null +++ b/tests/test_text_utils.py @@ -0,0 +1,106 @@ +""" +Copyright 2024, Zep Software, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence + + +def test_truncate_at_sentence_short_text(): + """Test that short text is returned unchanged.""" + text = 'This is a short sentence.' + result = truncate_at_sentence(text, 100) + assert result == text + + +def test_truncate_at_sentence_empty(): + """Test that empty text is handled correctly.""" + assert truncate_at_sentence('', 100) == '' + assert truncate_at_sentence(None, 100) is None + + +def test_truncate_at_sentence_exact_length(): + """Test text at exactly max_chars.""" + text = 'A' * 100 + result = truncate_at_sentence(text, 100) + assert result == text + + +def test_truncate_at_sentence_with_period(): + """Test truncation at sentence boundary with period.""" + text = 'First sentence. Second sentence. Third sentence. Fourth sentence.' + result = truncate_at_sentence(text, 40) + assert result == 'First sentence. Second sentence.' + assert len(result) <= 40 + + +def test_truncate_at_sentence_with_question(): + """Test truncation at sentence boundary with question mark.""" + text = 'What is this? This is a test. More text here.' + result = truncate_at_sentence(text, 30) + assert result == 'What is this? This is a test.' + assert len(result) <= 32 + + +def test_truncate_at_sentence_with_exclamation(): + """Test truncation at sentence boundary with exclamation mark.""" + text = 'Hello world! This is exciting. And more text.' + result = truncate_at_sentence(text, 30) + assert result == 'Hello world! This is exciting.' + assert len(result) <= 32 + + +def test_truncate_at_sentence_no_boundary(): + """Test truncation when no sentence boundary exists before max_chars.""" + text = 'This is a very long sentence without any punctuation marks near the beginning' + result = truncate_at_sentence(text, 30) + assert len(result) <= 30 + assert result.startswith('This is a very long sentence') + + +def test_truncate_at_sentence_multiple_periods(): + """Test with multiple sentence endings.""" + text = 'A. B. C. D. E. F. G. H.' + result = truncate_at_sentence(text, 10) + assert result == 'A. B. C.' + assert len(result) <= 10 + + +def test_truncate_at_sentence_strips_trailing_whitespace(): + """Test that trailing whitespace is stripped.""" + text = 'First sentence. Second sentence.' + result = truncate_at_sentence(text, 20) + assert result == 'First sentence.' + assert not result.endswith(' ') + + +def test_max_summary_chars_constant(): + """Test that MAX_SUMMARY_CHARS is set to expected value.""" + assert MAX_SUMMARY_CHARS == 250 + + +def test_truncate_at_sentence_realistic_summary(): + """Test with a realistic entity summary.""" + text = ( + 'John is a software engineer who works at a tech company in San Francisco. ' + 'He has been programming for over 10 years and specializes in Python and distributed systems. ' + 'John enjoys hiking on weekends and is learning to play guitar. ' + 'He graduated from MIT with a degree in computer science.' + ) + result = truncate_at_sentence(text, MAX_SUMMARY_CHARS) + assert len(result) <= MAX_SUMMARY_CHARS + # Should keep complete sentences + assert result.endswith('.') + # Should include at least the first sentence + assert 'John is a software engineer' in result