Add sentence-aware text truncator for entity summaries

- Created truncate_at_sentence() utility function that truncates text at sentence boundaries while respecting max character limits - Added MAX_SUMMARY_CHARS constant (250 chars) for entity summaries - Applied truncator to entity summaries in prompts (extract_nodes.py) - Applied truncator to LLM-generated summaries (node_operations.py) - Added comprehensive test suite for truncation logic 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-04 18:57:26 -07:00 · 2025-10-04 18:57:26 -07:00 · 9647c0b6f3
commit 9647c0b6f3
parent 78699b0139
4 changed files with 165 additions and 3 deletions
--- a/graphiti_core/prompts/extract_nodes.py
+++ b/graphiti_core/prompts/extract_nodes.py
@ -18,6 +18,8 @@ from typing import Any, Protocol, TypedDict

 from pydantic import BaseModel, Field

+from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS
+
 from .models import Message, PromptFunction, PromptVersion
 from .prompt_helpers import to_prompt_json
 from .snippets import summary_instructions
@ -57,7 +59,7 @@ class EntityClassification(BaseModel):
 class EntitySummary(BaseModel):
    summary: str = Field(
        ...,
-        description='Summary containing the important information about the entity. Under 250 characters.',
+        description=f'Summary containing the important information about the entity. Under {MAX_SUMMARY_CHARS} characters.',
    )


--- a/graphiti_core/utils/maintenance/node_operations.py
+++ b/graphiti_core/utils/maintenance/node_operations.py
@ -53,6 +53,7 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
 from graphiti_core.utils.maintenance.edge_operations import (
    filter_existing_duplicate_of_edges,
 )
+from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence

 logger = logging.getLogger(__name__)

@ -547,7 +548,7 @@ async def _extract_entity_summary(
    summary_context = _build_episode_context(
        node_data={
            'name': node.name,
-            'summary': node.summary,
+            'summary': truncate_at_sentence(node.summary, MAX_SUMMARY_CHARS),
            'entity_types': node.labels,
            'attributes': node.attributes,
        },
@ -562,7 +563,7 @@ async def _extract_entity_summary(
        group_id=node.group_id,
    )

-    node.summary = summary_response.get('summary', '')
+    node.summary = truncate_at_sentence(summary_response.get('summary', ''), MAX_SUMMARY_CHARS)


 def _build_episode_context(
--- a/graphiti_core/utils/text_utils.py
+++ b/graphiti_core/utils/text_utils.py
@ -0,0 +1,53 @@
+"""
+Copyright 2024, Zep Software, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import re
+
+# Maximum length for entity/node summaries
+MAX_SUMMARY_CHARS = 250
+
+
+def truncate_at_sentence(text: str, max_chars: int) -> str:
+    """
+    Truncate text at or about max_chars while respecting sentence boundaries.
+
+    Attempts to truncate at the last complete sentence before max_chars.
+    If no sentence boundary is found before max_chars, truncates at max_chars.
+
+    Args:
+        text: The text to truncate
+        max_chars: Maximum number of characters
+
+    Returns:
+        Truncated text
+    """
+    if not text or len(text) <= max_chars:
+        return text
+
+    # Find all sentence boundaries (., !, ?) up to max_chars
+    truncated = text[:max_chars]
+
+    # Look for sentence boundaries: period, exclamation, or question mark followed by space or end
+    sentence_pattern = r'[.!?](?:\s|$)'
+    matches = list(re.finditer(sentence_pattern, truncated))
+
+    if matches:
+        # Truncate at the last sentence boundary found
+        last_match = matches[-1]
+        return text[: last_match.end()].rstrip()
+
+    # No sentence boundary found, truncate at max_chars
+    return truncated.rstrip()
--- a/tests/test_text_utils.py
+++ b/tests/test_text_utils.py
@ -0,0 +1,106 @@
+"""
+Copyright 2024, Zep Software, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence
+
+
+def test_truncate_at_sentence_short_text():
+    """Test that short text is returned unchanged."""
+    text = 'This is a short sentence.'
+    result = truncate_at_sentence(text, 100)
+    assert result == text
+
+
+def test_truncate_at_sentence_empty():
+    """Test that empty text is handled correctly."""
+    assert truncate_at_sentence('', 100) == ''
+    assert truncate_at_sentence(None, 100) is None
+
+
+def test_truncate_at_sentence_exact_length():
+    """Test text at exactly max_chars."""
+    text = 'A' * 100
+    result = truncate_at_sentence(text, 100)
+    assert result == text
+
+
+def test_truncate_at_sentence_with_period():
+    """Test truncation at sentence boundary with period."""
+    text = 'First sentence. Second sentence. Third sentence. Fourth sentence.'
+    result = truncate_at_sentence(text, 40)
+    assert result == 'First sentence. Second sentence.'
+    assert len(result) <= 40
+
+
+def test_truncate_at_sentence_with_question():
+    """Test truncation at sentence boundary with question mark."""
+    text = 'What is this? This is a test. More text here.'
+    result = truncate_at_sentence(text, 30)
+    assert result == 'What is this? This is a test.'
+    assert len(result) <= 32
+
+
+def test_truncate_at_sentence_with_exclamation():
+    """Test truncation at sentence boundary with exclamation mark."""
+    text = 'Hello world! This is exciting. And more text.'
+    result = truncate_at_sentence(text, 30)
+    assert result == 'Hello world! This is exciting.'
+    assert len(result) <= 32
+
+
+def test_truncate_at_sentence_no_boundary():
+    """Test truncation when no sentence boundary exists before max_chars."""
+    text = 'This is a very long sentence without any punctuation marks near the beginning'
+    result = truncate_at_sentence(text, 30)
+    assert len(result) <= 30
+    assert result.startswith('This is a very long sentence')
+
+
+def test_truncate_at_sentence_multiple_periods():
+    """Test with multiple sentence endings."""
+    text = 'A. B. C. D. E. F. G. H.'
+    result = truncate_at_sentence(text, 10)
+    assert result == 'A. B. C.'
+    assert len(result) <= 10
+
+
+def test_truncate_at_sentence_strips_trailing_whitespace():
+    """Test that trailing whitespace is stripped."""
+    text = 'First sentence.   Second sentence.'
+    result = truncate_at_sentence(text, 20)
+    assert result == 'First sentence.'
+    assert not result.endswith(' ')
+
+
+def test_max_summary_chars_constant():
+    """Test that MAX_SUMMARY_CHARS is set to expected value."""
+    assert MAX_SUMMARY_CHARS == 250
+
+
+def test_truncate_at_sentence_realistic_summary():
+    """Test with a realistic entity summary."""
+    text = (
+        'John is a software engineer who works at a tech company in San Francisco. '
+        'He has been programming for over 10 years and specializes in Python and distributed systems. '
+        'John enjoys hiking on weekends and is learning to play guitar. '
+        'He graduated from MIT with a degree in computer science.'
+    )
+    result = truncate_at_sentence(text, MAX_SUMMARY_CHARS)
+    assert len(result) <= MAX_SUMMARY_CHARS
+    # Should keep complete sentences
+    assert result.endswith('.')
+    # Should include at least the first sentence
+    assert 'John is a software engineer' in result