Add sentence-aware text truncator for entity summaries

- Created truncate_at_sentence() utility function that truncates text at
  sentence boundaries while respecting max character limits
- Added MAX_SUMMARY_CHARS constant (250 chars) for entity summaries
- Applied truncator to entity summaries in prompts (extract_nodes.py)
- Applied truncator to LLM-generated summaries (node_operations.py)
- Added comprehensive test suite for truncation logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Daniel Chalef 2025-10-04 18:57:26 -07:00
parent 78699b0139
commit 9647c0b6f3
4 changed files with 165 additions and 3 deletions

View file

@ -18,6 +18,8 @@ from typing import Any, Protocol, TypedDict
from pydantic import BaseModel, Field
from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS
from .models import Message, PromptFunction, PromptVersion
from .prompt_helpers import to_prompt_json
from .snippets import summary_instructions
@ -57,7 +59,7 @@ class EntityClassification(BaseModel):
class EntitySummary(BaseModel):
summary: str = Field(
...,
description='Summary containing the important information about the entity. Under 250 characters.',
description=f'Summary containing the important information about the entity. Under {MAX_SUMMARY_CHARS} characters.',
)

View file

@ -53,6 +53,7 @@ from graphiti_core.utils.maintenance.dedup_helpers import (
from graphiti_core.utils.maintenance.edge_operations import (
filter_existing_duplicate_of_edges,
)
from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence
logger = logging.getLogger(__name__)
@ -547,7 +548,7 @@ async def _extract_entity_summary(
summary_context = _build_episode_context(
node_data={
'name': node.name,
'summary': node.summary,
'summary': truncate_at_sentence(node.summary, MAX_SUMMARY_CHARS),
'entity_types': node.labels,
'attributes': node.attributes,
},
@ -562,7 +563,7 @@ async def _extract_entity_summary(
group_id=node.group_id,
)
node.summary = summary_response.get('summary', '')
node.summary = truncate_at_sentence(summary_response.get('summary', ''), MAX_SUMMARY_CHARS)
def _build_episode_context(

View file

@ -0,0 +1,53 @@
"""
Copyright 2024, Zep Software, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import re
# Maximum length for entity/node summaries
MAX_SUMMARY_CHARS = 250
def truncate_at_sentence(text: str, max_chars: int) -> str:
"""
Truncate text at or about max_chars while respecting sentence boundaries.
Attempts to truncate at the last complete sentence before max_chars.
If no sentence boundary is found before max_chars, truncates at max_chars.
Args:
text: The text to truncate
max_chars: Maximum number of characters
Returns:
Truncated text
"""
if not text or len(text) <= max_chars:
return text
# Find all sentence boundaries (., !, ?) up to max_chars
truncated = text[:max_chars]
# Look for sentence boundaries: period, exclamation, or question mark followed by space or end
sentence_pattern = r'[.!?](?:\s|$)'
matches = list(re.finditer(sentence_pattern, truncated))
if matches:
# Truncate at the last sentence boundary found
last_match = matches[-1]
return text[: last_match.end()].rstrip()
# No sentence boundary found, truncate at max_chars
return truncated.rstrip()

106
tests/test_text_utils.py Normal file
View file

@ -0,0 +1,106 @@
"""
Copyright 2024, Zep Software, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from graphiti_core.utils.text_utils import MAX_SUMMARY_CHARS, truncate_at_sentence
def test_truncate_at_sentence_short_text():
"""Test that short text is returned unchanged."""
text = 'This is a short sentence.'
result = truncate_at_sentence(text, 100)
assert result == text
def test_truncate_at_sentence_empty():
"""Test that empty text is handled correctly."""
assert truncate_at_sentence('', 100) == ''
assert truncate_at_sentence(None, 100) is None
def test_truncate_at_sentence_exact_length():
"""Test text at exactly max_chars."""
text = 'A' * 100
result = truncate_at_sentence(text, 100)
assert result == text
def test_truncate_at_sentence_with_period():
"""Test truncation at sentence boundary with period."""
text = 'First sentence. Second sentence. Third sentence. Fourth sentence.'
result = truncate_at_sentence(text, 40)
assert result == 'First sentence. Second sentence.'
assert len(result) <= 40
def test_truncate_at_sentence_with_question():
"""Test truncation at sentence boundary with question mark."""
text = 'What is this? This is a test. More text here.'
result = truncate_at_sentence(text, 30)
assert result == 'What is this? This is a test.'
assert len(result) <= 32
def test_truncate_at_sentence_with_exclamation():
"""Test truncation at sentence boundary with exclamation mark."""
text = 'Hello world! This is exciting. And more text.'
result = truncate_at_sentence(text, 30)
assert result == 'Hello world! This is exciting.'
assert len(result) <= 32
def test_truncate_at_sentence_no_boundary():
"""Test truncation when no sentence boundary exists before max_chars."""
text = 'This is a very long sentence without any punctuation marks near the beginning'
result = truncate_at_sentence(text, 30)
assert len(result) <= 30
assert result.startswith('This is a very long sentence')
def test_truncate_at_sentence_multiple_periods():
"""Test with multiple sentence endings."""
text = 'A. B. C. D. E. F. G. H.'
result = truncate_at_sentence(text, 10)
assert result == 'A. B. C.'
assert len(result) <= 10
def test_truncate_at_sentence_strips_trailing_whitespace():
"""Test that trailing whitespace is stripped."""
text = 'First sentence. Second sentence.'
result = truncate_at_sentence(text, 20)
assert result == 'First sentence.'
assert not result.endswith(' ')
def test_max_summary_chars_constant():
"""Test that MAX_SUMMARY_CHARS is set to expected value."""
assert MAX_SUMMARY_CHARS == 250
def test_truncate_at_sentence_realistic_summary():
"""Test with a realistic entity summary."""
text = (
'John is a software engineer who works at a tech company in San Francisco. '
'He has been programming for over 10 years and specializes in Python and distributed systems. '
'John enjoys hiking on weekends and is learning to play guitar. '
'He graduated from MIT with a degree in computer science.'
)
result = truncate_at_sentence(text, MAX_SUMMARY_CHARS)
assert len(result) <= MAX_SUMMARY_CHARS
# Should keep complete sentences
assert result.endswith('.')
# Should include at least the first sentence
assert 'John is a software engineer' in result