53 lines
1.6 KiB
Python
53 lines
1.6 KiB
Python
"""
|
|
Copyright 2024, Zep Software, Inc.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
"""
|
|
|
|
import re
|
|
|
|
# Maximum length for entity/node summaries
|
|
MAX_SUMMARY_CHARS = 500
|
|
|
|
|
|
def truncate_at_sentence(text: str, max_chars: int) -> str:
|
|
"""
|
|
Truncate text at or about max_chars while respecting sentence boundaries.
|
|
|
|
Attempts to truncate at the last complete sentence before max_chars.
|
|
If no sentence boundary is found before max_chars, truncates at max_chars.
|
|
|
|
Args:
|
|
text: The text to truncate
|
|
max_chars: Maximum number of characters
|
|
|
|
Returns:
|
|
Truncated text
|
|
"""
|
|
if not text or len(text) <= max_chars:
|
|
return text
|
|
|
|
# Find all sentence boundaries (., !, ?) up to max_chars
|
|
truncated = text[:max_chars]
|
|
|
|
# Look for sentence boundaries: period, exclamation, or question mark followed by space or end
|
|
sentence_pattern = r'[.!?](?:\s|$)'
|
|
matches = list(re.finditer(sentence_pattern, truncated))
|
|
|
|
if matches:
|
|
# Truncate at the last sentence boundary found
|
|
last_match = matches[-1]
|
|
return text[: last_match.end()].rstrip()
|
|
|
|
# No sentence boundary found, truncate at max_chars
|
|
return truncated.rstrip()
|