* chore: Add romeo runner * fix: Linter * wip * wip dump * chore: Update romeo parser * chore: Anthropic model fix * wip * allbirds * allbirds runner * format * wip * wip * mypy updates * update * remove r * update tests * format * wip * chore: Strategically update the message * rebase and fix import issues * Update package imports for graphiti_core in examples and utils * nits * chore: Update OpenAI GPT-4o model to gpt-4o-2024-08-06 * implement groq * improvments & linting * cleanup and nits * Refactor package imports for graphiti_core in examples and utils * Refactor package imports for graphiti_core in examples and utils * implement diskcache * remove debug stuff * log cache hit when debugging only * Improve LLM config. Fix bugs (#41) Refactor LLMConfig class to allow None values for model and base_url * chore: Resolve mc --------- Co-authored-by: paulpaliychuk <pavlo.paliychuk.ca@gmail.com> Co-authored-by: prestonrasmussen <prasmuss15@gmail.com>
49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
import os
|
|
import re
|
|
import unicodedata
|
|
|
|
|
|
def sanitize_string(text):
|
|
# Normalize Unicode characters
|
|
text = unicodedata.normalize('NFKD', text)
|
|
# Replace specific problematic characters
|
|
text = text.replace('’', "'")
|
|
text = text.replace('“', '"')
|
|
text = text.replace('â€', '"')
|
|
# Remove any remaining non-ASCII characters
|
|
text = text.encode('ascii', 'ignore').decode('ascii')
|
|
return text.strip()
|
|
|
|
|
|
def parse_debate(file_path):
|
|
with open(file_path, encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
# Split the content into episodes
|
|
episodes = re.split(r'\n(?=(?:[A-Z]+):)', content)
|
|
|
|
parsed_episodes = []
|
|
for episode in episodes:
|
|
# Extract the role and statement
|
|
match = re.match(r'([A-Z]+):\s*(.*)', episode.strip(), re.DOTALL)
|
|
if match:
|
|
role, statement = match.groups()
|
|
parsed_episodes.append(
|
|
{'role': sanitize_string(role), 'statement': sanitize_string(statement)}
|
|
)
|
|
|
|
return parsed_episodes
|
|
|
|
|
|
def get_debate_messages():
|
|
file_path = 'bush_gore_debate.txt'
|
|
script_dir = os.path.dirname(__file__)
|
|
relative_path = os.path.join(script_dir, file_path)
|
|
return parse_debate(relative_path)
|
|
|
|
|
|
def get_kennedy_debate_messages():
|
|
file_path = 'kennedy_nixon_debate.txt'
|
|
script_dir = os.path.dirname(__file__)
|
|
relative_path = os.path.join(script_dir, file_path)
|
|
return parse_debate(relative_path)
|