Format entire codebase with ruff and add type hints across all modules: - Apply ruff formatting to all Python files (121 files, 17K insertions) - Add type hints to function signatures throughout lightrag core and API - Update test suite with improved type annotations and docstrings - Add pyrightconfig.json for static type checking configuration - Create prompt_optimized.py and test_extraction_prompt_ab.py test files - Update ruff.toml and .gitignore for improved linting configuration - Standardize code style across examples, reproduce scripts, and utilities
83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
import json
|
|
|
|
from openai import OpenAI
|
|
from transformers import GPT2Tokenizer
|
|
|
|
|
|
def openai_complete_if_cache(
|
|
model='gpt-4o',
|
|
prompt=None,
|
|
system_prompt=None,
|
|
history_messages=None,
|
|
**kwargs,
|
|
) -> str:
|
|
if history_messages is None:
|
|
history_messages = []
|
|
openai_client = OpenAI()
|
|
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({'role': 'system', 'content': system_prompt})
|
|
messages.extend(history_messages)
|
|
messages.append({'role': 'user', 'content': prompt})
|
|
|
|
response = openai_client.chat.completions.create(model=model, messages=messages, **kwargs)
|
|
return response.choices[0].message.content
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
|
|
|
|
def get_summary(context, tot_tokens=2000):
|
|
tokens = tokenizer.tokenize(context)
|
|
half_tokens = tot_tokens // 2
|
|
|
|
start_tokens = tokens[1000 : 1000 + half_tokens]
|
|
end_tokens = tokens[-(1000 + half_tokens) : 1000]
|
|
|
|
summary_tokens = start_tokens + end_tokens
|
|
summary = tokenizer.convert_tokens_to_string(summary_tokens)
|
|
|
|
return summary
|
|
|
|
|
|
clses = ['agriculture']
|
|
for cls in clses:
|
|
with open(f'../datasets/unique_contexts/{cls}_unique_contexts.json') as f:
|
|
unique_contexts = json.load(f)
|
|
|
|
summaries = [get_summary(context) for context in unique_contexts]
|
|
|
|
total_description = '\n\n'.join(summaries)
|
|
|
|
prompt = f"""
|
|
Given the following description of a dataset:
|
|
|
|
{total_description}
|
|
|
|
Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset.
|
|
|
|
Output the results in the following structure:
|
|
- User 1: [user description]
|
|
- Task 1: [task description]
|
|
- Question 1:
|
|
- Question 2:
|
|
- Question 3:
|
|
- Question 4:
|
|
- Question 5:
|
|
- Task 2: [task description]
|
|
...
|
|
- Task 5: [task description]
|
|
- User 2: [user description]
|
|
...
|
|
- User 5: [user description]
|
|
...
|
|
"""
|
|
|
|
result = openai_complete_if_cache(model='gpt-4o', prompt=prompt)
|
|
|
|
file_path = f'../datasets/questions/{cls}_questions.txt'
|
|
with open(file_path, 'w') as file:
|
|
file.write(result)
|
|
|
|
print(f'{cls}_questions written to {file_path}')
|