Add comprehensive test suites for prompt evaluation: - test_prompt_accuracy.py: 365 lines testing prompt extraction accuracy - test_prompt_quality_deep.py: 672 lines for deep quality analysis - Refactor prompt.py to consolidate optimized variants (removed prompt_optimized.py) - Apply ruff formatting and type hints across 30 files - Update pyrightconfig.json for static type checking - Modernize reproduce scripts and examples with improved type annotations - Sync uv.lock dependencies
65 lines
2.5 KiB
Python
65 lines
2.5 KiB
Python
import argparse
|
|
import glob
|
|
import json
|
|
import os
|
|
|
|
from lightrag.utils import logger
|
|
|
|
|
|
def extract_unique_contexts(input_directory, output_directory):
|
|
os.makedirs(output_directory, exist_ok=True)
|
|
|
|
jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl'))
|
|
logger.info(f'Found {len(jsonl_files)} JSONL files.')
|
|
|
|
for file_path in jsonl_files:
|
|
filename = os.path.basename(file_path)
|
|
name, _ext = os.path.splitext(filename)
|
|
output_filename = f'{name}_unique_contexts.json'
|
|
output_path = os.path.join(output_directory, output_filename)
|
|
|
|
unique_contexts_dict = {}
|
|
|
|
logger.info(f'Processing file: {filename}')
|
|
|
|
try:
|
|
with open(file_path, encoding='utf-8') as infile:
|
|
for line_number, line in enumerate(infile, start=1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
json_obj = json.loads(line)
|
|
context = json_obj.get('context')
|
|
if context and context not in unique_contexts_dict:
|
|
unique_contexts_dict[context] = None
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f'JSON decoding error in file {filename} at line {line_number}: {e}')
|
|
except FileNotFoundError:
|
|
logger.error(f'File not found: {filename}')
|
|
continue
|
|
except Exception as e:
|
|
logger.error(f'An error occurred while processing file {filename}: {e}')
|
|
continue
|
|
|
|
unique_contexts_list = list(unique_contexts_dict.keys())
|
|
logger.info(f'There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.')
|
|
|
|
try:
|
|
with open(output_path, 'w', encoding='utf-8') as outfile:
|
|
json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4)
|
|
logger.info(f'Unique `context` entries have been saved to: {output_filename}')
|
|
except Exception as e:
|
|
logger.error(f'An error occurred while saving to the file {output_filename}: {e}')
|
|
|
|
logger.info('All files have been processed.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-i', '--input_dir', type=str, default='../datasets')
|
|
parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts')
|
|
|
|
args = parser.parse_args()
|
|
|
|
extract_unique_contexts(args.input_dir, args.output_dir)
|