LightRAG/reproduce/Step_0.py
clssck 69358d830d test(lightrag,examples,api): comprehensive ruff formatting and type hints
Format entire codebase with ruff and add type hints across all modules:
- Apply ruff formatting to all Python files (121 files, 17K insertions)
- Add type hints to function signatures throughout lightrag core and API
- Update test suite with improved type annotations and docstrings
- Add pyrightconfig.json for static type checking configuration
- Create prompt_optimized.py and test_extraction_prompt_ab.py test files
- Update ruff.toml and .gitignore for improved linting configuration
- Standardize code style across examples, reproduce scripts, and utilities
2025-12-05 15:17:06 +01:00

63 lines
2.4 KiB
Python

import argparse
import glob
import json
import os
def extract_unique_contexts(input_directory, output_directory):
os.makedirs(output_directory, exist_ok=True)
jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl'))
print(f'Found {len(jsonl_files)} JSONL files.')
for file_path in jsonl_files:
filename = os.path.basename(file_path)
name, _ext = os.path.splitext(filename)
output_filename = f'{name}_unique_contexts.json'
output_path = os.path.join(output_directory, output_filename)
unique_contexts_dict = {}
print(f'Processing file: {filename}')
try:
with open(file_path, encoding='utf-8') as infile:
for line_number, line in enumerate(infile, start=1):
line = line.strip()
if not line:
continue
try:
json_obj = json.loads(line)
context = json_obj.get('context')
if context and context not in unique_contexts_dict:
unique_contexts_dict[context] = None
except json.JSONDecodeError as e:
print(f'JSON decoding error in file {filename} at line {line_number}: {e}')
except FileNotFoundError:
print(f'File not found: {filename}')
continue
except Exception as e:
print(f'An error occurred while processing file {filename}: {e}')
continue
unique_contexts_list = list(unique_contexts_dict.keys())
print(f'There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.')
try:
with open(output_path, 'w', encoding='utf-8') as outfile:
json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4)
print(f'Unique `context` entries have been saved to: {output_filename}')
except Exception as e:
print(f'An error occurred while saving to the file {output_filename}: {e}')
print('All files have been processed.')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_dir', type=str, default='../datasets')
parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts')
args = parser.parse_args()
extract_unique_contexts(args.input_dir, args.output_dir)