LightRAG/reproduce/Step_0.py

import argparse
import glob
import json
import os

from lightrag.utils import logger


def extract_unique_contexts(input_directory, output_directory):
    os.makedirs(output_directory, exist_ok=True)

    jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl'))
    logger.info(f'Found {len(jsonl_files)} JSONL files.')

    for file_path in jsonl_files:
        filename = os.path.basename(file_path)
        name, _ext = os.path.splitext(filename)
        output_filename = f'{name}_unique_contexts.json'
        output_path = os.path.join(output_directory, output_filename)

        unique_contexts_dict = {}

        logger.info(f'Processing file: {filename}')

        try:
            with open(file_path, encoding='utf-8') as infile:
                for line_number, line in enumerate(infile, start=1):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        json_obj = json.loads(line)
                        context = json_obj.get('context')
                        if context and context not in unique_contexts_dict:
                            unique_contexts_dict[context] = None
                    except json.JSONDecodeError as e:
                        logger.error(f'JSON decoding error in file {filename} at line {line_number}: {e}')
        except FileNotFoundError:
            logger.error(f'File not found: {filename}')
            continue
        except Exception as e:
            logger.error(f'An error occurred while processing file {filename}: {e}')
            continue

        unique_contexts_list = list(unique_contexts_dict.keys())
        logger.info(f'There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.')

        try:
            with open(output_path, 'w', encoding='utf-8') as outfile:
                json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4)
            logger.info(f'Unique `context` entries have been saved to: {output_filename}')
        except Exception as e:
            logger.error(f'An error occurred while saving to the file {output_filename}: {e}')

    logger.info('All files have been processed.')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_dir', type=str, default='../datasets')
    parser.add_argument('-o', '--output_dir', type=str, default='../datasets/unique_contexts')

    args = parser.parse_args()

    extract_unique_contexts(args.input_dir, args.output_dir)