LightRAG/scripts/repo_chunking.py

import os
import requests
from github import Github
from dotenv import load_dotenv
def chunk_repo(repo_url):
    # Load environment variables from .env file
    load_dotenv()
    # Extract owner and repo name from the URL
    _, _, _, owner, repo_name = repo_url.rstrip('/').split('/')
    print(f"Owner: {owner}, Repo: {repo_name}")
    # Initialize GitHub API client using the token from .env
    g = Github(os.getenv('GITHUB_TOKEN'))
    # Get the repository
    repo = g.get_repo(f"{owner}/{repo_name}")
    # Create output directory if it doesn't exist
    output_dir = 'scripts/output'
    os.makedirs(output_dir, exist_ok=True)
    # List of common code file extensions
    code_extensions = ['.py', '.js', '.ts', '.java', '.c', '.cpp', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.rs', '.html', '.css', '.scss', '.sql']
    # Traverse through all files in the repository
    contents = repo.get_contents("")
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            file_extension = os.path.splitext(file_content.name)[1]
            if file_extension in code_extensions:
                # Get the raw content of the file
                raw_content = requests.get(file_content.download_url).text

                # Create a unique filename for the output
                output_filename = f"{output_dir}/{file_content.path.replace('/', '_')}.txt"

                # Write metadata and file contents to the output file
                with open(output_filename, 'w', encoding='utf-8') as f:
                    f.write(f"File Path: {file_content.path}\n")
                    f.write("\n--- File Contents ---\n\n")
                    f.write(raw_content)
                print(f"Processed: {file_content.path}")
    print("Repository chunking completed.")
# Example usage
if __name__ == "__main__":
    repo_url = "https://github.com/palmier-io/palmier-vscode-extension"
    chunk_repo(repo_url)