LightRAG/scripts/repo_chunking.py
2024-10-16 20:59:08 -07:00

45 lines
No EOL
2 KiB
Python

import os
import requests
from github import Github
from dotenv import load_dotenv
def chunk_repo(repo_url):
# Load environment variables from .env file
load_dotenv()
# Extract owner and repo name from the URL
_, _, _, owner, repo_name = repo_url.rstrip('/').split('/')
print(f"Owner: {owner}, Repo: {repo_name}")
# Initialize GitHub API client using the token from .env
g = Github(os.getenv('GITHUB_TOKEN'))
# Get the repository
repo = g.get_repo(f"{owner}/{repo_name}")
# Create output directory if it doesn't exist
output_dir = 'scripts/output'
os.makedirs(output_dir, exist_ok=True)
# List of common code file extensions
code_extensions = ['.py', '.js', '.ts', '.java', '.c', '.cpp', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.rs', '.html', '.css', '.scss', '.sql']
# Traverse through all files in the repository
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
else:
file_extension = os.path.splitext(file_content.name)[1]
if file_extension in code_extensions:
# Get the raw content of the file
raw_content = requests.get(file_content.download_url).text
# Create a unique filename for the output
output_filename = f"{output_dir}/{file_content.path.replace('/', '_')}.txt"
# Write metadata and file contents to the output file
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(f"File Path: {file_content.path}\n")
f.write("\n--- File Contents ---\n\n")
f.write(raw_content)
print(f"Processed: {file_content.path}")
print("Repository chunking completed.")
# Example usage
if __name__ == "__main__":
repo_url = "https://github.com/palmier-io/palmier-vscode-extension"
chunk_repo(repo_url)