45 lines
No EOL
2 KiB
Python
45 lines
No EOL
2 KiB
Python
import os
|
|
import requests
|
|
from github import Github
|
|
from dotenv import load_dotenv
|
|
def chunk_repo(repo_url):
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
# Extract owner and repo name from the URL
|
|
_, _, _, owner, repo_name = repo_url.rstrip('/').split('/')
|
|
print(f"Owner: {owner}, Repo: {repo_name}")
|
|
# Initialize GitHub API client using the token from .env
|
|
g = Github(os.getenv('GITHUB_TOKEN'))
|
|
# Get the repository
|
|
repo = g.get_repo(f"{owner}/{repo_name}")
|
|
# Create output directory if it doesn't exist
|
|
output_dir = 'scripts/output'
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
# List of common code file extensions
|
|
code_extensions = ['.py', '.js', '.ts', '.java', '.c', '.cpp', '.cs', '.go', '.rb', '.php', '.swift', '.kt', '.rs', '.html', '.css', '.scss', '.sql']
|
|
# Traverse through all files in the repository
|
|
contents = repo.get_contents("")
|
|
while contents:
|
|
file_content = contents.pop(0)
|
|
if file_content.type == "dir":
|
|
contents.extend(repo.get_contents(file_content.path))
|
|
else:
|
|
file_extension = os.path.splitext(file_content.name)[1]
|
|
if file_extension in code_extensions:
|
|
# Get the raw content of the file
|
|
raw_content = requests.get(file_content.download_url).text
|
|
|
|
# Create a unique filename for the output
|
|
output_filename = f"{output_dir}/{file_content.path.replace('/', '_')}.txt"
|
|
|
|
# Write metadata and file contents to the output file
|
|
with open(output_filename, 'w', encoding='utf-8') as f:
|
|
f.write(f"File Path: {file_content.path}\n")
|
|
f.write("\n--- File Contents ---\n\n")
|
|
f.write(raw_content)
|
|
print(f"Processed: {file_content.path}")
|
|
print("Repository chunking completed.")
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
repo_url = "https://github.com/palmier-io/palmier-vscode-extension"
|
|
chunk_repo(repo_url) |