From 4cee9a16ce16048aedf4678201ed16b5ce22273b Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:22:45 +0100 Subject: [PATCH] fix: add allowed extensions --- .../repo_processor/get_non_code_files.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 671b998d9..f060782b6 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -25,8 +25,27 @@ async def get_non_py_files(repo_path): 'node_modules', '*.egg-info' } + ALLOWED_EXTENSIONS = { + '.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html', + '.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini', + '.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore', + '.gitattributes', '.makefile', '.pyproject', '.requirements', + '.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf', + '.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp', + '.sdw', '.sdx', '.docm', '.dotm', + # Additional extensions for other programming languages + '.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb', + '.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh', + '.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs', + '.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi', + '.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim', + '.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss', + '.less', '.json5', '.yaml', '.yml' + } + def should_process(path): - return not any(pattern in path for pattern in IGNORED_PATTERNS) + _, ext = os.path.splitext(path) + return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS) non_py_files_paths = [ os.path.join(root, file)