From 4ca1de266e344ec5b914f80407760b72ebfc76b3 Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Thu, 31 Jul 2025 05:15:41 -0700 Subject: [PATCH] feat/configurable-path-exclusion --- cognee/api/v1/cognify/code_graph_pipeline.py | 20 +++- .../get_repo_file_dependencies.py | 106 ++++++++---------- cognee/tests/test_repo_processor.py | 45 ++++++++ 3 files changed, 109 insertions(+), 62 deletions(-) create mode 100644 cognee/tests/test_repo_processor.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 00a0d3dc9..d7faab6b5 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline") @observe -async def run_code_graph_pipeline(repo_path, include_docs=False): +async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): import cognee from cognee.low_level import setup @@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): user = await get_default_user() detailed_extraction = True + # Default exclusion patterns + if excluded_paths is None: + excluded_paths = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", + "build/", "dist/", "node_modules/", ".npm/", ".git/", + ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + tasks = [ - Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction), - # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete + Task( + get_repo_file_dependencies, + detailed_extraction=detailed_extraction, + excluded_paths=excluded_paths + ), Task(add_data_points, task_config={"batch_size": 30}), ] if include_docs: - # This tasks take a long time to complete non_code_tasks = [ Task(get_non_py_files, task_config={"batch_size": 50}), Task(ingest_data, dataset_name="repo_docs", user=user), @@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): dataset_name = "codebase" - # Save dataset to database db_engine = get_relational_engine() async with db_engine.get_async_session() as session: dataset = await create_dataset(dataset_name, user, session) diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 232850936..2567a44cd 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -1,56 +1,68 @@ import asyncio import math import os - -# from concurrent.futures import ProcessPoolExecutor -from typing import AsyncGenerator +import fnmatch +from typing import AsyncGenerator, Optional, List from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, Repository -async def get_source_code_files(repo_path): +async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None): """ - Retrieve Python source code files from the specified repository path. - - This function scans the given repository path for files that have the .py extension - while excluding test files and files within a virtual environment. It returns a list of - absolute paths to the source code files that are not empty. + Retrieve Python source code files from the specified repository path, + excluding paths and file patterns commonly irrelevant to code analysis. Parameters: ----------- - - - repo_path: The file path to the repository to search for Python source files. + - repo_path: Root path of the repository to search + - excluded_paths: Optional list of path fragments or glob patterns to exclude Returns: -------- - - A list of absolute paths to .py files that contain source code, excluding empty - files, test files, and files from a virtual environment. + List of absolute file paths for .py files, excluding test files, + empty files, and files under ignored directories or matching ignore patterns. """ - if not os.path.exists(repo_path): - return {} - py_files_paths = ( - os.path.join(root, file) - for root, _, files in os.walk(repo_path) - for file in files - if ( - file.endswith(".py") - and not file.startswith("test_") - and not file.endswith("_test") - and ".venv" not in file - ) - ) + if not os.path.exists(repo_path): + return [] + + # Default exclusions + default_excluded_patterns = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/", + "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + + excluded_patterns = default_excluded_patterns + (excluded_paths or []) + + py_files_paths = [] + for root, _, files in os.walk(repo_path): + for file in files: + full_path = os.path.join(root, file) + rel_path = os.path.relpath(full_path, repo_path) + + # Check for exclusion + should_exclude = any( + pattern in rel_path or fnmatch.fnmatch(rel_path, pattern) + for pattern in excluded_patterns + ) + if should_exclude: + continue + + if ( + file.endswith(".py") + and not file.startswith("test_") + and not file.endswith("_test") + ): + py_files_paths.append(full_path) source_code_files = set() for file_path in py_files_paths: file_path = os.path.abspath(file_path) - if os.path.getsize(file_path) == 0: continue - source_code_files.add(file_path) return list(source_code_files) @@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs): This function creates a new asyncio event loop, sets it as the current loop, and executes the given coroutine function with the provided arguments. Once the coroutine - completes, the loop is closed. Intended for use in environments where an existing event - loop is not available or desirable. - - Parameters: - ----------- - - - coroutine_func: The coroutine function to be run. - - *args: Positional arguments to pass to the coroutine function. - - **kwargs: Keyword arguments to pass to the coroutine function. - - Returns: - -------- - - The result returned by the coroutine after completion. + completes, the loop is closed. """ loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( - repo_path: str, detailed_extraction: bool = False + repo_path: str, + detailed_extraction: bool = False, + excluded_paths: Optional[List[str]] = None ) -> AsyncGenerator[DataPoint, None]: """ Generate a dependency graph for Python files in the given repository path. - Check the validity of the repository path and yield a repository object followed by the - dependencies of Python files within that repository. Raise a FileNotFoundError if the - provided path does not exist. The extraction of detailed dependencies can be controlled - via the `detailed_extraction` argument. - Parameters: ----------- - - - repo_path (str): The file path to the repository where Python files are located. - - detailed_extraction (bool): A flag indicating whether to perform a detailed - extraction of dependencies (default is False). (default False) + - repo_path: Path to local repository + - detailed_extraction: Whether to extract fine-grained dependencies + - excluded_paths: Optional custom exclusion list """ if not os.path.exists(repo_path): raise FileNotFoundError(f"Repository path {repo_path} does not exist.") - source_code_files = await get_source_code_files(repo_path) + source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths) repo = Repository( id=uuid5(NAMESPACE_OID, repo_path), @@ -125,11 +120,9 @@ async def get_repo_file_dependencies( for chunk_number in range(number_of_chunks) ] - # Codegraph dependencies are not installed by default, so we import where we use them. from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies for start_range, end_range in chunk_ranges: - # with ProcessPoolExecutor(max_workers=12) as executor: tasks = [ get_local_script_dependencies(repo_path, file_path, detailed_extraction) for file_path in source_code_files[start_range : end_range + 1] @@ -139,5 +132,4 @@ async def get_repo_file_dependencies( for source_code_file in results: source_code_file.part_of = repo - yield source_code_file diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py new file mode 100644 index 000000000..4de102da6 --- /dev/null +++ b/cognee/tests/test_repo_processor.py @@ -0,0 +1,45 @@ +import os +import shutil +import tempfile +from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files + +def test_get_source_code_files_excludes_common_dirs_and_files(): + # Create a temporary test directory + test_repo = tempfile.mkdtemp() + + # Create files and folders to include/exclude + included_file = os.path.join(test_repo, "main.py") + excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"] + excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"] + + # Create included file + with open(included_file, "w") as f: + f.write("print('Hello world')") + + # Create excluded directories and files inside them + for folder in excluded_dirs: + folder_path = os.path.join(test_repo, folder) + os.makedirs(folder_path) + file_path = os.path.join(folder_path, "ignored.js") + with open(file_path, "w") as f: + f.write("// ignore this") + + # Create excluded files in root + for file_name in excluded_files: + file_path = os.path.join(test_repo, file_name) + with open(file_path, "w") as f: + f.write("dummy") + + # Run function + results = get_source_code_files(test_repo) + + # Assert only included file is present + assert included_file in results + for root, dirs, files in os.walk(test_repo): + for name in files: + full_path = os.path.join(root, name) + if full_path != included_file: + assert full_path not in results, f"{full_path} should have been excluded" + + # Cleanup + shutil.rmtree(test_repo)