feat/configurable-path-exclusion

2025-07-31 05:15:41 -07:00 · 2025-07-31 05:15:41 -07:00 · 4ca1de266e
commit 4ca1de266e
parent 9907e6fe5b
3 changed files with 109 additions and 62 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline")


@observe
-async def run_code_graph_pipeline(repo_path, include_docs=False):
+async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
    import cognee
    from cognee.low_level import setup

@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
    user = await get_default_user()
    detailed_extraction = True

+    # Default exclusion patterns
+    if excluded_paths is None:
+        excluded_paths = [
+            ".venv/", "venv/", "__pycache__/", ".pytest_cache/",
+            "build/", "dist/", "node_modules/", ".npm/", ".git/",
+            ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
+            "*.pyc", "*.pyo", "*.log", "*.tmp"
+        ]
+
    tasks = [
-        Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
-        # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
+        Task(
+            get_repo_file_dependencies,
+            detailed_extraction=detailed_extraction,
+            excluded_paths=excluded_paths
+        ),
        Task(add_data_points, task_config={"batch_size": 30}),
    ]

    if include_docs:
-        # This tasks take a long time to complete
        non_code_tasks = [
            Task(get_non_py_files, task_config={"batch_size": 50}),
            Task(ingest_data, dataset_name="repo_docs", user=user),
@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):

    dataset_name = "codebase"

-    # Save dataset to database
    db_engine = get_relational_engine()
    async with db_engine.get_async_session() as session:
        dataset = await create_dataset(dataset_name, user, session)
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -1,56 +1,68 @@
 import asyncio
 import math
 import os
-
-# from concurrent.futures import ProcessPoolExecutor
-from typing import AsyncGenerator
+import fnmatch
+from typing import AsyncGenerator, Optional, List
 from uuid import NAMESPACE_OID, uuid5

 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, Repository


-async def get_source_code_files(repo_path):
+async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None):
    """
-    Retrieve Python source code files from the specified repository path.
-
-    This function scans the given repository path for files that have the .py extension
-    while excluding test files and files within a virtual environment. It returns a list of
-    absolute paths to the source code files that are not empty.
+    Retrieve Python source code files from the specified repository path,
+    excluding paths and file patterns commonly irrelevant to code analysis.

    Parameters:
    -----------
-
-        - repo_path: The file path to the repository to search for Python source files.
+    - repo_path: Root path of the repository to search
+    - excluded_paths: Optional list of path fragments or glob patterns to exclude

    Returns:
    --------
-
-        A list of absolute paths to .py files that contain source code, excluding empty
-        files, test files, and files from a virtual environment.
+    List of absolute file paths for .py files, excluding test files,
+    empty files, and files under ignored directories or matching ignore patterns.
    """
-    if not os.path.exists(repo_path):
-        return {}

-    py_files_paths = (
-        os.path.join(root, file)
-        for root, _, files in os.walk(repo_path)
-        for file in files
-        if (
-            file.endswith(".py")
-            and not file.startswith("test_")
-            and not file.endswith("_test")
-            and ".venv" not in file
-        )
-    )
+    if not os.path.exists(repo_path):
+        return []
+
+    # Default exclusions
+    default_excluded_patterns = [
+        ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
+        "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
+        "*.pyc", "*.pyo", "*.log", "*.tmp"
+    ]
+
+    excluded_patterns = default_excluded_patterns + (excluded_paths or [])
+
+    py_files_paths = []
+    for root, _, files in os.walk(repo_path):
+        for file in files:
+            full_path = os.path.join(root, file)
+            rel_path = os.path.relpath(full_path, repo_path)
+
+            # Check for exclusion
+            should_exclude = any(
+                pattern in rel_path or fnmatch.fnmatch(rel_path, pattern)
+                for pattern in excluded_patterns
+            )
+            if should_exclude:
+                continue
+
+            if (
+                file.endswith(".py")
+                and not file.startswith("test_")
+                and not file.endswith("_test")
+            ):
+                py_files_paths.append(full_path)

    source_code_files = set()
    for file_path in py_files_paths:
        file_path = os.path.abspath(file_path)
-
        if os.path.getsize(file_path) == 0:
            continue
-
        source_code_files.add(file_path)

    return list(source_code_files)
@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):

    This function creates a new asyncio event loop, sets it as the current loop, and
    executes the given coroutine function with the provided arguments. Once the coroutine
-    completes, the loop is closed. Intended for use in environments where an existing event
-    loop is not available or desirable.
-
-    Parameters:
-    -----------
-
-        - coroutine_func: The coroutine function to be run.
-        - *args: Positional arguments to pass to the coroutine function.
-        - **kwargs: Keyword arguments to pass to the coroutine function.
-
-    Returns:
-    --------
-
-        The result returned by the coroutine after completion.
+    completes, the loop is closed.
    """
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs):


 async def get_repo_file_dependencies(
-    repo_path: str, detailed_extraction: bool = False
+    repo_path: str,
+    detailed_extraction: bool = False,
+    excluded_paths: Optional[List[str]] = None
 ) -> AsyncGenerator[DataPoint, None]:
    """
    Generate a dependency graph for Python files in the given repository path.

-    Check the validity of the repository path and yield a repository object followed by the
-    dependencies of Python files within that repository. Raise a FileNotFoundError if the
-    provided path does not exist. The extraction of detailed dependencies can be controlled
-    via the `detailed_extraction` argument.
-
    Parameters:
    -----------
-
-        - repo_path (str): The file path to the repository where Python files are located.
-        - detailed_extraction (bool): A flag indicating whether to perform a detailed
-          extraction of dependencies (default is False). (default False)
+    - repo_path: Path to local repository
+    - detailed_extraction: Whether to extract fine-grained dependencies
+    - excluded_paths: Optional custom exclusion list
    """

    if not os.path.exists(repo_path):
        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")

-    source_code_files = await get_source_code_files(repo_path)
+    source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths)

    repo = Repository(
        id=uuid5(NAMESPACE_OID, repo_path),
@ -125,11 +120,9 @@ async def get_repo_file_dependencies(
        for chunk_number in range(number_of_chunks)
    ]

-    # Codegraph dependencies are not installed by default, so we import where we use them.
    from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies

    for start_range, end_range in chunk_ranges:
-        # with ProcessPoolExecutor(max_workers=12) as executor:
        tasks = [
            get_local_script_dependencies(repo_path, file_path, detailed_extraction)
            for file_path in source_code_files[start_range : end_range + 1]
@ -139,5 +132,4 @@ async def get_repo_file_dependencies(

        for source_code_file in results:
            source_code_file.part_of = repo
-
            yield source_code_file
--- a/cognee/tests/test_repo_processor.py
+++ b/cognee/tests/test_repo_processor.py
@ -0,0 +1,45 @@
+import os
+import shutil
+import tempfile
+from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
+
+def test_get_source_code_files_excludes_common_dirs_and_files():
+    # Create a temporary test directory
+    test_repo = tempfile.mkdtemp()
+
+    # Create files and folders to include/exclude
+    included_file = os.path.join(test_repo, "main.py")
+    excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
+    excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
+
+    # Create included file
+    with open(included_file, "w") as f:
+        f.write("print('Hello world')")
+
+    # Create excluded directories and files inside them
+    for folder in excluded_dirs:
+        folder_path = os.path.join(test_repo, folder)
+        os.makedirs(folder_path)
+        file_path = os.path.join(folder_path, "ignored.js")
+        with open(file_path, "w") as f:
+            f.write("// ignore this")
+
+    # Create excluded files in root
+    for file_name in excluded_files:
+        file_path = os.path.join(test_repo, file_name)
+        with open(file_path, "w") as f:
+            f.write("dummy")
+
+    # Run function
+    results = get_source_code_files(test_repo)
+
+    # Assert only included file is present
+    assert included_file in results
+    for root, dirs, files in os.walk(test_repo):
+        for name in files:
+            full_path = os.path.join(root, name)
+            if full_path != included_file:
+                assert full_path not in results, f"{full_path} should have been excluded"
+
+    # Cleanup
+    shutil.rmtree(test_repo)