From 4ca1de266e344ec5b914f80407760b72ebfc76b3 Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Thu, 31 Jul 2025 05:15:41 -0700 Subject: [PATCH 1/5] feat/configurable-path-exclusion --- cognee/api/v1/cognify/code_graph_pipeline.py | 20 +++- .../get_repo_file_dependencies.py | 106 ++++++++---------- cognee/tests/test_repo_processor.py | 45 ++++++++ 3 files changed, 109 insertions(+), 62 deletions(-) create mode 100644 cognee/tests/test_repo_processor.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 00a0d3dc9..d7faab6b5 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline") @observe -async def run_code_graph_pipeline(repo_path, include_docs=False): +async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): import cognee from cognee.low_level import setup @@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): user = await get_default_user() detailed_extraction = True + # Default exclusion patterns + if excluded_paths is None: + excluded_paths = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", + "build/", "dist/", "node_modules/", ".npm/", ".git/", + ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + tasks = [ - Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction), - # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete + Task( + get_repo_file_dependencies, + detailed_extraction=detailed_extraction, + excluded_paths=excluded_paths + ), Task(add_data_points, task_config={"batch_size": 30}), ] if include_docs: - # This tasks take a long time to complete non_code_tasks = [ Task(get_non_py_files, task_config={"batch_size": 50}), Task(ingest_data, dataset_name="repo_docs", user=user), @@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False): dataset_name = "codebase" - # Save dataset to database db_engine = get_relational_engine() async with db_engine.get_async_session() as session: dataset = await create_dataset(dataset_name, user, session) diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 232850936..2567a44cd 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -1,56 +1,68 @@ import asyncio import math import os - -# from concurrent.futures import ProcessPoolExecutor -from typing import AsyncGenerator +import fnmatch +from typing import AsyncGenerator, Optional, List from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, Repository -async def get_source_code_files(repo_path): +async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None): """ - Retrieve Python source code files from the specified repository path. - - This function scans the given repository path for files that have the .py extension - while excluding test files and files within a virtual environment. It returns a list of - absolute paths to the source code files that are not empty. + Retrieve Python source code files from the specified repository path, + excluding paths and file patterns commonly irrelevant to code analysis. Parameters: ----------- - - - repo_path: The file path to the repository to search for Python source files. + - repo_path: Root path of the repository to search + - excluded_paths: Optional list of path fragments or glob patterns to exclude Returns: -------- - - A list of absolute paths to .py files that contain source code, excluding empty - files, test files, and files from a virtual environment. + List of absolute file paths for .py files, excluding test files, + empty files, and files under ignored directories or matching ignore patterns. """ - if not os.path.exists(repo_path): - return {} - py_files_paths = ( - os.path.join(root, file) - for root, _, files in os.walk(repo_path) - for file in files - if ( - file.endswith(".py") - and not file.startswith("test_") - and not file.endswith("_test") - and ".venv" not in file - ) - ) + if not os.path.exists(repo_path): + return [] + + # Default exclusions + default_excluded_patterns = [ + ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/", + "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", + "*.pyc", "*.pyo", "*.log", "*.tmp" + ] + + excluded_patterns = default_excluded_patterns + (excluded_paths or []) + + py_files_paths = [] + for root, _, files in os.walk(repo_path): + for file in files: + full_path = os.path.join(root, file) + rel_path = os.path.relpath(full_path, repo_path) + + # Check for exclusion + should_exclude = any( + pattern in rel_path or fnmatch.fnmatch(rel_path, pattern) + for pattern in excluded_patterns + ) + if should_exclude: + continue + + if ( + file.endswith(".py") + and not file.startswith("test_") + and not file.endswith("_test") + ): + py_files_paths.append(full_path) source_code_files = set() for file_path in py_files_paths: file_path = os.path.abspath(file_path) - if os.path.getsize(file_path) == 0: continue - source_code_files.add(file_path) return list(source_code_files) @@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs): This function creates a new asyncio event loop, sets it as the current loop, and executes the given coroutine function with the provided arguments. Once the coroutine - completes, the loop is closed. Intended for use in environments where an existing event - loop is not available or desirable. - - Parameters: - ----------- - - - coroutine_func: The coroutine function to be run. - - *args: Positional arguments to pass to the coroutine function. - - **kwargs: Keyword arguments to pass to the coroutine function. - - Returns: - -------- - - The result returned by the coroutine after completion. + completes, the loop is closed. """ loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( - repo_path: str, detailed_extraction: bool = False + repo_path: str, + detailed_extraction: bool = False, + excluded_paths: Optional[List[str]] = None ) -> AsyncGenerator[DataPoint, None]: """ Generate a dependency graph for Python files in the given repository path. - Check the validity of the repository path and yield a repository object followed by the - dependencies of Python files within that repository. Raise a FileNotFoundError if the - provided path does not exist. The extraction of detailed dependencies can be controlled - via the `detailed_extraction` argument. - Parameters: ----------- - - - repo_path (str): The file path to the repository where Python files are located. - - detailed_extraction (bool): A flag indicating whether to perform a detailed - extraction of dependencies (default is False). (default False) + - repo_path: Path to local repository + - detailed_extraction: Whether to extract fine-grained dependencies + - excluded_paths: Optional custom exclusion list """ if not os.path.exists(repo_path): raise FileNotFoundError(f"Repository path {repo_path} does not exist.") - source_code_files = await get_source_code_files(repo_path) + source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths) repo = Repository( id=uuid5(NAMESPACE_OID, repo_path), @@ -125,11 +120,9 @@ async def get_repo_file_dependencies( for chunk_number in range(number_of_chunks) ] - # Codegraph dependencies are not installed by default, so we import where we use them. from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies for start_range, end_range in chunk_ranges: - # with ProcessPoolExecutor(max_workers=12) as executor: tasks = [ get_local_script_dependencies(repo_path, file_path, detailed_extraction) for file_path in source_code_files[start_range : end_range + 1] @@ -139,5 +132,4 @@ async def get_repo_file_dependencies( for source_code_file in results: source_code_file.part_of = repo - yield source_code_file diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py new file mode 100644 index 000000000..4de102da6 --- /dev/null +++ b/cognee/tests/test_repo_processor.py @@ -0,0 +1,45 @@ +import os +import shutil +import tempfile +from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files + +def test_get_source_code_files_excludes_common_dirs_and_files(): + # Create a temporary test directory + test_repo = tempfile.mkdtemp() + + # Create files and folders to include/exclude + included_file = os.path.join(test_repo, "main.py") + excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"] + excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"] + + # Create included file + with open(included_file, "w") as f: + f.write("print('Hello world')") + + # Create excluded directories and files inside them + for folder in excluded_dirs: + folder_path = os.path.join(test_repo, folder) + os.makedirs(folder_path) + file_path = os.path.join(folder_path, "ignored.js") + with open(file_path, "w") as f: + f.write("// ignore this") + + # Create excluded files in root + for file_name in excluded_files: + file_path = os.path.join(test_repo, file_name) + with open(file_path, "w") as f: + f.write("dummy") + + # Run function + results = get_source_code_files(test_repo) + + # Assert only included file is present + assert included_file in results + for root, dirs, files in os.walk(test_repo): + for name in files: + full_path = os.path.join(root, name) + if full_path != included_file: + assert full_path not in results, f"{full_path} should have been excluded" + + # Cleanup + shutil.rmtree(test_repo) From c898895f2229f851127a977411abb6b9cc6a4f74 Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Thu, 31 Jul 2025 07:00:11 -0700 Subject: [PATCH 2/5] feat/configurable-path-exclusion --- cognee/tests/test_repo_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py index 4de102da6..fc3c26b05 100644 --- a/cognee/tests/test_repo_processor.py +++ b/cognee/tests/test_repo_processor.py @@ -1,7 +1,7 @@ import os import shutil import tempfile -from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files +from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files def test_get_source_code_files_excludes_common_dirs_and_files(): # Create a temporary test directory From 8f26a01b3ab744a818bfeaeae932a41921f92ccc Mon Sep 17 00:00:00 2001 From: Hassan <261925524@formanite.fccollege.edu.pk> Date: Sat, 2 Aug 2025 10:33:07 -0700 Subject: [PATCH 3/5] style: run ruff format and fix lint issues --- cognee/api/v1/cognify/code_graph_pipeline.py | 24 +++++++++++--- .../get_repo_file_dependencies.py | 31 ++++++++++++------- cognee/tests/test_repo_processor.py | 1 + 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index d7faab6b5..ae1c8b0ac 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -43,17 +43,31 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= # Default exclusion patterns if excluded_paths is None: excluded_paths = [ - ".venv/", "venv/", "__pycache__/", ".pytest_cache/", - "build/", "dist/", "node_modules/", ".npm/", ".git/", - ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", - "*.pyc", "*.pyo", "*.log", "*.tmp" + ".venv/", + "venv/", + "__pycache__/", + ".pytest_cache/", + "build/", + "dist/", + "node_modules/", + ".npm/", + ".git/", + ".svn/", + ".idea/", + ".vscode/", + "tmp/", + "temp/", + "*.pyc", + "*.pyo", + "*.log", + "*.tmp", ] tasks = [ Task( get_repo_file_dependencies, detailed_extraction=detailed_extraction, - excluded_paths=excluded_paths + excluded_paths=excluded_paths, ), Task(add_data_points, task_config={"batch_size": 30}), ] diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 2567a44cd..f1435a9e2 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -30,9 +30,24 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st # Default exclusions default_excluded_patterns = [ - ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/", - "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/", - "*.pyc", "*.pyo", "*.log", "*.tmp" + ".venv/", + "venv/", + "__pycache__/", + ".pytest_cache/", + "build/", + "dist/", + "node_modules/", + ".npm/", + ".git/", + ".svn/", + ".idea/", + ".vscode/", + "tmp/", + "temp/", + "*.pyc", + "*.pyo", + "*.log", + "*.tmp", ] excluded_patterns = default_excluded_patterns + (excluded_paths or []) @@ -51,11 +66,7 @@ async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[st if should_exclude: continue - if ( - file.endswith(".py") - and not file.startswith("test_") - and not file.endswith("_test") - ): + if file.endswith(".py") and not file.startswith("test_") and not file.endswith("_test"): py_files_paths.append(full_path) source_code_files = set() @@ -84,9 +95,7 @@ def run_coroutine(coroutine_func, *args, **kwargs): async def get_repo_file_dependencies( - repo_path: str, - detailed_extraction: bool = False, - excluded_paths: Optional[List[str]] = None + repo_path: str, detailed_extraction: bool = False, excluded_paths: Optional[List[str]] = None ) -> AsyncGenerator[DataPoint, None]: """ Generate a dependency graph for Python files in the given repository path. diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py index fc3c26b05..2d5868f36 100644 --- a/cognee/tests/test_repo_processor.py +++ b/cognee/tests/test_repo_processor.py @@ -3,6 +3,7 @@ import shutil import tempfile from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files + def test_get_source_code_files_excludes_common_dirs_and_files(): # Create a temporary test directory test_repo = tempfile.mkdtemp() From 4159846bb39c2197b460f28d28b205953bf8ed39 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 16:04:14 +0200 Subject: [PATCH 4/5] fix: Make exluded paths use absolute path --- cognee/api/v1/cognify/code_graph_pipeline.py | 12 ++++++++---- cognee/modules/retrieval/code_retriever.py | 8 ++++++++ .../get_repo_file_dependencies.py | 17 +++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 66b8568fa..fb3612857 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -1,6 +1,7 @@ import os import pathlib import asyncio +from typing import Optional from cognee.shared.logging_utils import get_logger, setup_logging from cognee.modules.observability.get_observe import get_observe @@ -28,7 +29,12 @@ logger = get_logger("code_graph_pipeline") @observe -async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None): +async def run_code_graph_pipeline( + repo_path, + include_docs=False, + excluded_paths: Optional[list[str]] = None, + supported_languages: Optional[list[str]] = None, +): import cognee from cognee.low_level import setup @@ -40,8 +46,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= user = await get_default_user() detailed_extraction = True - # Multi-language support: allow passing supported_languages - supported_languages = None # defer to task defaults tasks = [ Task( get_repo_file_dependencies, @@ -95,7 +99,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths= if __name__ == "__main__": async def main(): - async for run_status in run_code_graph_pipeline("/Users/igorilic/Desktop/cognee/examples"): + async for run_status in run_code_graph_pipeline("REPO_PATH"): print(f"{run_status.pipeline_run_id}: {run_status.status}") file_path = os.path.join( diff --git a/cognee/modules/retrieval/code_retriever.py b/cognee/modules/retrieval/code_retriever.py index 6e819d8a7..76b5e758c 100644 --- a/cognee/modules/retrieval/code_retriever.py +++ b/cognee/modules/retrieval/code_retriever.py @@ -94,7 +94,15 @@ class CodeRetriever(BaseRetriever): {"id": res.id, "score": res.score, "payload": res.payload} ) + existing_collection = [] for collection in self.classes_and_functions_collections: + if await vector_engine.has_collection(collection): + existing_collection.append(collection) + + if not existing_collection: + raise RuntimeError("No collection found for code retriever") + + for collection in existing_collection: logger.debug(f"Searching {collection} collection with general query") search_results_code = await vector_engine.search( collection, query, limit=self.top_k diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py index 3ebf1fcb1..06cc3bddb 100644 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py @@ -1,6 +1,7 @@ import asyncio import math import os +from pathlib import Path from typing import Set from typing import AsyncGenerator, Optional, List from uuid import NAMESPACE_OID, uuid5 @@ -78,15 +79,22 @@ async def get_source_code_files( if lang is None: continue # Exclude tests, common build/venv directories and files provided in exclude_paths - excluded_dirs = EXCLUDED_DIRS | set(excluded_paths or []) - root_parts = set(os.path.normpath(root).split(os.sep)) + excluded_dirs = EXCLUDED_DIRS + excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])} # full paths + + root_path = Path(root).resolve() + root_parts = set(root_path.parts) # same as before base_name, _ext = os.path.splitext(file) if ( base_name.startswith("test_") - or base_name.endswith("_test") # catches Go's *_test.go and similar + or base_name.endswith("_test") or ".test." in file or ".spec." in file - or (excluded_dirs & root_parts) + or (excluded_dirs & root_parts) # name match + or any( + root_path.is_relative_to(p) # full-path match + for p in excluded_paths + ) ): continue file_path = os.path.abspath(os.path.join(root, file)) @@ -164,6 +172,7 @@ async def get_repo_file_dependencies( "go": [".go"], "rust": [".rs"], "cpp": [".cpp", ".c", ".h", ".hpp"], + "c": [".c", ".h"], } if supported_languages is not None: language_config = { From 0ecea42c2ccc0a12cf69b5dc23b51ae5196f0da5 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Fri, 29 Aug 2025 16:12:25 +0200 Subject: [PATCH 5/5] test: Remove repo path test --- cognee/tests/test_repo_processor.py | 46 ----------------------------- 1 file changed, 46 deletions(-) delete mode 100644 cognee/tests/test_repo_processor.py diff --git a/cognee/tests/test_repo_processor.py b/cognee/tests/test_repo_processor.py deleted file mode 100644 index 2d5868f36..000000000 --- a/cognee/tests/test_repo_processor.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import shutil -import tempfile -from cognee.tasks.repo_processor.get_repo_file_dependencies import get_source_code_files - - -def test_get_source_code_files_excludes_common_dirs_and_files(): - # Create a temporary test directory - test_repo = tempfile.mkdtemp() - - # Create files and folders to include/exclude - included_file = os.path.join(test_repo, "main.py") - excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"] - excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"] - - # Create included file - with open(included_file, "w") as f: - f.write("print('Hello world')") - - # Create excluded directories and files inside them - for folder in excluded_dirs: - folder_path = os.path.join(test_repo, folder) - os.makedirs(folder_path) - file_path = os.path.join(folder_path, "ignored.js") - with open(file_path, "w") as f: - f.write("// ignore this") - - # Create excluded files in root - for file_name in excluded_files: - file_path = os.path.join(test_repo, file_name) - with open(file_path, "w") as f: - f.write("dummy") - - # Run function - results = get_source_code_files(test_repo) - - # Assert only included file is present - assert included_file in results - for root, dirs, files in os.walk(test_repo): - for name in files: - full_path = os.path.join(root, name) - if full_path != included_file: - assert full_path not in results, f"{full_path} should have been excluded" - - # Cleanup - shutil.rmtree(test_repo)