feat/configurable-path-exclusion

This commit is contained in:
Hassan 2025-07-31 05:15:41 -07:00
parent 9907e6fe5b
commit 4ca1de266e
3 changed files with 109 additions and 62 deletions

View file

@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline")
@observe @observe
async def run_code_graph_pipeline(repo_path, include_docs=False): async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
import cognee import cognee
from cognee.low_level import setup from cognee.low_level import setup
@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
user = await get_default_user() user = await get_default_user()
detailed_extraction = True detailed_extraction = True
# Default exclusion patterns
if excluded_paths is None:
excluded_paths = [
".venv/", "venv/", "__pycache__/", ".pytest_cache/",
"build/", "dist/", "node_modules/", ".npm/", ".git/",
".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
"*.pyc", "*.pyo", "*.log", "*.tmp"
]
tasks = [ tasks = [
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction), Task(
# Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete get_repo_file_dependencies,
detailed_extraction=detailed_extraction,
excluded_paths=excluded_paths
),
Task(add_data_points, task_config={"batch_size": 30}), Task(add_data_points, task_config={"batch_size": 30}),
] ]
if include_docs: if include_docs:
# This tasks take a long time to complete
non_code_tasks = [ non_code_tasks = [
Task(get_non_py_files, task_config={"batch_size": 50}), Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data, dataset_name="repo_docs", user=user), Task(ingest_data, dataset_name="repo_docs", user=user),
@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
dataset_name = "codebase" dataset_name = "codebase"
# Save dataset to database
db_engine = get_relational_engine() db_engine = get_relational_engine()
async with db_engine.get_async_session() as session: async with db_engine.get_async_session() as session:
dataset = await create_dataset(dataset_name, user, session) dataset = await create_dataset(dataset_name, user, session)

View file

@ -1,56 +1,68 @@
import asyncio import asyncio
import math import math
import os import os
import fnmatch
# from concurrent.futures import ProcessPoolExecutor from typing import AsyncGenerator, Optional, List
from typing import AsyncGenerator
from uuid import NAMESPACE_OID, uuid5 from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, Repository from cognee.shared.CodeGraphEntities import CodeFile, Repository
async def get_source_code_files(repo_path): async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None):
""" """
Retrieve Python source code files from the specified repository path. Retrieve Python source code files from the specified repository path,
excluding paths and file patterns commonly irrelevant to code analysis.
This function scans the given repository path for files that have the .py extension
while excluding test files and files within a virtual environment. It returns a list of
absolute paths to the source code files that are not empty.
Parameters: Parameters:
----------- -----------
- repo_path: Root path of the repository to search
- repo_path: The file path to the repository to search for Python source files. - excluded_paths: Optional list of path fragments or glob patterns to exclude
Returns: Returns:
-------- --------
List of absolute file paths for .py files, excluding test files,
A list of absolute paths to .py files that contain source code, excluding empty empty files, and files under ignored directories or matching ignore patterns.
files, test files, and files from a virtual environment.
""" """
if not os.path.exists(repo_path):
return {}
py_files_paths = ( if not os.path.exists(repo_path):
os.path.join(root, file) return []
for root, _, files in os.walk(repo_path)
for file in files # Default exclusions
if ( default_excluded_patterns = [
file.endswith(".py") ".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
and not file.startswith("test_") "node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
and not file.endswith("_test") "*.pyc", "*.pyo", "*.log", "*.tmp"
and ".venv" not in file ]
)
) excluded_patterns = default_excluded_patterns + (excluded_paths or [])
py_files_paths = []
for root, _, files in os.walk(repo_path):
for file in files:
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, repo_path)
# Check for exclusion
should_exclude = any(
pattern in rel_path or fnmatch.fnmatch(rel_path, pattern)
for pattern in excluded_patterns
)
if should_exclude:
continue
if (
file.endswith(".py")
and not file.startswith("test_")
and not file.endswith("_test")
):
py_files_paths.append(full_path)
source_code_files = set() source_code_files = set()
for file_path in py_files_paths: for file_path in py_files_paths:
file_path = os.path.abspath(file_path) file_path = os.path.abspath(file_path)
if os.path.getsize(file_path) == 0: if os.path.getsize(file_path) == 0:
continue continue
source_code_files.add(file_path) source_code_files.add(file_path)
return list(source_code_files) return list(source_code_files)
@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):
This function creates a new asyncio event loop, sets it as the current loop, and This function creates a new asyncio event loop, sets it as the current loop, and
executes the given coroutine function with the provided arguments. Once the coroutine executes the given coroutine function with the provided arguments. Once the coroutine
completes, the loop is closed. Intended for use in environments where an existing event completes, the loop is closed.
loop is not available or desirable.
Parameters:
-----------
- coroutine_func: The coroutine function to be run.
- *args: Positional arguments to pass to the coroutine function.
- **kwargs: Keyword arguments to pass to the coroutine function.
Returns:
--------
The result returned by the coroutine after completion.
""" """
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs):
async def get_repo_file_dependencies( async def get_repo_file_dependencies(
repo_path: str, detailed_extraction: bool = False repo_path: str,
detailed_extraction: bool = False,
excluded_paths: Optional[List[str]] = None
) -> AsyncGenerator[DataPoint, None]: ) -> AsyncGenerator[DataPoint, None]:
""" """
Generate a dependency graph for Python files in the given repository path. Generate a dependency graph for Python files in the given repository path.
Check the validity of the repository path and yield a repository object followed by the
dependencies of Python files within that repository. Raise a FileNotFoundError if the
provided path does not exist. The extraction of detailed dependencies can be controlled
via the `detailed_extraction` argument.
Parameters: Parameters:
----------- -----------
- repo_path: Path to local repository
- repo_path (str): The file path to the repository where Python files are located. - detailed_extraction: Whether to extract fine-grained dependencies
- detailed_extraction (bool): A flag indicating whether to perform a detailed - excluded_paths: Optional custom exclusion list
extraction of dependencies (default is False). (default False)
""" """
if not os.path.exists(repo_path): if not os.path.exists(repo_path):
raise FileNotFoundError(f"Repository path {repo_path} does not exist.") raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
source_code_files = await get_source_code_files(repo_path) source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths)
repo = Repository( repo = Repository(
id=uuid5(NAMESPACE_OID, repo_path), id=uuid5(NAMESPACE_OID, repo_path),
@ -125,11 +120,9 @@ async def get_repo_file_dependencies(
for chunk_number in range(number_of_chunks) for chunk_number in range(number_of_chunks)
] ]
# Codegraph dependencies are not installed by default, so we import where we use them.
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
for start_range, end_range in chunk_ranges: for start_range, end_range in chunk_ranges:
# with ProcessPoolExecutor(max_workers=12) as executor:
tasks = [ tasks = [
get_local_script_dependencies(repo_path, file_path, detailed_extraction) get_local_script_dependencies(repo_path, file_path, detailed_extraction)
for file_path in source_code_files[start_range : end_range + 1] for file_path in source_code_files[start_range : end_range + 1]
@ -139,5 +132,4 @@ async def get_repo_file_dependencies(
for source_code_file in results: for source_code_file in results:
source_code_file.part_of = repo source_code_file.part_of = repo
yield source_code_file yield source_code_file

View file

@ -0,0 +1,45 @@
import os
import shutil
import tempfile
from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
def test_get_source_code_files_excludes_common_dirs_and_files():
# Create a temporary test directory
test_repo = tempfile.mkdtemp()
# Create files and folders to include/exclude
included_file = os.path.join(test_repo, "main.py")
excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
# Create included file
with open(included_file, "w") as f:
f.write("print('Hello world')")
# Create excluded directories and files inside them
for folder in excluded_dirs:
folder_path = os.path.join(test_repo, folder)
os.makedirs(folder_path)
file_path = os.path.join(folder_path, "ignored.js")
with open(file_path, "w") as f:
f.write("// ignore this")
# Create excluded files in root
for file_name in excluded_files:
file_path = os.path.join(test_repo, file_name)
with open(file_path, "w") as f:
f.write("dummy")
# Run function
results = get_source_code_files(test_repo)
# Assert only included file is present
assert included_file in results
for root, dirs, files in os.walk(test_repo):
for name in files:
full_path = os.path.join(root, name)
if full_path != included_file:
assert full_path not in results, f"{full_path} should have been excluded"
# Cleanup
shutil.rmtree(test_repo)