feat/configurable-path-exclusion

This commit is contained in:
Hassan 2025-07-31 05:15:41 -07:00
parent 9907e6fe5b
commit 4ca1de266e
3 changed files with 109 additions and 62 deletions

View file

@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline")
@observe
async def run_code_graph_pipeline(repo_path, include_docs=False):
async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
import cognee
from cognee.low_level import setup
@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
user = await get_default_user()
detailed_extraction = True
# Default exclusion patterns
if excluded_paths is None:
excluded_paths = [
".venv/", "venv/", "__pycache__/", ".pytest_cache/",
"build/", "dist/", "node_modules/", ".npm/", ".git/",
".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
"*.pyc", "*.pyo", "*.log", "*.tmp"
]
tasks = [
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
# Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
Task(
get_repo_file_dependencies,
detailed_extraction=detailed_extraction,
excluded_paths=excluded_paths
),
Task(add_data_points, task_config={"batch_size": 30}),
]
if include_docs:
# This tasks take a long time to complete
non_code_tasks = [
Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data, dataset_name="repo_docs", user=user),
@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
dataset_name = "codebase"
# Save dataset to database
db_engine = get_relational_engine()
async with db_engine.get_async_session() as session:
dataset = await create_dataset(dataset_name, user, session)

View file

@ -1,56 +1,68 @@
import asyncio
import math
import os
# from concurrent.futures import ProcessPoolExecutor
from typing import AsyncGenerator
import fnmatch
from typing import AsyncGenerator, Optional, List
from uuid import NAMESPACE_OID, uuid5
from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, Repository
async def get_source_code_files(repo_path):
async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None):
"""
Retrieve Python source code files from the specified repository path.
This function scans the given repository path for files that have the .py extension
while excluding test files and files within a virtual environment. It returns a list of
absolute paths to the source code files that are not empty.
Retrieve Python source code files from the specified repository path,
excluding paths and file patterns commonly irrelevant to code analysis.
Parameters:
-----------
- repo_path: The file path to the repository to search for Python source files.
- repo_path: Root path of the repository to search
- excluded_paths: Optional list of path fragments or glob patterns to exclude
Returns:
--------
A list of absolute paths to .py files that contain source code, excluding empty
files, test files, and files from a virtual environment.
List of absolute file paths for .py files, excluding test files,
empty files, and files under ignored directories or matching ignore patterns.
"""
if not os.path.exists(repo_path):
return {}
py_files_paths = (
os.path.join(root, file)
for root, _, files in os.walk(repo_path)
for file in files
if (
file.endswith(".py")
and not file.startswith("test_")
and not file.endswith("_test")
and ".venv" not in file
)
)
if not os.path.exists(repo_path):
return []
# Default exclusions
default_excluded_patterns = [
".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
"node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
"*.pyc", "*.pyo", "*.log", "*.tmp"
]
excluded_patterns = default_excluded_patterns + (excluded_paths or [])
py_files_paths = []
for root, _, files in os.walk(repo_path):
for file in files:
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, repo_path)
# Check for exclusion
should_exclude = any(
pattern in rel_path or fnmatch.fnmatch(rel_path, pattern)
for pattern in excluded_patterns
)
if should_exclude:
continue
if (
file.endswith(".py")
and not file.startswith("test_")
and not file.endswith("_test")
):
py_files_paths.append(full_path)
source_code_files = set()
for file_path in py_files_paths:
file_path = os.path.abspath(file_path)
if os.path.getsize(file_path) == 0:
continue
source_code_files.add(file_path)
return list(source_code_files)
@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):
This function creates a new asyncio event loop, sets it as the current loop, and
executes the given coroutine function with the provided arguments. Once the coroutine
completes, the loop is closed. Intended for use in environments where an existing event
loop is not available or desirable.
Parameters:
-----------
- coroutine_func: The coroutine function to be run.
- *args: Positional arguments to pass to the coroutine function.
- **kwargs: Keyword arguments to pass to the coroutine function.
Returns:
--------
The result returned by the coroutine after completion.
completes, the loop is closed.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs):
async def get_repo_file_dependencies(
repo_path: str, detailed_extraction: bool = False
repo_path: str,
detailed_extraction: bool = False,
excluded_paths: Optional[List[str]] = None
) -> AsyncGenerator[DataPoint, None]:
"""
Generate a dependency graph for Python files in the given repository path.
Check the validity of the repository path and yield a repository object followed by the
dependencies of Python files within that repository. Raise a FileNotFoundError if the
provided path does not exist. The extraction of detailed dependencies can be controlled
via the `detailed_extraction` argument.
Parameters:
-----------
- repo_path (str): The file path to the repository where Python files are located.
- detailed_extraction (bool): A flag indicating whether to perform a detailed
extraction of dependencies (default is False). (default False)
- repo_path: Path to local repository
- detailed_extraction: Whether to extract fine-grained dependencies
- excluded_paths: Optional custom exclusion list
"""
if not os.path.exists(repo_path):
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
source_code_files = await get_source_code_files(repo_path)
source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths)
repo = Repository(
id=uuid5(NAMESPACE_OID, repo_path),
@ -125,11 +120,9 @@ async def get_repo_file_dependencies(
for chunk_number in range(number_of_chunks)
]
# Codegraph dependencies are not installed by default, so we import where we use them.
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
for start_range, end_range in chunk_ranges:
# with ProcessPoolExecutor(max_workers=12) as executor:
tasks = [
get_local_script_dependencies(repo_path, file_path, detailed_extraction)
for file_path in source_code_files[start_range : end_range + 1]
@ -139,5 +132,4 @@ async def get_repo_file_dependencies(
for source_code_file in results:
source_code_file.part_of = repo
yield source_code_file

View file

@ -0,0 +1,45 @@
import os
import shutil
import tempfile
from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
def test_get_source_code_files_excludes_common_dirs_and_files():
# Create a temporary test directory
test_repo = tempfile.mkdtemp()
# Create files and folders to include/exclude
included_file = os.path.join(test_repo, "main.py")
excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
# Create included file
with open(included_file, "w") as f:
f.write("print('Hello world')")
# Create excluded directories and files inside them
for folder in excluded_dirs:
folder_path = os.path.join(test_repo, folder)
os.makedirs(folder_path)
file_path = os.path.join(folder_path, "ignored.js")
with open(file_path, "w") as f:
f.write("// ignore this")
# Create excluded files in root
for file_name in excluded_files:
file_path = os.path.join(test_repo, file_name)
with open(file_path, "w") as f:
f.write("dummy")
# Run function
results = get_source_code_files(test_repo)
# Assert only included file is present
assert included_file in results
for root, dirs, files in os.walk(test_repo):
for name in files:
full_path = os.path.join(root, name)
if full_path != included_file:
assert full_path not in results, f"{full_path} should have been excluded"
# Cleanup
shutil.rmtree(test_repo)