feat/configurable-path-exclusion
This commit is contained in:
parent
9907e6fe5b
commit
4ca1de266e
3 changed files with 109 additions and 62 deletions
|
|
@ -28,7 +28,7 @@ logger = get_logger("code_graph_pipeline")
|
|||
|
||||
|
||||
@observe
|
||||
async def run_code_graph_pipeline(repo_path, include_docs=False):
|
||||
async def run_code_graph_pipeline(repo_path, include_docs=False, excluded_paths=None):
|
||||
import cognee
|
||||
from cognee.low_level import setup
|
||||
|
||||
|
|
@ -40,14 +40,25 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
|
|||
user = await get_default_user()
|
||||
detailed_extraction = True
|
||||
|
||||
# Default exclusion patterns
|
||||
if excluded_paths is None:
|
||||
excluded_paths = [
|
||||
".venv/", "venv/", "__pycache__/", ".pytest_cache/",
|
||||
"build/", "dist/", "node_modules/", ".npm/", ".git/",
|
||||
".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
|
||||
"*.pyc", "*.pyo", "*.log", "*.tmp"
|
||||
]
|
||||
|
||||
tasks = [
|
||||
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
|
||||
# Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
|
||||
Task(
|
||||
get_repo_file_dependencies,
|
||||
detailed_extraction=detailed_extraction,
|
||||
excluded_paths=excluded_paths
|
||||
),
|
||||
Task(add_data_points, task_config={"batch_size": 30}),
|
||||
]
|
||||
|
||||
if include_docs:
|
||||
# This tasks take a long time to complete
|
||||
non_code_tasks = [
|
||||
Task(get_non_py_files, task_config={"batch_size": 50}),
|
||||
Task(ingest_data, dataset_name="repo_docs", user=user),
|
||||
|
|
@ -67,7 +78,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
|
|||
|
||||
dataset_name = "codebase"
|
||||
|
||||
# Save dataset to database
|
||||
db_engine = get_relational_engine()
|
||||
async with db_engine.get_async_session() as session:
|
||||
dataset = await create_dataset(dataset_name, user, session)
|
||||
|
|
|
|||
|
|
@ -1,56 +1,68 @@
|
|||
import asyncio
|
||||
import math
|
||||
import os
|
||||
|
||||
# from concurrent.futures import ProcessPoolExecutor
|
||||
from typing import AsyncGenerator
|
||||
import fnmatch
|
||||
from typing import AsyncGenerator, Optional, List
|
||||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.shared.CodeGraphEntities import CodeFile, Repository
|
||||
|
||||
|
||||
async def get_source_code_files(repo_path):
|
||||
async def get_source_code_files(repo_path: str, excluded_paths: Optional[List[str]] = None):
|
||||
"""
|
||||
Retrieve Python source code files from the specified repository path.
|
||||
|
||||
This function scans the given repository path for files that have the .py extension
|
||||
while excluding test files and files within a virtual environment. It returns a list of
|
||||
absolute paths to the source code files that are not empty.
|
||||
Retrieve Python source code files from the specified repository path,
|
||||
excluding paths and file patterns commonly irrelevant to code analysis.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path: The file path to the repository to search for Python source files.
|
||||
- repo_path: Root path of the repository to search
|
||||
- excluded_paths: Optional list of path fragments or glob patterns to exclude
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
A list of absolute paths to .py files that contain source code, excluding empty
|
||||
files, test files, and files from a virtual environment.
|
||||
List of absolute file paths for .py files, excluding test files,
|
||||
empty files, and files under ignored directories or matching ignore patterns.
|
||||
"""
|
||||
if not os.path.exists(repo_path):
|
||||
return {}
|
||||
|
||||
py_files_paths = (
|
||||
os.path.join(root, file)
|
||||
for root, _, files in os.walk(repo_path)
|
||||
for file in files
|
||||
if (
|
||||
file.endswith(".py")
|
||||
and not file.startswith("test_")
|
||||
and not file.endswith("_test")
|
||||
and ".venv" not in file
|
||||
)
|
||||
)
|
||||
if not os.path.exists(repo_path):
|
||||
return []
|
||||
|
||||
# Default exclusions
|
||||
default_excluded_patterns = [
|
||||
".venv/", "venv/", "__pycache__/", ".pytest_cache/", "build/", "dist/",
|
||||
"node_modules/", ".npm/", ".git/", ".svn/", ".idea/", ".vscode/", "tmp/", "temp/",
|
||||
"*.pyc", "*.pyo", "*.log", "*.tmp"
|
||||
]
|
||||
|
||||
excluded_patterns = default_excluded_patterns + (excluded_paths or [])
|
||||
|
||||
py_files_paths = []
|
||||
for root, _, files in os.walk(repo_path):
|
||||
for file in files:
|
||||
full_path = os.path.join(root, file)
|
||||
rel_path = os.path.relpath(full_path, repo_path)
|
||||
|
||||
# Check for exclusion
|
||||
should_exclude = any(
|
||||
pattern in rel_path or fnmatch.fnmatch(rel_path, pattern)
|
||||
for pattern in excluded_patterns
|
||||
)
|
||||
if should_exclude:
|
||||
continue
|
||||
|
||||
if (
|
||||
file.endswith(".py")
|
||||
and not file.startswith("test_")
|
||||
and not file.endswith("_test")
|
||||
):
|
||||
py_files_paths.append(full_path)
|
||||
|
||||
source_code_files = set()
|
||||
for file_path in py_files_paths:
|
||||
file_path = os.path.abspath(file_path)
|
||||
|
||||
if os.path.getsize(file_path) == 0:
|
||||
continue
|
||||
|
||||
source_code_files.add(file_path)
|
||||
|
||||
return list(source_code_files)
|
||||
|
|
@ -62,20 +74,7 @@ def run_coroutine(coroutine_func, *args, **kwargs):
|
|||
|
||||
This function creates a new asyncio event loop, sets it as the current loop, and
|
||||
executes the given coroutine function with the provided arguments. Once the coroutine
|
||||
completes, the loop is closed. Intended for use in environments where an existing event
|
||||
loop is not available or desirable.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- coroutine_func: The coroutine function to be run.
|
||||
- *args: Positional arguments to pass to the coroutine function.
|
||||
- **kwargs: Keyword arguments to pass to the coroutine function.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
|
||||
The result returned by the coroutine after completion.
|
||||
completes, the loop is closed.
|
||||
"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
|
@ -85,28 +84,24 @@ def run_coroutine(coroutine_func, *args, **kwargs):
|
|||
|
||||
|
||||
async def get_repo_file_dependencies(
|
||||
repo_path: str, detailed_extraction: bool = False
|
||||
repo_path: str,
|
||||
detailed_extraction: bool = False,
|
||||
excluded_paths: Optional[List[str]] = None
|
||||
) -> AsyncGenerator[DataPoint, None]:
|
||||
"""
|
||||
Generate a dependency graph for Python files in the given repository path.
|
||||
|
||||
Check the validity of the repository path and yield a repository object followed by the
|
||||
dependencies of Python files within that repository. Raise a FileNotFoundError if the
|
||||
provided path does not exist. The extraction of detailed dependencies can be controlled
|
||||
via the `detailed_extraction` argument.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
- repo_path (str): The file path to the repository where Python files are located.
|
||||
- detailed_extraction (bool): A flag indicating whether to perform a detailed
|
||||
extraction of dependencies (default is False). (default False)
|
||||
- repo_path: Path to local repository
|
||||
- detailed_extraction: Whether to extract fine-grained dependencies
|
||||
- excluded_paths: Optional custom exclusion list
|
||||
"""
|
||||
|
||||
if not os.path.exists(repo_path):
|
||||
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
||||
|
||||
source_code_files = await get_source_code_files(repo_path)
|
||||
source_code_files = await get_source_code_files(repo_path, excluded_paths=excluded_paths)
|
||||
|
||||
repo = Repository(
|
||||
id=uuid5(NAMESPACE_OID, repo_path),
|
||||
|
|
@ -125,11 +120,9 @@ async def get_repo_file_dependencies(
|
|||
for chunk_number in range(number_of_chunks)
|
||||
]
|
||||
|
||||
# Codegraph dependencies are not installed by default, so we import where we use them.
|
||||
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
|
||||
|
||||
for start_range, end_range in chunk_ranges:
|
||||
# with ProcessPoolExecutor(max_workers=12) as executor:
|
||||
tasks = [
|
||||
get_local_script_dependencies(repo_path, file_path, detailed_extraction)
|
||||
for file_path in source_code_files[start_range : end_range + 1]
|
||||
|
|
@ -139,5 +132,4 @@ async def get_repo_file_dependencies(
|
|||
|
||||
for source_code_file in results:
|
||||
source_code_file.part_of = repo
|
||||
|
||||
yield source_code_file
|
||||
|
|
|
|||
45
cognee/tests/test_repo_processor.py
Normal file
45
cognee/tests/test_repo_processor.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from cognee.tasks.repo_processor.code_graph_repo import get_source_code_files
|
||||
|
||||
def test_get_source_code_files_excludes_common_dirs_and_files():
|
||||
# Create a temporary test directory
|
||||
test_repo = tempfile.mkdtemp()
|
||||
|
||||
# Create files and folders to include/exclude
|
||||
included_file = os.path.join(test_repo, "main.py")
|
||||
excluded_dirs = [".venv", "node_modules", "__pycache__", ".git"]
|
||||
excluded_files = ["ignore.pyc", "temp.log", "junk.tmp"]
|
||||
|
||||
# Create included file
|
||||
with open(included_file, "w") as f:
|
||||
f.write("print('Hello world')")
|
||||
|
||||
# Create excluded directories and files inside them
|
||||
for folder in excluded_dirs:
|
||||
folder_path = os.path.join(test_repo, folder)
|
||||
os.makedirs(folder_path)
|
||||
file_path = os.path.join(folder_path, "ignored.js")
|
||||
with open(file_path, "w") as f:
|
||||
f.write("// ignore this")
|
||||
|
||||
# Create excluded files in root
|
||||
for file_name in excluded_files:
|
||||
file_path = os.path.join(test_repo, file_name)
|
||||
with open(file_path, "w") as f:
|
||||
f.write("dummy")
|
||||
|
||||
# Run function
|
||||
results = get_source_code_files(test_repo)
|
||||
|
||||
# Assert only included file is present
|
||||
assert included_file in results
|
||||
for root, dirs, files in os.walk(test_repo):
|
||||
for name in files:
|
||||
full_path = os.path.join(root, name)
|
||||
if full_path != included_file:
|
||||
assert full_path not in results, f"{full_path} should have been excluded"
|
||||
|
||||
# Cleanup
|
||||
shutil.rmtree(test_repo)
|
||||
Loading…
Add table
Reference in a new issue