From 18e4bb48fdedd0b7efc1e22d78b618d3294c5c9a Mon Sep 17 00:00:00 2001 From: Andrej Milicevic Date: Wed, 5 Nov 2025 13:02:56 +0100 Subject: [PATCH] refactor: remove code and repository related tasks --- .../code/enrich_dependency_graph_checker.py | 35 -- .../code/get_local_dependencies_checker.py | 20 -- .../code/get_repo_dependency_graph_checker.py | 35 -- cognee/tasks/repo_processor/__init__.py | 2 - .../repo_processor/get_local_dependencies.py | 335 ------------------ .../repo_processor/get_non_code_files.py | 158 --------- .../get_repo_file_dependencies.py | 243 ------------- 7 files changed, 828 deletions(-) delete mode 100644 cognee/tasks/code/enrich_dependency_graph_checker.py delete mode 100644 cognee/tasks/code/get_local_dependencies_checker.py delete mode 100644 cognee/tasks/code/get_repo_dependency_graph_checker.py delete mode 100644 cognee/tasks/repo_processor/__init__.py delete mode 100644 cognee/tasks/repo_processor/get_local_dependencies.py delete mode 100644 cognee/tasks/repo_processor/get_non_code_files.py delete mode 100644 cognee/tasks/repo_processor/get_repo_file_dependencies.py diff --git a/cognee/tasks/code/enrich_dependency_graph_checker.py b/cognee/tasks/code/enrich_dependency_graph_checker.py deleted file mode 100644 index 1b3a80210..000000000 --- a/cognee/tasks/code/enrich_dependency_graph_checker.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import asyncio -import argparse -from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies -from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph - - -def main(): - """ - Execute the main logic of the dependency graph processor. - - This function sets up argument parsing to retrieve the repository path, checks the - existence of the specified path, and processes the repository to produce a dependency - graph. If the repository path does not exist, it logs an error message and terminates - without further execution. - """ - parser = argparse.ArgumentParser() - parser.add_argument("repo_path", help="Path to the repository") - args = parser.parse_args() - - repo_path = args.repo_path - if not os.path.exists(repo_path): - print(f"Error: The provided repository path does not exist: {repo_path}") - return - - graph = asyncio.run(get_repo_file_dependencies(repo_path)) - graph = asyncio.run(enrich_dependency_graph(graph)) - for node in graph.nodes: - print(f"Node: {node}") - for _, target, data in graph.out_edges(node, data=True): - print(f" Edge to {target}, data: {data}") - - -if __name__ == "__main__": - main() diff --git a/cognee/tasks/code/get_local_dependencies_checker.py b/cognee/tasks/code/get_local_dependencies_checker.py deleted file mode 100644 index 5d465254a..000000000 --- a/cognee/tasks/code/get_local_dependencies_checker.py +++ /dev/null @@ -1,20 +0,0 @@ -import argparse -import asyncio -from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Get local script dependencies.") - - # Suggested path: .../cognee/examples/python/simple_example.py - parser.add_argument("script_path", type=str, help="Absolute path to the Python script file") - - # Suggested path: .../cognee - parser.add_argument("repo_path", type=str, help="Absolute path to the repository root") - - args = parser.parse_args() - - dependencies = asyncio.run(get_local_script_dependencies(args.script_path, args.repo_path)) - - print("Dependencies:") - for dependency in dependencies: - print(dependency) diff --git a/cognee/tasks/code/get_repo_dependency_graph_checker.py b/cognee/tasks/code/get_repo_dependency_graph_checker.py deleted file mode 100644 index 0e68cf7fe..000000000 --- a/cognee/tasks/code/get_repo_dependency_graph_checker.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import asyncio -import argparse -from cognee.tasks.repo_processor.get_repo_file_dependencies import get_repo_file_dependencies - - -def main(): - """ - Parse the command line arguments and print the repository file dependencies. - - This function sets up an argument parser to retrieve the path of a repository. It checks - if the provided path exists and if it doesn’t, it prints an error message and exits. If - the path is valid, it calls an asynchronous function to get the dependencies and prints - the nodes and their relations in the dependency graph. - """ - parser = argparse.ArgumentParser() - parser.add_argument("repo_path", help="Path to the repository") - args = parser.parse_args() - - repo_path = args.repo_path - if not os.path.exists(repo_path): - print(f"Error: The provided repository path does not exist: {repo_path}") - return - - graph = asyncio.run(get_repo_file_dependencies(repo_path)) - - for node in graph.nodes: - print(f"Node: {node}") - edges = graph.edges(node, data=True) - for _, target, data in edges: - print(f" Edge to {target}, Relation: {data.get('relation')}") - - -if __name__ == "__main__": - main() diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py deleted file mode 100644 index 71ef3c287..000000000 --- a/cognee/tasks/repo_processor/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .get_non_code_files import get_non_py_files -from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py deleted file mode 100644 index f691d4a3e..000000000 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ /dev/null @@ -1,335 +0,0 @@ -import os -import aiofiles -import importlib -from typing import AsyncGenerator, Optional -from uuid import NAMESPACE_OID, uuid5 -import tree_sitter_python as tspython -from tree_sitter import Language, Node, Parser, Tree -from cognee.shared.logging_utils import get_logger - -from cognee.low_level import DataPoint -from cognee.shared.CodeGraphEntities import ( - CodeFile, - ImportStatement, - FunctionDefinition, - ClassDefinition, -) - -logger = get_logger() - - -class FileParser: - """ - Handles the parsing of files into source code and an abstract syntax tree - representation. Public methods include: - - - parse_file: Parses a file and returns its source code and syntax tree representation. - """ - - def __init__(self): - self.parsed_files = {} - - async def parse_file(self, file_path: str) -> tuple[str, Tree]: - """ - Parse a file and return its source code along with its syntax tree representation. - - If the file has already been parsed, retrieve the result from memory instead of reading - the file again. - - Parameters: - ----------- - - - file_path (str): The path of the file to parse. - - Returns: - -------- - - - tuple[str, Tree]: A tuple containing the source code of the file and its - corresponding syntax tree representation. - """ - PY_LANGUAGE = Language(tspython.language()) - source_code_parser = Parser(PY_LANGUAGE) - - if file_path not in self.parsed_files: - source_code = await get_source_code(file_path) - source_code_tree = source_code_parser.parse(bytes(source_code, "utf-8")) - self.parsed_files[file_path] = (source_code, source_code_tree) - - return self.parsed_files[file_path] - - -async def get_source_code(file_path: str): - """ - Read source code from a file asynchronously. - - This function attempts to open a file specified by the given file path, read its - contents, and return the source code. In case of any errors during the file reading - process, it logs an error message and returns None. - - Parameters: - ----------- - - - file_path (str): The path to the file from which to read the source code. - - Returns: - -------- - - Returns the contents of the file as a string if successful, or None if an error - occurs. - """ - try: - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - source_code = await f.read() - return source_code - except Exception as error: - logger.error(f"Error reading file {file_path}: {str(error)}") - return None - - -def resolve_module_path(module_name): - """ - Find the file path of a module. - - Return the file path of the specified module if found, or return None if the module does - not exist or cannot be located. - - Parameters: - ----------- - - - module_name: The name of the module whose file path is to be resolved. - - Returns: - -------- - - The file path of the module as a string or None if the module is not found. - """ - try: - spec = importlib.util.find_spec(module_name) - if spec and spec.origin: - return spec.origin - except ModuleNotFoundError: - return None - return None - - -def find_function_location( - module_path: str, function_name: str, parser: FileParser -) -> Optional[tuple[str, str]]: - """ - Find the location of a function definition in a specified module. - - Parameters: - ----------- - - - module_path (str): The path to the module where the function is defined. - - function_name (str): The name of the function whose location is to be found. - - parser (FileParser): An instance of FileParser used to parse the module's source - code. - - Returns: - -------- - - - Optional[tuple[str, str]]: Returns a tuple containing the module path and the - start point of the function if found; otherwise, returns None. - """ - if not module_path or not os.path.exists(module_path): - return None - - source_code, tree = parser.parse_file(module_path) - root_node: Node = tree.root_node - - for node in root_node.children: - if node.type == "function_definition": - func_name_node = node.child_by_field_name("name") - - if func_name_node and func_name_node.text.decode() == function_name: - return (module_path, node.start_point) # (line, column) - - return None - - -async def get_local_script_dependencies( - repo_path: str, script_path: str, detailed_extraction: bool = False -) -> CodeFile: - """ - Retrieve local script dependencies and create a CodeFile object. - - Parameters: - ----------- - - - repo_path (str): The path to the repository that contains the script. - - script_path (str): The path of the script for which dependencies are being - extracted. - - detailed_extraction (bool): A flag indicating whether to perform a detailed - extraction of code components. - - Returns: - -------- - - - CodeFile: Returns a CodeFile object containing information about the script, - including its dependencies and definitions. - """ - code_file_parser = FileParser() - source_code, source_code_tree = await code_file_parser.parse_file(script_path) - - file_path_relative_to_repo = script_path[len(repo_path) + 1 :] - - if not detailed_extraction: - code_file_node = CodeFile( - id=uuid5(NAMESPACE_OID, script_path), - name=file_path_relative_to_repo, - source_code=source_code, - file_path=script_path, - language="python", - ) - return code_file_node - - code_file_node = CodeFile( - id=uuid5(NAMESPACE_OID, script_path), - name=file_path_relative_to_repo, - source_code=None, - file_path=script_path, - language="python", - ) - - async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path): - part.file_path = script_path - - if isinstance(part, FunctionDefinition): - code_file_node.provides_function_definition.append(part) - if isinstance(part, ClassDefinition): - code_file_node.provides_class_definition.append(part) - if isinstance(part, ImportStatement): - code_file_node.depends_on.append(part) - - return code_file_node - - -def find_node(nodes: list[Node], condition: callable) -> Node: - """ - Find and return the first node that satisfies the given condition. - - Iterate through the provided list of nodes and return the first node for which the - condition callable returns True. If no such node is found, return None. - - Parameters: - ----------- - - - nodes (list[Node]): A list of Node objects to search through. - - condition (callable): A callable that takes a Node and returns a boolean - indicating if the node meets specified criteria. - - Returns: - -------- - - - Node: The first Node that matches the condition, or None if no such node exists. - """ - for node in nodes: - if condition(node): - return node - - return None - - -async def extract_code_parts( - tree_root: Node, script_path: str, existing_nodes: list[DataPoint] = {} -) -> AsyncGenerator[DataPoint, None]: - """ - Extract code parts from a given AST node tree asynchronously. - - Iteratively yields DataPoint nodes representing import statements, function definitions, - and class definitions found in the children of the specified tree root. The function - checks - if nodes are already present in the existing_nodes dictionary to prevent duplicates. - This function has to be used in an asynchronous context, and it requires a valid - tree_root - and proper initialization of existing_nodes. - - Parameters: - ----------- - - - tree_root (Node): The root node of the AST tree containing code parts to extract. - - script_path (str): The file path of the script from which the AST was generated. - - existing_nodes (list[DataPoint]): A dictionary that holds already extracted - DataPoint nodes to avoid duplicates. (default {}) - - Returns: - -------- - - Yields DataPoint nodes representing imported modules, functions, and classes. - """ - for child_node in tree_root.children: - if child_node.type == "import_statement" or child_node.type == "import_from_statement": - parts = child_node.text.decode("utf-8").split() - - if parts[0] == "import": - module_name = parts[1] - function_name = None - elif parts[0] == "from": - module_name = parts[1] - function_name = parts[3] - - if " as " in function_name: - function_name = function_name.split(" as ")[0] - - if " as " in module_name: - module_name = module_name.split(" as ")[0] - - if function_name and "import " + function_name not in existing_nodes: - import_statement_node = ImportStatement( - name=function_name, - module=module_name, - start_point=child_node.start_point, - end_point=child_node.end_point, - file_path=script_path, - source_code=child_node.text, - ) - existing_nodes["import " + function_name] = import_statement_node - - if function_name: - yield existing_nodes["import " + function_name] - - if module_name not in existing_nodes: - import_statement_node = ImportStatement( - name=module_name, - module=module_name, - start_point=child_node.start_point, - end_point=child_node.end_point, - file_path=script_path, - source_code=child_node.text, - ) - existing_nodes[module_name] = import_statement_node - - yield existing_nodes[module_name] - - if child_node.type == "function_definition": - function_node = find_node(child_node.children, lambda node: node.type == "identifier") - function_node_name = function_node.text - - if function_node_name not in existing_nodes: - function_definition_node = FunctionDefinition( - name=function_node_name, - start_point=child_node.start_point, - end_point=child_node.end_point, - file_path=script_path, - source_code=child_node.text, - ) - existing_nodes[function_node_name] = function_definition_node - - yield existing_nodes[function_node_name] - - if child_node.type == "class_definition": - class_name_node = find_node(child_node.children, lambda node: node.type == "identifier") - class_name_node_name = class_name_node.text - - if class_name_node_name not in existing_nodes: - class_definition_node = ClassDefinition( - name=class_name_node_name, - start_point=child_node.start_point, - end_point=child_node.end_point, - file_path=script_path, - source_code=child_node.text, - ) - existing_nodes[class_name_node_name] = class_definition_node - - yield existing_nodes[class_name_node_name] diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py deleted file mode 100644 index b9ab1d4c6..000000000 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ /dev/null @@ -1,158 +0,0 @@ -import os - - -async def get_non_py_files(repo_path): - """ - Get files that are not .py files and their contents. - - Check if the specified repository path exists and if so, traverse the directory, - collecting the paths of files that do not have a .py extension and meet the - criteria set in the allowed and ignored patterns. Return a list of paths to - those files. - - Parameters: - ----------- - - - repo_path: The file system path to the repository to scan for non-Python files. - - Returns: - -------- - - A list of file paths that are not Python files and meet the specified criteria. - """ - if not os.path.exists(repo_path): - return {} - - IGNORED_PATTERNS = { - ".git", - "__pycache__", - "*.pyc", - "*.pyo", - "*.pyd", - "node_modules", - "*.egg-info", - } - - ALLOWED_EXTENSIONS = { - ".txt", - ".md", - ".csv", - ".json", - ".xml", - ".yaml", - ".yml", - ".html", - ".css", - ".js", - ".ts", - ".jsx", - ".tsx", - ".sql", - ".log", - ".ini", - ".toml", - ".properties", - ".sh", - ".bash", - ".dockerfile", - ".gitignore", - ".gitattributes", - ".makefile", - ".pyproject", - ".requirements", - ".env", - ".pdf", - ".doc", - ".docx", - ".dot", - ".dotx", - ".rtf", - ".wps", - ".wpd", - ".odt", - ".ott", - ".ottx", - ".txt", - ".wp", - ".sdw", - ".sdx", - ".docm", - ".dotm", - # Additional extensions for other programming languages - ".java", - ".c", - ".cpp", - ".h", - ".cs", - ".go", - ".php", - ".rb", - ".swift", - ".pl", - ".lua", - ".rs", - ".scala", - ".kt", - ".sh", - ".sql", - ".v", - ".asm", - ".pas", - ".d", - ".ml", - ".clj", - ".cljs", - ".erl", - ".ex", - ".exs", - ".f", - ".fs", - ".r", - ".pyi", - ".pdb", - ".ipynb", - ".rmd", - ".cabal", - ".hs", - ".nim", - ".vhdl", - ".verilog", - ".svelte", - ".html", - ".css", - ".scss", - ".less", - ".json5", - ".yaml", - ".yml", - } - - def should_process(path): - """ - Determine if a file should be processed based on its extension and path patterns. - - This function checks if the file extension is in the allowed list and ensures that none - of the ignored patterns are present in the provided file path. - - Parameters: - ----------- - - - path: The file path to check for processing eligibility. - - Returns: - -------- - - Returns True if the file should be processed; otherwise, False. - """ - _, ext = os.path.splitext(path) - return ext in ALLOWED_EXTENSIONS and not any( - pattern in path for pattern in IGNORED_PATTERNS - ) - - non_py_files_paths = [ - os.path.join(root, file) - for root, _, files in os.walk(repo_path) - for file in files - if not file.endswith(".py") and should_process(os.path.join(root, file)) - ] - return non_py_files_paths diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py deleted file mode 100644 index 06cc3bddb..000000000 --- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py +++ /dev/null @@ -1,243 +0,0 @@ -import asyncio -import math -import os -from pathlib import Path -from typing import Set -from typing import AsyncGenerator, Optional, List -from uuid import NAMESPACE_OID, uuid5 - -from cognee.infrastructure.engine import DataPoint -from cognee.shared.CodeGraphEntities import CodeFile, Repository - -# constant, declared only once -EXCLUDED_DIRS: Set[str] = { - ".venv", - "venv", - "env", - ".env", - "site-packages", - "node_modules", - "dist", - "build", - ".git", - "tests", - "test", -} - - -async def get_source_code_files( - repo_path, - language_config: dict[str, list[str]] | None = None, - excluded_paths: Optional[List[str]] = None, -): - """ - Retrieve Python source code files from the specified repository path. - - This function scans the given repository path for files that have the .py extension - while excluding test files and files within a virtual environment. It returns a list of - absolute paths to the source code files that are not empty. - - Parameters: - ----------- - - repo_path: Root path of the repository to search - - language_config: dict mapping language names to file extensions, e.g., - {'python': ['.py'], 'javascript': ['.js', '.jsx'], ...} - - excluded_paths: Optional list of path fragments or glob patterns to exclude - - Returns: - -------- - A list of (absolute_path, language) tuples for source code files. - """ - - def _get_language_from_extension(file, language_config): - for lang, exts in language_config.items(): - for ext in exts: - if file.endswith(ext): - return lang - return None - - # Default config if not provided - if language_config is None: - language_config = { - "python": [".py"], - "javascript": [".js", ".jsx"], - "typescript": [".ts", ".tsx"], - "java": [".java"], - "csharp": [".cs"], - "go": [".go"], - "rust": [".rs"], - "cpp": [".cpp", ".c", ".h", ".hpp"], - } - - if not os.path.exists(repo_path): - return [] - - source_code_files = set() - for root, _, files in os.walk(repo_path): - for file in files: - lang = _get_language_from_extension(file, language_config) - if lang is None: - continue - # Exclude tests, common build/venv directories and files provided in exclude_paths - excluded_dirs = EXCLUDED_DIRS - excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])} # full paths - - root_path = Path(root).resolve() - root_parts = set(root_path.parts) # same as before - base_name, _ext = os.path.splitext(file) - if ( - base_name.startswith("test_") - or base_name.endswith("_test") - or ".test." in file - or ".spec." in file - or (excluded_dirs & root_parts) # name match - or any( - root_path.is_relative_to(p) # full-path match - for p in excluded_paths - ) - ): - continue - file_path = os.path.abspath(os.path.join(root, file)) - if os.path.getsize(file_path) == 0: - continue - source_code_files.add((file_path, lang)) - - return sorted(list(source_code_files)) - - -def run_coroutine(coroutine_func, *args, **kwargs): - """ - Run a coroutine function until it completes. - - This function creates a new asyncio event loop, sets it as the current loop, and - executes the given coroutine function with the provided arguments. Once the coroutine - completes, the loop is closed. Intended for use in environments where an existing event - loop is not available or desirable. - - Parameters: - ----------- - - - coroutine_func: The coroutine function to be run. - - *args: Positional arguments to pass to the coroutine function. - - **kwargs: Keyword arguments to pass to the coroutine function. - - Returns: - -------- - - The result returned by the coroutine after completion. - """ - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - result = loop.run_until_complete(coroutine_func(*args, **kwargs)) - loop.close() - return result - - -async def get_repo_file_dependencies( - repo_path: str, - detailed_extraction: bool = False, - supported_languages: list = None, - excluded_paths: Optional[List[str]] = None, -) -> AsyncGenerator[DataPoint, None]: - """ - Generate a dependency graph for source files (multi-language) in the given repository path. - - Check the validity of the repository path and yield a repository object followed by the - dependencies of source files within that repository. Raise a FileNotFoundError if the - provided path does not exist. The extraction of detailed dependencies can be controlled - via the `detailed_extraction` argument. Languages considered can be restricted via - the `supported_languages` argument. - - Parameters: - ----------- - - - repo_path (str): The file path to the repository to process. - - detailed_extraction (bool): Whether to perform a detailed extraction of code parts. - - supported_languages (list | None): Subset of languages to include; if None, use defaults. - """ - - if isinstance(repo_path, list) and len(repo_path) == 1: - repo_path = repo_path[0] - - if not os.path.exists(repo_path): - raise FileNotFoundError(f"Repository path {repo_path} does not exist.") - - # Build language config from supported_languages - default_language_config = { - "python": [".py"], - "javascript": [".js", ".jsx"], - "typescript": [".ts", ".tsx"], - "java": [".java"], - "csharp": [".cs"], - "go": [".go"], - "rust": [".rs"], - "cpp": [".cpp", ".c", ".h", ".hpp"], - "c": [".c", ".h"], - } - if supported_languages is not None: - language_config = { - k: v for k, v in default_language_config.items() if k in supported_languages - } - else: - language_config = default_language_config - - source_code_files = await get_source_code_files( - repo_path, language_config=language_config, excluded_paths=excluded_paths - ) - - repo = Repository( - id=uuid5(NAMESPACE_OID, repo_path), - path=repo_path, - ) - - yield repo - - chunk_size = 100 - number_of_chunks = math.ceil(len(source_code_files) / chunk_size) - chunk_ranges = [ - ( - chunk_number * chunk_size, - min((chunk_number + 1) * chunk_size, len(source_code_files)) - 1, - ) - for chunk_number in range(number_of_chunks) - ] - - # Import dependency extractors for each language (Python for now, extend later) - from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies - import aiofiles - # TODO: Add other language extractors here - - for start_range, end_range in chunk_ranges: - tasks = [] - for file_path, lang in source_code_files[start_range : end_range + 1]: - # For now, only Python is supported; extend with other languages - if lang == "python": - tasks.append( - get_local_script_dependencies(repo_path, file_path, detailed_extraction) - ) - else: - # Placeholder: create a minimal CodeFile for other languages - async def make_codefile_stub(file_path=file_path, lang=lang): - async with aiofiles.open( - file_path, "r", encoding="utf-8", errors="replace" - ) as f: - source = await f.read() - return CodeFile( - id=uuid5(NAMESPACE_OID, file_path), - name=os.path.relpath(file_path, repo_path), - file_path=file_path, - language=lang, - source_code=source, - ) - - tasks.append(make_codefile_stub()) - - results: list[CodeFile] = await asyncio.gather(*tasks) - - for source_code_file in results: - source_code_file.part_of = repo - if getattr( - source_code_file, "language", None - ) is None and source_code_file.file_path.endswith(".py"): - source_code_file.language = "python" - yield source_code_file