feat: Extend CodeGraph pipeline for multi-language support (closes #1160) (#1233)

## Description This pull request extends the CodeGraph pipeline to support multi-language code analysis, addressing the requirements in #1160. ### What’s included: - **Multi-language file discovery:** The pipeline now detects and processes source files for Python, JavaScript, TypeScript, Java, C#, Go, Rust, and C/C++ using a configurable extension mapping. - **Configurable language support:** The pipeline and file discovery functions accept a `supported_languages` parameter, making it easy to add or remove language support. - **Language field in CodeFile:** The `CodeFile` entity now includes a `language` field, allowing downstream tasks to distinguish between different programming languages. - **Stub support for non-Python languages:** All supported files are represented as `CodeFile` objects with the correct language. Python files retain full dependency extraction; other languages are ready for future parser integration. ### What’s not included (future work): - Language-specific dependency extraction for non-Python languages (e.g., using tree-sitter for JS, Java, etc.). - Cross-language dependency detection. This PR lays the foundation for comprehensive, multi-language code graph analysis and makes it easy to extend support for additional languages and dependency systems in the future. ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
2025-08-27 14:14:41 +02:00 · 2025-08-27 14:14:41 +02:00 · fddd34421e
commit fddd34421e
parent bf482ef91f 4f2fd4652c
4 changed files with 108 additions and 48 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -40,8 +40,11 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
    user = await get_default_user()
    detailed_extraction = True

+
+    # Multi-language support: allow passing supported_languages
+    supported_languages = None # defer to task defaults
    tasks = [
-        Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
+        Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction, supported_languages=supported_languages),
        # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
        Task(add_data_points, task_config={"batch_size": 30}),
    ]
--- a/cognee/shared/CodeGraphEntities.py
+++ b/cognee/shared/CodeGraphEntities.py
@ -36,6 +36,7 @@ class ClassDefinition(DataPoint):
 class CodeFile(DataPoint):
    name: str
    file_path: str
+    language: Optional[str] = None  # e.g., 'python', 'javascript', 'java', etc.
    source_code: Optional[str] = None
    part_of: Optional[Repository] = None
    depends_on: Optional[List["ImportStatement"]] = []
--- a/cognee/tasks/repo_processor/get_local_dependencies.py
+++ b/cognee/tasks/repo_processor/get_local_dependencies.py
@ -180,6 +180,7 @@ async def get_local_script_dependencies(
            name=file_path_relative_to_repo,
            source_code=source_code,
            file_path=script_path,
+            language="python",
        )
        return code_file_node

@ -188,6 +189,7 @@ async def get_local_script_dependencies(
        name=file_path_relative_to_repo,
        source_code=None,
        file_path=script_path,
+        language="python",
    )

    async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path):
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -10,50 +10,71 @@ from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, Repository


-async def get_source_code_files(repo_path):
+async def get_source_code_files(repo_path, language_config: dict[str, list[str]] | None = None):
    """
-    Retrieve Python source code files from the specified repository path.
-
-    This function scans the given repository path for files that have the .py extension
-    while excluding test files and files within a virtual environment. It returns a list of
-    absolute paths to the source code files that are not empty.
+    Retrieve source code files from the specified repository path for multiple languages.

    Parameters:
    -----------
-
-        - repo_path: The file path to the repository to search for Python source files.
+        - repo_path: The file path to the repository to search for source files.
+        - language_config: dict mapping language names to file extensions, e.g.,
+            {'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}

    Returns:
    --------
-
-        A list of absolute paths to .py files that contain source code, excluding empty
-        files, test files, and files from a virtual environment.
+        A list of (absolute_path, language) tuples for source code files.
    """
-    if not os.path.exists(repo_path):
-        return {}
+    def _get_language_from_extension(file, language_config):
+        for lang, exts in language_config.items():
+            for ext in exts:
+                if file.endswith(ext):
+                    return lang
+        return None

-    py_files_paths = (
-        os.path.join(root, file)
-        for root, _, files in os.walk(repo_path)
-        for file in files
-        if (
-            file.endswith(".py")
-            and not file.startswith("test_")
-            and not file.endswith("_test")
-            and ".venv" not in file
-        )
-    )
+    # Default config if not provided
+    if language_config is None:
+        language_config = {
+            'python': ['.py'],
+            'javascript': ['.js', '.jsx'],
+            'typescript': ['.ts', '.tsx'],
+            'java': ['.java'],
+            'csharp': ['.cs'],
+            'go': ['.go'],
+            'rust': ['.rs'],
+            'cpp': ['.cpp', '.c', '.h', '.hpp'],
+        }
+
+    if not os.path.exists(repo_path):
+        return []

    source_code_files = set()
-    for file_path in py_files_paths:
-        file_path = os.path.abspath(file_path)
+    for root, _, files in os.walk(repo_path):
+        for file in files:
+            lang = _get_language_from_extension(file, language_config)
+            if lang is None:
+                continue
+            # Exclude tests and common build/venv directories
+            excluded_dirs = {
+                ".venv", "venv", "env", ".env", "site-packages",
+                "node_modules", "dist", "build", ".git",
+                "tests", "test",
+            }
+            root_parts = set(os.path.normpath(root).split(os.sep))
+            base_name, _ext = os.path.splitext(file)
+            if (
+                base_name.startswith("test_")
+                or base_name.endswith("_test")  # catches Go's *_test.go and similar
+                or ".test." in file
+                or ".spec." in file
+                or (excluded_dirs & root_parts)
+            ):
+                continue
+            file_path = os.path.abspath(os.path.join(root, file))
+            if os.path.getsize(file_path) == 0:
+                continue
+            source_code_files.add((file_path, lang))

-        if os.path.getsize(file_path) == 0:
-            continue
-
-        source_code_files.add(file_path)
-
-    return list(source_code_files)
+    return sorted(list(source_code_files))


 def run_coroutine(coroutine_func, *args, **kwargs):
@ -85,22 +106,23 @@ def run_coroutine(coroutine_func, *args, **kwargs):


 async def get_repo_file_dependencies(
-    repo_path: str, detailed_extraction: bool = False
+    repo_path: str, detailed_extraction: bool = False, supported_languages: list = None
 ) -> AsyncGenerator[DataPoint, None]:
    """
-    Generate a dependency graph for Python files in the given repository path.
+    Generate a dependency graph for source files (multi-language) in the given repository path.

    Check the validity of the repository path and yield a repository object followed by the
-    dependencies of Python files within that repository. Raise a FileNotFoundError if the
+    dependencies of source files within that repository. Raise a FileNotFoundError if the
    provided path does not exist. The extraction of detailed dependencies can be controlled
-    via the `detailed_extraction` argument.
+    via the `detailed_extraction` argument. Languages considered can be restricted via
+    the `supported_languages` argument.

    Parameters:
    -----------

-        - repo_path (str): The file path to the repository where Python files are located.
-        - detailed_extraction (bool): A flag indicating whether to perform a detailed
-          extraction of dependencies (default is False). (default False)
+        - repo_path (str): The file path to the repository to process.
+        - detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
+        - supported_languages (list | None): Subset of languages to include; if None, use defaults.
    """

    if isinstance(repo_path, list) and len(repo_path) == 1:
@ -109,7 +131,23 @@ async def get_repo_file_dependencies(
    if not os.path.exists(repo_path):
        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")

-    source_code_files = await get_source_code_files(repo_path)
+    # Build language config from supported_languages
+    default_language_config = {
+        'python': ['.py'],
+        'javascript': ['.js', '.jsx'],
+        'typescript': ['.ts', '.tsx'],
+        'java': ['.java'],
+        'csharp': ['.cs'],
+        'go': ['.go'],
+        'rust': ['.rs'],
+        'cpp': ['.cpp', '.c', '.h', '.hpp'],
+    }
+    if supported_languages is not None:
+        language_config = {k: v for k, v in default_language_config.items() if k in supported_languages}
+    else:
+        language_config = default_language_config
+
+    source_code_files = await get_source_code_files(repo_path, language_config=language_config)

    repo = Repository(
        id=uuid5(NAMESPACE_OID, repo_path),
@ -128,19 +166,35 @@ async def get_repo_file_dependencies(
        for chunk_number in range(number_of_chunks)
    ]

-    # Codegraph dependencies are not installed by default, so we import where we use them.
+    # Import dependency extractors for each language (Python for now, extend later)
    from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
+    import aiofiles
+    # TODO: Add other language extractors here

    for start_range, end_range in chunk_ranges:
-        # with ProcessPoolExecutor(max_workers=12) as executor:
-        tasks = [
-            get_local_script_dependencies(repo_path, file_path, detailed_extraction)
-            for file_path in source_code_files[start_range : end_range + 1]
-        ]
+        tasks = []
+        for file_path, lang in source_code_files[start_range : end_range + 1]:
+            # For now, only Python is supported; extend with other languages
+            if lang == 'python':
+                tasks.append(get_local_script_dependencies(repo_path, file_path, detailed_extraction))
+            else:
+                # Placeholder: create a minimal CodeFile for other languages
+                async def make_codefile_stub(file_path=file_path, lang=lang):
+                    async with aiofiles.open(file_path, "r", encoding="utf-8", errors="replace") as f:
+                        source = await f.read()
+                    return CodeFile(
+                        id=uuid5(NAMESPACE_OID, file_path),
+                        name=os.path.relpath(file_path, repo_path),
+                        file_path=file_path,
+                        language=lang,
+                        source_code=source,
+                    )
+                tasks.append(make_codefile_stub())

        results: list[CodeFile] = await asyncio.gather(*tasks)

        for source_code_file in results:
            source_code_file.part_of = repo
-
+            if (getattr(source_code_file, 'language', None) is None and source_code_file.file_path.endswith('.py')):
+                source_code_file.language = 'python'
            yield source_code_file