feat: Extend CodeGraph pipeline for multi-language support (closes #1160) (#1233)

<!-- .github/pull_request_template.md -->

## Description

This pull request extends the CodeGraph pipeline to support
multi-language code analysis, addressing the requirements in #1160.

### What’s included:
- **Multi-language file discovery:** The pipeline now detects and
processes source files for Python, JavaScript, TypeScript, Java, C#, Go,
Rust, and C/C++ using a configurable extension mapping.
- **Configurable language support:** The pipeline and file discovery
functions accept a `supported_languages` parameter, making it easy to
add or remove language support.
- **Language field in CodeFile:** The `CodeFile` entity now includes a
`language` field, allowing downstream tasks to distinguish between
different programming languages.
- **Stub support for non-Python languages:** All supported files are
represented as `CodeFile` objects with the correct language. Python
files retain full dependency extraction; other languages are ready for
future parser integration.

### What’s not included (future work):
- Language-specific dependency extraction for non-Python languages
(e.g., using tree-sitter for JS, Java, etc.).
- Cross-language dependency detection.

This PR lays the foundation for comprehensive, multi-language code graph
analysis and makes it easy to extend support for additional languages
and dependency systems in the future.

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-08-27 14:14:41 +02:00 committed by GitHub
commit fddd34421e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 108 additions and 48 deletions

View file

@ -40,8 +40,11 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
user = await get_default_user()
detailed_extraction = True
# Multi-language support: allow passing supported_languages
supported_languages = None # defer to task defaults
tasks = [
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction, supported_languages=supported_languages),
# Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
Task(add_data_points, task_config={"batch_size": 30}),
]

View file

@ -36,6 +36,7 @@ class ClassDefinition(DataPoint):
class CodeFile(DataPoint):
name: str
file_path: str
language: Optional[str] = None # e.g., 'python', 'javascript', 'java', etc.
source_code: Optional[str] = None
part_of: Optional[Repository] = None
depends_on: Optional[List["ImportStatement"]] = []

View file

@ -180,6 +180,7 @@ async def get_local_script_dependencies(
name=file_path_relative_to_repo,
source_code=source_code,
file_path=script_path,
language="python",
)
return code_file_node
@ -188,6 +189,7 @@ async def get_local_script_dependencies(
name=file_path_relative_to_repo,
source_code=None,
file_path=script_path,
language="python",
)
async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path):

View file

@ -10,50 +10,71 @@ from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, Repository
async def get_source_code_files(repo_path):
async def get_source_code_files(repo_path, language_config: dict[str, list[str]] | None = None):
"""
Retrieve Python source code files from the specified repository path.
This function scans the given repository path for files that have the .py extension
while excluding test files and files within a virtual environment. It returns a list of
absolute paths to the source code files that are not empty.
Retrieve source code files from the specified repository path for multiple languages.
Parameters:
-----------
- repo_path: The file path to the repository to search for Python source files.
- repo_path: The file path to the repository to search for source files.
- language_config: dict mapping language names to file extensions, e.g.,
{'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}
Returns:
--------
A list of absolute paths to .py files that contain source code, excluding empty
files, test files, and files from a virtual environment.
A list of (absolute_path, language) tuples for source code files.
"""
if not os.path.exists(repo_path):
return {}
def _get_language_from_extension(file, language_config):
for lang, exts in language_config.items():
for ext in exts:
if file.endswith(ext):
return lang
return None
py_files_paths = (
os.path.join(root, file)
for root, _, files in os.walk(repo_path)
for file in files
if (
file.endswith(".py")
and not file.startswith("test_")
and not file.endswith("_test")
and ".venv" not in file
)
)
# Default config if not provided
if language_config is None:
language_config = {
'python': ['.py'],
'javascript': ['.js', '.jsx'],
'typescript': ['.ts', '.tsx'],
'java': ['.java'],
'csharp': ['.cs'],
'go': ['.go'],
'rust': ['.rs'],
'cpp': ['.cpp', '.c', '.h', '.hpp'],
}
if not os.path.exists(repo_path):
return []
source_code_files = set()
for file_path in py_files_paths:
file_path = os.path.abspath(file_path)
for root, _, files in os.walk(repo_path):
for file in files:
lang = _get_language_from_extension(file, language_config)
if lang is None:
continue
# Exclude tests and common build/venv directories
excluded_dirs = {
".venv", "venv", "env", ".env", "site-packages",
"node_modules", "dist", "build", ".git",
"tests", "test",
}
root_parts = set(os.path.normpath(root).split(os.sep))
base_name, _ext = os.path.splitext(file)
if (
base_name.startswith("test_")
or base_name.endswith("_test") # catches Go's *_test.go and similar
or ".test." in file
or ".spec." in file
or (excluded_dirs & root_parts)
):
continue
file_path = os.path.abspath(os.path.join(root, file))
if os.path.getsize(file_path) == 0:
continue
source_code_files.add((file_path, lang))
if os.path.getsize(file_path) == 0:
continue
source_code_files.add(file_path)
return list(source_code_files)
return sorted(list(source_code_files))
def run_coroutine(coroutine_func, *args, **kwargs):
@ -85,22 +106,23 @@ def run_coroutine(coroutine_func, *args, **kwargs):
async def get_repo_file_dependencies(
repo_path: str, detailed_extraction: bool = False
repo_path: str, detailed_extraction: bool = False, supported_languages: list = None
) -> AsyncGenerator[DataPoint, None]:
"""
Generate a dependency graph for Python files in the given repository path.
Generate a dependency graph for source files (multi-language) in the given repository path.
Check the validity of the repository path and yield a repository object followed by the
dependencies of Python files within that repository. Raise a FileNotFoundError if the
dependencies of source files within that repository. Raise a FileNotFoundError if the
provided path does not exist. The extraction of detailed dependencies can be controlled
via the `detailed_extraction` argument.
via the `detailed_extraction` argument. Languages considered can be restricted via
the `supported_languages` argument.
Parameters:
-----------
- repo_path (str): The file path to the repository where Python files are located.
- detailed_extraction (bool): A flag indicating whether to perform a detailed
extraction of dependencies (default is False). (default False)
- repo_path (str): The file path to the repository to process.
- detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
- supported_languages (list | None): Subset of languages to include; if None, use defaults.
"""
if isinstance(repo_path, list) and len(repo_path) == 1:
@ -109,7 +131,23 @@ async def get_repo_file_dependencies(
if not os.path.exists(repo_path):
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
source_code_files = await get_source_code_files(repo_path)
# Build language config from supported_languages
default_language_config = {
'python': ['.py'],
'javascript': ['.js', '.jsx'],
'typescript': ['.ts', '.tsx'],
'java': ['.java'],
'csharp': ['.cs'],
'go': ['.go'],
'rust': ['.rs'],
'cpp': ['.cpp', '.c', '.h', '.hpp'],
}
if supported_languages is not None:
language_config = {k: v for k, v in default_language_config.items() if k in supported_languages}
else:
language_config = default_language_config
source_code_files = await get_source_code_files(repo_path, language_config=language_config)
repo = Repository(
id=uuid5(NAMESPACE_OID, repo_path),
@ -128,19 +166,35 @@ async def get_repo_file_dependencies(
for chunk_number in range(number_of_chunks)
]
# Codegraph dependencies are not installed by default, so we import where we use them.
# Import dependency extractors for each language (Python for now, extend later)
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
import aiofiles
# TODO: Add other language extractors here
for start_range, end_range in chunk_ranges:
# with ProcessPoolExecutor(max_workers=12) as executor:
tasks = [
get_local_script_dependencies(repo_path, file_path, detailed_extraction)
for file_path in source_code_files[start_range : end_range + 1]
]
tasks = []
for file_path, lang in source_code_files[start_range : end_range + 1]:
# For now, only Python is supported; extend with other languages
if lang == 'python':
tasks.append(get_local_script_dependencies(repo_path, file_path, detailed_extraction))
else:
# Placeholder: create a minimal CodeFile for other languages
async def make_codefile_stub(file_path=file_path, lang=lang):
async with aiofiles.open(file_path, "r", encoding="utf-8", errors="replace") as f:
source = await f.read()
return CodeFile(
id=uuid5(NAMESPACE_OID, file_path),
name=os.path.relpath(file_path, repo_path),
file_path=file_path,
language=lang,
source_code=source,
)
tasks.append(make_codefile_stub())
results: list[CodeFile] = await asyncio.gather(*tasks)
for source_code_file in results:
source_code_file.part_of = repo
if (getattr(source_code_file, 'language', None) is None and source_code_file.file_path.endswith('.py')):
source_code_file.language = 'python'
yield source_code_file