refactor: Remove codify and code_graph pipeline from main repo

2025-11-05 12:56:17 +01:00 · 2025-11-05 12:56:17 +01:00 · c481b87d58
commit c481b87d58
parent 8d7c4d5384
9 changed files with 14 additions and 413 deletions
--- a/.github/workflows/basic_tests.yml
+++ b/.github/workflows/basic_tests.yml
@ -193,32 +193,3 @@ jobs:

      - name: Run Simple Examples
        run: uv run python ./examples/python/simple_example.py
-
-  graph-tests:
-    name: Run Basic Graph Tests
-    runs-on: ubuntu-22.04
-    env:
-      LLM_PROVIDER: openai
-      LLM_MODEL: ${{ secrets.LLM_MODEL }}
-      LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
-      LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
-      LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
-
-      EMBEDDING_PROVIDER: openai
-      EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
-      EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
-      EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
-      EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Cognee Setup
-        uses: ./.github/actions/cognee_setup
-        with:
-          python-version: ${{ inputs.python-version }}
-
-      - name: Run Graph Tests
-        run: uv run python ./examples/python/code_graph_example.py --repo_path ./cognee/tasks/graph
--- a/cognee-mcp/src/server.py
+++ b/cognee-mcp/src/server.py
@ -407,75 +407,6 @@ async def save_interaction(data: str) -> list:
    ]


-@mcp.tool()
-async def codify(repo_path: str) -> list:
-    """
-    Analyze and generate a code-specific knowledge graph from a software repository.
-
-    This function launches a background task that processes the provided repository
-    and builds a code knowledge graph. The function returns immediately while
-    the processing continues in the background due to MCP timeout constraints.
-
-    Parameters
-    ----------
-    repo_path : str
-        Path to the code repository to analyze. This can be a local file path or a
-        relative path to a repository. The path should point to the root of the
-        repository or a specific directory within it.
-
-    Returns
-    -------
-    list
-        A list containing a single TextContent object with information about the
-        background task launch and how to check its status.
-
-    Notes
-    -----
-    - The function launches a background task and returns immediately
-    - The code graph generation may take significant time for larger repositories
-    - Use the codify_status tool to check the progress of the operation
-    - Process results are logged to the standard Cognee log file
-    - All stdout is redirected to stderr to maintain MCP communication integrity
-    """
-
-    if cognee_client.use_api:
-        error_msg = "❌ Codify operation is not available in API mode. Please use direct mode for code graph pipeline."
-        logger.error(error_msg)
-        return [types.TextContent(type="text", text=error_msg)]
-
-    async def codify_task(repo_path: str):
-        # NOTE: MCP uses stdout to communicate, we must redirect all output
-        #       going to stdout ( like the print function ) to stderr.
-        with redirect_stdout(sys.stderr):
-            logger.info("Codify process starting.")
-            from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
-
-            results = []
-            async for result in run_code_graph_pipeline(repo_path, False):
-                results.append(result)
-                logger.info(result)
-            if all(results):
-                logger.info("Codify process finished succesfully.")
-            else:
-                logger.info("Codify process failed.")
-
-    asyncio.create_task(codify_task(repo_path))
-
-    log_file = get_log_file_location()
-    text = (
-        f"Background process launched due to MCP timeout limitations.\n"
-        f"To check current codify status use the codify_status tool\n"
-        f"or you can check the log file at: {log_file}"
-    )
-
-    return [
-        types.TextContent(
-            type="text",
-            text=text,
-        )
-    ]
-
-
@mcp.tool()
 async def search(search_query: str, search_type: str) -> list:
    """
@ -954,48 +885,6 @@ async def cognify_status():
            return [types.TextContent(type="text", text=error_msg)]


-@mcp.tool()
-async def codify_status():
-    """
-    Get the current status of the codify pipeline.
-
-    This function retrieves information about current and recently completed codify operations
-    in the codebase dataset. It provides details on progress, success/failure status, and statistics
-    about the processed code repositories.
-
-    Returns
-    -------
-    list
-        A list containing a single TextContent object with the status information as a string.
-        The status includes information about active and completed jobs for the cognify_code_pipeline.
-
-    Notes
-    -----
-    - The function retrieves pipeline status specifically for the "cognify_code_pipeline" on the "codebase" dataset
-    - Status information includes job progress, execution time, and completion status
-    - The status is returned in string format for easy reading
-    - This operation is not available in API mode
-    """
-    with redirect_stdout(sys.stderr):
-        try:
-            from cognee.modules.data.methods.get_unique_dataset_id import get_unique_dataset_id
-            from cognee.modules.users.methods import get_default_user
-
-            user = await get_default_user()
-            status = await cognee_client.get_pipeline_status(
-                [await get_unique_dataset_id("codebase", user)], "cognify_code_pipeline"
-            )
-            return [types.TextContent(type="text", text=str(status))]
-        except NotImplementedError:
-            error_msg = "❌ Pipeline status is not available in API mode"
-            logger.error(error_msg)
-            return [types.TextContent(type="text", text=error_msg)]
-        except Exception as e:
-            error_msg = f"❌ Failed to get codify status: {str(e)}"
-            logger.error(error_msg)
-            return [types.TextContent(type="text", text=error_msg)]
-
-
 def node_to_string(node):
    node_data = ", ".join(
        [f'{key}: "{value}"' for key, value in node.items() if key in ["id", "name"]]
--- a/cognee/api/client.py
+++ b/cognee/api/client.py
@ -21,7 +21,7 @@ from cognee.api.v1.notebooks.routers import get_notebooks_router
 from cognee.api.v1.permissions.routers import get_permissions_router
 from cognee.api.v1.settings.routers import get_settings_router
 from cognee.api.v1.datasets.routers import get_datasets_router
-from cognee.api.v1.cognify.routers import get_code_pipeline_router, get_cognify_router
+from cognee.api.v1.cognify.routers import get_cognify_router
 from cognee.api.v1.search.routers import get_search_router
 from cognee.api.v1.memify.routers import get_memify_router
 from cognee.api.v1.add.routers import get_add_router
@ -270,10 +270,6 @@ app.include_router(get_responses_router(), prefix="/api/v1/responses", tags=["re

 app.include_router(get_sync_router(), prefix="/api/v1/sync", tags=["sync"])

-codegraph_routes = get_code_pipeline_router()
-if codegraph_routes:
-    app.include_router(codegraph_routes, prefix="/api/v1/code-pipeline", tags=["code-pipeline"])
-
 app.include_router(
    get_users_router(),
    prefix="/api/v1/users",
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -1,119 +0,0 @@
-import os
-import pathlib
-import asyncio
-from typing import Optional
-from cognee.shared.logging_utils import get_logger, setup_logging
-from cognee.modules.observability.get_observe import get_observe
-
-from cognee.api.v1.search import SearchType, search
-from cognee.api.v1.visualize.visualize import visualize_graph
-from cognee.modules.cognify.config import get_cognify_config
-from cognee.modules.pipelines import run_tasks
-from cognee.modules.pipelines.tasks.task import Task
-from cognee.modules.users.methods import get_default_user
-from cognee.shared.data_models import KnowledgeGraph
-from cognee.modules.data.methods import create_dataset
-from cognee.tasks.documents import classify_documents, extract_chunks_from_documents
-from cognee.tasks.graph import extract_graph_from_data
-from cognee.tasks.ingestion import ingest_data
-from cognee.tasks.repo_processor import get_non_py_files, get_repo_file_dependencies
-
-from cognee.tasks.storage import add_data_points
-from cognee.tasks.summarization import summarize_text
-from cognee.infrastructure.llm import get_max_chunk_tokens
-from cognee.infrastructure.databases.relational import get_relational_engine
-
-observe = get_observe()
-
-logger = get_logger("code_graph_pipeline")
-
-
-@observe
-async def run_code_graph_pipeline(
-    repo_path,
-    include_docs=False,
-    excluded_paths: Optional[list[str]] = None,
-    supported_languages: Optional[list[str]] = None,
-):
-    import cognee
-    from cognee.low_level import setup
-
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-    await setup()
-
-    cognee_config = get_cognify_config()
-    user = await get_default_user()
-    detailed_extraction = True
-
-    tasks = [
-        Task(
-            get_repo_file_dependencies,
-            detailed_extraction=detailed_extraction,
-            supported_languages=supported_languages,
-            excluded_paths=excluded_paths,
-        ),
-        # Task(summarize_code, task_config={"batch_size": 500}), # This task takes a long time to complete
-        Task(add_data_points, task_config={"batch_size": 30}),
-    ]
-
-    if include_docs:
-        # This tasks take a long time to complete
-        non_code_tasks = [
-            Task(get_non_py_files, task_config={"batch_size": 50}),
-            Task(ingest_data, dataset_name="repo_docs", user=user),
-            Task(classify_documents),
-            Task(extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()),
-            Task(
-                extract_graph_from_data,
-                graph_model=KnowledgeGraph,
-                task_config={"batch_size": 50},
-            ),
-            Task(
-                summarize_text,
-                summarization_model=cognee_config.summarization_model,
-                task_config={"batch_size": 50},
-            ),
-        ]
-
-    dataset_name = "codebase"
-
-    # Save dataset to database
-    db_engine = get_relational_engine()
-    async with db_engine.get_async_session() as session:
-        dataset = await create_dataset(dataset_name, user, session)
-
-    if include_docs:
-        non_code_pipeline_run = run_tasks(
-            non_code_tasks, dataset.id, repo_path, user, "cognify_pipeline"
-        )
-        async for run_status in non_code_pipeline_run:
-            yield run_status
-
-    async for run_status in run_tasks(
-        tasks, dataset.id, repo_path, user, "cognify_code_pipeline", incremental_loading=False
-    ):
-        yield run_status
-
-
-if __name__ == "__main__":
-
-    async def main():
-        async for run_status in run_code_graph_pipeline("REPO_PATH"):
-            print(f"{run_status.pipeline_run_id}: {run_status.status}")
-
-        file_path = os.path.join(
-            pathlib.Path(__file__).parent, ".artifacts", "graph_visualization.html"
-        )
-        await visualize_graph(file_path)
-
-        search_results = await search(
-            query_type=SearchType.CODE,
-            query_text="How is Relationship weight calculated?",
-        )
-
-        for file in search_results:
-            print(file["name"])
-
-    logger = setup_logging(name="code_graph_pipeline")
-    asyncio.run(main())
--- a/cognee/api/v1/cognify/routers/init.py
+++ b/cognee/api/v1/cognify/routers/init.py
@ -1,2 +1 @@
 from .get_cognify_router import get_cognify_router
-from .get_code_pipeline_router import get_code_pipeline_router
--- a/cognee/api/v1/cognify/routers/get_code_pipeline_router.py
+++ b/cognee/api/v1/cognify/routers/get_code_pipeline_router.py
@ -1,90 +0,0 @@
-import json
-from cognee.shared.logging_utils import get_logger
-from fastapi import APIRouter
-from fastapi.responses import JSONResponse
-from cognee.api.DTO import InDTO
-from cognee.modules.retrieval.code_retriever import CodeRetriever
-from cognee.modules.storage.utils import JSONEncoder
-
-
-logger = get_logger()
-
-
-class CodePipelineIndexPayloadDTO(InDTO):
-    repo_path: str
-    include_docs: bool = False
-
-
-class CodePipelineRetrievePayloadDTO(InDTO):
-    query: str
-    full_input: str
-
-
-def get_code_pipeline_router() -> APIRouter:
-    try:
-        import cognee.api.v1.cognify.code_graph_pipeline
-    except ModuleNotFoundError:
-        logger.error("codegraph dependencies not found. Skipping codegraph API routes.")
-        return None
-
-    router = APIRouter()
-
-    @router.post("/index", response_model=None)
-    async def code_pipeline_index(payload: CodePipelineIndexPayloadDTO):
-        """
-        Run indexation on a code repository.
-
-        This endpoint processes a code repository to create a knowledge graph
-        of the codebase structure, dependencies, and relationships.
-
-        ## Request Parameters
-        - **repo_path** (str): Path to the code repository
-        - **include_docs** (bool): Whether to include documentation files (default: false)
-
-        ## Response
-        No content returned. Processing results are logged.
-
-        ## Error Codes
-        - **409 Conflict**: Error during indexation process
-        """
-        from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
-
-        try:
-            async for result in run_code_graph_pipeline(payload.repo_path, payload.include_docs):
-                logger.info(result)
-        except Exception as error:
-            return JSONResponse(status_code=409, content={"error": str(error)})
-
-    @router.post("/retrieve", response_model=list[dict])
-    async def code_pipeline_retrieve(payload: CodePipelineRetrievePayloadDTO):
-        """
-        Retrieve context from the code knowledge graph.
-
-        This endpoint searches the indexed code repository to find relevant
-        context based on the provided query.
-
-        ## Request Parameters
-        - **query** (str): Search query for code context
-        - **full_input** (str): Full input text for processing
-
-        ## Response
-        Returns a list of relevant code files and context as JSON.
-
-        ## Error Codes
-        - **409 Conflict**: Error during retrieval process
-        """
-        try:
-            query = (
-                payload.full_input.replace("cognee ", "")
-                if payload.full_input.startswith("cognee ")
-                else payload.full_input
-            )
-
-            retriever = CodeRetriever()
-            retrieved_files = await retriever.get_context(query)
-
-            return json.dumps(retrieved_files, cls=JSONEncoder)
-        except Exception as error:
-            return JSONResponse(status_code=409, content={"error": str(error)})
-
-    return router
--- a/cognee/modules/pipelines/init.py
+++ b/cognee/modules/pipelines/init.py
@ -2,3 +2,4 @@ from .tasks.task import Task
 from .operations.run_tasks import run_tasks
 from .operations.run_parallel import run_tasks_parallel
 from .operations.pipeline import run_pipeline
+from .custom_pipeline_interface import CustomPipelineInterface
--- a/cognee/modules/pipelines/custom_pipeline_interface.py
+++ b/cognee/modules/pipelines/custom_pipeline_interface.py
@ -0,0 +1,12 @@
+from typing import Protocol, Any
+from abc import abstractmethod
+
+
+class CustomPipelineInterface(Protocol):
+    """
+    Defines an interface for creating and running a custom pipeline.
+    """
+
+    @abstractmethod
+    async def run_pipeline(self) -> Any:
+        raise NotImplementedError
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@ -1,58 +0,0 @@
-import argparse
-import asyncio
-import cognee
-from cognee import SearchType
-from cognee.shared.logging_utils import setup_logging, ERROR
-
-from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
-
-
-async def main(repo_path, include_docs):
-    run_status = False
-    async for run_status in run_code_graph_pipeline(repo_path, include_docs=include_docs):
-        run_status = run_status
-
-    # Test CODE search
-    search_results = await cognee.search(query_type=SearchType.CODE, query_text="test")
-    assert len(search_results) != 0, "The search results list is empty."
-    print("\n\nSearch results are:\n")
-    for result in search_results:
-        print(f"{result}\n")
-
-    return run_status
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
-    parser.add_argument(
-        "--include_docs",
-        type=lambda x: x.lower() in ("true", "1"),
-        default=False,
-        help="Whether or not to process non-code files",
-    )
-    parser.add_argument(
-        "--time",
-        type=lambda x: x.lower() in ("true", "1"),
-        default=True,
-        help="Whether or not to time the pipeline run",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    logger = setup_logging(log_level=ERROR)
-
-    args = parse_args()
-
-    if args.time:
-        import time
-
-        start_time = time.time()
-        asyncio.run(main(args.repo_path, args.include_docs))
-        end_time = time.time()
-        print("\n" + "=" * 50)
-        print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds")
-        print("=" * 50 + "\n")
-    else:
-        asyncio.run(main(args.repo_path, args.include_docs))