fix: Resolve code graph pipeline issue

2025-07-10 22:20:30 +02:00 · 2025-07-10 22:20:30 +02:00 · 67b61ff964
commit 67b61ff964
parent 80896fdcc5
3 changed files with 55 additions and 49 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -79,7 +79,9 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
        async for run_status in non_code_pipeline_run:
            yield run_status
-    async for run_status in run_tasks(tasks, dataset.id, repo_path, user, "cognify_code_pipeline"):
+    async for run_status in run_tasks(
        tasks, dataset.id, repo_path, user, "cognify_code_pipeline", incremental_loading=False
    ):
        yield run_status
--- a/cognee/modules/pipelines/operations/run_tasks.py
+++ b/cognee/modules/pipelines/operations/run_tasks.py
@ -95,29 +95,29 @@ async def run_tasks(
        # TODO: Convert to async gather task instead of for loop (just make sure it can work there were some issues when async gathering datasets)
        for data_item in data:
-            # If data is being added to Cognee for the first time calculate the id of the data
+            # If incremental_loading of data is set to True don't process documents already processed by pipeline
            if not isinstance(data_item, Data):
                data = await resolve_data_directories(data)
                file_path = await save_data_item_to_storage(data_item, dataset.name)
                # Ingest data and add metadata
                with open_data_file(file_path, s3fs=fs) as file:
                    classified_data = ingestion.classify(file, s3fs=fs)
                    # data_id is the hash of file contents + owner id to avoid duplicate data
                    data_id = ingestion.identify(classified_data, user)
            else:
                # If data was already processed by Cognee get data id
                data_id = data_item.id
            # If incremental_loading is set to True don't process documents already processed by pipeline
            if incremental_loading:
-                # Check pipeline status, if Data already processed for pipeline before skip current processing
+                # If data is being added to Cognee for the first time calculate the id of the data
-                async with db_engine.get_async_session() as session:
+                if not isinstance(data_item, Data):
-                    data_point = (
+                    data = await resolve_data_directories(data)
-                        await session.execute(select(Data).filter(Data.id == data_id))
+                    file_path = await save_data_item_to_storage(data_item, dataset.name)
-                    ).scalar_one_or_none()
+                    # Ingest data and add metadata
-                    if data_point:
+                    with open_data_file(file_path, s3fs=fs) as file:
-                        if data_point.pipeline_status.get(pipeline_name) == "Completed":
+                        classified_data = ingestion.classify(file, s3fs=fs)
-                            break
+                        # data_id is the hash of file contents + owner id to avoid duplicate data
                        data_id = ingestion.identify(classified_data, user)
                else:
                    # If data was already processed by Cognee get data id
                    data_id = data_item.id
                    # Check pipeline status, if Data already processed for pipeline before skip current processing
                    async with db_engine.get_async_session() as session:
                        data_point = (
                            await session.execute(select(Data).filter(Data.id == data_id))
                        ).scalar_one_or_none()
                        if data_point:
                            if data_point.pipeline_status.get(pipeline_name) == "Completed":
                                break
            try:
                async for result in run_tasks_with_telemetry(
@ -134,23 +134,24 @@ async def run_tasks(
                        payload=result,
                    )
-                data_items_pipeline_run_info[data_id] = {
+                if incremental_loading:
-                    "run_info": PipelineRunCompleted(
+                    data_items_pipeline_run_info[data_id] = {
-                        pipeline_run_id=pipeline_run_id,
+                        "run_info": PipelineRunCompleted(
-                        dataset_id=dataset.id,
+                            pipeline_run_id=pipeline_run_id,
-                        dataset_name=dataset.name,
+                            dataset_id=dataset.id,
-                    ),
+                            dataset_name=dataset.name,
-                    "data_id": data_id,
+                        ),
-                }
+                        "data_id": data_id,
                    }
-                # Update pipeline status for Data element
+                    # Update pipeline status for Data element
-                async with db_engine.get_async_session() as session:
+                    async with db_engine.get_async_session() as session:
-                    data_point = (
+                        data_point = (
-                        await session.execute(select(Data).filter(Data.id == data_id))
+                            await session.execute(select(Data).filter(Data.id == data_id))
-                    ).scalar_one_or_none()
+                        ).scalar_one_or_none()
-                    data_point.pipeline_status[pipeline_name] = "Completed"
+                        data_point.pipeline_status[pipeline_name] = "Completed"
-                    await session.merge(data_point)
+                        await session.merge(data_point)
-                    await session.commit()
+                        await session.commit()
            except Exception as error:
                # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
@ -158,16 +159,16 @@ async def run_tasks(
                logger.error(
                    f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
                )
-
+                if incremental_loading:
-                data_items_pipeline_run_info = {
+                    data_items_pipeline_run_info = {
-                    "run_info": PipelineRunErrored(
+                        "run_info": PipelineRunErrored(
-                        pipeline_run_id=pipeline_run_id,
+                            pipeline_run_id=pipeline_run_id,
-                        payload=error,
+                            payload=error,
-                        dataset_id=dataset.id,
+                            dataset_id=dataset.id,
-                        dataset_name=dataset.name,
+                            dataset_name=dataset.name,
-                    ),
+                        ),
-                    "data_id": data_id,
+                        "data_id": data_id,
-                }
+                    }
        # re-raise error found during data ingestion
        if ingestion_error:
--- a/cognee/tasks/repo_processor/get_repo_file_dependencies.py
+++ b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@ -103,6 +103,9 @@ async def get_repo_file_dependencies(
          extraction of dependencies (default is False). (default False)
    """
    if isinstance(repo_path, list) and len(repo_path) == 1:
        repo_path = repo_path[0]
    if not os.path.exists(repo_path):
        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")