fix: Resolve code graph pipeline issue
This commit is contained in:
parent
80896fdcc5
commit
67b61ff964
3 changed files with 55 additions and 49 deletions
|
|
@ -79,7 +79,9 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
|
||||||
async for run_status in non_code_pipeline_run:
|
async for run_status in non_code_pipeline_run:
|
||||||
yield run_status
|
yield run_status
|
||||||
|
|
||||||
async for run_status in run_tasks(tasks, dataset.id, repo_path, user, "cognify_code_pipeline"):
|
async for run_status in run_tasks(
|
||||||
|
tasks, dataset.id, repo_path, user, "cognify_code_pipeline", incremental_loading=False
|
||||||
|
):
|
||||||
yield run_status
|
yield run_status
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,29 +95,29 @@ async def run_tasks(
|
||||||
|
|
||||||
# TODO: Convert to async gather task instead of for loop (just make sure it can work there were some issues when async gathering datasets)
|
# TODO: Convert to async gather task instead of for loop (just make sure it can work there were some issues when async gathering datasets)
|
||||||
for data_item in data:
|
for data_item in data:
|
||||||
# If data is being added to Cognee for the first time calculate the id of the data
|
# If incremental_loading of data is set to True don't process documents already processed by pipeline
|
||||||
if not isinstance(data_item, Data):
|
|
||||||
data = await resolve_data_directories(data)
|
|
||||||
file_path = await save_data_item_to_storage(data_item, dataset.name)
|
|
||||||
# Ingest data and add metadata
|
|
||||||
with open_data_file(file_path, s3fs=fs) as file:
|
|
||||||
classified_data = ingestion.classify(file, s3fs=fs)
|
|
||||||
# data_id is the hash of file contents + owner id to avoid duplicate data
|
|
||||||
data_id = ingestion.identify(classified_data, user)
|
|
||||||
else:
|
|
||||||
# If data was already processed by Cognee get data id
|
|
||||||
data_id = data_item.id
|
|
||||||
|
|
||||||
# If incremental_loading is set to True don't process documents already processed by pipeline
|
|
||||||
if incremental_loading:
|
if incremental_loading:
|
||||||
# Check pipeline status, if Data already processed for pipeline before skip current processing
|
# If data is being added to Cognee for the first time calculate the id of the data
|
||||||
async with db_engine.get_async_session() as session:
|
if not isinstance(data_item, Data):
|
||||||
data_point = (
|
data = await resolve_data_directories(data)
|
||||||
await session.execute(select(Data).filter(Data.id == data_id))
|
file_path = await save_data_item_to_storage(data_item, dataset.name)
|
||||||
).scalar_one_or_none()
|
# Ingest data and add metadata
|
||||||
if data_point:
|
with open_data_file(file_path, s3fs=fs) as file:
|
||||||
if data_point.pipeline_status.get(pipeline_name) == "Completed":
|
classified_data = ingestion.classify(file, s3fs=fs)
|
||||||
break
|
# data_id is the hash of file contents + owner id to avoid duplicate data
|
||||||
|
data_id = ingestion.identify(classified_data, user)
|
||||||
|
else:
|
||||||
|
# If data was already processed by Cognee get data id
|
||||||
|
data_id = data_item.id
|
||||||
|
|
||||||
|
# Check pipeline status, if Data already processed for pipeline before skip current processing
|
||||||
|
async with db_engine.get_async_session() as session:
|
||||||
|
data_point = (
|
||||||
|
await session.execute(select(Data).filter(Data.id == data_id))
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if data_point:
|
||||||
|
if data_point.pipeline_status.get(pipeline_name) == "Completed":
|
||||||
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for result in run_tasks_with_telemetry(
|
async for result in run_tasks_with_telemetry(
|
||||||
|
|
@ -134,23 +134,24 @@ async def run_tasks(
|
||||||
payload=result,
|
payload=result,
|
||||||
)
|
)
|
||||||
|
|
||||||
data_items_pipeline_run_info[data_id] = {
|
if incremental_loading:
|
||||||
"run_info": PipelineRunCompleted(
|
data_items_pipeline_run_info[data_id] = {
|
||||||
pipeline_run_id=pipeline_run_id,
|
"run_info": PipelineRunCompleted(
|
||||||
dataset_id=dataset.id,
|
pipeline_run_id=pipeline_run_id,
|
||||||
dataset_name=dataset.name,
|
dataset_id=dataset.id,
|
||||||
),
|
dataset_name=dataset.name,
|
||||||
"data_id": data_id,
|
),
|
||||||
}
|
"data_id": data_id,
|
||||||
|
}
|
||||||
|
|
||||||
# Update pipeline status for Data element
|
# Update pipeline status for Data element
|
||||||
async with db_engine.get_async_session() as session:
|
async with db_engine.get_async_session() as session:
|
||||||
data_point = (
|
data_point = (
|
||||||
await session.execute(select(Data).filter(Data.id == data_id))
|
await session.execute(select(Data).filter(Data.id == data_id))
|
||||||
).scalar_one_or_none()
|
).scalar_one_or_none()
|
||||||
data_point.pipeline_status[pipeline_name] = "Completed"
|
data_point.pipeline_status[pipeline_name] = "Completed"
|
||||||
await session.merge(data_point)
|
await session.merge(data_point)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
|
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
|
||||||
|
|
@ -158,16 +159,16 @@ async def run_tasks(
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
|
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
|
||||||
)
|
)
|
||||||
|
if incremental_loading:
|
||||||
data_items_pipeline_run_info = {
|
data_items_pipeline_run_info = {
|
||||||
"run_info": PipelineRunErrored(
|
"run_info": PipelineRunErrored(
|
||||||
pipeline_run_id=pipeline_run_id,
|
pipeline_run_id=pipeline_run_id,
|
||||||
payload=error,
|
payload=error,
|
||||||
dataset_id=dataset.id,
|
dataset_id=dataset.id,
|
||||||
dataset_name=dataset.name,
|
dataset_name=dataset.name,
|
||||||
),
|
),
|
||||||
"data_id": data_id,
|
"data_id": data_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
# re-raise error found during data ingestion
|
# re-raise error found during data ingestion
|
||||||
if ingestion_error:
|
if ingestion_error:
|
||||||
|
|
|
||||||
|
|
@ -103,6 +103,9 @@ async def get_repo_file_dependencies(
|
||||||
extraction of dependencies (default is False). (default False)
|
extraction of dependencies (default is False). (default False)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if isinstance(repo_path, list) and len(repo_path) == 1:
|
||||||
|
repo_path = repo_path[0]
|
||||||
|
|
||||||
if not os.path.exists(repo_path):
|
if not os.path.exists(repo_path):
|
||||||
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue