From a8a83fffff17c882c554e1a6ea95481de773c6b1 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 20 Dec 2024 10:53:57 +0100 Subject: [PATCH] Ingest non-code files --- cognee/api/v1/cognify/code_graph_pipeline.py | 43 ++++++++++++++++--- cognee/tasks/repo_processor/__init__.py | 1 + .../repo_processor/get_non_code_files.py | 36 ++++++++++++++++ examples/python/code_graph_example.py | 11 ++--- 4 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 cognee/tasks/repo_processor/get_non_code_files.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index c35f9719f..8e92d08e0 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -2,29 +2,37 @@ import asyncio import logging from pathlib import Path +from cognee.base_config import get_base_config +from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.users.methods import get_default_user +from cognee.shared.data_models import KnowledgeGraph, MonitoringTool +from cognee.tasks.documents import (classify_documents, + extract_chunks_from_documents) +from cognee.tasks.graph import extract_graph_from_data +from cognee.tasks.ingestion import ingest_data_with_metadata from cognee.tasks.repo_processor import (enrich_dependency_graph, expand_dependency_graph, + get_data_list_for_user, + get_non_code_files, get_repo_file_dependencies) from cognee.tasks.storage import add_data_points -from cognee.base_config import get_base_config -from cognee.shared.data_models import MonitoringTool - monitoring = get_base_config().monitoring_tool if monitoring == MonitoringTool.LANGFUSE: from langfuse.decorators import observe -from cognee.tasks.summarization import summarize_code +from cognee.tasks.summarization import summarize_code, summarize_text logger = logging.getLogger("code_graph_pipeline") update_status_lock = asyncio.Lock() @observe -async def run_code_graph_pipeline(repo_path): +async def run_code_graph_pipeline(repo_path, include_docs=True): import os import pathlib + import cognee from cognee.infrastructure.databases.relational import create_db_and_tables @@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() + cognee_config = get_cognify_config() + user = await get_default_user() + tasks = [ Task(get_repo_file_dependencies), Task(enrich_dependency_graph, task_config={"batch_size": 50}), @@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path): Task(add_data_points, task_config={"batch_size": 50}), ] - return run_tasks(tasks, repo_path, "cognify_code_pipeline") + if include_docs: + non_code_tasks = [ + Task(get_non_code_files, task_config={"batch_size": 50}), + Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), + Task(get_data_list_for_user, dataset_name="repo_docs", user=user), + Task(classify_documents), + Task(extract_chunks_from_documents), + Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), + Task( + summarize_text, + summarization_model=cognee_config.summarization_model, + task_config={"batch_size": 50} + ), + ] + + if include_docs: + async for result in run_tasks(non_code_tasks, repo_path): + yield result + + async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"): + yield result \ No newline at end of file diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 05e111b29..fa754028e 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor") from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph +from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py new file mode 100644 index 000000000..5a8a34f64 --- /dev/null +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -0,0 +1,36 @@ +import os + +import aiofiles + +import cognee.modules.ingestion as ingestion +from cognee.infrastructure.engine import DataPoint +from cognee.modules.data.methods import get_datasets +from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.modules.data.methods.get_datasets_by_name import \ + get_datasets_by_name +from cognee.modules.data.models import Data +from cognee.modules.data.operations.write_metadata import write_metadata +from cognee.modules.ingestion.data_types import BinaryData +from cognee.modules.users.methods import get_default_user +from cognee.shared.CodeGraphEntities import Repository + + +async def get_non_py_files(repo_path): + """Get files that are not .py files and their contents""" + if not os.path.exists(repo_path): + return {} + + non_py_files_paths = [ + os.path.join(root, file) + for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py") + ] + return non_py_files_paths + + +async def get_data_list_for_user(_, dataset_name, user): + datasets = await get_datasets_by_name(dataset_name, user.id) + data_documents: list[Data] = [] + for dataset in datasets: + data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id) + data_documents.extend(data_docs) + return data_documents \ No newline at end of file diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 9189de46c..c0b91972b 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,15 +1,16 @@ import argparse import asyncio + from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline -async def main(repo_path): - async for result in await run_code_graph_pipeline(repo_path): +async def main(repo_path, include_docs): + async for result in run_code_graph_pipeline(repo_path, include_docs): print(result) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository") + parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") + parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files") args = parser.parse_args() - asyncio.run(main(args.repo_path)) - + asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file