Ingest non-code files

This commit is contained in:
Rita Aleksziev 2024-12-20 10:53:57 +01:00
parent 02f46a5fdf
commit a8a83fffff
4 changed files with 80 additions and 11 deletions

View file

@ -2,29 +2,37 @@ import asyncio
import logging
from pathlib import Path
from cognee.base_config import get_base_config
from cognee.modules.cognify.config import get_cognify_config
from cognee.modules.pipelines import run_tasks
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.users.methods import get_default_user
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
from cognee.tasks.documents import (classify_documents,
extract_chunks_from_documents)
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.ingestion import ingest_data_with_metadata
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_data_list_for_user,
get_non_code_files,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
from cognee.base_config import get_base_config
from cognee.shared.data_models import MonitoringTool
monitoring = get_base_config().monitoring_tool
if monitoring == MonitoringTool.LANGFUSE:
from langfuse.decorators import observe
from cognee.tasks.summarization import summarize_code
from cognee.tasks.summarization import summarize_code, summarize_text
logger = logging.getLogger("code_graph_pipeline")
update_status_lock = asyncio.Lock()
@observe
async def run_code_graph_pipeline(repo_path):
async def run_code_graph_pipeline(repo_path, include_docs=True):
import os
import pathlib
import cognee
from cognee.infrastructure.databases.relational import create_db_and_tables
@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()
cognee_config = get_cognify_config()
user = await get_default_user()
tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
Task(add_data_points, task_config={"batch_size": 50}),
]
return run_tasks(tasks, repo_path, "cognify_code_pipeline")
if include_docs:
non_code_tasks = [
Task(get_non_code_files, task_config={"batch_size": 50}),
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents),
Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
Task(
summarize_text,
summarization_model=cognee_config.summarization_model,
task_config={"batch_size": 50}
),
]
if include_docs:
async for result in run_tasks(non_code_tasks, repo_path):
yield result
async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
yield result

View file

@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor")
from .enrich_dependency_graph import enrich_dependency_graph
from .expand_dependency_graph import expand_dependency_graph
from .get_non_code_files import get_data_list_for_user, get_non_py_files
from .get_repo_file_dependencies import get_repo_file_dependencies

View file

@ -0,0 +1,36 @@
import os
import aiofiles
import cognee.modules.ingestion as ingestion
from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.methods import get_datasets
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.methods.get_datasets_by_name import \
get_datasets_by_name
from cognee.modules.data.models import Data
from cognee.modules.data.operations.write_metadata import write_metadata
from cognee.modules.ingestion.data_types import BinaryData
from cognee.modules.users.methods import get_default_user
from cognee.shared.CodeGraphEntities import Repository
async def get_non_py_files(repo_path):
"""Get files that are not .py files and their contents"""
if not os.path.exists(repo_path):
return {}
non_py_files_paths = [
os.path.join(root, file)
for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
]
return non_py_files_paths
async def get_data_list_for_user(_, dataset_name, user):
datasets = await get_datasets_by_name(dataset_name, user.id)
data_documents: list[Data] = []
for dataset in datasets:
data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
data_documents.extend(data_docs)
return data_documents

View file

@ -1,15 +1,16 @@
import argparse
import asyncio
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
async def main(repo_path):
async for result in await run_code_graph_pipeline(repo_path):
async def main(repo_path, include_docs):
async for result in run_code_graph_pipeline(repo_path, include_docs):
print(result)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
args = parser.parse_args()
asyncio.run(main(args.repo_path))
asyncio.run(main(args.repo_path, args.include_docs))