Ingest non-code files
This commit is contained in:
parent
02f46a5fdf
commit
a8a83fffff
4 changed files with 80 additions and 11 deletions
|
|
@ -2,29 +2,37 @@ import asyncio
|
|||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from cognee.base_config import get_base_config
|
||||
from cognee.modules.cognify.config import get_cognify_config
|
||||
from cognee.modules.pipelines import run_tasks
|
||||
from cognee.modules.pipelines.tasks.Task import Task
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
|
||||
from cognee.tasks.documents import (classify_documents,
|
||||
extract_chunks_from_documents)
|
||||
from cognee.tasks.graph import extract_graph_from_data
|
||||
from cognee.tasks.ingestion import ingest_data_with_metadata
|
||||
from cognee.tasks.repo_processor import (enrich_dependency_graph,
|
||||
expand_dependency_graph,
|
||||
get_data_list_for_user,
|
||||
get_non_code_files,
|
||||
get_repo_file_dependencies)
|
||||
from cognee.tasks.storage import add_data_points
|
||||
|
||||
from cognee.base_config import get_base_config
|
||||
from cognee.shared.data_models import MonitoringTool
|
||||
|
||||
monitoring = get_base_config().monitoring_tool
|
||||
if monitoring == MonitoringTool.LANGFUSE:
|
||||
from langfuse.decorators import observe
|
||||
|
||||
from cognee.tasks.summarization import summarize_code
|
||||
from cognee.tasks.summarization import summarize_code, summarize_text
|
||||
|
||||
logger = logging.getLogger("code_graph_pipeline")
|
||||
update_status_lock = asyncio.Lock()
|
||||
|
||||
@observe
|
||||
async def run_code_graph_pipeline(repo_path):
|
||||
async def run_code_graph_pipeline(repo_path, include_docs=True):
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import cognee
|
||||
from cognee.infrastructure.databases.relational import create_db_and_tables
|
||||
|
||||
|
|
@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
|
|||
await cognee.prune.prune_system(metadata=True)
|
||||
await create_db_and_tables()
|
||||
|
||||
cognee_config = get_cognify_config()
|
||||
user = await get_default_user()
|
||||
|
||||
tasks = [
|
||||
Task(get_repo_file_dependencies),
|
||||
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
|
||||
|
|
@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
|
|||
Task(add_data_points, task_config={"batch_size": 50}),
|
||||
]
|
||||
|
||||
return run_tasks(tasks, repo_path, "cognify_code_pipeline")
|
||||
if include_docs:
|
||||
non_code_tasks = [
|
||||
Task(get_non_code_files, task_config={"batch_size": 50}),
|
||||
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
|
||||
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
||||
Task(classify_documents),
|
||||
Task(extract_chunks_from_documents),
|
||||
Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
|
||||
Task(
|
||||
summarize_text,
|
||||
summarization_model=cognee_config.summarization_model,
|
||||
task_config={"batch_size": 50}
|
||||
),
|
||||
]
|
||||
|
||||
if include_docs:
|
||||
async for result in run_tasks(non_code_tasks, repo_path):
|
||||
yield result
|
||||
|
||||
async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
|
||||
yield result
|
||||
|
|
@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor")
|
|||
|
||||
from .enrich_dependency_graph import enrich_dependency_graph
|
||||
from .expand_dependency_graph import expand_dependency_graph
|
||||
from .get_non_code_files import get_data_list_for_user, get_non_py_files
|
||||
from .get_repo_file_dependencies import get_repo_file_dependencies
|
||||
|
|
|
|||
36
cognee/tasks/repo_processor/get_non_code_files.py
Normal file
36
cognee/tasks/repo_processor/get_non_code_files.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import os
|
||||
|
||||
import aiofiles
|
||||
|
||||
import cognee.modules.ingestion as ingestion
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.data.methods import get_datasets
|
||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
from cognee.modules.data.methods.get_datasets_by_name import \
|
||||
get_datasets_by_name
|
||||
from cognee.modules.data.models import Data
|
||||
from cognee.modules.data.operations.write_metadata import write_metadata
|
||||
from cognee.modules.ingestion.data_types import BinaryData
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.shared.CodeGraphEntities import Repository
|
||||
|
||||
|
||||
async def get_non_py_files(repo_path):
|
||||
"""Get files that are not .py files and their contents"""
|
||||
if not os.path.exists(repo_path):
|
||||
return {}
|
||||
|
||||
non_py_files_paths = [
|
||||
os.path.join(root, file)
|
||||
for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
|
||||
]
|
||||
return non_py_files_paths
|
||||
|
||||
|
||||
async def get_data_list_for_user(_, dataset_name, user):
|
||||
datasets = await get_datasets_by_name(dataset_name, user.id)
|
||||
data_documents: list[Data] = []
|
||||
for dataset in datasets:
|
||||
data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
|
||||
data_documents.extend(data_docs)
|
||||
return data_documents
|
||||
|
|
@ -1,15 +1,16 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
|
||||
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
||||
|
||||
|
||||
async def main(repo_path):
|
||||
async for result in await run_code_graph_pipeline(repo_path):
|
||||
async def main(repo_path, include_docs):
|
||||
async for result in run_code_graph_pipeline(repo_path, include_docs):
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
|
||||
parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
|
||||
parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(main(args.repo_path))
|
||||
|
||||
asyncio.run(main(args.repo_path, args.include_docs))
|
||||
Loading…
Add table
Reference in a new issue