Ingest non-code files
This commit is contained in:
parent
02f46a5fdf
commit
a8a83fffff
4 changed files with 80 additions and 11 deletions
|
|
@ -2,29 +2,37 @@ import asyncio
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cognee.base_config import get_base_config
|
||||||
|
from cognee.modules.cognify.config import get_cognify_config
|
||||||
from cognee.modules.pipelines import run_tasks
|
from cognee.modules.pipelines import run_tasks
|
||||||
from cognee.modules.pipelines.tasks.Task import Task
|
from cognee.modules.pipelines.tasks.Task import Task
|
||||||
|
from cognee.modules.users.methods import get_default_user
|
||||||
|
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
|
||||||
|
from cognee.tasks.documents import (classify_documents,
|
||||||
|
extract_chunks_from_documents)
|
||||||
|
from cognee.tasks.graph import extract_graph_from_data
|
||||||
|
from cognee.tasks.ingestion import ingest_data_with_metadata
|
||||||
from cognee.tasks.repo_processor import (enrich_dependency_graph,
|
from cognee.tasks.repo_processor import (enrich_dependency_graph,
|
||||||
expand_dependency_graph,
|
expand_dependency_graph,
|
||||||
|
get_data_list_for_user,
|
||||||
|
get_non_code_files,
|
||||||
get_repo_file_dependencies)
|
get_repo_file_dependencies)
|
||||||
from cognee.tasks.storage import add_data_points
|
from cognee.tasks.storage import add_data_points
|
||||||
|
|
||||||
from cognee.base_config import get_base_config
|
|
||||||
from cognee.shared.data_models import MonitoringTool
|
|
||||||
|
|
||||||
monitoring = get_base_config().monitoring_tool
|
monitoring = get_base_config().monitoring_tool
|
||||||
if monitoring == MonitoringTool.LANGFUSE:
|
if monitoring == MonitoringTool.LANGFUSE:
|
||||||
from langfuse.decorators import observe
|
from langfuse.decorators import observe
|
||||||
|
|
||||||
from cognee.tasks.summarization import summarize_code
|
from cognee.tasks.summarization import summarize_code, summarize_text
|
||||||
|
|
||||||
logger = logging.getLogger("code_graph_pipeline")
|
logger = logging.getLogger("code_graph_pipeline")
|
||||||
update_status_lock = asyncio.Lock()
|
update_status_lock = asyncio.Lock()
|
||||||
|
|
||||||
@observe
|
@observe
|
||||||
async def run_code_graph_pipeline(repo_path):
|
async def run_code_graph_pipeline(repo_path, include_docs=True):
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
import cognee
|
import cognee
|
||||||
from cognee.infrastructure.databases.relational import create_db_and_tables
|
from cognee.infrastructure.databases.relational import create_db_and_tables
|
||||||
|
|
||||||
|
|
@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
|
||||||
await cognee.prune.prune_system(metadata=True)
|
await cognee.prune.prune_system(metadata=True)
|
||||||
await create_db_and_tables()
|
await create_db_and_tables()
|
||||||
|
|
||||||
|
cognee_config = get_cognify_config()
|
||||||
|
user = await get_default_user()
|
||||||
|
|
||||||
tasks = [
|
tasks = [
|
||||||
Task(get_repo_file_dependencies),
|
Task(get_repo_file_dependencies),
|
||||||
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
|
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
|
||||||
|
|
@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
|
||||||
Task(add_data_points, task_config={"batch_size": 50}),
|
Task(add_data_points, task_config={"batch_size": 50}),
|
||||||
]
|
]
|
||||||
|
|
||||||
return run_tasks(tasks, repo_path, "cognify_code_pipeline")
|
if include_docs:
|
||||||
|
non_code_tasks = [
|
||||||
|
Task(get_non_code_files, task_config={"batch_size": 50}),
|
||||||
|
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
|
||||||
|
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
|
||||||
|
Task(classify_documents),
|
||||||
|
Task(extract_chunks_from_documents),
|
||||||
|
Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
|
||||||
|
Task(
|
||||||
|
summarize_text,
|
||||||
|
summarization_model=cognee_config.summarization_model,
|
||||||
|
task_config={"batch_size": 50}
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
if include_docs:
|
||||||
|
async for result in run_tasks(non_code_tasks, repo_path):
|
||||||
|
yield result
|
||||||
|
|
||||||
|
async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
|
||||||
|
yield result
|
||||||
|
|
@ -4,4 +4,5 @@ logger = logging.getLogger("task:repo_processor")
|
||||||
|
|
||||||
from .enrich_dependency_graph import enrich_dependency_graph
|
from .enrich_dependency_graph import enrich_dependency_graph
|
||||||
from .expand_dependency_graph import expand_dependency_graph
|
from .expand_dependency_graph import expand_dependency_graph
|
||||||
|
from .get_non_code_files import get_data_list_for_user, get_non_py_files
|
||||||
from .get_repo_file_dependencies import get_repo_file_dependencies
|
from .get_repo_file_dependencies import get_repo_file_dependencies
|
||||||
|
|
|
||||||
36
cognee/tasks/repo_processor/get_non_code_files.py
Normal file
36
cognee/tasks/repo_processor/get_non_code_files.py
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import aiofiles
|
||||||
|
|
||||||
|
import cognee.modules.ingestion as ingestion
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
from cognee.modules.data.methods import get_datasets
|
||||||
|
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||||
|
from cognee.modules.data.methods.get_datasets_by_name import \
|
||||||
|
get_datasets_by_name
|
||||||
|
from cognee.modules.data.models import Data
|
||||||
|
from cognee.modules.data.operations.write_metadata import write_metadata
|
||||||
|
from cognee.modules.ingestion.data_types import BinaryData
|
||||||
|
from cognee.modules.users.methods import get_default_user
|
||||||
|
from cognee.shared.CodeGraphEntities import Repository
|
||||||
|
|
||||||
|
|
||||||
|
async def get_non_py_files(repo_path):
|
||||||
|
"""Get files that are not .py files and their contents"""
|
||||||
|
if not os.path.exists(repo_path):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
non_py_files_paths = [
|
||||||
|
os.path.join(root, file)
|
||||||
|
for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
|
||||||
|
]
|
||||||
|
return non_py_files_paths
|
||||||
|
|
||||||
|
|
||||||
|
async def get_data_list_for_user(_, dataset_name, user):
|
||||||
|
datasets = await get_datasets_by_name(dataset_name, user.id)
|
||||||
|
data_documents: list[Data] = []
|
||||||
|
for dataset in datasets:
|
||||||
|
data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
|
||||||
|
data_documents.extend(data_docs)
|
||||||
|
return data_documents
|
||||||
|
|
@ -1,15 +1,16 @@
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
||||||
|
|
||||||
|
|
||||||
async def main(repo_path):
|
async def main(repo_path, include_docs):
|
||||||
async for result in await run_code_graph_pipeline(repo_path):
|
async for result in run_code_graph_pipeline(repo_path, include_docs):
|
||||||
print(result)
|
print(result)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
|
parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
|
||||||
|
parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
asyncio.run(main(args.repo_path))
|
asyncio.run(main(args.repo_path, args.include_docs))
|
||||||
|
|
||||||
Loading…
Add table
Reference in a new issue