refactor: Cog-547 code graph enrichment task clean

This commit is contained in:
0xideas 2024-11-24 19:26:40 +01:00 committed by GitHub
parent 80b06c3acb
commit 8466764cbe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,11 +1,8 @@
import argparse
import asyncio import asyncio
import os import os
import cognee from uuid import UUID, uuid4
import json
import numpy as np import numpy as np
from networkx.classes.digraph import DiGraph
from cognee.modules.pipelines import Task, run_tasks from cognee.modules.pipelines import Task, run_tasks
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
@ -16,49 +13,70 @@ from cognee.tasks.code.get_local_dependencies_checker import (
from cognee.tasks.graph.convert_graph_from_code_graph import ( from cognee.tasks.graph.convert_graph_from_code_graph import (
convert_graph_from_code_graph, convert_graph_from_code_graph,
) )
from cognee.tasks.repo_processor.get_repo_dependency_graph import (
get_repo_dependency_graph,
)
from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph
from cognee.tasks.summarization import summarize_code from cognee.tasks.summarization import summarize_code
from cognee.tasks.storage import index_data_points from cognee.tests.tasks.graph.code_graph_test_data_generation import (
code_graph_test_data_generation,
)
async def print_results(pipeline): async def print_results(pipeline):
async for result in pipeline: async for result in pipeline:
print(result) print(result)
async def write_results(repo, pipeline):
output_dir = os.path.join(repo, "code_pipeline_output", "")
os.makedirs(output_dir, exist_ok = True)
async for code_files, summaries in pipeline:
for summary in summaries:
file_name = os.path.split(summary.made_from.extracted_id)[-1]
relpath = os.path.join(*os.path.split(os.path.relpath(summary.made_from.extracted_id, repo))[:-1])
output_dir2 = os.path.join(repo, "code_pipeline_output", relpath)
os.makedirs(output_dir2, exist_ok=True)
with open(os.path.join(output_dir2, file_name.replace(".py", ".json")), "w") as f:
f.write(json.dumps({"summary": summary.text, "source_code": summary.made_from.source_code}))
async def reset_system(): def create_code_file(path, type):
await cognee.prune.prune_data() abspath = os.path.abspath(path)
await cognee.prune.prune_system(metadata=True) with open(abspath, "r") as f:
return(True) source_code = f.read()
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
return (code_file, abspath)
async def get_local_script_dependencies_wrapper(script_path, repo_path):
dependencies = await get_local_script_dependencies(script_path, repo_path)
return (script_path, dependencies)
async def scan_repo(path, condition):
futures = []
for root, dirs, files in os.walk(path):
for file in files:
if condition(file):
futures.append(
get_local_script_dependencies_wrapper(
os.path.abspath(f"{root}/{file}"), path
)
)
results = await asyncio.gather(*futures)
code_files = {}
code_relationships = []
for abspath, dependencies in results:
code_file, abspath = create_code_file(abspath, "python_file")
code_files[abspath] = code_file
for dependency in dependencies:
dependency_code_file, dependency_abspath = create_code_file(
dependency, "python_file"
)
code_files[dependency_abspath] = dependency_code_file
code_relationship = CodeRelationship(
source_id=abspath,
target_id=dependency_abspath,
type="files",
relation="depends_on",
)
code_relationships.append(code_relationship)
return (Repository(path=path), list(code_files.values()), code_relationships)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a file path")
parser.add_argument("path", help="Path to the file")
args = parser.parse_args()
abspath = os.path.abspath(args.path)
data = abspath
tasks = [ tasks = [
Task(get_repo_dependency_graph), Task(scan_repo),
Task(enrich_dependency_graph), Task(convert_graph_from_code_graph),
Task(convert_graph_from_code_graph, repo_path = abspath),
Task(index_data_points),
Task(summarize_code, summarization_model=SummarizedContent), Task(summarize_code, summarization_model=SummarizedContent),
] ]
data = ("cognee", lambda file: file.endswith(".py"))
pipeline = run_tasks(tasks, data, "cognify_pipeline") pipeline = run_tasks(tasks, data, "cognify_pipeline")
asyncio.run(print_results(pipeline))
asyncio.run(write_results(abspath, pipeline))