cognee/cognee/api/v1/cognify/cognify.py
2024-05-21 19:34:18 +02:00

275 lines
11 KiB
Python

import asyncio
import os
from uuid import uuid4
from typing import List, Union
import logging
import instructor
import nltk
from openai import OpenAI
from nltk.corpus import stopwords
from cognee.api.v1.prune import prune
from cognee.config import Config
from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine
from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import LiteLLMEmbeddingEngine
from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks, add_data_chunks_basic_rag
from cognee.modules.cognify.graph.add_document_node import add_document_node
from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes
from cognee.modules.cognify.graph.add_cognitive_layer_graphs import add_cognitive_layer_graphs
from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes
from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes
from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \
graph_ready_output, connect_nodes_in_graph
from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references
from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client
from cognee.modules.cognify.graph.add_cognitive_layers import add_cognitive_layers
# from cognee.modules.cognify.graph.initialize_graph import initialize_graph
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
from cognee.infrastructure import infrastructure_config
from cognee.modules.data.get_content_categories import get_content_categories
from cognee.modules.data.get_content_summary import get_content_summary
from cognee.modules.data.get_cognitive_layers import get_cognitive_layers
from cognee.modules.data.get_layer_graphs import get_layer_graphs
from cognee.modules.topology.topology import TopologyEngine
from cognee.shared.GithubClassification import CodeContentPrediction
from cognee.shared.data_models import ChunkStrategy, DefaultGraphModel, KnowledgeGraph
from cognee.utils import send_telemetry
from cognee.shared.SourceCodeGraph import SourceCodeGraph
config = Config()
config.load()
aclient = instructor.patch(OpenAI())
USER_ID = "default_user"
logger = logging.getLogger("cognify")
async def cognify(datasets: Union[str, List[str]] = None):
"""This function is responsible for the cognitive processing of the content."""
# Has to be loaded in advance, multithreading doesn't work without it.
nltk.download("stopwords", quiet=True)
stopwords.ensure_loaded()
graph_db_type = infrastructure_config.get_config()["graph_engine"]
graph_client = await get_graph_client(graph_db_type)
db_engine = infrastructure_config.get_config()["database_engine"]
if datasets is None or len(datasets) == 0:
datasets = db_engine.get_datasets()
awaitables = []
# datasets is a list of dataset names
if isinstance(datasets, list):
for dataset in datasets:
awaitables.append(cognify(dataset))
graphs = await asyncio.gather(*awaitables)
return graphs[0]
added_datasets = db_engine.get_datasets()
dataset_files = []
# datasets is a dataset name string
dataset_name = datasets.replace(".", "_").replace(" ", "_")
for added_dataset in added_datasets:
if dataset_name in added_dataset:
dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset)))
data_chunks = {}
chunk_engine = infrastructure_config.get_config()["chunk_engine"]
chunk_strategy = infrastructure_config.get_config()["chunk_strategy"]
async def process_batch(files_batch):
for dataset_name, file_metadata, document_id in files_batch:
with open(file_metadata["file_path"], "rb") as file:
try:
file_type = guess_file_type(file)
text = extract_text_from_file(file, file_type)
if text is None:
text = "empty file"
if text == "":
text = "empty file"
subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap)
if dataset_name not in data_chunks:
data_chunks[dataset_name] = []
for subchunk in subchunks:
data_chunks[dataset_name].append(dict(document_id = document_id, chunk_id = str(uuid4()), text = subchunk, file_metadata = file_metadata))
except FileTypeException:
logger.warning("File (%s) has an unknown file type. We are skipping it.", file_metadata["id"])
added_chunks = await add_data_chunks(data_chunks)
added__basic_rag_chunks = await add_data_chunks_basic_rag(data_chunks)
await asyncio.gather(
*[process_text(chunk["collection"], chunk["chunk_id"], chunk["text"], chunk["file_metadata"],chunk['document_id']) for chunk in
added_chunks]
)
batch_size = 20
file_count = 0
files_batch = []
for (dataset_name, files) in dataset_files:
for file_metadata in files:
graph_topology = infrastructure_config.get_config()["graph_model"]
if graph_topology == SourceCodeGraph:
parent_node_id = f"{file_metadata['name']}.{file_metadata['extension']}"
elif graph_topology == KnowledgeGraph:
parent_node_id = f"DefaultGraphModel__{USER_ID}"
else:
parent_node_id = f"DefaultGraphModel__{USER_ID}"
document_id = await add_document_node(
graph_client,
parent_node_id=parent_node_id,
document_metadata=file_metadata,
)
files_batch.append((dataset_name, file_metadata, document_id))
file_count += 1
if file_count >= batch_size:
await process_batch(files_batch)
files_batch = []
file_count = 0
# Process any remaining files in the last batch
if files_batch:
await process_batch(files_batch)
return graph_client.graph
async def process_text(chunk_collection: str, chunk_id: str, input_text: str, file_metadata: dict, document_id: str):
print(f"Processing chunk ({chunk_id}) from document ({file_metadata['id']}).")
graph_client = await get_graph_client(infrastructure_config.get_config()["graph_engine"])
graph_topology = infrastructure_config.get_config()["graph_model"]
if graph_topology == SourceCodeGraph:
classified_categories = [{'data_type': 'text', 'category_name': 'Code and functions'}]
elif graph_topology == KnowledgeGraph:
classified_categories = await get_content_categories(input_text)
else:
classified_categories = [{'data_type': 'text', 'category_name': 'Unclassified text'}]
# await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|"))
await add_classification_nodes(
graph_client,
parent_node_id = document_id,
categories = classified_categories,
)
print(f"Chunk ({chunk_id}) classified.")
content_summary = await get_content_summary(input_text)
await add_summary_nodes(graph_client, document_id, content_summary)
print(f"Chunk ({chunk_id}) summarized.")
cognitive_layers = await get_cognitive_layers(input_text, classified_categories)
cognitive_layers = cognitive_layers[:config.cognitive_layers_limit]
try:
cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2]
print("cognitive_layers", cognitive_layers)
layer_graphs = await get_layer_graphs(input_text, cognitive_layers)
await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs)
except:
pass
# if infrastructure_config.get_config()["connect_documents"] is True:
# db_engine = infrastructure_config.get_config()["database_engine"]
# relevant_documents_to_connect = db_engine.fetch_cognify_data(excluded_document_id = raw_document_id)
#
# list_of_nodes = []
#
# relevant_documents_to_connect.append({
# "layer_id": document_id,
# })
#
# for document in relevant_documents_to_connect:
# node_descriptions_to_match = await graph_client.extract_node_description(document["layer_id"])
# list_of_nodes.extend(node_descriptions_to_match)
#
# nodes_by_layer = await group_nodes_by_layer(list_of_nodes)
#
# results = await resolve_cross_graph_references(nodes_by_layer)
#
# relationships = graph_ready_output(results)
#
# await connect_nodes_in_graph(
# graph_client,
# relationships,
# score_threshold = infrastructure_config.get_config()["intra_layer_score_treshold"]
# )
#
# send_telemetry("cognee.cognify")
#
# print(f"Chunk ({chunk_id}) cognified.")
if __name__ == "__main__":
async def test():
# await prune.prune_system()
# #
# from cognee.api.v1.add import add
# data_directory_path = os.path.abspath("../../../.data")
# # print(data_directory_path)
# # config.data_root_directory(data_directory_path)
# # cognee_directory_path = os.path.abspath("../.cognee_system")
# # config.system_root_directory(cognee_directory_path)
#
# await add("data://" +data_directory_path, "example")
text = """import subprocess
def show_all_processes():
process = subprocess.Popen(['ps', 'aux'], stdout=subprocess.PIPE)
output, error = process.communicate()
if error:
print(f"Error: {error}")
else:
print(output.decode())
show_all_processes()"""
from cognee.api.v1.add import add
await add([text], "example_dataset")
infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': LiteLLMEmbeddingEngine() })
from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.api.v1.config import config
# config.set_graph_model(SourceCodeGraph)
# config.set_classification_model(CodeContentPrediction)
# graph = await cognify()
vector_client = infrastructure_config.get_config("vector_engine")
out = await vector_client.search(collection_name ="basic_rag", query_text="show_all_processes", limit=10)
print("results", out)
#
# from cognee.utils import render_graph
#
# await render_graph(graph, include_color=True, include_nodes=False, include_size=False)
import asyncio
asyncio.run(test())