From 3c261ce6a192bd0bf80ce0babf02e5be1a75056d Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Sun, 12 May 2024 21:58:09 +0200 Subject: [PATCH] working cognify on large dataset --- cognee/api/v1/cognify/cognify.py | 36 +++++++---- cognee/config.py | 2 + .../data/chunking/LangchainChunkingEngine.py | 4 +- .../embeddings/DefaultEmbeddingEngine.py | 21 +++++-- .../modules/cognify/graph/add_data_chunks.py | 61 ++++++++++--------- cognee/shared/GithubClassification.py | 25 ++++++++ 6 files changed, 101 insertions(+), 48 deletions(-) create mode 100644 cognee/shared/GithubClassification.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index b9bcf81cb..28203f7e2 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -11,7 +11,7 @@ from nltk.corpus import stopwords from cognee.api.v1.prune import prune from cognee.config import Config from cognee.infrastructure.data.chunking.LangchainChunkingEngine import LangchainChunkEngine -from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import OpenAIEmbeddingEngine +from cognee.infrastructure.databases.vector.embeddings.DefaultEmbeddingEngine import LiteLLMEmbeddingEngine from cognee.modules.cognify.graph.add_data_chunks import add_data_chunks from cognee.modules.cognify.graph.add_document_node import add_document_node from cognee.modules.cognify.graph.add_classification_nodes import add_classification_nodes @@ -32,6 +32,7 @@ from cognee.modules.data.get_content_summary import get_content_summary from cognee.modules.data.get_cognitive_layers import get_cognitive_layers from cognee.modules.data.get_layer_graphs import get_layer_graphs from cognee.modules.topology.topology import TopologyEngine +from cognee.shared.GithubClassification import CodeContentPrediction from cognee.shared.data_models import ChunkStrategy from cognee.utils import send_telemetry @@ -94,7 +95,7 @@ async def cognify(datasets: Union[str, List[str]] = None): file_type = guess_file_type(file) text = extract_text_from_file(file, file_type) if text is None: - text = "" + text = "empty file" subchunks = chunk_engine.chunk_data(chunk_strategy, text, config.chunk_size, config.chunk_overlap) if dataset_name not in data_chunks: @@ -105,6 +106,9 @@ async def cognify(datasets: Union[str, List[str]] = None): except FileTypeException: logger.warning("File (%s) has an unknown file type. We are skipping it.", file_metadata["id"]) + + + added_chunks: list[tuple[str, str, dict]] = await add_data_chunks(data_chunks) await asyncio.gather( @@ -129,12 +133,12 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi # await add_label_nodes(graph_client, document_id, chunk_id, file_metadata["keywords"].split("|")) - # classified_categories = await get_content_categories(input_text) - # await add_classification_nodes( - # graph_client, - # parent_node_id = document_id, - # categories = classified_categories, - # ) + classified_categories = await get_content_categories(input_text) + await add_classification_nodes( + graph_client, + parent_node_id = document_id, + categories = classified_categories, + ) # print(f"Chunk ({chunk_id}) classified.") @@ -145,11 +149,11 @@ async def process_text(chunk_collection: str, chunk_id: str, input_text: str, fi print(f"Chunk ({chunk_id}) summarized.") - # cognitive_layers = await get_cognitive_layers(input_text, classified_categories) - # cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2] + cognitive_layers = await get_cognitive_layers(input_text, classified_categories) + cognitive_layers = (await add_cognitive_layers(graph_client, document_id, cognitive_layers))[:2] # - # layer_graphs = await get_layer_graphs(input_text, cognitive_layers) - # await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs) + layer_graphs = await get_layer_graphs(input_text, cognitive_layers) + await add_cognitive_layer_graphs(graph_client, chunk_collection, chunk_id, layer_graphs) # # if infrastructure_config.get_config()["connect_documents"] is True: # db_engine = infrastructure_config.get_config()["database_engine"] @@ -196,7 +200,13 @@ if __name__ == "__main__": # # await add("data://" +data_directory_path, "example") - infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': OpenAIEmbeddingEngine()}) + infrastructure_config.set_config( {"chunk_engine": LangchainChunkEngine() , "chunk_strategy": ChunkStrategy.CODE,'embedding_engine': LiteLLMEmbeddingEngine()}) + from cognee.shared.SourceCodeGraph import SourceCodeGraph + from cognee.api.v1.config import config + + config.set_graph_model(SourceCodeGraph) + + config.set_classification_model(CodeContentPrediction) graph = await cognify() # diff --git a/cognee/config.py b/cognee/config.py index 91101334b..639a6b41e 100644 --- a/cognee/config.py +++ b/cognee/config.py @@ -63,6 +63,8 @@ class Config: openai_temperature: float = float(os.getenv("OPENAI_TEMPERATURE", 0.0)) openai_embedding_model = "text-embedding-3-large" openai_embedding_dimensions = 3072 + litellm_embedding_model = "text-embedding-3-large" + litellm_embedding_dimensions = 3072 graphistry_username = os.getenv("GRAPHISTRY_USERNAME") graphistry_password = os.getenv("GRAPHISTRY_PASSWORD") diff --git a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py index f15d66f7e..c936bbe66 100644 --- a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py +++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py @@ -46,5 +46,7 @@ class LangchainChunkEngine(): ) code_chunks = python_splitter.create_documents([data_chunks]) - return code_chunks + only_content = [chunk.page_content for chunk in code_chunks] + + return only_content diff --git a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py index 8ced98728..ac7fcc0d7 100644 --- a/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/DefaultEmbeddingEngine.py @@ -11,6 +11,9 @@ from cognee.config import Config from cognee.root_dir import get_absolute_path from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine from litellm import aembedding +import litellm + +litellm.set_verbose = True config = Config() config.load() @@ -25,25 +28,35 @@ class DefaultEmbeddingEngine(EmbeddingEngine): return config.embedding_dimensions -class OpenAIEmbeddingEngine(EmbeddingEngine): +class LiteLLMEmbeddingEngine(EmbeddingEngine): + async def embed_text(self, text: List[str]) -> List[float]: - response = await aembedding(config.openai_embedding_model, input=text) + + print("text", text) + try: + text = str(text[0]) + except: + text = str(text) + + + response = await aembedding(config.litellm_embedding_model, input=text) # embedding = response.data[0].embedding # embeddings_list = list(map(lambda embedding: embedding.tolist(), embedding_model.embed(text))) + print("response", type(response.data[0]['embedding'])) return response.data[0]['embedding'] def get_vector_size(self) -> int: - return config.openai_embedding_dimensions + return config.litellm_embedding_dimensions if __name__ == "__main__": async def gg(): - openai_embedding_engine = OpenAIEmbeddingEngine() + openai_embedding_engine = LiteLLMEmbeddingEngine() # print(openai_embedding_engine.embed_text(["Hello, how are you?"])) # print(openai_embedding_engine.get_vector_size()) # default_embedding_engine = DefaultEmbeddingEngine() diff --git a/cognee/modules/cognify/graph/add_data_chunks.py b/cognee/modules/cognify/graph/add_data_chunks.py index a40f6dd52..1be631815 100644 --- a/cognee/modules/cognify/graph/add_data_chunks.py +++ b/cognee/modules/cognify/graph/add_data_chunks.py @@ -1,3 +1,4 @@ +import json import logging from typing import TypedDict from cognee.infrastructure import infrastructure_config @@ -14,12 +15,12 @@ async def add_data_chunks(dataset_data_chunks: dict[str, list[TextChunk]]): identified_chunks = [] for (dataset_name, chunks) in dataset_data_chunks.items(): - try: - # if not await vector_client.collection_exists(dataset_name): - # logging.error(f"Creating collection {str(dataset_name)}") - await vector_client.create_collection(dataset_name) - except Exception: - pass + # try: + # # if not await vector_client.collection_exists(dataset_name): + # # logging.error(f"Creating collection {str(dataset_name)}") + # await vector_client.create_collection(dataset_name) + # except Exception: + # pass dataset_chunks = [ dict( @@ -32,29 +33,29 @@ async def add_data_chunks(dataset_data_chunks: dict[str, list[TextChunk]]): identified_chunks.extend(dataset_chunks) - # if not await vector_client.collection_exists(dataset_name): - try: - logging.error("Collection still not found. Creating collection again.") - await vector_client.create_collection(dataset_name) - except: - pass - - async def create_collection_retry(dataset_name, dataset_chunks): - await vector_client.create_data_points( - dataset_name, - [ - DataPoint( - id = chunk["chunk_id"], - payload = dict(text = chunk["text"]), - embed_field = "text" - ) for chunk in dataset_chunks - ], - ) - - try: - await create_collection_retry(dataset_name, dataset_chunks) - except Exception: - logging.error("Collection not found in create data points.") - await create_collection_retry(dataset_name, dataset_chunks) + # # if not await vector_client.collection_exists(dataset_name): + # try: + # logging.error("Collection still not found. Creating collection again.") + # await vector_client.create_collection(dataset_name) + # except: + # pass + # + # async def create_collection_retry(dataset_name, dataset_chunks): + # await vector_client.create_data_points( + # dataset_name, + # [ + # DataPoint( + # id = chunk["chunk_id"], + # payload = dict(text = chunk["text"]), + # embed_field = "text" + # ) for chunk in dataset_chunks + # ], + # ) + # + # try: + # await create_collection_retry(dataset_name, dataset_chunks) + # except Exception: + # logging.error("Collection not found in create data points.") + # await create_collection_retry(dataset_name, dataset_chunks) return identified_chunks diff --git a/cognee/shared/GithubClassification.py b/cognee/shared/GithubClassification.py new file mode 100644 index 000000000..66f14ec91 --- /dev/null +++ b/cognee/shared/GithubClassification.py @@ -0,0 +1,25 @@ +from enum import Enum +from typing import List + +from pydantic import BaseModel + + + +class TextSubclass(str, Enum): + SOURCE_CODE = "Source code in various programming languages" + SHELL_SCRIPTS = "Shell commands and scripts" + MARKUP_LANGUAGES = "Markup languages (HTML, XML)" + STYLESHEETS = "Stylesheets (CSS) and configuration files (YAML, JSON, INI)" + OTHER = "Other that does not fit into any of the above categories" + +class ContentType(BaseModel): + """Base class for content type, storing type of content as string.""" + type: str = "TEXT" + +class TextContent(ContentType): + """Textual content class for more specific text categories.""" + subclass: List[TextSubclass] + +class CodeContentPrediction(BaseModel): + """Model to predict the type of content.""" + label: TextContent \ No newline at end of file