From e1a9a236a500ab3ccca3cc4fd4ac1911fb0282c1 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Thu, 9 May 2024 23:11:33 +0200 Subject: [PATCH] fix search, add improvements --- cognee/api/v1/cognify/cognify.py | 20 +++++--- cognee/api/v1/config/config.py | 6 +++ cognee/config.py | 3 +- cognee/infrastructure/InfrastructureConfig.py | 8 +++ .../data/chunking/HaystackChunkEngine.py | 0 .../data/chunking/LangchainChunkingEngine.py | 50 +++++++++++++++++++ .../files/utils/get_file_metadata.py | 9 +++- cognee/modules/cognify/graph/create.py | 2 + .../modules/cognify/graph/initialize_graph.py | 4 +- cognee/shared/GithubTopology.py | 36 +++++++++++++ cognee/shared/data_models.py | 1 + cognee/utils.py | 4 +- 12 files changed, 132 insertions(+), 11 deletions(-) create mode 100644 cognee/infrastructure/data/chunking/HaystackChunkEngine.py create mode 100644 cognee/infrastructure/data/chunking/LangchainChunkingEngine.py create mode 100644 cognee/shared/GithubTopology.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 1f084a75e..bc2a7188d 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -14,6 +14,7 @@ from cognee.modules.cognify.graph.add_cognitive_layer_graphs import add_cognitiv from cognee.modules.cognify.graph.add_summary_nodes import add_summary_nodes from cognee.modules.cognify.graph.add_node_connections import group_nodes_by_layer, \ graph_ready_output, connect_nodes_in_graph +from cognee.modules.cognify.graph.initialize_graph import initialize_graph from cognee.modules.cognify.llm.resolve_cross_graph_references import resolve_cross_graph_references from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client from cognee.modules.cognify.graph.add_label_nodes import add_label_nodes @@ -73,7 +74,14 @@ async def cognify(datasets: Union[str, List[str]] = None): if dataset_name in added_dataset: dataset_files.append((added_dataset, db_engine.get_files_metadata(added_dataset))) - # await initialize_graph(USER_ID, graph_data_model, graph_client) + + + + graph_topology = infrastructure_config.get_config()["graph_topology"] + + + + await initialize_graph(USER_ID, graph_client=graph_client) data_chunks = {} @@ -174,11 +182,11 @@ if __name__ == "__main__": async def test(): # - # from cognee.api.v1.add import add - # - # await add(["A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification"], "code") - # - # graph = await cognify() + from cognee.api.v1.add import add + + await add(["A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification"], "code") + + graph = await cognify() from cognee.utils import render_graph diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py index 1b8afd95d..832d75705 100644 --- a/cognee/api/v1/config/config.py +++ b/cognee/api/v1/config/config.py @@ -72,3 +72,9 @@ class config(): infrastructure_config.set_config({ "chunk_strategy": chunk_strategy }) + + @staticmethod + def set_graph_topology(graph_topology: object): + infrastructure_config.set_config({ + "graph_topology": graph_topology + }) diff --git a/cognee/config.py b/cognee/config.py index aa4a3e882..91101334b 100644 --- a/cognee/config.py +++ b/cognee/config.py @@ -8,7 +8,7 @@ from dataclasses import dataclass, field from pathlib import Path from dotenv import load_dotenv from cognee.root_dir import get_absolute_path -from cognee.shared.data_models import ChunkStrategy +from cognee.shared.data_models import ChunkStrategy, DefaultGraphModel base_dir = Path(__file__).resolve().parent.parent # Load the .env file from the base directory @@ -74,6 +74,7 @@ class Config: # Database parameters graph_database_provider: str = os.getenv("GRAPH_DB_PROVIDER", "NETWORKX") + graph_topology:str = DefaultGraphModel if ( os.getenv("ENV") == "prod" diff --git a/cognee/infrastructure/InfrastructureConfig.py b/cognee/infrastructure/InfrastructureConfig.py index 12a21b188..042addd0f 100644 --- a/cognee/infrastructure/InfrastructureConfig.py +++ b/cognee/infrastructure/InfrastructureConfig.py @@ -33,6 +33,7 @@ class InfrastructureConfig(): database_file_path: str = None chunk_strategy = config.chunk_strategy chunk_engine = None + graph_topology = config.graph_topology def get_config(self, config_entity: str = None) -> dict: if (config_entity is None or config_entity == "database_engine") and self.database_engine is None: @@ -78,6 +79,9 @@ class InfrastructureConfig(): if self.chunk_engine is None: self.chunk_engine = DefaultChunkEngine() + if self.graph_topology is None: + self.graph_topology = config.graph_topology + if (config_entity is None or config_entity == "llm_engine") and self.llm_engine is None: self.llm_engine = OpenAIAdapter(config.openai_key, config.openai_model) @@ -129,6 +133,7 @@ class InfrastructureConfig(): "database_path": self.database_file_path, "chunk_strategy": self.chunk_strategy, "chunk_engine": self.chunk_engine, + "graph_topology": self.graph_topology } def set_config(self, new_config: dict): @@ -183,4 +188,7 @@ class InfrastructureConfig(): if "chunk_engine" in new_config: self.chunk_engine = new_config["chunk_engine"] + if "graph_topology" in new_config: + self.graph_topology = new_config["graph_topology"] + infrastructure_config = InfrastructureConfig() diff --git a/cognee/infrastructure/data/chunking/HaystackChunkEngine.py b/cognee/infrastructure/data/chunking/HaystackChunkEngine.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py new file mode 100644 index 000000000..f15d66f7e --- /dev/null +++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py @@ -0,0 +1,50 @@ +from __future__ import annotations +import re + +from cognee.infrastructure.data.chunking.DefaultChunkEngine import DefaultChunkEngine +from cognee.shared.data_models import ChunkStrategy + + + +class LangchainChunkEngine(): + @staticmethod + def chunk_data( + chunk_strategy = None, + source_data = None, + chunk_size = None, + chunk_overlap = None, + ): + """ + Chunk data based on the specified strategy. + + Parameters: + - chunk_strategy: The strategy to use for chunking. + - source_data: The data to be chunked. + - chunk_size: The size of each chunk. + - chunk_overlap: The overlap between chunks. + + Returns: + - The chunked data. + """ + + if chunk_strategy == ChunkStrategy.CODE: + chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap) + else: + chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap) + return chunked_data + + @staticmethod + def chunk_data_by_code(data_chunks, chunk_size, chunk_overlap, language=None): + from langchain_text_splitters import ( + Language, + RecursiveCharacterTextSplitter, + ) + if language is None: + language = Language.PYTHON + python_splitter = RecursiveCharacterTextSplitter.from_language( + language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + code_chunks = python_splitter.create_documents([data_chunks]) + + return code_chunks + diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py index 2b9d6ed0a..44c53bb11 100644 --- a/cognee/infrastructure/files/utils/get_file_metadata.py +++ b/cognee/infrastructure/files/utils/get_file_metadata.py @@ -16,7 +16,14 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata: file.seek(0) file_text = extract_text_from_file(file, file_type) - keywords = extract_keywords(file_text) + + import uuid + + try: + keywords = extract_keywords(file_text) + except: + keywords = ["no keywords detected" + str(uuid.uuid4())] + file_path = file.name file_name = file_path.split("/")[-1].split(".")[0] diff --git a/cognee/modules/cognify/graph/create.py b/cognee/modules/cognify/graph/create.py index 020af9f7f..e6bbd8868 100644 --- a/cognee/modules/cognify/graph/create.py +++ b/cognee/modules/cognify/graph/create.py @@ -44,6 +44,7 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di # Add an edge if a parent ID is provided and the graph engine is NETWORKX if parent_id and "default_relationship" in node_data and infrastructure_config.get_config()["graph_engine"] == GraphDBType.NETWORKX: + print("Node id", node_id) await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data) except Exception as e: # Log the exception; consider a logging framework for production use @@ -103,6 +104,7 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di async def add_edge(client, parent_id: Optional[str], node_id: str, node_data: dict, created_node_ids): + print('NODE ID', node_data) if node_id == "Relationship_default" and parent_id: # Initialize source and target variables outside the loop diff --git a/cognee/modules/cognify/graph/initialize_graph.py b/cognee/modules/cognify/graph/initialize_graph.py index d6c13512c..dcb3517d8 100644 --- a/cognee/modules/cognify/graph/initialize_graph.py +++ b/cognee/modules/cognify/graph/initialize_graph.py @@ -2,9 +2,9 @@ from datetime import datetime from cognee.shared.data_models import DefaultGraphModel, Relationship, UserProperties, UserLocation from cognee.modules.cognify.graph.create import create_semantic_graph -async def initialize_graph(root_id: str, graphdatamodel, graph_client): +async def initialize_graph(root_id: str, graphdatamodel=None, graph_client=None): if graphdatamodel: - graph = graphdatamodel(id = root_id) + graph = graphdatamodel(node_id= root_id) graph_ = await create_semantic_graph(graph, graph_client) return graph_ else: diff --git a/cognee/shared/GithubTopology.py b/cognee/shared/GithubTopology.py new file mode 100644 index 000000000..f0b9e3c2b --- /dev/null +++ b/cognee/shared/GithubTopology.py @@ -0,0 +1,36 @@ + + +from pydantic import BaseModel +from typing import List, Optional, Dict, Any, Union + +class Relationship(BaseModel): + type: str + attributes: Optional[Dict[str, Any]] = {} + +class Document(BaseModel): + name: str + content: str + filetype: str + +class Directory(BaseModel): + name: str + documents: List[Document] = [] + directories: List['Directory'] = [] + + # Allows recursive Directory Model + Directory.update_forward_refs() + +class RepositoryProperties(BaseModel): + custom_properties: Optional[Dict[str, Any]] = None + location: Optional[str] = None # Simplified location reference + +class RepositoryNode(BaseModel): + node_id: str + node_type: str # 'document' or 'directory' + properties: RepositoryProperties = RepositoryProperties() + content: Union[Document, Directory, None] = None + relationships: List[Relationship] = [] + +class RepositoryGraphModel(BaseModel): + root: RepositoryNode + default_relationships: List[Relationship] = [] diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index 7eea17cdc..cbaabfe44 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -34,6 +34,7 @@ class ChunkStrategy(Enum): EXACT = "exact" PARAGRAPH = "paragraph" SENTENCE = "sentence" + CODE = "code" class MemorySummary(BaseModel): """ Memory summary. """ diff --git a/cognee/utils.py b/cognee/utils.py index 98d7e68e0..19d52740e 100644 --- a/cognee/utils.py +++ b/cognee/utils.py @@ -1,5 +1,5 @@ """ This module contains utility functions for the cognee. """ - +import logging import os import uuid import datetime @@ -20,6 +20,8 @@ config.load() def send_telemetry(event_name: str): if os.getenv("TELEMETRY_DISABLED"): + print("Telemetry is disabled.") + logging.info("Telemetry is disabled.") return env = os.getenv("ENV")