diff --git a/.github/workflows/test_neo4j.yml b/.github/workflows/test_neo4j.yml new file mode 100644 index 000000000..d99f0633f --- /dev/null +++ b/.github/workflows/test_neo4j.yml @@ -0,0 +1,62 @@ +name: common + +on: + pull_request: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + RUNTIME__LOG_LEVEL: ERROR + +jobs: + get_docs_changes: + name: docs changes + uses: ./.github/workflows/get_docs_changes.yml + + run_common: + name: test + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + runs-on: macos-latest + + defaults: + run: + shell: bash + + steps: + - name: Check out + uses: actions/checkout@v2 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Install dependencies + run: poetry install --no-interaction + + - name: Create .cognee_system directory and print path + run: | + mkdir .cognee_system + echo $(pwd)/.cognee_system + + - name: Run default Qdrant + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GRAPH_DATABASE_URL: ${{ secrets.NEO4J_API_URL }} + GRAPH_DATABASE_PASSWORD: ${{ secrets.NEO4J_API_KEY }} + ENV: 'dev' + run: poetry run python ./cognee/tests/test_neo4j.py diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py index 6a1478d40..571aba1b2 100644 --- a/cognee/api/v1/config/config.py +++ b/cognee/api/v1/config/config.py @@ -56,9 +56,9 @@ class config(): cognify_config.cognitive_layer_model = cognitive_layer_model @staticmethod - def set_graph_engine(graph_engine: object): + def set_graph_database_provider(graph_database_provider: str): graph_config = get_graph_config() - graph_config.graph_engine = graph_engine + graph_config.graph_database_provider = graph_database_provider @staticmethod def llm_provider(llm_provider: str): diff --git a/cognee/api/v1/search/search.py b/cognee/api/v1/search/search.py index 4273767f3..08f311f62 100644 --- a/cognee/api/v1/search/search.py +++ b/cognee/api/v1/search/search.py @@ -50,7 +50,7 @@ async def search(search_type: str, params: Dict[str, Any]) -> List: async def specific_search(query_params: List[SearchParameters]) -> List: graph_config = get_graph_config() - graph_client = await get_graph_client(graph_config.graph_engine) + graph_client = await get_graph_client(graph_config.graph_database_provider) graph = graph_client.graph search_functions: Dict[SearchType, Callable] = { diff --git a/cognee/infrastructure/databases/graph/config.py b/cognee/infrastructure/databases/graph/config.py index 5d3f0b4c7..9baea6d24 100644 --- a/cognee/infrastructure/databases/graph/config.py +++ b/cognee/infrastructure/databases/graph/config.py @@ -17,7 +17,7 @@ class GraphConfig(BaseSettings): graph_file_path: str = os.path.join( get_relationaldb_config().db_path, graph_filename ) - graph_engine: object = GraphDBType.NETWORKX + # graph_engine: object = GraphDBType.NETWORKX graph_model: object = KnowledgeGraph graph_topology_task: bool = False graph_topology: object = KnowledgeGraph @@ -37,7 +37,7 @@ class GraphConfig(BaseSettings): "graph_database_username": self.graph_database_username, "graph_database_password": self.graph_database_password, "graph_database_port": self.graph_database_port, - "graph_engine": self.graph_engine, + # "graph_engine": self.graph_engine, "infer_graph_topology": self.infer_graph_topology, } diff --git a/cognee/infrastructure/databases/graph/get_graph_client.py b/cognee/infrastructure/databases/graph/get_graph_client.py index 9ea6bb9bd..05be21a70 100644 --- a/cognee/infrastructure/databases/graph/get_graph_client.py +++ b/cognee/infrastructure/databases/graph/get_graph_client.py @@ -10,7 +10,7 @@ async def get_graph_client(graph_type: GraphDBType=None, graph_file_name: str = """Factory function to get the appropriate graph client based on the graph type.""" config = get_graph_config() - if config.graph_engine == GraphDBType.NEO4J: + if config.graph_database_provider == "neo4j": try: from .neo4j_driver.adapter import Neo4jAdapter @@ -22,7 +22,7 @@ async def get_graph_client(graph_type: GraphDBType=None, graph_file_name: str = except: pass - elif config.graph_engine == GraphDBType.FALKORDB: + elif config.graph_database_provider == "falkorb": try: from .falkordb.adapter import FalcorDBAdapter diff --git a/cognee/modules/cognify/graph/add_node_connections.py b/cognee/modules/cognify/graph/add_node_connections.py index 96f2dd662..33cd498ad 100644 --- a/cognee/modules/cognify/graph/add_node_connections.py +++ b/cognee/modules/cognify/graph/add_node_connections.py @@ -41,7 +41,7 @@ async def connect_nodes_in_graph(graph, relationship_dict, score_threshold=0.9): graph_config = get_graph_config() # For NetworkX - if graph_config.graph_engine == GraphDBType.NETWORKX: + if graph_config.graph_database_provider == "NETWORKX": searched_node_id_found = await get_node_by_unique_id(graph.graph, relationship['searched_node_id']) original_id_for_search_found = await get_node_by_unique_id(graph.graph, relationship['original_id_for_search']) if searched_node_id_found and original_id_for_search_found: @@ -54,7 +54,7 @@ async def connect_nodes_in_graph(graph, relationship_dict, score_threshold=0.9): ) # For Neo4j - elif graph_config.graph_engine == GraphDBType.NEO4J: + elif graph_config.graph_database_provider == "neo4j": # Neo4j specific logic to add an edge # This is just a placeholder, replace it with actual Neo4j logic print("query is ", f"""MATCH (a), (b) WHERE a.unique_id = '{relationship['searched_node_id']}' AND b.unique_id = '{relationship['original_id_for_search']}' CREATE (a)-[:CONNECTED {{weight:{relationship['score']}}}]->(b)""") diff --git a/cognee/modules/cognify/graph/create.py b/cognee/modules/cognify/graph/create.py index 21eb0f010..540ef2f95 100644 --- a/cognee/modules/cognify/graph/create.py +++ b/cognee/modules/cognify/graph/create.py @@ -45,7 +45,7 @@ async def add_node(client, parent_id: Optional[str], node_id: str, node_data: di graph_config = get_graph_config() # Add an edge if a parent ID is provided and the graph engine is NETWORKX - if parent_id and "default_relationship" in node_data and graph_config.graph_engine == GraphDBType.NETWORKX: + if parent_id and "default_relationship" in node_data and graph_config.graph_database_provider == "NETWORKX": try: await client.add_edge(parent_id, node_id, relationship_name = node_data["default_relationship"]["type"], edge_properties = node_data) diff --git a/cognee/modules/search/graph/search_adjacent.py b/cognee/modules/search/graph/search_adjacent.py index fb163d05f..efc50afb2 100644 --- a/cognee/modules/search/graph/search_adjacent.py +++ b/cognee/modules/search/graph/search_adjacent.py @@ -26,7 +26,7 @@ async def search_adjacent(graph: Union[nx.Graph, any], query: str, other_param: graph_config = get_graph_config() - if graph_config.graph_engine == GraphDBType.NETWORKX: + if graph_config.graph_database_provider == "NETWORKX": if node_id not in graph: return {} @@ -34,7 +34,7 @@ async def search_adjacent(graph: Union[nx.Graph, any], query: str, other_param: neighbor_descriptions = {neighbor: graph.nodes[neighbor].get('description') for neighbor in neighbors} return neighbor_descriptions - elif graph_config.graph_engine == GraphDBType.NEO4J: + elif graph_config.graph_database_provider == "neo4j": cypher_query = """ MATCH (node {id: $node_id})-[:CONNECTED_TO]->(neighbor) RETURN neighbor.id AS neighbor_id, neighbor.description AS description diff --git a/cognee/modules/search/graph/search_categories.py b/cognee/modules/search/graph/search_categories.py index bac06cde4..9a07419bc 100644 --- a/cognee/modules/search/graph/search_categories.py +++ b/cognee/modules/search/graph/search_categories.py @@ -34,7 +34,7 @@ async def search_categories(query:str, graph: Union[nx.Graph, any], query_label: # Determine which client is in use based on the configuration graph_config = get_graph_config() - if graph_config.graph_engine == GraphDBType.NETWORKX: + if graph_config.graph_database_provider == "NETWORKX": categories_and_ids = [ {"document_id": strip_exact_regex(_, "DATA_SUMMARY__"), "Summary": data["summary"]} @@ -50,7 +50,7 @@ async def search_categories(query:str, graph: Union[nx.Graph, any], query_label: descriptions = {node: graph.nodes[node].get("description", "No desc available") for node in connected_nodes} return descriptions - elif graph_config.graph_engine == GraphDBType.NEO4J: + elif graph_config.graph_database_provider == "neo4j": # Logic for Neo4j cypher_query = """ MATCH (n) diff --git a/cognee/modules/search/graph/search_cypher.py b/cognee/modules/search/graph/search_cypher.py index ed6431a7b..10078db3e 100644 --- a/cognee/modules/search/graph/search_cypher.py +++ b/cognee/modules/search/graph/search_cypher.py @@ -10,7 +10,7 @@ async def search_cypher(query:str, graph: Union[nx.Graph, any]): """ graph_config = get_graph_config() - if graph_config.graph_engine == GraphDBType.NEO4J: + if graph_config.graph_database_provider == "neo4j": result = await graph.run(query) return result diff --git a/cognee/modules/search/graph/search_neighbour.py b/cognee/modules/search/graph/search_neighbour.py index 8b4bd7da4..6f198a6d8 100644 --- a/cognee/modules/search/graph/search_neighbour.py +++ b/cognee/modules/search/graph/search_neighbour.py @@ -25,7 +25,7 @@ async def search_neighbour(graph: Union[nx.Graph, any], query: str, graph_config = get_graph_config() - if graph_config.graph_engine == GraphDBType.NETWORKX: + if graph_config.graph_database_provider == "NETWORKX": relevant_context = [] target_layer_uuid = graph.nodes[node_id].get("layer_uuid") @@ -36,7 +36,7 @@ async def search_neighbour(graph: Union[nx.Graph, any], query: str, return relevant_context - elif graph_config.graph_engine == GraphDBType.NEO4J: + elif graph_config.graph_database_provider == "neo4j": from neo4j import AsyncSession if isinstance(graph, AsyncSession): diff --git a/cognee/modules/search/graph/search_summary.py b/cognee/modules/search/graph/search_summary.py index cd732018b..af0d6c24c 100644 --- a/cognee/modules/search/graph/search_summary.py +++ b/cognee/modules/search/graph/search_summary.py @@ -26,7 +26,7 @@ async def search_summary( query: str, graph: Union[nx.Graph, any]) -> Dict[str, """ graph_config = get_graph_config() - if graph_config.graph_engine == GraphDBType.NETWORKX: + if graph_config.graph_database_provider == "NETWORKX": summaries_and_ids = [ {"document_id": strip_exact_regex(_, "DATA_SUMMARY__"), "Summary": data["summary"]} for _, data in graph.nodes(data=True) @@ -43,7 +43,7 @@ async def search_summary( query: str, graph: Union[nx.Graph, any]) -> Dict[str, return descriptions - elif graph_config.graph_engine == GraphDBType.NEO4J: + elif graph_config.graph_database_provider == "neo4j": cypher_query = f""" MATCH (n) WHERE n.id CONTAINS $query AND EXISTS(n.summary) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py new file mode 100644 index 000000000..985e4a7cf --- /dev/null +++ b/cognee/tests/test_neo4j.py @@ -0,0 +1,76 @@ + +import logging +import os + +logging.basicConfig(level=logging.DEBUG) + +async def main(): + from os import path + import pathlib + import cognee + logging.basicConfig(level=logging.DEBUG) + + # print("Working dir: ", str(pathlib.Path(__file__).parent)) + # data_directory_path = str(pathlib.Path(path.join(pathlib.Path(__file__).parent, "../../.data")).resolve()) + # print("Data dir: ", data_directory_path) + # cognee.config.data_root_directory(data_directory_path) + # + # cognee_directory_path = str(pathlib.Path(path.join(pathlib.Path(__file__).parent, "../../.cognee_system")).resolve()) + # print("System dir: ", cognee_directory_path) + # cognee.config.system_root_directory(cognee_directory_path) + + cognee.config.set_graph_database_provider("neo4j") + + logging.debug("CURRENCT CWD: %s", pathlib.Path(__file__).parent) + logging.debug("CURRENCT CWD: %s", os.getcwd()) + + dataset_name = "cs_explanations" + + explanation_file_path = os.path.join(pathlib.Path(__file__).parent, "test_data/Natural_language_processing.txt") + await cognee.add([explanation_file_path], dataset_name) + + # dataset_name = "short_stories" + # # data_directory_path is defined above + # await cognee.add("data://" + "/Users/runner/work/cognee/cognee/./cognee/tests", dataset_name) + + text_1 = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena. + At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the preparation and manipulation of quantum states. + Classical physics cannot explain the operation of these quantum devices, and a scalable quantum computer could perform some calculations exponentially faster (with respect to input size scaling) than any modern "classical" computer. In particular, a large-scale quantum computer could break widely used encryption schemes and aid physicists in performing physical simulations; however, the current state of the technology is largely experimental and impractical, with several obstacles to useful applications. Moreover, scalable quantum computers do not hold promise for many practical tasks, and for many important tasks quantum speedups are proven impossible. + The basic unit of information in quantum computing is the qubit, similar to the bit in traditional digital electronics. Unlike a classical bit, a qubit can exist in a superposition of its two "basis" states. When measuring a qubit, the result is a probabilistic output of a classical bit, therefore making quantum computers nondeterministic in general. If a quantum computer manipulates the qubit in a particular way, wave interference effects can amplify the desired measurement results. The design of quantum algorithms involves creating procedures that allow a quantum computer to perform calculations efficiently and quickly. + Physically engineering high-quality qubits has proven challenging. If a physical qubit is not sufficiently isolated from its environment, it suffers from quantum decoherence, introducing noise into calculations. Paradoxically, perfectly isolating qubits is also undesirable because quantum computations typically need to initialize qubits, perform controlled qubit interactions, and measure the resulting quantum states. Each of those operations introduces errors and suffers from noise, and such inaccuracies accumulate. + In principle, a non-quantum (classical) computer can solve the same computational problems as a quantum computer, given enough time. Quantum advantage comes in the form of time complexity rather than computability, and quantum complexity theory shows that some quantum algorithms for carefully selected tasks require exponentially fewer computational steps than the best known non-quantum algorithms. Such tasks can in theory be solved on a large-scale quantum computer whereas classical computers would not finish computations in any reasonable amount of time. However, quantum speedup is not universal or even typical across computational tasks, since basic tasks such as sorting are proven to not allow any asymptotic quantum speedup. Claims of quantum supremacy have drawn significant attention to the discipline, but are demonstrated on contrived tasks, while near-term practical use cases remain limited. + """ + + text_2 = """A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs can be used for text generation, a form of generative AI, by taking an input text and repeatedly predicting the next token or word. + LLMs are artificial neural networks. The largest and most capable, as of March 2024, are built with a decoder-only transformer-based architecture while some recent implementations are based on other architectures, such as recurrent neural network variants and Mamba (a state space model). + Up to 2020, fine tuning was the only way a model could be adapted to be able to accomplish specific tasks. Larger sized models, such as GPT-3, however, can be prompt-engineered to achieve similar results.[6] They are thought to acquire knowledge about syntax, semantics and "ontology" inherent in human language corpora, but also inaccuracies and biases present in the corpora. + Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), Google's PaLM and Gemini (the latter of which is currently used in the chatbot of the same name), xAI's Grok, Meta's LLaMA family of open-source models, Anthropic's Claude models, Mistral AI's open source models, and Databricks' open source DBRX. + """ + + await cognee.cognify(["cs_explanations"]) + + search_results = await cognee.search("SIMILARITY", {"query": "computer science"}) + assert len(search_results) != 0, "The search results list is empty." + print("The search results list is not empty.") + + search_results = await cognee.search("CATEGORIES", {"query": "DefaultGraphModel__default_user"}) + assert len(search_results) != 0, "The search results list is empty." + print("The search results list is not empty.") + + search_results = await cognee.search("NEIGHBOR", {"query": "DefaultGraphModel__default_user"}) + assert len(search_results) != 0, "The search results list is empty." + print("The search results list is not empty.") + + search_results = await cognee.search("SUMMARY", {"query": "Work and computers"}) + assert len(search_results) != 0, "The search results list is empty." + print("The search results list is not empty.") + + search_results = await cognee.search("ADJACENT", {"query": "DefaultGraphModel__default_user"}) + assert len(search_results) != 0, "The search results list is empty." + print("The search results list is not empty.") + + + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/docs/api_reference.md b/docs/api_reference.md index c3bf9644d..c87028c52 100644 --- a/docs/api_reference.md +++ b/docs/api_reference.md @@ -73,10 +73,10 @@ import cognee cognee.config.set_llm_model("openai") ``` -### set_graph_engine(graph_engine: object) +### graph_database_provider(graph_engine: string) Sets the engine to manage graph processing tasks. Parameters: -graph_engine (object): The engine for graph tasks. +graph_database_provider (object): The engine for graph tasks. Example: ```python from cognee.shared.data_models import GraphDBType