From c4850f64dc7c72c0d33ddd6fcb0b44e9a9d35ab0 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:14:42 +0100 Subject: [PATCH 01/23] feat: Implements pipeline structure for retrievers --- cognee/pipelines/__init__.py | 0 cognee/pipelines/retriever/__init__.py | 0 .../retriever/diffusion_retriever.py | 25 ++++++++++++++++++ cognee/pipelines/retriever/g_retriever.py | 25 ++++++++++++++++++ .../retriever/two_steps_retriever.py | 26 +++++++++++++++++++ examples/python/dynamic_steps_example.py | 17 ++++++------ 6 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 cognee/pipelines/__init__.py create mode 100644 cognee/pipelines/retriever/__init__.py create mode 100644 cognee/pipelines/retriever/diffusion_retriever.py create mode 100644 cognee/pipelines/retriever/g_retriever.py create mode 100644 cognee/pipelines/retriever/two_steps_retriever.py diff --git a/cognee/pipelines/__init__.py b/cognee/pipelines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/__init__.py b/cognee/pipelines/retriever/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cognee/pipelines/retriever/diffusion_retriever.py b/cognee/pipelines/retriever/diffusion_retriever.py new file mode 100644 index 000000000..a6b79310e --- /dev/null +++ b/cognee/pipelines/retriever/diffusion_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await diffusion_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def diffusion_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/g_retriever.py b/cognee/pipelines/retriever/g_retriever.py new file mode 100644 index 000000000..4b319acd9 --- /dev/null +++ b/cognee/pipelines/retriever/g_retriever.py @@ -0,0 +1,25 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await g_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + +async def g_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py new file mode 100644 index 000000000..cb0d80133 --- /dev/null +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -0,0 +1,26 @@ +from uuid import UUID +from enum import Enum +from typing import Callable, Dict +from cognee.shared.utils import send_telemetry +from cognee.modules.users.models import User +from cognee.modules.users.methods import get_default_user +from cognee.modules.users.permissions.methods import get_document_ids_for_user + +async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: + if user is None: + user = await get_default_user() + + if user is None: + raise PermissionError("No user found in the system. Please create a user.") + + own_document_ids = await get_document_ids_for_user(user.id) + retrieved_results = await run_two_step_retriever(query, user) + + filtered_search_results = [] + + + return retrieved_results + + +async def run_two_step_retriever(query: str, user, community_filter = []) -> list: + raise(NotImplementedError) \ No newline at end of file diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 309aea82c..11c2f1110 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,6 +1,6 @@ import cognee import asyncio -from cognee.api.v1.search import SearchType +from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever job_position = """0:Senior Data Scientist (Machine Learning) @@ -206,9 +206,8 @@ async def main(enable_steps): print("Knowledge graph created.") # Step 4: Query insights - if enable_steps.get("search_insights"): - search_results = await cognee.search( - SearchType.INSIGHTS, + if enable_steps.get("retriever"): + search_results = await two_step_retriever( {'query': 'Which applicant has the most relevant experience in data science?'} ) print("Search results:") @@ -219,11 +218,11 @@ async def main(enable_steps): if __name__ == '__main__': # Flags to enable/disable steps steps_to_enable = { - "prune_data": True, - "prune_system": True, - "add_text": True, - "cognify": True, - "search_insights": True + "prune_data": False, + "prune_system": False, + "add_text": False, + "cognify": False, + "retriever": True } asyncio.run(main(steps_to_enable)) From f2c0fddeb2b39aa521724133317046365735ff96 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:29:52 +0100 Subject: [PATCH 02/23] feat: Adds graph-data-science to neo4j docker image --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 426b178a7..1e13f1924 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,7 +46,7 @@ services: - 7687:7687 environment: - NEO4J_AUTH=neo4j/pleaseletmein - - NEO4J_PLUGINS=["apoc"] + - NEO4J_PLUGINS=["apoc", "graph-data-science"] networks: - cognee-network From 44ac9b68b41f9889cf904e5f7d3c2148c76ca901 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:39:45 +0100 Subject: [PATCH 03/23] feat: adds get_distances from collection method to LanceDB and PgVector --- .../vector/lancedb/LanceDBAdapter.py | 51 ++++++++++------ .../vector/pgvector/PGVectorAdapter.py | 61 +++++++++++++++++++ .../infrastructure/databases/vector/utils.py | 26 ++++++++ 3 files changed, 118 insertions(+), 20 deletions(-) create mode 100644 cognee/infrastructure/databases/vector/utils.py diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 96f026b4f..6cbe45655 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -10,6 +10,7 @@ from cognee.infrastructure.files.storage import LocalStorage from cognee.modules.storage.utils import copy_model, get_own_properties from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine class IndexSchema(DataPoint): @@ -141,6 +142,34 @@ class LanceDBAdapter(VectorDBInterface): score = 0, ) for result in results.to_dict("index").values()] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + ): + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + connection = await self.get_connection() + collection = await connection.open_table(collection_name) + + results = await collection.vector_search(query_vector).to_pandas() + + result_values = list(results.to_dict("index").values()) + + normalized_values = normalize_distances(result_values) + + return [ScoredResult( + id=UUID(result["id"]), + payload=result["payload"], + score=normalized_values[value_index], + ) for value_index, result in enumerate(result_values)] + async def search( self, collection_name: str, @@ -148,6 +177,7 @@ class LanceDBAdapter(VectorDBInterface): query_vector: List[float] = None, limit: int = 5, with_vector: bool = False, + normalized: bool = True ): if query_text is None and query_vector is None: raise ValueError("One of query_text or query_vector must be provided!") @@ -162,26 +192,7 @@ class LanceDBAdapter(VectorDBInterface): result_values = list(results.to_dict("index").values()) - min_value = 100 - max_value = 0 - - for result in result_values: - value = float(result["_distance"]) - if value > max_value: - max_value = value - if value < min_value: - min_value = value - - normalized_values = [] - min_value = min(result["_distance"] for result in result_values) - max_value = max(result["_distance"] for result in result_values) - - if max_value == min_value: - # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) - normalized_values = [0 for _ in result_values] - else: - normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in - result_values] + normalized_values = normalize_distances(result_values) return [ScoredResult( id = UUID(result["id"]), diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 01691714b..97571a274 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -11,6 +11,7 @@ from cognee.infrastructure.engine import DataPoint from .serialize_data import serialize_data from ..models.ScoredResult import ScoredResult from ..vector_db_interface import VectorDBInterface +from ..utils import normalize_distances from ..embeddings.EmbeddingEngine import EmbeddingEngine from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter from ...relational.ModelBase import Base @@ -22,6 +23,19 @@ class IndexSchema(DataPoint): "index_fields": ["text"] } +def singleton(class_): + # Note: Using this singleton as a decorator to a class removes + # the option to use class methods for that class + instances = {} + + def getinstance(*args, **kwargs): + if class_ not in instances: + instances[class_] = class_(*args, **kwargs) + return instances[class_] + + return getinstance + +@singleton class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): def __init__( @@ -162,6 +176,53 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): ) for result in results ] + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + )-> List[ScoredResult]: + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_text and not query_vector: + query_vector = (await self.embedding_engine.embed_text([query_text]))[0] + + # Get PGVectorDataPoint Table from database + PGVectorDataPoint = await self.get_table(collection_name) + + closest_items = [] + + # Use async session to connect to the database + async with self.get_async_session() as session: + # Find closest vectors to query_vector + closest_items = await session.execute( + select( + PGVectorDataPoint, + PGVectorDataPoint.c.vector.cosine_distance(query_vector).label( + "similarity" + ), + ) + .order_by("similarity") + ) + + vector_list = [] + + # Extract distances and find min/max for normalization + for vector in closest_items: + # TODO: Add normalization of similarity score + vector_list.append(vector) + + # Create and return ScoredResult objects + return [ + ScoredResult( + id = UUID(str(row.id)), + payload = row.payload, + score = row.similarity + ) for row in vector_list + ] + async def search( self, collection_name: str, diff --git a/cognee/infrastructure/databases/vector/utils.py b/cognee/infrastructure/databases/vector/utils.py new file mode 100644 index 000000000..ced161ea3 --- /dev/null +++ b/cognee/infrastructure/databases/vector/utils.py @@ -0,0 +1,26 @@ +from typing import List + + +def normalize_distances(result_values: List[dict]) -> List[float]: + min_value = 100 + max_value = 0 + + for result in result_values: + value = float(result["_distance"]) + if value > max_value: + max_value = value + if value < min_value: + min_value = value + + normalized_values = [] + min_value = min(result["_distance"] for result in result_values) + max_value = max(result["_distance"] for result in result_values) + + if max_value == min_value: + # Avoid division by zero: Assign all normalized values to 0 (or any constant value like 1) + normalized_values = [0 for _ in result_values] + else: + normalized_values = [(result["_distance"] - min_value) / (max_value - min_value) for result in + result_values] + + return normalized_values \ No newline at end of file From d9eec77f18932fd0a2b13795aa2bc2e48b7ea13e Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:40:27 +0100 Subject: [PATCH 04/23] feat: Implements first step of the two step retrieval --- .../retriever/two_steps_retriever.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py index cb0d80133..7a630fab3 100644 --- a/cognee/pipelines/retriever/two_steps_retriever.py +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -1,3 +1,4 @@ +import asyncio from uuid import UUID from enum import Enum from typing import Callable, Dict @@ -5,6 +6,9 @@ from cognee.shared.utils import send_telemetry from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.users.permissions.methods import get_document_ids_for_user +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.infrastructure.databases.graph import get_graph_engine + async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: if user is None: @@ -23,4 +27,32 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: async def run_two_step_retriever(query: str, user, community_filter = []) -> list: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + + collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] + results = await asyncio.gather( + *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + ) + + ############################################# This part is a quick fix til we don't fix the vector db inconsistency + results_dict = {} + for collection, results in zip(collections, results): + seen_ids = set() + unique_results = [] + for result in results: + if result.id not in seen_ids: + unique_results.append(result) + seen_ids.add(result.id) + else: + print(f"Duplicate found in collection '{collection}': {result.id}") + results_dict[collection] = unique_results + # :TODO: Due to duplicates and inconsistent vector db state now am collecting + # :TODO: the first appearance of the object but this code should be the solution once the db is fixed. + # results_dict = {collection: result for collection, result in zip(collections, results)} + ############################################## + + print() + + raise(NotImplementedError) \ No newline at end of file From 0101d43b8de29179b5b92a6bfa319af8a144b645 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:13:38 +0100 Subject: [PATCH 05/23] feat: Adds graph node filtering by feature --- .../databases/graph/neo4j_driver/adapter.py | 48 ++++++++++++++++++- .../databases/graph/networkx/adapter.py | 38 ++++++++++++++- .../modules/graph/cognee_graph/CogneeGraph.py | 8 +++- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py index 1121a24d5..e6520e4e2 100644 --- a/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +++ b/cognee/infrastructure/databases/graph/neo4j_driver/adapter.py @@ -2,7 +2,7 @@ import logging import asyncio from textwrap import dedent -from typing import Optional, Any, List, Dict +from typing import Optional, Any, List, Dict, Union from contextlib import asynccontextmanager from uuid import UUID from neo4j import AsyncSession @@ -432,3 +432,49 @@ class Neo4jAdapter(GraphDBInterface): ) for record in result] return (nodes, edges) + + async def get_filtered_graph_data(self, attribute_filters): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: nodes and edges. + """ + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + values_str = ", ".join(f"'{value}'" if isinstance(value, str) else str(value) for value in values) + where_clauses.append(f"n.{attribute} IN [{values_str}]") + + where_clause = " AND ".join(where_clauses) + + query_nodes = f""" + MATCH (n) + WHERE {where_clause} + RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties + """ + result_nodes = await self.query(query_nodes) + + nodes = [( + record["id"], + record["properties"], + ) for record in result_nodes] + + query_edges = f""" + MATCH (n)-[r]->(m) + WHERE {where_clause} AND {where_clause.replace('n.', 'm.')} + RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties + """ + result_edges = await self.query(query_edges) + + edges = [( + record["source"], + record["target"], + record["type"], + record["properties"], + ) for record in result_edges] + + return (nodes, edges) \ No newline at end of file diff --git a/cognee/infrastructure/databases/graph/networkx/adapter.py b/cognee/infrastructure/databases/graph/networkx/adapter.py index a72376082..d249b6336 100644 --- a/cognee/infrastructure/databases/graph/networkx/adapter.py +++ b/cognee/infrastructure/databases/graph/networkx/adapter.py @@ -6,7 +6,7 @@ import json import asyncio import logging from re import A -from typing import Dict, Any, List +from typing import Dict, Any, List, Union from uuid import UUID import aiofiles import aiofiles.os as aiofiles_os @@ -301,3 +301,39 @@ class NetworkXAdapter(GraphDBInterface): logger.info("Graph deleted successfully.") except Exception as error: logger.error("Failed to delete graph: %s", error) + + async def get_filtered_graph_data(self, attribute_filters: List[Dict[str, List[Union[str, int]]]]): + """ + Fetches nodes and relationships filtered by specified attribute values. + + Args: + attribute_filters (list of dict): A list of dictionaries where keys are attributes and values are lists of values to filter on. + Example: [{"community": ["1", "2"]}] + + Returns: + tuple: A tuple containing two lists: + - Nodes: List of tuples (node_id, node_properties). + - Edges: List of tuples (source_id, target_id, relationship_type, edge_properties). + """ + # Create filters for nodes based on the attribute filters + where_clauses = [] + for attribute, values in attribute_filters[0].items(): + where_clauses.append((attribute, values)) + + # Filter nodes + filtered_nodes = [ + (node, data) for node, data in self.graph.nodes(data=True) + if all(data.get(attr) in values for attr, values in where_clauses) + ] + + # Filter edges where both source and target nodes satisfy the filters + filtered_edges = [ + (source, target, data.get('relationship_type', 'UNKNOWN'), data) + for source, target, data in self.graph.edges(data=True) + if ( + all(self.graph.nodes[source].get(attr) in values for attr, values in where_clauses) and + all(self.graph.nodes[target].get(attr) in values for attr, values in where_clauses) + ) + ] + + return filtered_nodes, filtered_edges \ No newline at end of file diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index d15d93b73..0b752c6cb 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -52,13 +52,17 @@ class CogneeGraph(CogneeAbstractGraph): edge_properties_to_project: List[str], directed = True, node_dimension = 1, - edge_dimension = 1) -> None: + edge_dimension = 1, + memory_fragment_filter = List[Dict[str, List[Union[str, int]]]]) -> None: if node_dimension < 1 or edge_dimension < 1: raise ValueError("Dimensions must be positive integers") try: - nodes_data, edges_data = await adapter.get_graph_data() + if len(memory_fragment_filter) == 0: + nodes_data, edges_data = await adapter.get_graph_data() + else: + nodes_data, edges_data = await adapter.get_filtered_graph_data(attribute_filters = memory_fragment_filter) if not nodes_data: raise ValueError("No node data retrieved from the database.") From 9f557b0c5bd4b1aa5c11f4af5af3c51b759df3d1 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:14:36 +0100 Subject: [PATCH 06/23] feat: Extends two steps retriever with graph projection --- .../retriever/two_steps_retriever.py | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py index 7a630fab3..c681f3e99 100644 --- a/cognee/pipelines/retriever/two_steps_retriever.py +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -6,8 +6,11 @@ from cognee.shared.utils import send_telemetry from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.users.permissions.methods import get_document_ids_for_user +from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph import get_graph_engine +from openai import organization +from sympy.codegen.fnodes import dimension async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: @@ -26,16 +29,7 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: return retrieved_results -async def run_two_step_retriever(query: str, user, community_filter = []) -> list: - vector_engine = get_vector_engine() - graph_engine = await get_graph_engine() - - collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] - results = await asyncio.gather( - *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] - ) - - ############################################# This part is a quick fix til we don't fix the vector db inconsistency +def delete_duplicated_vector_db_elements(collections, results): #:TODO: This is just for now to fix vector db duplicates results_dict = {} for collection, results in zip(collections, results): seen_ids = set() @@ -47,11 +41,36 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis else: print(f"Duplicate found in collection '{collection}': {result.id}") results_dict[collection] = unique_results - # :TODO: Due to duplicates and inconsistent vector db state now am collecting - # :TODO: the first appearance of the object but this code should be the solution once the db is fixed. + + return results_dict + + +async def run_two_step_retriever(query: str, user, community_filter = []) -> list: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + + collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] + results = await asyncio.gather( + *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + ) + + ############################################# This part is a quick fix til we don't fix the vector db inconsistency + results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed # results_dict = {collection: result for collection, result in zip(collections, results)} ############################################## + memory_fragment = CogneeGraph() + + await memory_fragment.project_graph_from_db(graph_engine, + node_properties_to_project=['id', + 'community'], + edge_properties_to_project=['id', + 'relationship_name'], + directed=True, + node_dimension=1, + edge_dimension=1, + memory_fragment_filter=[]) + print() From 980ae2b22c02c59875f9ff71c931677e4e7b2d78 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:32:03 +0100 Subject: [PATCH 07/23] feat: Adds in time edge vector similarity calculation and triplet importances --- .../modules/graph/cognee_graph/CogneeGraph.py | 92 ++++++++++++++++++- .../graph/cognee_graph/CogneeGraphElements.py | 16 +++- .../unit/modules/graph/cognee_graph_test.py | 4 +- 3 files changed, 105 insertions(+), 7 deletions(-) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 0b752c6cb..158fb9d07 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -1,9 +1,12 @@ -from typing import List, Dict, Union +import numpy as np +from typing import List, Dict, Union from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface from cognee.modules.graph.cognee_graph.CogneeGraphElements import Node, Edge from cognee.modules.graph.cognee_graph.CogneeAbstractGraph import CogneeAbstractGraph -from cognee.infrastructure.databases.graph import get_graph_engine +import heapq +from graphistry import edges + class CogneeGraph(CogneeAbstractGraph): """ @@ -39,13 +42,16 @@ class CogneeGraph(CogneeAbstractGraph): def get_node(self, node_id: str) -> Node: return self.nodes.get(node_id, None) - def get_edges(self, node_id: str) -> List[Edge]: + def get_edges_of_node(self, node_id: str) -> List[Edge]: node = self.get_node(node_id) if node: return node.skeleton_edges else: raise ValueError(f"Node with id {node_id} does not exist.") + def get_edges(self)-> List[Edge]: + return edges + async def project_graph_from_db(self, adapter: Union[GraphDBInterface], node_properties_to_project: List[str], @@ -53,7 +59,7 @@ class CogneeGraph(CogneeAbstractGraph): directed = True, node_dimension = 1, edge_dimension = 1, - memory_fragment_filter = List[Dict[str, List[Union[str, int]]]]) -> None: + memory_fragment_filter = []) -> None: if node_dimension < 1 or edge_dimension < 1: raise ValueError("Dimensions must be positive integers") @@ -93,3 +99,81 @@ class CogneeGraph(CogneeAbstractGraph): print(f"Error projecting graph: {e}") except Exception as ex: print(f"Unexpected error: {ex}") + + async def map_vector_distances_to_graph_nodes(self, node_distances) -> None: + for category, scored_results in node_distances.items(): + for scored_result in scored_results: + node_id = str(scored_result.id) + score = scored_result.score + node =self.get_node(node_id) + if node: + node.add_attribute("vector_distance", score) + else: + print(f"Node with id {node_id} not found in the graph.") + + async def map_vector_distances_to_graph_edges(self, vector_engine, query) -> None: # :TODO: When we calculate edge embeddings in vector db change this similarly to node mapping + try: + # Step 1: Generate the query embedding + query_vector = await vector_engine.embed_data([query]) + query_vector = query_vector[0] + if query_vector is None or len(query_vector) == 0: + raise ValueError("Failed to generate query embedding.") + + # Step 2: Collect all unique relationship types + unique_relationship_types = set() + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if relationship_type: + unique_relationship_types.add(relationship_type) + + # Step 3: Embed all unique relationship types + unique_relationship_types = list(unique_relationship_types) + relationship_type_embeddings = await vector_engine.embed_data(unique_relationship_types) + + # Step 4: Map relationship types to their embeddings and calculate distances + embedding_map = {} + for relationship_type, embedding in zip(unique_relationship_types, relationship_type_embeddings): + edge_vector = np.array(embedding) + + # Calculate cosine similarity + similarity = np.dot(query_vector, edge_vector) / ( + np.linalg.norm(query_vector) * np.linalg.norm(edge_vector) + ) + distance = 1 - similarity + + # Round the distance to 4 decimal places and store it + embedding_map[relationship_type] = round(distance, 4) + + # Step 4: Assign precomputed distances to edges + for edge in self.edges: + relationship_type = edge.attributes.get('relationship_type') + if not relationship_type or relationship_type not in embedding_map: + print(f"Edge {edge} has an unknown or missing relationship type.") + continue + + # Assign the precomputed distance + edge.attributes["vector_distance"] = embedding_map[relationship_type] + + except Exception as ex: + print(f"Error mapping vector distances to edges: {ex}") + + + async def calculate_top_triplet_importances(self, k = int) -> List: + min_heap = [] + for i, edge in enumerate(self.edges): + source_node = self.get_node(edge.node1.id) + target_node = self.get_node(edge.node2.id) + + source_distance = source_node.attributes.get("vector_distance", 0) if source_node else 0 + target_distance = target_node.attributes.get("vector_distance", 0) if target_node else 0 + edge_distance = edge.attributes.get("vector_distance", 0) + + total_distance = source_distance + target_distance + edge_distance + + heapq.heappush(min_heap, (-total_distance, i, edge)) + if len(min_heap) > k: + heapq.heappop(min_heap) + + + return [edge for _, _, edge in sorted(min_heap)] + diff --git a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py index 8235cb24d..cecb0a272 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraphElements.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraphElements.py @@ -1,5 +1,5 @@ import numpy as np -from typing import List, Dict, Optional, Any +from typing import List, Dict, Optional, Any, Union class Node: """ @@ -21,6 +21,7 @@ class Node: raise ValueError("Dimension must be a positive integer") self.id = node_id self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.skeleton_neighbours = [] self.skeleton_edges = [] self.status = np.ones(dimension, dtype=int) @@ -55,6 +56,12 @@ class Node: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: return f"Node({self.id}, attributes={self.attributes})" @@ -87,6 +94,7 @@ class Edge: self.node1 = node1 self.node2 = node2 self.attributes = attributes if attributes is not None else {} + self.attributes["vector_distance"] = float('inf') self.directed = directed self.status = np.ones(dimension, dtype=int) @@ -95,6 +103,12 @@ class Edge: raise ValueError(f"Dimension {dimension} is out of range. Valid range is 0 to {len(self.status) - 1}.") return self.status[dimension] == 1 + def add_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value + + def get_attribute(self, key: str, value: Any) -> Union[str, int, float]: + return self.attributes[key] + def __repr__(self) -> str: direction = "->" if self.directed else "--" return f"Edge({self.node1.id} {direction} {self.node2.id}, attributes={self.attributes})" diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index d05292d75..bad474023 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -77,11 +77,11 @@ def test_get_edges_success(setup_graph): graph.add_node(node2) edge = Edge(node1, node2) graph.add_edge(edge) - assert edge in graph.get_edges("node1") + assert edge in graph.get_edges_of_node("node1") def test_get_edges_nonexistent_node(setup_graph): """Test retrieving edges for a nonexistent node raises an exception.""" graph = setup_graph with pytest.raises(ValueError, match="Node with id nonexistent does not exist."): - graph.get_edges("nonexistent") + graph.get_edges_of_node("nonexistent") From a114d68aeffd63160723c151b2c331664fd23b34 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:33:34 +0100 Subject: [PATCH 08/23] feat: Implements basic global triplet optimizing retrieval --- .../retriever/two_steps_retriever.py | 63 ++++++++++++++++--- examples/python/dynamic_steps_example.py | 48 +++----------- 2 files changed, 63 insertions(+), 48 deletions(-) diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py index c681f3e99..ff35a0864 100644 --- a/cognee/pipelines/retriever/two_steps_retriever.py +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -13,6 +13,46 @@ from openai import organization from sympy.codegen.fnodes import dimension +def format_triplets(edges): + def filter_attributes(obj, attributes): + """Helper function to filter out non-None properties, including nested dicts.""" + print("\n\n\n") + result = {} + for attr in attributes: + value = getattr(obj, attr, None) + if value is not None: + # If the value is a dict, extract relevant keys from it + if isinstance(value, dict): + nested_values = {k: v for k, v in value.items() if k in attributes and v is not None} + result[attr] = nested_values + else: + result[attr] = value + return result + + triplets = [] + for edge in edges: + node1 = edge.node1 + node2 = edge.node2 + edge_attributes = edge.attributes + node1_attributes = node1.attributes + node2_attributes = node2.attributes + + # Filter only non-None properties + node1_info = {key: value for key, value in node1_attributes.items() if value is not None} + node2_info = {key: value for key, value in node2_attributes.items() if value is not None} + edge_info = {key: value for key, value in edge_attributes.items() if value is not None} + + # Create the formatted triplet + triplet = ( + f"Node1: {node1_info}\n" + f"Edge: {edge_info}\n" + f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation + ) + triplets.append(triplet) + + return "".join(triplets) + + async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: if user is None: user = await get_default_user() @@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: filtered_search_results = [] - return retrieved_results @@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis ) ############################################# This part is a quick fix til we don't fix the vector db inconsistency - results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed + node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed # results_dict = {collection: result for collection, result in zip(collections, results)} ############################################## @@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis await memory_fragment.project_graph_from_db(graph_engine, node_properties_to_project=['id', - 'community'], + 'description', + 'name', + 'type', + 'text'], edge_properties_to_project=['id', - 'relationship_name'], - directed=True, - node_dimension=1, - edge_dimension=1, - memory_fragment_filter=[]) + 'relationship_name']) - print() + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) + + await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db + + results = await memory_fragment.calculate_top_triplet_importances(k=5) - raise(NotImplementedError) \ No newline at end of file + print(format_triplets(results)) + print(f'Query was the following:{query}' ) diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 11c2f1110..f4aa0aaf7 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -2,32 +2,6 @@ import cognee import asyncio from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever -job_position = """0:Senior Data Scientist (Machine Learning) - -Company: TechNova Solutions -Location: San Francisco, CA - -Job Description: - -TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights. - -Responsibilities: - -Develop and implement advanced machine learning algorithms and models. -Analyze large, complex datasets to extract meaningful patterns and insights. -Collaborate with cross-functional teams to integrate predictive models into products. -Stay updated with the latest advancements in machine learning and data science. -Mentor junior data scientists and provide technical guidance. -Qualifications: - -Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field. -5+ years of experience in data science and machine learning. -Proficient in Python, R, and SQL. -Experience with deep learning frameworks (e.g., TensorFlow, PyTorch). -Strong problem-solving skills and attention to detail. -Candidate CVs -""" - job_1 = """ CV 1: Relevant Name: Dr. Emily Carter @@ -195,7 +169,7 @@ async def main(enable_steps): # Step 2: Add text if enable_steps.get("add_text"): - text_list = [job_position, job_1, job_2, job_3, job_4, job_5] + text_list = [job_1, job_2, job_3, job_4, job_5] for text in text_list: await cognee.add(text) print(f"Added text: {text[:35]}...") @@ -207,22 +181,20 @@ async def main(enable_steps): # Step 4: Query insights if enable_steps.get("retriever"): - search_results = await two_step_retriever( - {'query': 'Which applicant has the most relevant experience in data science?'} - ) - print("Search results:") - for result_text in search_results: - print(result_text) + await two_step_retriever('Who has Phd?') if __name__ == '__main__': # Flags to enable/disable steps + + rebuild_kg = False + retrieve = True steps_to_enable = { - "prune_data": False, - "prune_system": False, - "add_text": False, - "cognify": False, - "retriever": True + "prune_data": rebuild_kg, + "prune_system": rebuild_kg, + "add_text": rebuild_kg, + "cognify": rebuild_kg, + "retriever": retrieve } asyncio.run(main(steps_to_enable)) From 6efe566849d11d96c550438f277babc86f529f29 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:40:56 +0100 Subject: [PATCH 09/23] fix: Adds new obligatory attributes to cognee graph tests --- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py index d2a1b6c59..a3755a58f 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_elements_test.py @@ -8,7 +8,7 @@ def test_node_initialization(): """Test that a Node is initialized correctly.""" node = Node("node1", {"attr1": "value1"}, dimension=2) assert node.id == "node1" - assert node.attributes == {"attr1": "value1"} + assert node.attributes == {"attr1": "value1", 'vector_distance': np.inf} assert len(node.status) == 2 assert np.all(node.status == 1) @@ -95,7 +95,7 @@ def test_edge_initialization(): edge = Edge(node1, node2, {"weight": 10}, directed=False, dimension=2) assert edge.node1 == node1 assert edge.node2 == node2 - assert edge.attributes == {"weight": 10} + assert edge.attributes == {'vector_distance': np.inf,"weight": 10} assert edge.directed is False assert len(edge.status) == 2 assert np.all(edge.status == 1) From b5d9e7a6d2363596ec3da5845c6cfc09c99c94ad Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 20 Nov 2024 19:03:32 +0100 Subject: [PATCH 10/23] chore: adds return value and sets tue entry point kg generation to true --- cognee/pipelines/retriever/two_steps_retriever.py | 7 +++---- examples/python/dynamic_steps_example.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/two_steps_retriever.py index ff35a0864..92ef2be2e 100644 --- a/cognee/pipelines/retriever/two_steps_retriever.py +++ b/cognee/pipelines/retriever/two_steps_retriever.py @@ -9,14 +9,12 @@ from cognee.modules.users.permissions.methods import get_document_ids_for_user from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph import get_graph_engine -from openai import organization -from sympy.codegen.fnodes import dimension def format_triplets(edges): + print("\n\n\n") def filter_attributes(obj, attributes): """Helper function to filter out non-None properties, including nested dicts.""" - print("\n\n\n") result = {} for attr in attributes: value = getattr(obj, attr, None) @@ -115,6 +113,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis results = await memory_fragment.calculate_top_triplet_importances(k=5) - print(format_triplets(results)) print(f'Query was the following:{query}' ) + + return results diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index f4aa0aaf7..49b41db1c 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -187,7 +187,7 @@ async def main(enable_steps): if __name__ == '__main__': # Flags to enable/disable steps - rebuild_kg = False + rebuild_kg = True retrieve = True steps_to_enable = { "prune_data": rebuild_kg, From a59517409c8e4d29899ecd3d4fafe42b052b8dc5 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:45:48 +0100 Subject: [PATCH 11/23] chore: Fixes some of the issues based on PR review + restructures things --- .../vector/pgvector/PGVectorAdapter.py | 2 - .../modules/graph/cognee_graph/CogneeGraph.py | 6 +-- ...iever.py => brute_force_triplet_search.py} | 42 +++++++++---------- .../retriever/diffusion_retriever.py | 25 ----------- cognee/pipelines/retriever/g_retriever.py | 25 ----------- examples/python/dynamic_steps_example.py | 6 +-- 6 files changed, 26 insertions(+), 80 deletions(-) rename cognee/pipelines/retriever/{two_steps_retriever.py => brute_force_triplet_search.py} (75%) delete mode 100644 cognee/pipelines/retriever/diffusion_retriever.py delete mode 100644 cognee/pipelines/retriever/g_retriever.py diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index 97571a274..a4cfcf789 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -192,8 +192,6 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): # Get PGVectorDataPoint Table from database PGVectorDataPoint = await self.get_table(collection_name) - closest_items = [] - # Use async session to connect to the database async with self.get_async_session() as session: # Find closest vectors to query_vector diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 158fb9d07..0b5aebce4 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -164,9 +164,9 @@ class CogneeGraph(CogneeAbstractGraph): source_node = self.get_node(edge.node1.id) target_node = self.get_node(edge.node2.id) - source_distance = source_node.attributes.get("vector_distance", 0) if source_node else 0 - target_distance = target_node.attributes.get("vector_distance", 0) if target_node else 0 - edge_distance = edge.attributes.get("vector_distance", 0) + source_distance = source_node.attributes.get("vector_distance", 1) if source_node else 1 + target_distance = target_node.attributes.get("vector_distance", 1) if target_node else 1 + edge_distance = edge.attributes.get("vector_distance", 1) total_distance = source_distance + target_distance + edge_distance diff --git a/cognee/pipelines/retriever/two_steps_retriever.py b/cognee/pipelines/retriever/brute_force_triplet_search.py similarity index 75% rename from cognee/pipelines/retriever/two_steps_retriever.py rename to cognee/pipelines/retriever/brute_force_triplet_search.py index 92ef2be2e..6fef6104f 100644 --- a/cognee/pipelines/retriever/two_steps_retriever.py +++ b/cognee/pipelines/retriever/brute_force_triplet_search.py @@ -1,15 +1,11 @@ import asyncio -from uuid import UUID -from enum import Enum -from typing import Callable, Dict -from cognee.shared.utils import send_telemetry +from typing import Dict, List from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user -from cognee.modules.users.permissions.methods import get_document_ids_for_user from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.databases.graph import get_graph_engine - +from cognee.shared.utils import send_telemetry def format_triplets(edges): print("\n\n\n") @@ -44,24 +40,22 @@ def format_triplets(edges): triplet = ( f"Node1: {node1_info}\n" f"Edge: {edge_info}\n" - f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation + f"Node2: {node2_info}\n\n\n" ) triplets.append(triplet) return "".join(triplets) -async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: +async def brute_force_triplet_search(query: str, user: User = None, top_k = 5) -> list: if user is None: user = await get_default_user() if user is None: raise PermissionError("No user found in the system. Please create a user.") - own_document_ids = await get_document_ids_for_user(user.id) - retrieved_results = await run_two_step_retriever(query, user) + retrieved_results = await brute_force_search(query, user, top_k) - filtered_search_results = [] return retrieved_results @@ -82,18 +76,22 @@ def delete_duplicated_vector_db_elements(collections, results): #:TODO: This is return results_dict -async def run_two_step_retriever(query: str, user, community_filter = []) -> list: +async def brute_force_search(query: str, user: User, top_k: int, collections: List[str] = None) -> list: + if collections is None: + collections = ["entity_name", "text_summary_text", "entity_type_name", "document_chunk_text"] + vector_engine = get_vector_engine() graph_engine = await get_graph_engine() - collections = ["Entity_name", "TextSummary_text", 'EntityType_name', 'DocumentChunk_text'] + send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id) + results = await asyncio.gather( *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] ) - ############################################# This part is a quick fix til we don't fix the vector db inconsistency - node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed - # results_dict = {collection: result for collection, result in zip(collections, results)} + ############################################# :TODO: Change when vector db does not contain duplicates + node_distances = delete_duplicated_vector_db_elements(collections, results) + # node_distances = {collection: result for collection, result in zip(collections, results)} ############################################## memory_fragment = CogneeGraph() @@ -104,16 +102,16 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis 'name', 'type', 'text'], - edge_properties_to_project=['id', - 'relationship_name']) + edge_properties_to_project=['relationship_name']) await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) - await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db + #:TODO: Change when vectordb contains edge embeddings + await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query) - results = await memory_fragment.calculate_top_triplet_importances(k=5) + results = await memory_fragment.calculate_top_triplet_importances(k=top_k) - print(format_triplets(results)) - print(f'Query was the following:{query}' ) + send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id) + #:TODO: Once we have Edge pydantic models we should retrieve the exact edge and node objects from graph db return results diff --git a/cognee/pipelines/retriever/diffusion_retriever.py b/cognee/pipelines/retriever/diffusion_retriever.py deleted file mode 100644 index a6b79310e..000000000 --- a/cognee/pipelines/retriever/diffusion_retriever.py +++ /dev/null @@ -1,25 +0,0 @@ -from uuid import UUID -from enum import Enum -from typing import Callable, Dict -from cognee.shared.utils import send_telemetry -from cognee.modules.users.models import User -from cognee.modules.users.methods import get_default_user -from cognee.modules.users.permissions.methods import get_document_ids_for_user - -async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: - if user is None: - user = await get_default_user() - - if user is None: - raise PermissionError("No user found in the system. Please create a user.") - - own_document_ids = await get_document_ids_for_user(user.id) - retrieved_results = await diffusion_retriever(query, user) - - filtered_search_results = [] - - - return retrieved_results - -async def diffusion_retriever(query: str, user, community_filter = []) -> list: - raise(NotImplementedError) \ No newline at end of file diff --git a/cognee/pipelines/retriever/g_retriever.py b/cognee/pipelines/retriever/g_retriever.py deleted file mode 100644 index 4b319acd9..000000000 --- a/cognee/pipelines/retriever/g_retriever.py +++ /dev/null @@ -1,25 +0,0 @@ -from uuid import UUID -from enum import Enum -from typing import Callable, Dict -from cognee.shared.utils import send_telemetry -from cognee.modules.users.models import User -from cognee.modules.users.methods import get_default_user -from cognee.modules.users.permissions.methods import get_document_ids_for_user - -async def two_step_retriever(query: Dict[str, str], user: User = None) -> list: - if user is None: - user = await get_default_user() - - if user is None: - raise PermissionError("No user found in the system. Please create a user.") - - own_document_ids = await get_document_ids_for_user(user.id) - retrieved_results = await g_retriever(query, user) - - filtered_search_results = [] - - - return retrieved_results - -async def g_retriever(query: str, user, community_filter = []) -> list: - raise(NotImplementedError) \ No newline at end of file diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 49b41db1c..6b75310cf 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,6 +1,6 @@ import cognee import asyncio -from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever +from cognee.pipelines.retriever.brute_force_triplet_search import brute_force_triplet_search job_1 = """ CV 1: Relevant @@ -181,13 +181,13 @@ async def main(enable_steps): # Step 4: Query insights if enable_steps.get("retriever"): - await two_step_retriever('Who has Phd?') + await brute_force_triplet_search('Who has Phd?') if __name__ == '__main__': # Flags to enable/disable steps - rebuild_kg = True + rebuild_kg = False retrieve = True steps_to_enable = { "prune_data": rebuild_kg, From 163bdc527cb096f39a9f7e11008f05b6902602bc Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:37:34 +0100 Subject: [PATCH 12/23] chore: fixes PR issues regarding vector normalization and cognee graph --- .../databases/vector/lancedb/LanceDBAdapter.py | 3 +-- cognee/infrastructure/databases/vector/utils.py | 10 ---------- cognee/modules/graph/cognee_graph/CogneeGraph.py | 2 +- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 6cbe45655..66d960c20 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -146,8 +146,7 @@ class LanceDBAdapter(VectorDBInterface): self, collection_name: str, query_text: str = None, - query_vector: List[float] = None, - with_vector: bool = False + query_vector: List[float] = None ): if query_text is None and query_vector is None: raise ValueError("One of query_text or query_vector must be provided!") diff --git a/cognee/infrastructure/databases/vector/utils.py b/cognee/infrastructure/databases/vector/utils.py index ced161ea3..30cff3f02 100644 --- a/cognee/infrastructure/databases/vector/utils.py +++ b/cognee/infrastructure/databases/vector/utils.py @@ -2,17 +2,7 @@ from typing import List def normalize_distances(result_values: List[dict]) -> List[float]: - min_value = 100 - max_value = 0 - for result in result_values: - value = float(result["_distance"]) - if value > max_value: - max_value = value - if value < min_value: - min_value = value - - normalized_values = [] min_value = min(result["_distance"] for result in result_values) max_value = max(result["_distance"] for result in result_values) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 0b5aebce4..3f0d48a23 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -158,7 +158,7 @@ class CogneeGraph(CogneeAbstractGraph): print(f"Error mapping vector distances to edges: {ex}") - async def calculate_top_triplet_importances(self, k = int) -> List: + async def calculate_top_triplet_importances(self, k: int) -> List: min_heap = [] for i, edge in enumerate(self.edges): source_node = self.get_node(edge.node1.id) From c66c43e71786aa37b4f93e3a8f362524d46c922d Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:44:11 +0100 Subject: [PATCH 13/23] chore: places retrievers under modules directory --- cognee/{pipelines => modules/retrieval}/__init__.py | 0 .../retrieval}/brute_force_triplet_search.py | 0 cognee/pipelines/retriever/__init__.py | 0 examples/python/dynamic_steps_example.py | 4 ++-- 4 files changed, 2 insertions(+), 2 deletions(-) rename cognee/{pipelines => modules/retrieval}/__init__.py (100%) rename cognee/{pipelines/retriever => modules/retrieval}/brute_force_triplet_search.py (100%) delete mode 100644 cognee/pipelines/retriever/__init__.py diff --git a/cognee/pipelines/__init__.py b/cognee/modules/retrieval/__init__.py similarity index 100% rename from cognee/pipelines/__init__.py rename to cognee/modules/retrieval/__init__.py diff --git a/cognee/pipelines/retriever/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py similarity index 100% rename from cognee/pipelines/retriever/brute_force_triplet_search.py rename to cognee/modules/retrieval/brute_force_triplet_search.py diff --git a/cognee/pipelines/retriever/__init__.py b/cognee/pipelines/retriever/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 6b75310cf..68bbb7bce 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,6 +1,6 @@ import cognee import asyncio -from cognee.pipelines.retriever.brute_force_triplet_search import brute_force_triplet_search +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search job_1 = """ CV 1: Relevant @@ -187,7 +187,7 @@ async def main(enable_steps): if __name__ == '__main__': # Flags to enable/disable steps - rebuild_kg = False + rebuild_kg = True retrieve = True steps_to_enable = { "prune_data": rebuild_kg, From db07179856cbdba1349d3bebf3a70e7f2e16e54b Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:17:57 +0100 Subject: [PATCH 14/23] chore: Adds error handling to brute force triplet search --- .../retrieval/brute_force_triplet_search.py | 85 +++++++++++++------ examples/python/dynamic_steps_example.py | 5 +- 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py index 6fef6104f..ea7c2cb4d 100644 --- a/cognee/modules/retrieval/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/brute_force_triplet_search.py @@ -1,5 +1,6 @@ import asyncio -from typing import Dict, List +import logging +from typing import List from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph @@ -76,42 +77,74 @@ def delete_duplicated_vector_db_elements(collections, results): #:TODO: This is return results_dict -async def brute_force_search(query: str, user: User, top_k: int, collections: List[str] = None) -> list: +async def brute_force_search( + query: str, + user: User, + top_k: int, + collections: List[str] = None +) -> list: + """ + Performs a brute force search to retrieve the top triplets from the graph. + + Args: + query (str): The search query. + user (User): The user performing the search. + top_k (int): The number of top results to retrieve. + collections (Optional[List[str]]): List of collections to query. Defaults to predefined collections. + + Returns: + list: The top triplet results. + """ + if not query or not isinstance(query, str): + raise ValueError("The query must be a non-empty string.") + if top_k <= 0: + raise ValueError("top_k must be a positive integer.") + if collections is None: collections = ["entity_name", "text_summary_text", "entity_type_name", "document_chunk_text"] - vector_engine = get_vector_engine() - graph_engine = await get_graph_engine() + try: + vector_engine = get_vector_engine() + graph_engine = await get_graph_engine() + except Exception as e: + logging.error("Failed to initialize engines: %s", e) + raise RuntimeError("Initialization error") from e send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id) - results = await asyncio.gather( - *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] - ) + try: + results = await asyncio.gather( + *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + ) - ############################################# :TODO: Change when vector db does not contain duplicates - node_distances = delete_duplicated_vector_db_elements(collections, results) - # node_distances = {collection: result for collection, result in zip(collections, results)} - ############################################## + ############################################# :TODO: Change when vector db does not contain duplicates + node_distances = delete_duplicated_vector_db_elements(collections, results) + # node_distances = {collection: result for collection, result in zip(collections, results)} + ############################################## - memory_fragment = CogneeGraph() + memory_fragment = CogneeGraph() - await memory_fragment.project_graph_from_db(graph_engine, - node_properties_to_project=['id', - 'description', - 'name', - 'type', - 'text'], - edge_properties_to_project=['relationship_name']) + await memory_fragment.project_graph_from_db(graph_engine, + node_properties_to_project=['id', + 'description', + 'name', + 'type', + 'text'], + edge_properties_to_project=['relationship_name']) - await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) + await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances) - #:TODO: Change when vectordb contains edge embeddings - await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query) + #:TODO: Change when vectordb contains edge embeddings + await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query) - results = await memory_fragment.calculate_top_triplet_importances(k=top_k) + results = await memory_fragment.calculate_top_triplet_importances(k=top_k) - send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id) + send_telemetry("cognee.brute_force_triplet_search EXECUTION STARTED", user.id) - #:TODO: Once we have Edge pydantic models we should retrieve the exact edge and node objects from graph db - return results + #:TODO: Once we have Edge pydantic models we should retrieve the exact edge and node objects from graph db + return results + + except Exception as e: + logging.error("Error during brute force search for user: %s, query: %s. Error: %s", user.id, query, e) + send_telemetry("cognee.brute_force_triplet_search EXECUTION FAILED", user.id) + raise RuntimeError("An error occurred during brute force search") from e diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 68bbb7bce..ed5c97561 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,6 +1,7 @@ import cognee import asyncio from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search +from cognee.modules.retrieval.brute_force_triplet_search import format_triplets job_1 = """ CV 1: Relevant @@ -181,8 +182,8 @@ async def main(enable_steps): # Step 4: Query insights if enable_steps.get("retriever"): - await brute_force_triplet_search('Who has Phd?') - + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + print(format_triplets(results)) if __name__ == '__main__': # Flags to enable/disable steps From c9f66145f5b7b67be0c8606d54a4f14b8fe53cbc Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:33:33 +0100 Subject: [PATCH 15/23] feat: checks neo4j test for bruteforce retriever --- cognee/tests/test_neo4j.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 756b29cc4..8eea5b7f7 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -4,6 +4,7 @@ import logging import pathlib import cognee from cognee.api.v1.search import SearchType +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search logging.basicConfig(level=logging.DEBUG) @@ -61,6 +62,9 @@ async def main(): assert len(history) == 6, "Search history is not correct." + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + assert len(results)>0 + if __name__ == "__main__": import asyncio asyncio.run(main()) From ecdf3d4d54cc04f62be1726568a9f4d442131ae9 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:35:20 +0100 Subject: [PATCH 16/23] fix: Updates neo4j test --- cognee/tests/test_neo4j.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 8eea5b7f7..4efbcc66a 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -62,9 +62,6 @@ async def main(): assert len(history) == 6, "Search history is not correct." - results = await brute_force_triplet_search('Who has the most experience with graphic design?') - assert len(results)>0 - if __name__ == "__main__": import asyncio asyncio.run(main()) From 0441e19bc9a294761cce95a6c2aaf2f6ec7c8918 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:42:35 +0100 Subject: [PATCH 17/23] feat: Adds bruteforce retriever test for neo4j --- cognee/tests/test_neo4j.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 60f006c35..87dd789a6 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -62,6 +62,9 @@ async def main(): assert len(history) == 6, "Search history is not correct." + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + assert len(results) > 0 + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted" From 4035302dd45e03f8e6f9e318e5ecd98501537e45 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:48:09 +0100 Subject: [PATCH 18/23] feat: Adds tests for pgvector, qdrant and weaviate --- cognee/tests/test_pgvector.py | 4 ++++ cognee/tests/test_qdrant.py | 4 ++++ cognee/tests/test_weaviate.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index bd6584cbc..1a62fa4d4 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -3,6 +3,7 @@ import logging import pathlib import cognee from cognee.api.v1.search import SearchType +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search logging.basicConfig(level=logging.DEBUG) @@ -89,6 +90,9 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + assert len(results) > 0 + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted" diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index 4c2462c3b..e0df0e980 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -5,6 +5,7 @@ import logging import pathlib import cognee from cognee.api.v1.search import SearchType +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search logging.basicConfig(level=logging.DEBUG) @@ -61,6 +62,9 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + assert len(results) > 0 + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted" diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index c352df13e..726a8ebc2 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -3,6 +3,7 @@ import logging import pathlib import cognee from cognee.api.v1.search import SearchType +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search logging.basicConfig(level=logging.DEBUG) @@ -59,6 +60,9 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." + results = await brute_force_triplet_search('Who has the most experience with graphic design?') + assert len(results) > 0 + await cognee.prune.prune_data() assert not os.path.isdir(data_directory_path), "Local data files are not deleted" From 4c9d816f874d5b4aa80335b806ec857a66aa7629 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:05:38 +0100 Subject: [PATCH 19/23] feat: extends bruteforce triplet search for Qdrant db --- .../databases/vector/qdrant/QDrantAdapter.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py index 1efcd47b3..08c47d005 100644 --- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py @@ -142,6 +142,41 @@ class QDrantAdapter(VectorDBInterface): await client.close() return results + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + ) -> List[ScoredResult]: + + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + client = self.get_qdrant_client() + + results = await client.search( + collection_name = collection_name, + query_vector = models.NamedVector( + name = "text", + vector = query_vector if query_vector is not None else (await self.embed_data([query_text]))[0], + ), + with_vectors = with_vector + ) + + await client.close() + + return [ + ScoredResult( + id = UUID(result.id), + payload = { + **result.payload, + "id": UUID(result.id), + }, + score = 1 - result.score, + ) for result in results + ] + async def search( self, collection_name: str, From 98a517dd9f97a9cf87e15e85e8af524dd5a0c7ef Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:20:53 +0100 Subject: [PATCH 20/23] feat: extends brute force triplet search for weaviate db --- .../vector/weaviate_db/WeaviateAdapter.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index be356740f..a8aa568a0 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -153,6 +153,36 @@ class WeaviateAdapter(VectorDBInterface): return await future + async def get_distances_of_collection( + self, + collection_name: str, + query_text: str = None, + query_vector: List[float] = None, + with_vector: bool = False + ) -> List[ScoredResult]: + import weaviate.classes as wvc + + if query_text is None and query_vector is None: + raise ValueError("One of query_text or query_vector must be provided!") + + if query_vector is None: + query_vector = (await self.embed_data([query_text]))[0] + + search_result = self.get_collection(collection_name).query.hybrid( + query=None, + vector=query_vector, + include_vector=with_vector, + return_metadata=wvc.query.MetadataQuery(score=True), + ) + + return [ + ScoredResult( + id=UUID(str(result.uuid)), + payload=result.properties, + score=1 - float(result.metadata.score) + ) for result in search_result.objects + ] + async def search( self, collection_name: str, From c30683e20ec2feeddfcb11a77ca76f9d12e44bee Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:29:44 +0100 Subject: [PATCH 21/23] chore: changes query text in tests --- cognee/tests/test_neo4j.py | 2 +- cognee/tests/test_pgvector.py | 2 +- cognee/tests/test_qdrant.py | 2 +- cognee/tests/test_weaviate.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cognee/tests/test_neo4j.py b/cognee/tests/test_neo4j.py index 87dd789a6..92e5b5f05 100644 --- a/cognee/tests/test_neo4j.py +++ b/cognee/tests/test_neo4j.py @@ -62,7 +62,7 @@ async def main(): assert len(history) == 6, "Search history is not correct." - results = await brute_force_triplet_search('Who has the most experience with graphic design?') + results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 await cognee.prune.prune_data() diff --git a/cognee/tests/test_pgvector.py b/cognee/tests/test_pgvector.py index 1a62fa4d4..3b4fa19c5 100644 --- a/cognee/tests/test_pgvector.py +++ b/cognee/tests/test_pgvector.py @@ -90,7 +90,7 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." - results = await brute_force_triplet_search('Who has the most experience with graphic design?') + results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 await cognee.prune.prune_data() diff --git a/cognee/tests/test_qdrant.py b/cognee/tests/test_qdrant.py index e0df0e980..f32e0b4a4 100644 --- a/cognee/tests/test_qdrant.py +++ b/cognee/tests/test_qdrant.py @@ -62,7 +62,7 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." - results = await brute_force_triplet_search('Who has the most experience with graphic design?') + results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 await cognee.prune.prune_data() diff --git a/cognee/tests/test_weaviate.py b/cognee/tests/test_weaviate.py index 726a8ebc2..43ec30aaf 100644 --- a/cognee/tests/test_weaviate.py +++ b/cognee/tests/test_weaviate.py @@ -60,7 +60,7 @@ async def main(): history = await cognee.get_search_history() assert len(history) == 6, "Search history is not correct." - results = await brute_force_triplet_search('Who has the most experience with graphic design?') + results = await brute_force_triplet_search('What is a quantum computer?') assert len(results) > 0 await cognee.prune.prune_data() From 94dc545fcd904f44083af3bb564a5a8a81a97108 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:42:35 +0100 Subject: [PATCH 22/23] chore: adds self to cogneegraph edges --- cognee/modules/graph/cognee_graph/CogneeGraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 3f0d48a23..715b8260e 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -50,7 +50,7 @@ class CogneeGraph(CogneeAbstractGraph): raise ValueError(f"Node with id {node_id} does not exist.") def get_edges(self)-> List[Edge]: - return edges + return self.edges async def project_graph_from_db(self, adapter: Union[GraphDBInterface], From 3146ef75c9ceea873efb8de242d9617b6582601c Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:47:26 +0100 Subject: [PATCH 23/23] Fix: renames new vector db and cogneegraph methods --- .../infrastructure/databases/vector/lancedb/LanceDBAdapter.py | 2 +- .../databases/vector/pgvector/PGVectorAdapter.py | 2 +- .../infrastructure/databases/vector/qdrant/QDrantAdapter.py | 2 +- .../databases/vector/weaviate_db/WeaviateAdapter.py | 2 +- cognee/modules/graph/cognee_graph/CogneeGraph.py | 2 +- cognee/modules/retrieval/brute_force_triplet_search.py | 2 +- cognee/tests/unit/modules/graph/cognee_graph_test.py | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 66d960c20..5204c1bad 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -142,7 +142,7 @@ class LanceDBAdapter(VectorDBInterface): score = 0, ) for result in results.to_dict("index").values()] - async def get_distances_of_collection( + async def get_distance_from_collection_elements( self, collection_name: str, query_text: str = None, diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index a4cfcf789..fd0fd493c 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -176,7 +176,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): ) for result in results ] - async def get_distances_of_collection( + async def get_distance_from_collection_elements( self, collection_name: str, query_text: str = None, diff --git a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py index 08c47d005..c340928f4 100644 --- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py @@ -142,7 +142,7 @@ class QDrantAdapter(VectorDBInterface): await client.close() return results - async def get_distances_of_collection( + async def get_distance_from_collection_elements( self, collection_name: str, query_text: str = None, diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index a8aa568a0..c9848e02c 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -153,7 +153,7 @@ class WeaviateAdapter(VectorDBInterface): return await future - async def get_distances_of_collection( + async def get_distance_from_collection_elements( self, collection_name: str, query_text: str = None, diff --git a/cognee/modules/graph/cognee_graph/CogneeGraph.py b/cognee/modules/graph/cognee_graph/CogneeGraph.py index 715b8260e..21d095f3d 100644 --- a/cognee/modules/graph/cognee_graph/CogneeGraph.py +++ b/cognee/modules/graph/cognee_graph/CogneeGraph.py @@ -42,7 +42,7 @@ class CogneeGraph(CogneeAbstractGraph): def get_node(self, node_id: str) -> Node: return self.nodes.get(node_id, None) - def get_edges_of_node(self, node_id: str) -> List[Edge]: + def get_edges_from_node(self, node_id: str) -> List[Edge]: node = self.get_node(node_id) if node: return node.skeleton_edges diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py index ea7c2cb4d..0a4e9dea5 100644 --- a/cognee/modules/retrieval/brute_force_triplet_search.py +++ b/cognee/modules/retrieval/brute_force_triplet_search.py @@ -114,7 +114,7 @@ async def brute_force_search( try: results = await asyncio.gather( - *[vector_engine.get_distances_of_collection(collection, query_text=query) for collection in collections] + *[vector_engine.get_distance_from_collection_elements(collection, query_text=query) for collection in collections] ) ############################################# :TODO: Change when vector db does not contain duplicates diff --git a/cognee/tests/unit/modules/graph/cognee_graph_test.py b/cognee/tests/unit/modules/graph/cognee_graph_test.py index bad474023..e3b748dab 100644 --- a/cognee/tests/unit/modules/graph/cognee_graph_test.py +++ b/cognee/tests/unit/modules/graph/cognee_graph_test.py @@ -77,11 +77,11 @@ def test_get_edges_success(setup_graph): graph.add_node(node2) edge = Edge(node1, node2) graph.add_edge(edge) - assert edge in graph.get_edges_of_node("node1") + assert edge in graph.get_edges_from_node("node1") def test_get_edges_nonexistent_node(setup_graph): """Test retrieving edges for a nonexistent node raises an exception.""" graph = setup_graph with pytest.raises(ValueError, match="Node with id nonexistent does not exist."): - graph.get_edges_of_node("nonexistent") + graph.get_edges_from_node("nonexistent")