Add utils for graph visualization + classification nodes

2024-03-11 12:41:32 +01:00 · 2024-03-11 12:41:32 +01:00 · 7e964bcb23
commit 7e964bcb23
parent faf7e6ae59
12 changed files with 1487 additions and 1610 deletions
--- a/Demo_graph.ipynb
+++ b/Demo_graph.ipynb
--- a/cognitive_architecture/api/init.py
+++ b/cognitive_architecture/api/init.py
--- a/cognitive_architecture/api/v1/init.py
+++ b/cognitive_architecture/api/v1/init.py
--- a/cognitive_architecture/api/v1/cognify/cognify.py
+++ b/cognitive_architecture/api/v1/cognify/cognify.py
@ -14,8 +14,13 @@ from dotenv import load_dotenv
 import os

 from cognitive_architecture.infrastructure.databases.vector.qdrant.adapter import CollectionConfig
+from cognitive_architecture.infrastructure.llm.get_llm_client import get_llm_client
 from cognitive_architecture.modules.cognify.graph.add_classification_nodes import add_classification_nodes
+from cognitive_architecture.modules.cognify.graph.add_node_connections import add_node_connection, graph_ready_output, \
+    connect_nodes_in_graph
 from cognitive_architecture.modules.cognify.graph.add_propositions import append_to_graph
+from cognitive_architecture.modules.cognify.llm.add_node_connection_embeddings import process_items
+from cognitive_architecture.modules.cognify.vector.batch_search import adapted_qdrant_batch_search
 from cognitive_architecture.modules.cognify.vector.load_propositions import add_propositions

 # Load environment variables from .env file
@ -146,7 +151,7 @@ async def cognify(input_text:str):

    await add_classification_nodes(graph_client, 'Document:doc1', transformed_dict_1)

-    await append_to_graph(layer_1_graph, required_layers_one, graph_client)
+    F, unique_layer_uuids = await append_to_graph(layer_1_graph, required_layers_one, graph_client)

    def extract_node_descriptions(data):
        descriptions = []
@ -158,10 +163,10 @@ async def cognify(input_text:str):
        return descriptions

    # Extract the node descriptions
-    graph = await graph_client.graph()
+    graph = graph_client.graph
    node_descriptions = extract_node_descriptions(graph.nodes(data=True))
-    unique_layer_uuids = set(node['layer_decomposition_uuid'] for node in node_descriptions)
-
+    # unique_layer_uuids = set(node['layer_decomposition_uuid'] for node in node_descriptions)
+    #
    db = get_vector_database()


@ -178,10 +183,43 @@ async def cognify(input_text:str):
    for layer in unique_layer_uuids:
        await db.create_collection(layer,collection_config)

-    #to check if it works
-
+    # #to check if it works
+    #
    await add_propositions(node_descriptions, db)

+    from cognitive_architecture.infrastructure.databases.vector.qdrant.adapter import AsyncQdrantClient
+
+    grouped_data = await add_node_connection(graph_client, db, node_descriptions)
+
+    llm_client = get_llm_client()
+
+    relationship_dict = await process_items(grouped_data, unique_layer_uuids, llm_client)
+
+    results = await adapted_qdrant_batch_search(relationship_dict, db)
+    relationship_d = graph_ready_output(results)
+
+    CONNECTED_GRAPH = connect_nodes_in_graph(F, relationship_d)
+    return CONNECTED_GRAPH
+
+    #
+    # grouped_data = {}
+    #
+    # # Iterate through each dictionary in the list
+    # for item in node_descriptions:
+    #     # Get the layer_decomposition_uuid of the current dictionary
+    #     uuid = item['layer_decomposition_uuid']
+    #
+    #     # Check if this uuid is already a key in the grouped_data dictionary
+    #     if uuid not in grouped_data:
+    #         # If not, initialize a new list for this uuid
+    #         grouped_data[uuid] = []
+    #
+    #     # Append the current dictionary to the list corresponding to its uuid
+    #     grouped_data[uuid].append(item)
+
+
+
+



--- a/cognitive_architecture/infrastructure/databases/vector/qdrant/adapter.py
+++ b/cognitive_architecture/infrastructure/databases/vector/qdrant/adapter.py
@ -52,7 +52,7 @@ class QDrantAdapter(VectorDBInterface):
            quantization_config = collection_config.quantization_config
        )

-    async def create_data_points(self, collection_name: str, data_points: List[any]):
+    async def create_data_points(self, collection_name: str, data_points):
        client = self.get_qdrant_client()

        return await client.upload_points(
@ -96,11 +96,11 @@ class QDrantAdapter(VectorDBInterface):
                # vector= embedding,
                limit=3,
                with_vector=False
-            ) for embedding in embeddings
+            ) for embedding in [embeddings]
        ]

        # Perform batch search with the dynamically generated requests
-        results = client.search_batch(
+        results = await client.search_batch(
            collection_name=collection_name,
            requests=requests
        )
--- a/cognitive_architecture/infrastructure/databases/vector/vector_db_interface.py
+++ b/cognitive_architecture/infrastructure/databases/vector/vector_db_interface.py
@ -43,7 +43,7 @@ class VectorDBInterface(Protocol):
    async def create_data_points(
        self,
        collection_name: str,
-        data_points: List[any]
+        data_points
    ): raise NotImplementedError

    # @abstractmethod
--- a/cognitive_architecture/infrastructure/databases/vector/weaviate/adapter.py
+++ b/cognitive_architecture/infrastructure/databases/vector/weaviate/adapter.py
--- a/cognitive_architecture/modules/cognify/graph/add_node_connections.py
+++ b/cognitive_architecture/modules/cognify/graph/add_node_connections.py
@ -0,0 +1,113 @@
+from cognitive_architecture.infrastructure.databases.graph.get_graph_client import get_graph_client
+from cognitive_architecture.shared.data_models import GraphDBType
+
+
+def extract_node_descriptions(data):
+    descriptions = []
+    for node_id, attributes in data:
+        if 'description' in attributes and 'id' in attributes:
+            descriptions.append({'node_id': attributes['id'], 'description': attributes['description'], 'layer_uuid': attributes['layer_uuid'], 'layer_decomposition_uuid': attributes['layer_decomposition_uuid'] })
+    return descriptions
+
+
+
+
+def add_node_connection(graph_client, vector_database_client, data):
+
+    graph = graph_client.graph
+    node_descriptions = extract_node_descriptions(graph.nodes(data=True))
+
+
+    grouped_data = {}
+
+    # Iterate through each dictionary in the list
+    for item in node_descriptions:
+        # Get the layer_decomposition_uuid of the current dictionary
+        uuid = item['layer_decomposition_uuid']
+
+        # Check if this uuid is already a key in the grouped_data dictionary
+        if uuid not in grouped_data:
+            # If not, initialize a new list for this uuid
+            grouped_data[uuid] = []
+
+        # Append the current dictionary to the list corresponding to its uuid
+        grouped_data[uuid].append(item)
+
+    return grouped_data
+
+
+def connect_nodes_in_graph(graph, relationship_dict):
+    """
+    For each relationship in relationship_dict, check if both nodes exist in the graph based on node attributes.
+    If they do, create a connection (edge) between them.
+
+    :param graph: A NetworkX graph object
+    :param relationship_dict: A dictionary containing relationships between nodes
+    """
+    for id, relationships in relationship_dict.items():
+        for relationship in relationships:
+            searched_node_attr_id = relationship['searched_node_id']
+            print(searched_node_attr_id)
+            score_attr_id = relationship['original_id_for_search']
+            score = relationship['score']
+
+            # Initialize node keys for both searched_node and score_node
+            searched_node_key, score_node_key = None, None
+
+            # Find nodes in the graph that match the searched_node_id and score_id from their attributes
+            for node, attrs in graph.nodes(data=True):
+                if 'id' in attrs:  # Ensure there is an 'id' attribute
+                    if attrs['id'] == searched_node_attr_id:
+                        searched_node_key = node
+                    elif attrs['id'] == score_attr_id:
+                        score_node_key = node
+
+                # If both nodes are found, no need to continue checking other nodes
+                if searched_node_key and score_node_key:
+                    break
+
+            # Check if both nodes were found in the graph
+            if searched_node_key is not None and score_node_key is not None:
+                print(searched_node_key)
+                print(score_node_key)
+                # If both nodes exist, create an edge between them
+                # You can customize the edge attributes as needed, here we use 'score' as an attribute
+                graph.add_edge(searched_node_key, score_node_key, weight=score,
+                               score_metadata=relationship.get('score_metadata'))
+
+    return graph
+def graph_ready_output(results):
+    relationship_dict = {}
+
+    for result_tuple in results:
+
+        uuid, scored_points_list, desc, node_id = result_tuple
+        # Unpack the tuple
+
+        # Ensure there's a list to collect related items for this uuid
+        if uuid not in relationship_dict:
+            relationship_dict[uuid] = []
+
+        for scored_points in scored_points_list:  # Iterate over the list of ScoredPoint lists
+            for scored_point in scored_points:  # Iterate over each ScoredPoint object
+                if scored_point.score > 0.9:  # Check the score condition
+                    # Append a new dictionary to the list associated with the uuid
+                    relationship_dict[uuid].append({
+                        'collection_name_uuid': uuid,
+                        'searched_node_id': scored_point.id,
+                        'score': scored_point.score,
+                        'score_metadata': scored_point.payload,
+                        'original_id_for_search': node_id,
+                    })
+    return relationship_dict
+
+
+
+
+if __name__ == '__main__':
+    graph_client = get_graph_client(GraphDBType.NETWORKX)
+    add_node_connection(graph_client, None, None)
+
+
+
+    # db = get_vector_database()
--- a/cognitive_architecture/modules/cognify/graph/add_propositions.py
+++ b/cognitive_architecture/modules/cognify/graph/add_propositions.py
@ -68,7 +68,7 @@ async def add_propositions(G, category_name, subclass_content, layer_description
 async def append_to_graph(layer_graphs, required_layers, G):
    # Generate a UUID for the overall layer
    layer_uuid = uuid.uuid4()
-
+    decomposition_uuids = set()
    # Extract category name from required_layers data
    category_name = required_layers.dict()['label']['type']

@ -84,7 +84,7 @@ async def append_to_graph(layer_graphs, required_layers, G):

            # Generate a UUID for this particular layer decomposition
            layer_decomposition_uuid = uuid.uuid4()
-
+            decomposition_uuids.add(layer_decomposition_uuid)
            # Assuming append_data_to_graph is defined elsewhere and appends data to G
            # You would pass relevant information from knowledge_graph along with other details to this function
            F = await add_propositions(G, category_name, subgroup_name, layer_description, knowledge_graph,
@ -93,7 +93,7 @@ async def append_to_graph(layer_graphs, required_layers, G):
            # Print updated graph for verification (assuming F is the updated NetworkX Graph)
            print("Updated Nodes:", F.graph.nodes(data=True))

-    return F
+    return F, decomposition_uuids


 if __name__ == "__main__":
--- a/cognitive_architecture/modules/cognify/llm/add_node_connection_embeddings.py
+++ b/cognitive_architecture/modules/cognify/llm/add_node_connection_embeddings.py
@ -0,0 +1,38 @@
+import asyncio
+
+
+async def process_items(grouped_data, unique_layer_uuids, llm_client):
+    results_to_check = []  # This will hold results excluding self comparisons
+    tasks = []  # List to hold all tasks
+    task_to_info = {}  # Dictionary to map tasks to their corresponding group id and item info
+
+    # Iterate through each group in grouped_data
+    for group_id, items in grouped_data.items():
+        # Filter unique_layer_uuids to exclude the current group_id
+        target_uuids = [uuid for uuid in unique_layer_uuids if uuid != group_id]
+
+        # Process each item in the group
+        for item in items:
+            # For each target UUID, create an async task for the item's embedding retrieval
+            for target_id in target_uuids:
+                task = asyncio.create_task(
+                    llm_client.async_get_embedding_with_backoff(item['description'], "text-embedding-3-large"))
+                tasks.append(task)
+                # Map the task to the target id, item's node_id, and description for later retrieval
+                task_to_info[task] = (target_id, item['node_id'], group_id, item['description'])
+
+    # Await all tasks to complete and gather results
+    results = await asyncio.gather(*tasks)
+
+    # Process the results, associating them with their target id, node id, and description
+    for task, embedding in zip(tasks, results):
+        target_id, node_id, group_id, description = task_to_info[task]
+        results_to_check.append([target_id, embedding, description, node_id, group_id])
+
+    return results_to_check
+
+
+if __name__ == '__main__':
+
+    process_items()
+
--- a/cognitive_architecture/modules/cognify/vector/batch_search.py
+++ b/cognitive_architecture/modules/cognify/vector/batch_search.py
@ -1,7 +1,7 @@

 from cognitive_architecture.infrastructure.databases.vector.get_vector_database import get_vector_database

-async def adapted_qdrant_batch_search(results_to_check, client):
+async def adapted_qdrant_batch_search(results_to_check,vector_client):
    search_results_list = []

    for result in results_to_check:
@ -15,9 +15,10 @@ async def adapted_qdrant_batch_search(results_to_check, client):
        limits = [3] * len(embedding)  # Set a limit of 3 results for this embedding

        try:
-            # Perform the batch search for this id with its embedding
-            # Assuming qdrant_batch_search function accepts a single embedding and a list of limits
-            id_search_results = await client.batch_search(id, [embedding], limits)
+            #Perform the batch search for this id with its embedding
+            #Assuming qdrant_batch_search function accepts a single embedding and a list of limits
+            #qdrant_batch_search
+            id_search_results = await vector_client.batch_search(collection_name = id, embeddings= embedding, with_vectors=limits)
            search_results_list.append((id, id_search_results, node_id, target))
        except Exception as e:
            print(f"Error during batch search for ID {id}: {e}")
@ -26,4 +27,6 @@ async def adapted_qdrant_batch_search(results_to_check, client):
    return search_results_list


-client = get_vector_database()
+if __name__ == '__main__':
+
+    client = get_vector_database()
--- a/cognitive_architecture/modules/cognify/vector/load_propositions.py
+++ b/cognitive_architecture/modules/cognify/vector/load_propositions.py
@ -28,6 +28,7 @@ async def upload_embedding(id, metadata, some_embeddings, collection_name, clien
 async def add_propositions(node_descriptions, client):
    for item in node_descriptions:
        print(item['node_id'])
-        await upload_embedding(id = item['node_id'], metadata = {"meta":item['description']}, some_embeddings = get_embeddings(item['description']), collection_name= item['layer_decomposition_uuid'],client= client)
+        embeddings = await get_embeddings(item['description'])
+        await upload_embedding(id = item['node_id'], metadata = {"meta":item['description']}, some_embeddings = embeddings[0], collection_name= item['layer_decomposition_uuid'],client= client)