feat: Implements basic global triplet optimizing retrieval

2024-11-20 18:33:34 +01:00 · 2024-11-20 18:33:34 +01:00 · a114d68aef
commit a114d68aef
parent 980ae2b22c
2 changed files with 63 additions and 48 deletions
--- a/cognee/pipelines/retriever/two_steps_retriever.py
+++ b/cognee/pipelines/retriever/two_steps_retriever.py
@ -13,6 +13,46 @@ from openai import organization
 from sympy.codegen.fnodes import dimension


+def format_triplets(edges):
+    def filter_attributes(obj, attributes):
+        """Helper function to filter out non-None properties, including nested dicts."""
+        print("\n\n\n")
+        result = {}
+        for attr in attributes:
+            value = getattr(obj, attr, None)
+            if value is not None:
+                # If the value is a dict, extract relevant keys from it
+                if isinstance(value, dict):
+                    nested_values = {k: v for k, v in value.items() if k in attributes and v is not None}
+                    result[attr] = nested_values
+                else:
+                    result[attr] = value
+        return result
+
+    triplets = []
+    for edge in edges:
+        node1 = edge.node1
+        node2 = edge.node2
+        edge_attributes = edge.attributes
+        node1_attributes = node1.attributes
+        node2_attributes = node2.attributes
+
+        # Filter only non-None properties
+        node1_info = {key: value for key, value in node1_attributes.items() if value is not None}
+        node2_info = {key: value for key, value in node2_attributes.items() if value is not None}
+        edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
+
+        # Create the formatted triplet
+        triplet = (
+            f"Node1: {node1_info}\n"
+            f"Edge: {edge_info}\n"
+            f"Node2: {node2_info}\n\n\n"  # Add three blank lines for separation
+        )
+        triplets.append(triplet)
+
+    return "".join(triplets)
+
+
 async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
    if user is None:
        user = await get_default_user()
@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:

    filtered_search_results = []

-
    return retrieved_results


@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
    )

    ############################################# This part is a quick fix til we don't fix the vector db inconsistency
-    results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
+    node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
    # results_dict = {collection: result for collection, result in zip(collections, results)}
    ##############################################

@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis

    await memory_fragment.project_graph_from_db(graph_engine,
                                          node_properties_to_project=['id',
-                                                                      'community'],
+                                                                      'description',
+                                                                      'name',
+                                                                      'type',
+                                                                      'text'],
                                          edge_properties_to_project=['id',
-                                                                      'relationship_name'],
-                                          directed=True,
-                                          node_dimension=1,
-                                          edge_dimension=1,
-                                          memory_fragment_filter=[])
+                                                                      'relationship_name'])

-    print()
+    await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances)
+
+    await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db
+
+    results = await memory_fragment.calculate_top_triplet_importances(k=5)


-    raise(NotImplementedError)
+    print(format_triplets(results))
+    print(f'Query was the following:{query}' )
--- a/examples/python/dynamic_steps_example.py
+++ b/examples/python/dynamic_steps_example.py
@ -2,32 +2,6 @@ import cognee
 import asyncio
 from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever

-job_position = """0:Senior Data Scientist (Machine Learning)
-
-Company: TechNova Solutions
-Location: San Francisco, CA
-
-Job Description:
-
-TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.
-
-Responsibilities:
-
-Develop and implement advanced machine learning algorithms and models.
-Analyze large, complex datasets to extract meaningful patterns and insights.
-Collaborate with cross-functional teams to integrate predictive models into products.
-Stay updated with the latest advancements in machine learning and data science.
-Mentor junior data scientists and provide technical guidance.
-Qualifications:
-
-Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
-5+ years of experience in data science and machine learning.
-Proficient in Python, R, and SQL.
-Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
-Strong problem-solving skills and attention to detail.
-Candidate CVs
-"""
-
 job_1 = """
 CV 1: Relevant
 Name: Dr. Emily Carter
@ -195,7 +169,7 @@ async def main(enable_steps):

    # Step 2: Add text
    if enable_steps.get("add_text"):
-        text_list = [job_position, job_1, job_2, job_3, job_4, job_5]
+        text_list = [job_1, job_2, job_3, job_4, job_5]
        for text in text_list:
            await cognee.add(text)
            print(f"Added text: {text[:35]}...")
@ -207,22 +181,20 @@ async def main(enable_steps):

    # Step 4: Query insights
    if enable_steps.get("retriever"):
-        search_results = await two_step_retriever(
-            {'query': 'Which applicant has the most relevant experience in data science?'}
-        )
-        print("Search results:")
-        for result_text in search_results:
-            print(result_text)
+        await two_step_retriever('Who has Phd?')


 if __name__ == '__main__':
    # Flags to enable/disable steps
+
+    rebuild_kg = False
+    retrieve = True
    steps_to_enable = {
-        "prune_data": False,
-        "prune_system": False,
-        "add_text": False,
-        "cognify": False,
-        "retriever": True
+        "prune_data": rebuild_kg,
+        "prune_system": rebuild_kg,
+        "add_text": rebuild_kg,
+        "cognify": rebuild_kg,
+        "retriever": retrieve
    }

    asyncio.run(main(steps_to_enable))