feat: Implements basic global triplet optimizing retrieval

2024-11-20 18:33:34 +01:00 · 2024-11-20 18:33:34 +01:00 · a114d68aef
commit a114d68aef
parent 980ae2b22c
2 changed files with 63 additions and 48 deletions
--- a/cognee/pipelines/retriever/two_steps_retriever.py
+++ b/cognee/pipelines/retriever/two_steps_retriever.py
@ -13,6 +13,46 @@ from openai import organization
 from sympy.codegen.fnodes import dimension
 def format_triplets(edges):
    def filter_attributes(obj, attributes):
        """Helper function to filter out non-None properties, including nested dicts."""
        print("\n\n\n")
        result = {}
        for attr in attributes:
            value = getattr(obj, attr, None)
            if value is not None:
                # If the value is a dict, extract relevant keys from it
                if isinstance(value, dict):
                    nested_values = {k: v for k, v in value.items() if k in attributes and v is not None}
                    result[attr] = nested_values
                else:
                    result[attr] = value
        return result
    triplets = []
    for edge in edges:
        node1 = edge.node1
        node2 = edge.node2
        edge_attributes = edge.attributes
        node1_attributes = node1.attributes
        node2_attributes = node2.attributes
        # Filter only non-None properties
        node1_info = {key: value for key, value in node1_attributes.items() if value is not None}
        node2_info = {key: value for key, value in node2_attributes.items() if value is not None}
        edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
        # Create the formatted triplet
        triplet = (
            f"Node1: {node1_info}\n"
            f"Edge: {edge_info}\n"
            f"Node2: {node2_info}\n\n\n"  # Add three blank lines for separation
        )
        triplets.append(triplet)
    return "".join(triplets)
 async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
    if user is None:
        user = await get_default_user()
@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
    filtered_search_results = []
    return retrieved_results
@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
    )
    ############################################# This part is a quick fix til we don't fix the vector db inconsistency
-    results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
+    node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
    # results_dict = {collection: result for collection, result in zip(collections, results)}
    ##############################################
@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
    await memory_fragment.project_graph_from_db(graph_engine,
                                          node_properties_to_project=['id',
-                                                                      'community'],
+                                                                      'description',
                                                                      'name',
                                                                      'type',
                                                                      'text'],
                                          edge_properties_to_project=['id',
-                                                                      'relationship_name'],
+                                                                      'relationship_name'])
                                          directed=True,
                                          node_dimension=1,
                                          edge_dimension=1,
                                          memory_fragment_filter=[])
-    print()
+    await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances)
    await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db
    results = await memory_fragment.calculate_top_triplet_importances(k=5)
-    raise(NotImplementedError)
+    print(format_triplets(results))
    print(f'Query was the following:{query}' )
--- a/examples/python/dynamic_steps_example.py
+++ b/examples/python/dynamic_steps_example.py
@ -2,32 +2,6 @@ import cognee
 import asyncio
 from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever
 job_position = """0:Senior Data Scientist (Machine Learning)
 Company: TechNova Solutions
 Location: San Francisco, CA
 Job Description:
 TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.
 Responsibilities:
 Develop and implement advanced machine learning algorithms and models.
 Analyze large, complex datasets to extract meaningful patterns and insights.
 Collaborate with cross-functional teams to integrate predictive models into products.
 Stay updated with the latest advancements in machine learning and data science.
 Mentor junior data scientists and provide technical guidance.
 Qualifications:
 Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
 5+ years of experience in data science and machine learning.
 Proficient in Python, R, and SQL.
 Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
 Strong problem-solving skills and attention to detail.
 Candidate CVs
 """
 job_1 = """
 CV 1: Relevant
 Name: Dr. Emily Carter
@ -195,7 +169,7 @@ async def main(enable_steps):
    # Step 2: Add text
    if enable_steps.get("add_text"):
-        text_list = [job_position, job_1, job_2, job_3, job_4, job_5]
+        text_list = [job_1, job_2, job_3, job_4, job_5]
        for text in text_list:
            await cognee.add(text)
            print(f"Added text: {text[:35]}...")
@ -207,22 +181,20 @@ async def main(enable_steps):
    # Step 4: Query insights
    if enable_steps.get("retriever"):
-        search_results = await two_step_retriever(
+        await two_step_retriever('Who has Phd?')
            {'query': 'Which applicant has the most relevant experience in data science?'}
        )
        print("Search results:")
        for result_text in search_results:
            print(result_text)
 if __name__ == '__main__':
    # Flags to enable/disable steps
    rebuild_kg = False
    retrieve = True
    steps_to_enable = {
-        "prune_data": False,
+        "prune_data": rebuild_kg,
-        "prune_system": False,
+        "prune_system": rebuild_kg,
-        "add_text": False,
+        "add_text": rebuild_kg,
-        "cognify": False,
+        "cognify": rebuild_kg,
-        "retriever": True
+        "retriever": retrieve
    }
    asyncio.run(main(steps_to_enable))