feat: Implements basic global triplet optimizing retrieval

This commit is contained in:
hajdul88 2024-11-20 18:33:34 +01:00
parent 980ae2b22c
commit a114d68aef
2 changed files with 63 additions and 48 deletions

View file

@ -13,6 +13,46 @@ from openai import organization
from sympy.codegen.fnodes import dimension
def format_triplets(edges):
def filter_attributes(obj, attributes):
"""Helper function to filter out non-None properties, including nested dicts."""
print("\n\n\n")
result = {}
for attr in attributes:
value = getattr(obj, attr, None)
if value is not None:
# If the value is a dict, extract relevant keys from it
if isinstance(value, dict):
nested_values = {k: v for k, v in value.items() if k in attributes and v is not None}
result[attr] = nested_values
else:
result[attr] = value
return result
triplets = []
for edge in edges:
node1 = edge.node1
node2 = edge.node2
edge_attributes = edge.attributes
node1_attributes = node1.attributes
node2_attributes = node2.attributes
# Filter only non-None properties
node1_info = {key: value for key, value in node1_attributes.items() if value is not None}
node2_info = {key: value for key, value in node2_attributes.items() if value is not None}
edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
# Create the formatted triplet
triplet = (
f"Node1: {node1_info}\n"
f"Edge: {edge_info}\n"
f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation
)
triplets.append(triplet)
return "".join(triplets)
async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
if user is None:
user = await get_default_user()
@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
filtered_search_results = []
return retrieved_results
@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
)
############################################# This part is a quick fix til we don't fix the vector db inconsistency
results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
# results_dict = {collection: result for collection, result in zip(collections, results)}
##############################################
@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
await memory_fragment.project_graph_from_db(graph_engine,
node_properties_to_project=['id',
'community'],
'description',
'name',
'type',
'text'],
edge_properties_to_project=['id',
'relationship_name'],
directed=True,
node_dimension=1,
edge_dimension=1,
memory_fragment_filter=[])
'relationship_name'])
print()
await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances)
await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db
results = await memory_fragment.calculate_top_triplet_importances(k=5)
raise(NotImplementedError)
print(format_triplets(results))
print(f'Query was the following:{query}' )

View file

@ -2,32 +2,6 @@ import cognee
import asyncio
from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever
job_position = """0:Senior Data Scientist (Machine Learning)
Company: TechNova Solutions
Location: San Francisco, CA
Job Description:
TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.
Responsibilities:
Develop and implement advanced machine learning algorithms and models.
Analyze large, complex datasets to extract meaningful patterns and insights.
Collaborate with cross-functional teams to integrate predictive models into products.
Stay updated with the latest advancements in machine learning and data science.
Mentor junior data scientists and provide technical guidance.
Qualifications:
Masters or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
5+ years of experience in data science and machine learning.
Proficient in Python, R, and SQL.
Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
Strong problem-solving skills and attention to detail.
Candidate CVs
"""
job_1 = """
CV 1: Relevant
Name: Dr. Emily Carter
@ -195,7 +169,7 @@ async def main(enable_steps):
# Step 2: Add text
if enable_steps.get("add_text"):
text_list = [job_position, job_1, job_2, job_3, job_4, job_5]
text_list = [job_1, job_2, job_3, job_4, job_5]
for text in text_list:
await cognee.add(text)
print(f"Added text: {text[:35]}...")
@ -207,22 +181,20 @@ async def main(enable_steps):
# Step 4: Query insights
if enable_steps.get("retriever"):
search_results = await two_step_retriever(
{'query': 'Which applicant has the most relevant experience in data science?'}
)
print("Search results:")
for result_text in search_results:
print(result_text)
await two_step_retriever('Who has Phd?')
if __name__ == '__main__':
# Flags to enable/disable steps
rebuild_kg = False
retrieve = True
steps_to_enable = {
"prune_data": False,
"prune_system": False,
"add_text": False,
"cognify": False,
"retriever": True
"prune_data": rebuild_kg,
"prune_system": rebuild_kg,
"add_text": rebuild_kg,
"cognify": rebuild_kg,
"retriever": retrieve
}
asyncio.run(main(steps_to_enable))