feat: Implements basic global triplet optimizing retrieval
This commit is contained in:
parent
980ae2b22c
commit
a114d68aef
2 changed files with 63 additions and 48 deletions
|
|
@ -13,6 +13,46 @@ from openai import organization
|
|||
from sympy.codegen.fnodes import dimension
|
||||
|
||||
|
||||
def format_triplets(edges):
|
||||
def filter_attributes(obj, attributes):
|
||||
"""Helper function to filter out non-None properties, including nested dicts."""
|
||||
print("\n\n\n")
|
||||
result = {}
|
||||
for attr in attributes:
|
||||
value = getattr(obj, attr, None)
|
||||
if value is not None:
|
||||
# If the value is a dict, extract relevant keys from it
|
||||
if isinstance(value, dict):
|
||||
nested_values = {k: v for k, v in value.items() if k in attributes and v is not None}
|
||||
result[attr] = nested_values
|
||||
else:
|
||||
result[attr] = value
|
||||
return result
|
||||
|
||||
triplets = []
|
||||
for edge in edges:
|
||||
node1 = edge.node1
|
||||
node2 = edge.node2
|
||||
edge_attributes = edge.attributes
|
||||
node1_attributes = node1.attributes
|
||||
node2_attributes = node2.attributes
|
||||
|
||||
# Filter only non-None properties
|
||||
node1_info = {key: value for key, value in node1_attributes.items() if value is not None}
|
||||
node2_info = {key: value for key, value in node2_attributes.items() if value is not None}
|
||||
edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
|
||||
|
||||
# Create the formatted triplet
|
||||
triplet = (
|
||||
f"Node1: {node1_info}\n"
|
||||
f"Edge: {edge_info}\n"
|
||||
f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation
|
||||
)
|
||||
triplets.append(triplet)
|
||||
|
||||
return "".join(triplets)
|
||||
|
||||
|
||||
async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
|
||||
if user is None:
|
||||
user = await get_default_user()
|
||||
|
|
@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
|
|||
|
||||
filtered_search_results = []
|
||||
|
||||
|
||||
return retrieved_results
|
||||
|
||||
|
||||
|
|
@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
|
|||
)
|
||||
|
||||
############################################# This part is a quick fix til we don't fix the vector db inconsistency
|
||||
results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
|
||||
node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
|
||||
# results_dict = {collection: result for collection, result in zip(collections, results)}
|
||||
##############################################
|
||||
|
||||
|
|
@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
|
|||
|
||||
await memory_fragment.project_graph_from_db(graph_engine,
|
||||
node_properties_to_project=['id',
|
||||
'community'],
|
||||
'description',
|
||||
'name',
|
||||
'type',
|
||||
'text'],
|
||||
edge_properties_to_project=['id',
|
||||
'relationship_name'],
|
||||
directed=True,
|
||||
node_dimension=1,
|
||||
edge_dimension=1,
|
||||
memory_fragment_filter=[])
|
||||
'relationship_name'])
|
||||
|
||||
print()
|
||||
await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances)
|
||||
|
||||
await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db
|
||||
|
||||
results = await memory_fragment.calculate_top_triplet_importances(k=5)
|
||||
|
||||
|
||||
raise(NotImplementedError)
|
||||
print(format_triplets(results))
|
||||
print(f'Query was the following:{query}' )
|
||||
|
|
|
|||
|
|
@ -2,32 +2,6 @@ import cognee
|
|||
import asyncio
|
||||
from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever
|
||||
|
||||
job_position = """0:Senior Data Scientist (Machine Learning)
|
||||
|
||||
Company: TechNova Solutions
|
||||
Location: San Francisco, CA
|
||||
|
||||
Job Description:
|
||||
|
||||
TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.
|
||||
|
||||
Responsibilities:
|
||||
|
||||
Develop and implement advanced machine learning algorithms and models.
|
||||
Analyze large, complex datasets to extract meaningful patterns and insights.
|
||||
Collaborate with cross-functional teams to integrate predictive models into products.
|
||||
Stay updated with the latest advancements in machine learning and data science.
|
||||
Mentor junior data scientists and provide technical guidance.
|
||||
Qualifications:
|
||||
|
||||
Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
|
||||
5+ years of experience in data science and machine learning.
|
||||
Proficient in Python, R, and SQL.
|
||||
Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
|
||||
Strong problem-solving skills and attention to detail.
|
||||
Candidate CVs
|
||||
"""
|
||||
|
||||
job_1 = """
|
||||
CV 1: Relevant
|
||||
Name: Dr. Emily Carter
|
||||
|
|
@ -195,7 +169,7 @@ async def main(enable_steps):
|
|||
|
||||
# Step 2: Add text
|
||||
if enable_steps.get("add_text"):
|
||||
text_list = [job_position, job_1, job_2, job_3, job_4, job_5]
|
||||
text_list = [job_1, job_2, job_3, job_4, job_5]
|
||||
for text in text_list:
|
||||
await cognee.add(text)
|
||||
print(f"Added text: {text[:35]}...")
|
||||
|
|
@ -207,22 +181,20 @@ async def main(enable_steps):
|
|||
|
||||
# Step 4: Query insights
|
||||
if enable_steps.get("retriever"):
|
||||
search_results = await two_step_retriever(
|
||||
{'query': 'Which applicant has the most relevant experience in data science?'}
|
||||
)
|
||||
print("Search results:")
|
||||
for result_text in search_results:
|
||||
print(result_text)
|
||||
await two_step_retriever('Who has Phd?')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Flags to enable/disable steps
|
||||
|
||||
rebuild_kg = False
|
||||
retrieve = True
|
||||
steps_to_enable = {
|
||||
"prune_data": False,
|
||||
"prune_system": False,
|
||||
"add_text": False,
|
||||
"cognify": False,
|
||||
"retriever": True
|
||||
"prune_data": rebuild_kg,
|
||||
"prune_system": rebuild_kg,
|
||||
"add_text": rebuild_kg,
|
||||
"cognify": rebuild_kg,
|
||||
"retriever": retrieve
|
||||
}
|
||||
|
||||
asyncio.run(main(steps_to_enable))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue