feat: Implements basic global triplet optimizing retrieval
This commit is contained in:
parent
980ae2b22c
commit
a114d68aef
2 changed files with 63 additions and 48 deletions
|
|
@ -13,6 +13,46 @@ from openai import organization
|
||||||
from sympy.codegen.fnodes import dimension
|
from sympy.codegen.fnodes import dimension
|
||||||
|
|
||||||
|
|
||||||
|
def format_triplets(edges):
|
||||||
|
def filter_attributes(obj, attributes):
|
||||||
|
"""Helper function to filter out non-None properties, including nested dicts."""
|
||||||
|
print("\n\n\n")
|
||||||
|
result = {}
|
||||||
|
for attr in attributes:
|
||||||
|
value = getattr(obj, attr, None)
|
||||||
|
if value is not None:
|
||||||
|
# If the value is a dict, extract relevant keys from it
|
||||||
|
if isinstance(value, dict):
|
||||||
|
nested_values = {k: v for k, v in value.items() if k in attributes and v is not None}
|
||||||
|
result[attr] = nested_values
|
||||||
|
else:
|
||||||
|
result[attr] = value
|
||||||
|
return result
|
||||||
|
|
||||||
|
triplets = []
|
||||||
|
for edge in edges:
|
||||||
|
node1 = edge.node1
|
||||||
|
node2 = edge.node2
|
||||||
|
edge_attributes = edge.attributes
|
||||||
|
node1_attributes = node1.attributes
|
||||||
|
node2_attributes = node2.attributes
|
||||||
|
|
||||||
|
# Filter only non-None properties
|
||||||
|
node1_info = {key: value for key, value in node1_attributes.items() if value is not None}
|
||||||
|
node2_info = {key: value for key, value in node2_attributes.items() if value is not None}
|
||||||
|
edge_info = {key: value for key, value in edge_attributes.items() if value is not None}
|
||||||
|
|
||||||
|
# Create the formatted triplet
|
||||||
|
triplet = (
|
||||||
|
f"Node1: {node1_info}\n"
|
||||||
|
f"Edge: {edge_info}\n"
|
||||||
|
f"Node2: {node2_info}\n\n\n" # Add three blank lines for separation
|
||||||
|
)
|
||||||
|
triplets.append(triplet)
|
||||||
|
|
||||||
|
return "".join(triplets)
|
||||||
|
|
||||||
|
|
||||||
async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
|
async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
|
||||||
if user is None:
|
if user is None:
|
||||||
user = await get_default_user()
|
user = await get_default_user()
|
||||||
|
|
@ -25,7 +65,6 @@ async def two_step_retriever(query: Dict[str, str], user: User = None) -> list:
|
||||||
|
|
||||||
filtered_search_results = []
|
filtered_search_results = []
|
||||||
|
|
||||||
|
|
||||||
return retrieved_results
|
return retrieved_results
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -55,7 +94,7 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
|
||||||
)
|
)
|
||||||
|
|
||||||
############################################# This part is a quick fix til we don't fix the vector db inconsistency
|
############################################# This part is a quick fix til we don't fix the vector db inconsistency
|
||||||
results_dict = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
|
node_distances = delete_duplicated_vector_db_elements(collections, results)# :TODO: Change when vector db is fixed
|
||||||
# results_dict = {collection: result for collection, result in zip(collections, results)}
|
# results_dict = {collection: result for collection, result in zip(collections, results)}
|
||||||
##############################################
|
##############################################
|
||||||
|
|
||||||
|
|
@ -63,15 +102,19 @@ async def run_two_step_retriever(query: str, user, community_filter = []) -> lis
|
||||||
|
|
||||||
await memory_fragment.project_graph_from_db(graph_engine,
|
await memory_fragment.project_graph_from_db(graph_engine,
|
||||||
node_properties_to_project=['id',
|
node_properties_to_project=['id',
|
||||||
'community'],
|
'description',
|
||||||
|
'name',
|
||||||
|
'type',
|
||||||
|
'text'],
|
||||||
edge_properties_to_project=['id',
|
edge_properties_to_project=['id',
|
||||||
'relationship_name'],
|
'relationship_name'])
|
||||||
directed=True,
|
|
||||||
node_dimension=1,
|
|
||||||
edge_dimension=1,
|
|
||||||
memory_fragment_filter=[])
|
|
||||||
|
|
||||||
print()
|
await memory_fragment.map_vector_distances_to_graph_nodes(node_distances=node_distances)
|
||||||
|
|
||||||
|
await memory_fragment.map_vector_distances_to_graph_edges(vector_engine, query)# :TODO: This should be coming from vector db
|
||||||
|
|
||||||
|
results = await memory_fragment.calculate_top_triplet_importances(k=5)
|
||||||
|
|
||||||
|
|
||||||
raise(NotImplementedError)
|
print(format_triplets(results))
|
||||||
|
print(f'Query was the following:{query}' )
|
||||||
|
|
|
||||||
|
|
@ -2,32 +2,6 @@ import cognee
|
||||||
import asyncio
|
import asyncio
|
||||||
from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever
|
from cognee.pipelines.retriever.two_steps_retriever import two_step_retriever
|
||||||
|
|
||||||
job_position = """0:Senior Data Scientist (Machine Learning)
|
|
||||||
|
|
||||||
Company: TechNova Solutions
|
|
||||||
Location: San Francisco, CA
|
|
||||||
|
|
||||||
Job Description:
|
|
||||||
|
|
||||||
TechNova Solutions is seeking a Senior Data Scientist specializing in Machine Learning to join our dynamic analytics team. The ideal candidate will have a strong background in developing and deploying machine learning models, working with large datasets, and translating complex data into actionable insights.
|
|
||||||
|
|
||||||
Responsibilities:
|
|
||||||
|
|
||||||
Develop and implement advanced machine learning algorithms and models.
|
|
||||||
Analyze large, complex datasets to extract meaningful patterns and insights.
|
|
||||||
Collaborate with cross-functional teams to integrate predictive models into products.
|
|
||||||
Stay updated with the latest advancements in machine learning and data science.
|
|
||||||
Mentor junior data scientists and provide technical guidance.
|
|
||||||
Qualifications:
|
|
||||||
|
|
||||||
Master’s or Ph.D. in Data Science, Computer Science, Statistics, or a related field.
|
|
||||||
5+ years of experience in data science and machine learning.
|
|
||||||
Proficient in Python, R, and SQL.
|
|
||||||
Experience with deep learning frameworks (e.g., TensorFlow, PyTorch).
|
|
||||||
Strong problem-solving skills and attention to detail.
|
|
||||||
Candidate CVs
|
|
||||||
"""
|
|
||||||
|
|
||||||
job_1 = """
|
job_1 = """
|
||||||
CV 1: Relevant
|
CV 1: Relevant
|
||||||
Name: Dr. Emily Carter
|
Name: Dr. Emily Carter
|
||||||
|
|
@ -195,7 +169,7 @@ async def main(enable_steps):
|
||||||
|
|
||||||
# Step 2: Add text
|
# Step 2: Add text
|
||||||
if enable_steps.get("add_text"):
|
if enable_steps.get("add_text"):
|
||||||
text_list = [job_position, job_1, job_2, job_3, job_4, job_5]
|
text_list = [job_1, job_2, job_3, job_4, job_5]
|
||||||
for text in text_list:
|
for text in text_list:
|
||||||
await cognee.add(text)
|
await cognee.add(text)
|
||||||
print(f"Added text: {text[:35]}...")
|
print(f"Added text: {text[:35]}...")
|
||||||
|
|
@ -207,22 +181,20 @@ async def main(enable_steps):
|
||||||
|
|
||||||
# Step 4: Query insights
|
# Step 4: Query insights
|
||||||
if enable_steps.get("retriever"):
|
if enable_steps.get("retriever"):
|
||||||
search_results = await two_step_retriever(
|
await two_step_retriever('Who has Phd?')
|
||||||
{'query': 'Which applicant has the most relevant experience in data science?'}
|
|
||||||
)
|
|
||||||
print("Search results:")
|
|
||||||
for result_text in search_results:
|
|
||||||
print(result_text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Flags to enable/disable steps
|
# Flags to enable/disable steps
|
||||||
|
|
||||||
|
rebuild_kg = False
|
||||||
|
retrieve = True
|
||||||
steps_to_enable = {
|
steps_to_enable = {
|
||||||
"prune_data": False,
|
"prune_data": rebuild_kg,
|
||||||
"prune_system": False,
|
"prune_system": rebuild_kg,
|
||||||
"add_text": False,
|
"add_text": rebuild_kg,
|
||||||
"cognify": False,
|
"cognify": rebuild_kg,
|
||||||
"retriever": True
|
"retriever": retrieve
|
||||||
}
|
}
|
||||||
|
|
||||||
asyncio.run(main(steps_to_enable))
|
asyncio.run(main(steps_to_enable))
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue