feature: solve edge embedding duplicates in edge collection + retriever optimization (#1151)
<!-- .github/pull_request_template.md --> ## Description feature: solve edge embedding duplicates in edge collection + retriever optimization ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
parent
4ea4b100ab
commit
f78af0cec3
3 changed files with 18 additions and 19 deletions
5
cognee/modules/engine/utils/generate_edge_id.py
Normal file
5
cognee/modules/engine/utils/generate_edge_id.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from uuid import NAMESPACE_OID, uuid5
|
||||
|
||||
|
||||
def generate_edge_id(edge_id: str) -> str:
|
||||
return uuid5(NAMESPACE_OID, edge_id.lower().replace(" ", "_").replace("'", ""))
|
||||
|
|
@ -170,28 +170,19 @@ class CogneeGraph(CogneeAbstractGraph):
|
|||
|
||||
for edge in self.edges:
|
||||
relationship_type = edge.attributes.get("relationship_type")
|
||||
if relationship_type and relationship_type in embedding_map:
|
||||
edge.attributes["vector_distance"] = embedding_map[relationship_type]
|
||||
distance = embedding_map.get(relationship_type, None)
|
||||
if distance is not None:
|
||||
edge.attributes["vector_distance"] = distance
|
||||
|
||||
except Exception as ex:
|
||||
logger.error(f"Error mapping vector distances to edges: {str(ex)}")
|
||||
raise ex
|
||||
|
||||
async def calculate_top_triplet_importances(self, k: int) -> List:
|
||||
min_heap = []
|
||||
def score(edge):
|
||||
n1 = edge.node1.attributes.get("vector_distance", 1)
|
||||
n2 = edge.node2.attributes.get("vector_distance", 1)
|
||||
e = edge.attributes.get("vector_distance", 1)
|
||||
return n1 + n2 + e
|
||||
|
||||
for i, edge in enumerate(self.edges):
|
||||
source_node = self.get_node(edge.node1.id)
|
||||
target_node = self.get_node(edge.node2.id)
|
||||
|
||||
source_distance = source_node.attributes.get("vector_distance", 1) if source_node else 1
|
||||
target_distance = target_node.attributes.get("vector_distance", 1) if target_node else 1
|
||||
edge_distance = edge.attributes.get("vector_distance", 1)
|
||||
|
||||
total_distance = source_distance + target_distance + edge_distance
|
||||
|
||||
heapq.heappush(min_heap, (-total_distance, i, edge))
|
||||
if len(min_heap) > k:
|
||||
heapq.heappop(min_heap)
|
||||
|
||||
return [edge for _, _, edge in sorted(min_heap)]
|
||||
return heapq.nsmallest(k, self.edges, key=score)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
|
||||
from cognee.shared.logging_utils import get_logger, ERROR
|
||||
from collections import Counter
|
||||
|
||||
|
|
@ -49,7 +50,9 @@ async def index_graph_edges(batch_size: int = 1024):
|
|||
)
|
||||
|
||||
for text, count in edge_types.items():
|
||||
edge = EdgeType(relationship_name=text, number_of_edges=count)
|
||||
edge = EdgeType(
|
||||
id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count
|
||||
)
|
||||
data_point_type = type(edge)
|
||||
|
||||
for field_name in edge.metadata["index_fields"]:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue