feat: implements graph edge indexing

2024-12-04 15:37:48 +01:00 · 2024-12-04 15:37:48 +01:00 · c20ee11e80
commit c20ee11e80
parent 46ee513f6c
3 changed files with 84 additions and 0 deletions
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@ -18,6 +18,7 @@ from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline
 from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
 from cognee.tasks.graph import extract_graph_from_data
 from cognee.tasks.storage import add_data_points
+from cognee.tasks.storage.index_graph_edges import index_graph_edges
 from cognee.tasks.summarization import summarize_text

 logger = logging.getLogger("cognify.v2")
@ -94,6 +95,8 @@ async def run_cognify_pipeline(dataset: Dataset, user: User):
        async for result in pipeline:
            print(result)

+        await index_graph_edges()
+
        send_telemetry("cognee.cognify EXECUTION COMPLETED", user.id)

        await log_pipeline_status(dataset_id, PipelineRunStatus.DATASET_PROCESSING_COMPLETED, {
--- a/cognee/modules/graph/models/EdgeType.py
+++ b/cognee/modules/graph/models/EdgeType.py
@ -0,0 +1,11 @@
+from typing import Optional
+from cognee.infrastructure.engine import DataPoint
+
+class EdgeType(DataPoint):
+    __tablename__ = "edge_type"
+    relationship_name: str
+    number_of_edges: int
+
+    _metadata: Optional[dict] = {
+        "index_fields": ["relationship_name"],
+    }
--- a/cognee/tasks/storage/index_graph_edges.py
+++ b/cognee/tasks/storage/index_graph_edges.py
@ -0,0 +1,70 @@
+import logging
+from collections import Counter
+
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.modules.graph.models.EdgeType import EdgeType
+
+
+async def index_graph_edges():
+    """
+        Indexes graph edges by creating and managing vector indexes for relationship types.
+
+        This function retrieves edge data from the graph engine, counts distinct relationship
+        types, and creates `EdgeType` pydantic objects. It ensures that vector indexes are created for
+        the `relationship_name` field.
+
+        Steps:
+        1. Initialize the vector engine and graph engine.
+        2. Retrieve graph edge data and count relationship types (`relationship_name`).
+        3. Create vector indexes for `relationship_name` if they don't exist.
+        4. Transform the counted relationships into `EdgeType` objects.
+        5. Index the transformed data points in the vector engine.
+
+        Raises:
+            RuntimeError: If initialization of the vector engine or graph engine fails.
+
+        Returns:
+            None
+        """
+    try:
+        created_indexes = {}
+        index_points = {}
+
+        vector_engine = get_vector_engine()
+        graph_engine = await get_graph_engine()
+    except Exception as e:
+        logging.error("Failed to initialize engines: %s", e)
+        raise RuntimeError("Initialization error") from e
+
+    _, edges_data = await graph_engine.get_graph_data()
+
+    edge_types = Counter(
+        item.get('relationship_name')
+        for edge in edges_data
+        for item in edge if isinstance(item, dict) and 'relationship_name' in item
+    )
+
+    for text, count in edge_types.items():
+        edge = EdgeType(relationship_name=text, number_of_edges=count)
+        data_point_type = type(edge)
+
+        for field_name in edge._metadata["index_fields"]:
+            index_name = f"{data_point_type.__tablename__}.{field_name}"
+
+            if index_name not in created_indexes:
+                await vector_engine.create_vector_index(data_point_type.__tablename__, field_name)
+                created_indexes[index_name] = True
+
+            if index_name not in index_points:
+                index_points[index_name] = []
+
+            indexed_data_point = edge.model_copy()
+            indexed_data_point._metadata["index_fields"] = [field_name]
+            index_points[index_name].append(indexed_data_point)
+
+    for index_name, indexable_points in index_points.items():
+        index_name, field_name = index_name.split(".")
+        await vector_engine.index_data_points(index_name, field_name, indexable_points)
+
+    return None