From 40142b4789197e7f2a96263e551cd70e6aa70670 Mon Sep 17 00:00:00 2001
From: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:22:21 +0200
Subject: [PATCH 1/2] Delete cognee/modules/graph/README.md

---
 cognee/modules/graph/README.md | 106 ---------------------------------
 1 file changed, 106 deletions(-)
 delete mode 100644 cognee/modules/graph/README.md

diff --git a/cognee/modules/graph/README.md b/cognee/modules/graph/README.md
deleted file mode 100644
index bdf4df94f..000000000
--- a/cognee/modules/graph/README.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Layered Knowledge Graph
-
-This module provides a simplified implementation of a layered knowledge graph, which allows organizing nodes and edges into hierarchical layers.
-
-## Features
-
-- **Hierarchical Layer Structure**: Organize your graph into layers with parent-child relationships
-- **Cumulative Views**: Access nodes and edges from a layer and all its parent layers
-- **Adapter-based Design**: Connect to different database backends using adapter pattern
-- **NetworkX Integration**: Built-in support for NetworkX graph database
-- **Type Safety**: Pydantic models ensure type safety and data validation
-- **Async API**: All methods are async for better performance
-
-## Components
-
-- **GraphNode**: A node in the graph with a name, type, properties, and metadata
-- **GraphEdge**: An edge connecting two nodes with an edge type, properties, and metadata
-- **GraphLayer**: A layer in the graph that can contain nodes and edges, and can have parent layers
-- **LayeredKnowledgeGraph**: The main graph class that manages layers, nodes, and edges
-
-## Usage Example
-
-```python
-import asyncio
-from uuid import UUID
-from cognee.modules.graph.simplified_layered_graph import LayeredKnowledgeGraph
-from cognee.modules.graph.enhanced_layered_graph_adapter import LayeredGraphDBAdapter
-from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter
-
-async def main():
-    # Initialize adapter
-    adapter = NetworkXAdapter(filename="graph.pkl")
-    await adapter.create_empty_graph("graph.pkl")
-    
-    # Create graph
-    graph = LayeredKnowledgeGraph.create_empty("My Knowledge Graph")
-    graph.set_adapter(LayeredGraphDBAdapter(adapter))
-    
-    # Add layers with parent-child relationships
-    base_layer = await graph.add_layer(
-        name="Base Layer", 
-        description="Foundation concepts",
-        layer_type="base"
-    )
-    
-    derived_layer = await graph.add_layer(
-        name="Derived Layer",
-        description="Concepts built upon the base layer",
-        layer_type="derived",
-        parent_layers=[base_layer.id]  # Parent-child relationship
-    )
-    
-    # Add nodes to layers
-    node1 = await graph.add_node(
-        name="Concept A",
-        node_type="concept",
-        properties={"importance": "high"},
-        layer_id=base_layer.id
-    )
-    
-    node2 = await graph.add_node(
-        name="Concept B",
-        node_type="concept",
-        properties={"importance": "medium"},
-        layer_id=derived_layer.id
-    )
-    
-    # Connect nodes with an edge
-    edge = await graph.add_edge(
-        source_id=node1.id,
-        target_id=node2.id,
-        edge_type="RELATES_TO",
-        properties={"strength": "high"},
-        layer_id=derived_layer.id
-    )
-    
-    # Get cumulative view (including parent layers)
-    nodes, edges = await graph.get_cumulative_layer_graph(derived_layer.id)
-    
-    print(f"Nodes in cumulative view: {[n.name for n in nodes]}")
-    print(f"Edges in cumulative view: {[e.edge_type for e in edges]}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Design Improvements
-
-The simplified layered graph implementation offers several improvements over the previous approach:
-
-1. **Clear Separation of Concerns**: In-memory operations vs. database operations
-2. **More Intuitive API**: Methods have clear, consistent signatures
-3. **Better Error Handling**: Comprehensive validation and error reporting
-4. **Enhanced Debugging**: Detailed logging throughout
-5. **Improved Caching**: Local caches reduce database load
-6. **Method Naming Consistency**: All methods follow consistent naming conventions
-7. **Reduced Complexity**: Simpler implementation with equivalent functionality
-
-## Best Practices
-
-- Always use the adapter pattern for database operations
-- Use the provided factory methods for creating nodes and edges
-- Leverage parent-child relationships for organizing related concepts
-- Utilize cumulative views to access inherited nodes and edges
-- Consider layer types for additional semantic meaning
-- Use properties and metadata for storing additional information 
\ No newline at end of file

From b2a53b41243e6530c5012e05f889dc717f11274b Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Thu, 17 Apr 2025 17:10:42 +0200
Subject: [PATCH 2/2] Add nodesets datapoints (#755)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
---
 cognee/api/v1/cognify/cognify.py              |  2 +-
 .../infrastructure/engine/models/DataPoint.py |  4 +-
 .../processing/document_types/Document.py     |  2 +-
 cognee/modules/engine/models/node_set.py      |  8 ++++
 cognee/tasks/documents/classify_documents.py  | 29 ++++++++++++++
 .../extract_chunks_from_documents.py          |  1 +
 examples/python/simple_node_set_example.py    | 38 +++++++++++++++++++
 7 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 cognee/modules/engine/models/node_set.py
 create mode 100644 examples/python/simple_node_set_example.py

diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py
index 0c4487648..971bd0f51 100644
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@@ -140,7 +140,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
             task_config={"batch_size": 10},
         ),
         Task(add_data_points, task_config={"batch_size": 10}),
-        Task(apply_node_set, task_config={"batch_size": 10}),  # Apply NodeSet values and create set nodes
+        # Task(apply_node_set, task_config={"batch_size": 10}),  # Apply NodeSet values and create set nodes
     ]
 
     return default_tasks
diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
index 3abadd51c..76e4fbb6e 100644
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@@ -27,7 +27,9 @@ class DataPoint(BaseModel):
     topological_rank: Optional[int] = 0
     metadata: Optional[MetaData] = {"index_fields": []}
     type: str = Field(default_factory=lambda: DataPoint.__name__)
-    NodeSet: Optional[List[str]] = None  # List of nodes this data point is associated with
+    belongs_to_set: Optional[List["DataPoint"]] = (
+        None  # List of nodesets this data point belongs to
+    )
 
     def __init__(self, **data):
         super().__init__(**data)
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index c75203231..af1be84c2 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, List
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.Chunker import Chunker
 
diff --git a/cognee/modules/engine/models/node_set.py b/cognee/modules/engine/models/node_set.py
new file mode 100644
index 000000000..33fe3f557
--- /dev/null
+++ b/cognee/modules/engine/models/node_set.py
@@ -0,0 +1,8 @@
+from cognee.infrastructure.engine import DataPoint
+
+
+class NodeSet(DataPoint):
+    """NodeSet data point."""
+
+    name: str
+    metadata: dict = {"index_fields": ["name"]}
diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py
index 7211ebced..f745c2cf3 100644
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@@ -8,6 +8,11 @@ from cognee.modules.data.processing.document_types import (
     TextDocument,
     UnstructuredDocument,
 )
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.engine.models.node_set import NodeSet
+from cognee.modules.engine.utils.generate_node_id import generate_node_id
+from typing import List, Optional
+import uuid
 
 EXTENSION_TO_DOCUMENT_CLASS = {
     "pdf": PdfDocument,  # Text documents
@@ -49,6 +54,29 @@ EXTENSION_TO_DOCUMENT_CLASS = {
 }
 
 
+def update_node_set(document):
+    """Extracts node_set from document's external_metadata."""
+    try:
+        external_metadata = json.loads(document.external_metadata)
+    except json.JSONDecodeError:
+        return
+
+    if not isinstance(external_metadata, dict):
+        return
+
+    if "node_set" not in external_metadata:
+        return
+
+    node_set = external_metadata["node_set"]
+    if not isinstance(node_set, list):
+        return
+
+    document.belongs_to_set = [
+        NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name)
+        for node_set_name in node_set
+    ]
+
+
 async def classify_documents(data_documents: list[Data]) -> list[Document]:
     """
     Classifies a list of data items into specific document types based on file extensions.
@@ -67,6 +95,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
             mime_type=data_item.mime_type,
             external_metadata=json.dumps(data_item.external_metadata, indent=4),
         )
+        update_node_set(document)
         documents.append(document)
 
     return documents
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index a585d519c..08df7fa57 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -40,6 +40,7 @@ async def extract_chunks_from_documents(
         document_token_count = 0
         for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
             document_token_count += document_chunk.chunk_size
+            document_chunk.belongs_to_set = document.belongs_to_set
             yield document_chunk
 
         await update_document_token_count(document.id, document_token_count)
diff --git a/examples/python/simple_node_set_example.py b/examples/python/simple_node_set_example.py
new file mode 100644
index 000000000..2f891b7d8
--- /dev/null
+++ b/examples/python/simple_node_set_example.py
@@ -0,0 +1,38 @@
+import asyncio
+import cognee
+from cognee.shared.logging_utils import get_logger, ERROR
+from cognee.api.v1.search import SearchType
+
+text_a = """
+    AI is revolutionizing financial services through intelligent fraud detection
+    and automated customer service platforms.
+    """
+
+text_b = """
+    Advances in AI are enabling smarter systems that learn and adapt over time.
+    """
+
+text_c = """
+    MedTech startups have seen significant growth in recent years, driven by innovation
+    in digital health and medical devices.
+    """
+
+node_set_a = ["AI", "FinTech"]
+node_set_b = ["AI"]
+node_set_c = ["MedTech"]
+
+
+async def main():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    await cognee.add(text_a, node_set=node_set_a)
+    await cognee.add(text_b, node_set=node_set_b)
+    await cognee.add(text_c, node_set=node_set_c)
+    await cognee.cognify()
+
+
+if __name__ == "__main__":
+    logger = get_logger(level=ERROR)
+    loop = asyncio.new_event_loop()
+    asyncio.run(main())