Merge branch 'add_nodesets' of github.com:topoteretes/cognee into add_nodesets

# Conflicts: # cognee/api/v1/cognify/cognify.py
2025-04-17 17:22:55 +02:00 · 2025-04-17 17:22:55 +02:00 · 83b20b1e92
commit 83b20b1e92
parent 69c090c91d b2a53b4124
8 changed files with 81 additions and 109 deletions
--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -13,7 +13,7 @@ from cognee.modules.data.models import Data, Dataset
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.models import PipelineRunStatus
 from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
-from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.pipelines.tasks.Task import Task
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.models import User
 from cognee.shared.data_models import KnowledgeGraph
--- a/cognee/infrastructure/engine/models/DataPoint.py
+++ b/cognee/infrastructure/engine/models/DataPoint.py
@ -27,7 +27,9 @@ class DataPoint(BaseModel):
    topological_rank: Optional[int] = 0
    metadata: Optional[MetaData] = {"index_fields": []}
    type: str = Field(default_factory=lambda: DataPoint.__name__)
-    NodeSet: Optional[List[str]] = None  # List of nodes this data point is associated with
+    belongs_to_set: Optional[List["DataPoint"]] = (
        None  # List of nodesets this data point belongs to
    )
    def __init__(self, **data):
        super().__init__(**data)
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, List
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.Chunker import Chunker
--- a/cognee/modules/engine/models/node_set.py
+++ b/cognee/modules/engine/models/node_set.py
@ -0,0 +1,8 @@
 from cognee.infrastructure.engine import DataPoint
 class NodeSet(DataPoint):
    """NodeSet data point."""
    name: str
    metadata: dict = {"index_fields": ["name"]}
--- a/cognee/modules/graph/README.md
+++ b/cognee/modules/graph/README.md
@ -1,106 +0,0 @@
 # Layered Knowledge Graph
 This module provides a simplified implementation of a layered knowledge graph, which allows organizing nodes and edges into hierarchical layers.
 ## Features
 - **Hierarchical Layer Structure**: Organize your graph into layers with parent-child relationships
 - **Cumulative Views**: Access nodes and edges from a layer and all its parent layers
 - **Adapter-based Design**: Connect to different database backends using adapter pattern
 - **NetworkX Integration**: Built-in support for NetworkX graph database
 - **Type Safety**: Pydantic models ensure type safety and data validation
 - **Async API**: All methods are async for better performance
 ## Components
 - **GraphNode**: A node in the graph with a name, type, properties, and metadata
 - **GraphEdge**: An edge connecting two nodes with an edge type, properties, and metadata
 - **GraphLayer**: A layer in the graph that can contain nodes and edges, and can have parent layers
 - **LayeredKnowledgeGraph**: The main graph class that manages layers, nodes, and edges
 ## Usage Example
 ```python
 import asyncio
 from uuid import UUID
 from cognee.modules.graph.simplified_layered_graph import LayeredKnowledgeGraph
 from cognee.modules.graph.enhanced_layered_graph_adapter import LayeredGraphDBAdapter
 from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter
 async def main():
    # Initialize adapter
    adapter = NetworkXAdapter(filename="graph.pkl")
    await adapter.create_empty_graph("graph.pkl")
    # Create graph
    graph = LayeredKnowledgeGraph.create_empty("My Knowledge Graph")
    graph.set_adapter(LayeredGraphDBAdapter(adapter))
    # Add layers with parent-child relationships
    base_layer = await graph.add_layer(
        name="Base Layer", 
        description="Foundation concepts",
        layer_type="base"
    )
    derived_layer = await graph.add_layer(
        name="Derived Layer",
        description="Concepts built upon the base layer",
        layer_type="derived",
        parent_layers=[base_layer.id]  # Parent-child relationship
    )
    # Add nodes to layers
    node1 = await graph.add_node(
        name="Concept A",
        node_type="concept",
        properties={"importance": "high"},
        layer_id=base_layer.id
    )
    node2 = await graph.add_node(
        name="Concept B",
        node_type="concept",
        properties={"importance": "medium"},
        layer_id=derived_layer.id
    )
    # Connect nodes with an edge
    edge = await graph.add_edge(
        source_id=node1.id,
        target_id=node2.id,
        edge_type="RELATES_TO",
        properties={"strength": "high"},
        layer_id=derived_layer.id
    )
    # Get cumulative view (including parent layers)
    nodes, edges = await graph.get_cumulative_layer_graph(derived_layer.id)
    print(f"Nodes in cumulative view: {[n.name for n in nodes]}")
    print(f"Edges in cumulative view: {[e.edge_type for e in edges]}")
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 ## Design Improvements
 The simplified layered graph implementation offers several improvements over the previous approach:
 1. **Clear Separation of Concerns**: In-memory operations vs. database operations
 2. **More Intuitive API**: Methods have clear, consistent signatures
 3. **Better Error Handling**: Comprehensive validation and error reporting
 4. **Enhanced Debugging**: Detailed logging throughout
 5. **Improved Caching**: Local caches reduce database load
 6. **Method Naming Consistency**: All methods follow consistent naming conventions
 7. **Reduced Complexity**: Simpler implementation with equivalent functionality
 ## Best Practices
 - Always use the adapter pattern for database operations
 - Use the provided factory methods for creating nodes and edges
 - Leverage parent-child relationships for organizing related concepts
 - Utilize cumulative views to access inherited nodes and edges
 - Consider layer types for additional semantic meaning
 - Use properties and metadata for storing additional information 
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -8,6 +8,11 @@ from cognee.modules.data.processing.document_types import (
    TextDocument,
    UnstructuredDocument,
 )
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.engine.models.node_set import NodeSet
 from cognee.modules.engine.utils.generate_node_id import generate_node_id
 from typing import List, Optional
 import uuid
 EXTENSION_TO_DOCUMENT_CLASS = {
    "pdf": PdfDocument,  # Text documents
@ -49,6 +54,29 @@ EXTENSION_TO_DOCUMENT_CLASS = {
 }
 def update_node_set(document):
    """Extracts node_set from document's external_metadata."""
    try:
        external_metadata = json.loads(document.external_metadata)
    except json.JSONDecodeError:
        return
    if not isinstance(external_metadata, dict):
        return
    if "node_set" not in external_metadata:
        return
    node_set = external_metadata["node_set"]
    if not isinstance(node_set, list):
        return
    document.belongs_to_set = [
        NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name)
        for node_set_name in node_set
    ]
 async def classify_documents(data_documents: list[Data]) -> list[Document]:
    """
    Classifies a list of data items into specific document types based on file extensions.
@ -67,6 +95,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
            mime_type=data_item.mime_type,
            external_metadata=json.dumps(data_item.external_metadata, indent=4),
        )
        update_node_set(document)
        documents.append(document)
    return documents
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -40,6 +40,7 @@ async def extract_chunks_from_documents(
        document_token_count = 0
        for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
            document_token_count += document_chunk.chunk_size
            document_chunk.belongs_to_set = document.belongs_to_set
            yield document_chunk
        await update_document_token_count(document.id, document_token_count)
--- a/examples/python/simple_node_set_example.py
+++ b/examples/python/simple_node_set_example.py
@ -0,0 +1,38 @@
 import asyncio
 import cognee
 from cognee.shared.logging_utils import get_logger, ERROR
 from cognee.api.v1.search import SearchType
 text_a = """
    AI is revolutionizing financial services through intelligent fraud detection
    and automated customer service platforms.
    """
 text_b = """
    Advances in AI are enabling smarter systems that learn and adapt over time.
    """
 text_c = """
    MedTech startups have seen significant growth in recent years, driven by innovation
    in digital health and medical devices.
    """
 node_set_a = ["AI", "FinTech"]
 node_set_b = ["AI"]
 node_set_c = ["MedTech"]
 async def main():
    await cognee.prune.prune_data()
    await cognee.prune.prune_system(metadata=True)
    await cognee.add(text_a, node_set=node_set_a)
    await cognee.add(text_b, node_set=node_set_b)
    await cognee.add(text_c, node_set=node_set_c)
    await cognee.cognify()
 if __name__ == "__main__":
    logger = get_logger(level=ERROR)
    loop = asyncio.new_event_loop()
    asyncio.run(main())