From 40142b4789197e7f2a96263e551cd70e6aa70670 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:22:21 +0200 Subject: [PATCH 1/2] Delete cognee/modules/graph/README.md --- cognee/modules/graph/README.md | 106 --------------------------------- 1 file changed, 106 deletions(-) delete mode 100644 cognee/modules/graph/README.md diff --git a/cognee/modules/graph/README.md b/cognee/modules/graph/README.md deleted file mode 100644 index bdf4df94f..000000000 --- a/cognee/modules/graph/README.md +++ /dev/null @@ -1,106 +0,0 @@ -# Layered Knowledge Graph - -This module provides a simplified implementation of a layered knowledge graph, which allows organizing nodes and edges into hierarchical layers. - -## Features - -- **Hierarchical Layer Structure**: Organize your graph into layers with parent-child relationships -- **Cumulative Views**: Access nodes and edges from a layer and all its parent layers -- **Adapter-based Design**: Connect to different database backends using adapter pattern -- **NetworkX Integration**: Built-in support for NetworkX graph database -- **Type Safety**: Pydantic models ensure type safety and data validation -- **Async API**: All methods are async for better performance - -## Components - -- **GraphNode**: A node in the graph with a name, type, properties, and metadata -- **GraphEdge**: An edge connecting two nodes with an edge type, properties, and metadata -- **GraphLayer**: A layer in the graph that can contain nodes and edges, and can have parent layers -- **LayeredKnowledgeGraph**: The main graph class that manages layers, nodes, and edges - -## Usage Example - -```python -import asyncio -from uuid import UUID -from cognee.modules.graph.simplified_layered_graph import LayeredKnowledgeGraph -from cognee.modules.graph.enhanced_layered_graph_adapter import LayeredGraphDBAdapter -from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter - -async def main(): - # Initialize adapter - adapter = NetworkXAdapter(filename="graph.pkl") - await adapter.create_empty_graph("graph.pkl") - - # Create graph - graph = LayeredKnowledgeGraph.create_empty("My Knowledge Graph") - graph.set_adapter(LayeredGraphDBAdapter(adapter)) - - # Add layers with parent-child relationships - base_layer = await graph.add_layer( - name="Base Layer", - description="Foundation concepts", - layer_type="base" - ) - - derived_layer = await graph.add_layer( - name="Derived Layer", - description="Concepts built upon the base layer", - layer_type="derived", - parent_layers=[base_layer.id] # Parent-child relationship - ) - - # Add nodes to layers - node1 = await graph.add_node( - name="Concept A", - node_type="concept", - properties={"importance": "high"}, - layer_id=base_layer.id - ) - - node2 = await graph.add_node( - name="Concept B", - node_type="concept", - properties={"importance": "medium"}, - layer_id=derived_layer.id - ) - - # Connect nodes with an edge - edge = await graph.add_edge( - source_id=node1.id, - target_id=node2.id, - edge_type="RELATES_TO", - properties={"strength": "high"}, - layer_id=derived_layer.id - ) - - # Get cumulative view (including parent layers) - nodes, edges = await graph.get_cumulative_layer_graph(derived_layer.id) - - print(f"Nodes in cumulative view: {[n.name for n in nodes]}") - print(f"Edges in cumulative view: {[e.edge_type for e in edges]}") - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Design Improvements - -The simplified layered graph implementation offers several improvements over the previous approach: - -1. **Clear Separation of Concerns**: In-memory operations vs. database operations -2. **More Intuitive API**: Methods have clear, consistent signatures -3. **Better Error Handling**: Comprehensive validation and error reporting -4. **Enhanced Debugging**: Detailed logging throughout -5. **Improved Caching**: Local caches reduce database load -6. **Method Naming Consistency**: All methods follow consistent naming conventions -7. **Reduced Complexity**: Simpler implementation with equivalent functionality - -## Best Practices - -- Always use the adapter pattern for database operations -- Use the provided factory methods for creating nodes and edges -- Leverage parent-child relationships for organizing related concepts -- Utilize cumulative views to access inherited nodes and edges -- Consider layer types for additional semantic meaning -- Use properties and metadata for storing additional information \ No newline at end of file From b2a53b41243e6530c5012e05f889dc717f11274b Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Thu, 17 Apr 2025 17:10:42 +0200 Subject: [PATCH 2/2] Add nodesets datapoints (#755) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --- cognee/api/v1/cognify/cognify.py | 2 +- .../infrastructure/engine/models/DataPoint.py | 4 +- .../processing/document_types/Document.py | 2 +- cognee/modules/engine/models/node_set.py | 8 ++++ cognee/tasks/documents/classify_documents.py | 29 ++++++++++++++ .../extract_chunks_from_documents.py | 1 + examples/python/simple_node_set_example.py | 38 +++++++++++++++++++ 7 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 cognee/modules/engine/models/node_set.py create mode 100644 examples/python/simple_node_set_example.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 0c4487648..971bd0f51 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -140,7 +140,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's task_config={"batch_size": 10}, ), Task(add_data_points, task_config={"batch_size": 10}), - Task(apply_node_set, task_config={"batch_size": 10}), # Apply NodeSet values and create set nodes + # Task(apply_node_set, task_config={"batch_size": 10}), # Apply NodeSet values and create set nodes ] return default_tasks diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index 3abadd51c..76e4fbb6e 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -27,7 +27,9 @@ class DataPoint(BaseModel): topological_rank: Optional[int] = 0 metadata: Optional[MetaData] = {"index_fields": []} type: str = Field(default_factory=lambda: DataPoint.__name__) - NodeSet: Optional[List[str]] = None # List of nodes this data point is associated with + belongs_to_set: Optional[List["DataPoint"]] = ( + None # List of nodesets this data point belongs to + ) def __init__(self, **data): super().__init__(**data) diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index c75203231..af1be84c2 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, List from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.Chunker import Chunker diff --git a/cognee/modules/engine/models/node_set.py b/cognee/modules/engine/models/node_set.py new file mode 100644 index 000000000..33fe3f557 --- /dev/null +++ b/cognee/modules/engine/models/node_set.py @@ -0,0 +1,8 @@ +from cognee.infrastructure.engine import DataPoint + + +class NodeSet(DataPoint): + """NodeSet data point.""" + + name: str + metadata: dict = {"index_fields": ["name"]} diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 7211ebced..f745c2cf3 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -8,6 +8,11 @@ from cognee.modules.data.processing.document_types import ( TextDocument, UnstructuredDocument, ) +from cognee.infrastructure.engine import DataPoint +from cognee.modules.engine.models.node_set import NodeSet +from cognee.modules.engine.utils.generate_node_id import generate_node_id +from typing import List, Optional +import uuid EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents @@ -49,6 +54,29 @@ EXTENSION_TO_DOCUMENT_CLASS = { } +def update_node_set(document): + """Extracts node_set from document's external_metadata.""" + try: + external_metadata = json.loads(document.external_metadata) + except json.JSONDecodeError: + return + + if not isinstance(external_metadata, dict): + return + + if "node_set" not in external_metadata: + return + + node_set = external_metadata["node_set"] + if not isinstance(node_set, list): + return + + document.belongs_to_set = [ + NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name) + for node_set_name in node_set + ] + + async def classify_documents(data_documents: list[Data]) -> list[Document]: """ Classifies a list of data items into specific document types based on file extensions. @@ -67,6 +95,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]: mime_type=data_item.mime_type, external_metadata=json.dumps(data_item.external_metadata, indent=4), ) + update_node_set(document) documents.append(document) return documents diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index a585d519c..08df7fa57 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -40,6 +40,7 @@ async def extract_chunks_from_documents( document_token_count = 0 for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker): document_token_count += document_chunk.chunk_size + document_chunk.belongs_to_set = document.belongs_to_set yield document_chunk await update_document_token_count(document.id, document_token_count) diff --git a/examples/python/simple_node_set_example.py b/examples/python/simple_node_set_example.py new file mode 100644 index 000000000..2f891b7d8 --- /dev/null +++ b/examples/python/simple_node_set_example.py @@ -0,0 +1,38 @@ +import asyncio +import cognee +from cognee.shared.logging_utils import get_logger, ERROR +from cognee.api.v1.search import SearchType + +text_a = """ + AI is revolutionizing financial services through intelligent fraud detection + and automated customer service platforms. + """ + +text_b = """ + Advances in AI are enabling smarter systems that learn and adapt over time. + """ + +text_c = """ + MedTech startups have seen significant growth in recent years, driven by innovation + in digital health and medical devices. + """ + +node_set_a = ["AI", "FinTech"] +node_set_b = ["AI"] +node_set_c = ["MedTech"] + + +async def main(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + await cognee.add(text_a, node_set=node_set_a) + await cognee.add(text_b, node_set=node_set_b) + await cognee.add(text_c, node_set=node_set_c) + await cognee.cognify() + + +if __name__ == "__main__": + logger = get_logger(level=ERROR) + loop = asyncio.new_event_loop() + asyncio.run(main())