Merge branch 'add_nodesets' of github.com:topoteretes/cognee into add_nodesets
# Conflicts: # cognee/api/v1/cognify/cognify.py
This commit is contained in:
commit
83b20b1e92
8 changed files with 81 additions and 109 deletions
|
|
@ -13,7 +13,7 @@ from cognee.modules.data.models import Data, Dataset
|
||||||
from cognee.modules.pipelines import run_tasks
|
from cognee.modules.pipelines import run_tasks
|
||||||
from cognee.modules.pipelines.models import PipelineRunStatus
|
from cognee.modules.pipelines.models import PipelineRunStatus
|
||||||
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
||||||
from cognee.modules.pipelines.tasks.task import Task
|
from cognee.modules.pipelines.tasks.Task import Task
|
||||||
from cognee.modules.users.methods import get_default_user
|
from cognee.modules.users.methods import get_default_user
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
from cognee.shared.data_models import KnowledgeGraph
|
from cognee.shared.data_models import KnowledgeGraph
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,9 @@ class DataPoint(BaseModel):
|
||||||
topological_rank: Optional[int] = 0
|
topological_rank: Optional[int] = 0
|
||||||
metadata: Optional[MetaData] = {"index_fields": []}
|
metadata: Optional[MetaData] = {"index_fields": []}
|
||||||
type: str = Field(default_factory=lambda: DataPoint.__name__)
|
type: str = Field(default_factory=lambda: DataPoint.__name__)
|
||||||
NodeSet: Optional[List[str]] = None # List of nodes this data point is associated with
|
belongs_to_set: Optional[List["DataPoint"]] = (
|
||||||
|
None # List of nodesets this data point belongs to
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, **data):
|
def __init__(self, **data):
|
||||||
super().__init__(**data)
|
super().__init__(**data)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.chunking.Chunker import Chunker
|
from cognee.modules.chunking.Chunker import Chunker
|
||||||
|
|
||||||
|
|
|
||||||
8
cognee/modules/engine/models/node_set.py
Normal file
8
cognee/modules/engine/models/node_set.py
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
|
||||||
|
|
||||||
|
class NodeSet(DataPoint):
|
||||||
|
"""NodeSet data point."""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
metadata: dict = {"index_fields": ["name"]}
|
||||||
|
|
@ -1,106 +0,0 @@
|
||||||
# Layered Knowledge Graph
|
|
||||||
|
|
||||||
This module provides a simplified implementation of a layered knowledge graph, which allows organizing nodes and edges into hierarchical layers.
|
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- **Hierarchical Layer Structure**: Organize your graph into layers with parent-child relationships
|
|
||||||
- **Cumulative Views**: Access nodes and edges from a layer and all its parent layers
|
|
||||||
- **Adapter-based Design**: Connect to different database backends using adapter pattern
|
|
||||||
- **NetworkX Integration**: Built-in support for NetworkX graph database
|
|
||||||
- **Type Safety**: Pydantic models ensure type safety and data validation
|
|
||||||
- **Async API**: All methods are async for better performance
|
|
||||||
|
|
||||||
## Components
|
|
||||||
|
|
||||||
- **GraphNode**: A node in the graph with a name, type, properties, and metadata
|
|
||||||
- **GraphEdge**: An edge connecting two nodes with an edge type, properties, and metadata
|
|
||||||
- **GraphLayer**: A layer in the graph that can contain nodes and edges, and can have parent layers
|
|
||||||
- **LayeredKnowledgeGraph**: The main graph class that manages layers, nodes, and edges
|
|
||||||
|
|
||||||
## Usage Example
|
|
||||||
|
|
||||||
```python
|
|
||||||
import asyncio
|
|
||||||
from uuid import UUID
|
|
||||||
from cognee.modules.graph.simplified_layered_graph import LayeredKnowledgeGraph
|
|
||||||
from cognee.modules.graph.enhanced_layered_graph_adapter import LayeredGraphDBAdapter
|
|
||||||
from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
# Initialize adapter
|
|
||||||
adapter = NetworkXAdapter(filename="graph.pkl")
|
|
||||||
await adapter.create_empty_graph("graph.pkl")
|
|
||||||
|
|
||||||
# Create graph
|
|
||||||
graph = LayeredKnowledgeGraph.create_empty("My Knowledge Graph")
|
|
||||||
graph.set_adapter(LayeredGraphDBAdapter(adapter))
|
|
||||||
|
|
||||||
# Add layers with parent-child relationships
|
|
||||||
base_layer = await graph.add_layer(
|
|
||||||
name="Base Layer",
|
|
||||||
description="Foundation concepts",
|
|
||||||
layer_type="base"
|
|
||||||
)
|
|
||||||
|
|
||||||
derived_layer = await graph.add_layer(
|
|
||||||
name="Derived Layer",
|
|
||||||
description="Concepts built upon the base layer",
|
|
||||||
layer_type="derived",
|
|
||||||
parent_layers=[base_layer.id] # Parent-child relationship
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add nodes to layers
|
|
||||||
node1 = await graph.add_node(
|
|
||||||
name="Concept A",
|
|
||||||
node_type="concept",
|
|
||||||
properties={"importance": "high"},
|
|
||||||
layer_id=base_layer.id
|
|
||||||
)
|
|
||||||
|
|
||||||
node2 = await graph.add_node(
|
|
||||||
name="Concept B",
|
|
||||||
node_type="concept",
|
|
||||||
properties={"importance": "medium"},
|
|
||||||
layer_id=derived_layer.id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Connect nodes with an edge
|
|
||||||
edge = await graph.add_edge(
|
|
||||||
source_id=node1.id,
|
|
||||||
target_id=node2.id,
|
|
||||||
edge_type="RELATES_TO",
|
|
||||||
properties={"strength": "high"},
|
|
||||||
layer_id=derived_layer.id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get cumulative view (including parent layers)
|
|
||||||
nodes, edges = await graph.get_cumulative_layer_graph(derived_layer.id)
|
|
||||||
|
|
||||||
print(f"Nodes in cumulative view: {[n.name for n in nodes]}")
|
|
||||||
print(f"Edges in cumulative view: {[e.edge_type for e in edges]}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
```
|
|
||||||
|
|
||||||
## Design Improvements
|
|
||||||
|
|
||||||
The simplified layered graph implementation offers several improvements over the previous approach:
|
|
||||||
|
|
||||||
1. **Clear Separation of Concerns**: In-memory operations vs. database operations
|
|
||||||
2. **More Intuitive API**: Methods have clear, consistent signatures
|
|
||||||
3. **Better Error Handling**: Comprehensive validation and error reporting
|
|
||||||
4. **Enhanced Debugging**: Detailed logging throughout
|
|
||||||
5. **Improved Caching**: Local caches reduce database load
|
|
||||||
6. **Method Naming Consistency**: All methods follow consistent naming conventions
|
|
||||||
7. **Reduced Complexity**: Simpler implementation with equivalent functionality
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
- Always use the adapter pattern for database operations
|
|
||||||
- Use the provided factory methods for creating nodes and edges
|
|
||||||
- Leverage parent-child relationships for organizing related concepts
|
|
||||||
- Utilize cumulative views to access inherited nodes and edges
|
|
||||||
- Consider layer types for additional semantic meaning
|
|
||||||
- Use properties and metadata for storing additional information
|
|
||||||
|
|
@ -8,6 +8,11 @@ from cognee.modules.data.processing.document_types import (
|
||||||
TextDocument,
|
TextDocument,
|
||||||
UnstructuredDocument,
|
UnstructuredDocument,
|
||||||
)
|
)
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
from cognee.modules.engine.models.node_set import NodeSet
|
||||||
|
from cognee.modules.engine.utils.generate_node_id import generate_node_id
|
||||||
|
from typing import List, Optional
|
||||||
|
import uuid
|
||||||
|
|
||||||
EXTENSION_TO_DOCUMENT_CLASS = {
|
EXTENSION_TO_DOCUMENT_CLASS = {
|
||||||
"pdf": PdfDocument, # Text documents
|
"pdf": PdfDocument, # Text documents
|
||||||
|
|
@ -49,6 +54,29 @@ EXTENSION_TO_DOCUMENT_CLASS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def update_node_set(document):
|
||||||
|
"""Extracts node_set from document's external_metadata."""
|
||||||
|
try:
|
||||||
|
external_metadata = json.loads(document.external_metadata)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not isinstance(external_metadata, dict):
|
||||||
|
return
|
||||||
|
|
||||||
|
if "node_set" not in external_metadata:
|
||||||
|
return
|
||||||
|
|
||||||
|
node_set = external_metadata["node_set"]
|
||||||
|
if not isinstance(node_set, list):
|
||||||
|
return
|
||||||
|
|
||||||
|
document.belongs_to_set = [
|
||||||
|
NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name)
|
||||||
|
for node_set_name in node_set
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
||||||
"""
|
"""
|
||||||
Classifies a list of data items into specific document types based on file extensions.
|
Classifies a list of data items into specific document types based on file extensions.
|
||||||
|
|
@ -67,6 +95,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
||||||
mime_type=data_item.mime_type,
|
mime_type=data_item.mime_type,
|
||||||
external_metadata=json.dumps(data_item.external_metadata, indent=4),
|
external_metadata=json.dumps(data_item.external_metadata, indent=4),
|
||||||
)
|
)
|
||||||
|
update_node_set(document)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ async def extract_chunks_from_documents(
|
||||||
document_token_count = 0
|
document_token_count = 0
|
||||||
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
|
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
|
||||||
document_token_count += document_chunk.chunk_size
|
document_token_count += document_chunk.chunk_size
|
||||||
|
document_chunk.belongs_to_set = document.belongs_to_set
|
||||||
yield document_chunk
|
yield document_chunk
|
||||||
|
|
||||||
await update_document_token_count(document.id, document_token_count)
|
await update_document_token_count(document.id, document_token_count)
|
||||||
|
|
|
||||||
38
examples/python/simple_node_set_example.py
Normal file
38
examples/python/simple_node_set_example.py
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
import asyncio
|
||||||
|
import cognee
|
||||||
|
from cognee.shared.logging_utils import get_logger, ERROR
|
||||||
|
from cognee.api.v1.search import SearchType
|
||||||
|
|
||||||
|
text_a = """
|
||||||
|
AI is revolutionizing financial services through intelligent fraud detection
|
||||||
|
and automated customer service platforms.
|
||||||
|
"""
|
||||||
|
|
||||||
|
text_b = """
|
||||||
|
Advances in AI are enabling smarter systems that learn and adapt over time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
text_c = """
|
||||||
|
MedTech startups have seen significant growth in recent years, driven by innovation
|
||||||
|
in digital health and medical devices.
|
||||||
|
"""
|
||||||
|
|
||||||
|
node_set_a = ["AI", "FinTech"]
|
||||||
|
node_set_b = ["AI"]
|
||||||
|
node_set_c = ["MedTech"]
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
await cognee.add(text_a, node_set=node_set_a)
|
||||||
|
await cognee.add(text_b, node_set=node_set_b)
|
||||||
|
await cognee.add(text_c, node_set=node_set_c)
|
||||||
|
await cognee.cognify()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger = get_logger(level=ERROR)
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
Add table
Reference in a new issue