Merge branch 'main' into merge-main-vol7

This commit is contained in:
Igor Ilic 2025-12-11 19:11:24 +01:00
commit 59f8d12fa3
7 changed files with 19 additions and 58 deletions

View file

@ -20,7 +20,6 @@ from cognee.modules.ontology.get_default_ontology_resolver import (
from cognee.modules.users.models import User from cognee.modules.users.models import User
from cognee.tasks.documents import ( from cognee.tasks.documents import (
check_permissions_on_dataset,
classify_documents, classify_documents,
extract_chunks_from_documents, extract_chunks_from_documents,
) )
@ -79,12 +78,11 @@ async def cognify(
Processing Pipeline: Processing Pipeline:
1. **Document Classification**: Identifies document types and structures 1. **Document Classification**: Identifies document types and structures
2. **Permission Validation**: Ensures user has processing rights 2. **Text Chunking**: Breaks content into semantically meaningful segments
3. **Text Chunking**: Breaks content into semantically meaningful segments 3. **Entity Extraction**: Identifies key concepts, people, places, organizations
4. **Entity Extraction**: Identifies key concepts, people, places, organizations 4. **Relationship Detection**: Discovers connections between entities
5. **Relationship Detection**: Discovers connections between entities 5. **Graph Construction**: Builds semantic knowledge graph with embeddings
6. **Graph Construction**: Builds semantic knowledge graph with embeddings 6. **Content Summarization**: Creates hierarchical summaries for navigation
7. **Content Summarization**: Creates hierarchical summaries for navigation
Graph Model Customization: Graph Model Customization:
The `graph_model` parameter allows custom knowledge structures: The `graph_model` parameter allows custom knowledge structures:
@ -278,7 +276,6 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
default_tasks = [ default_tasks = [
Task(classify_documents), Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task( Task(
extract_chunks_from_documents, extract_chunks_from_documents,
max_chunk_size=chunk_size or get_max_chunk_tokens(), max_chunk_size=chunk_size or get_max_chunk_tokens(),
@ -313,14 +310,13 @@ async def get_temporal_tasks(
The pipeline includes: The pipeline includes:
1. Document classification. 1. Document classification.
2. Dataset permission checks (requires "write" access). 2. Document chunking with a specified or default chunk size.
3. Document chunking with a specified or default chunk size. 3. Event and timestamp extraction from chunks.
4. Event and timestamp extraction from chunks. 4. Knowledge graph extraction from events.
5. Knowledge graph extraction from events. 5. Batched insertion of data points.
6. Batched insertion of data points.
Args: Args:
user (User, optional): The user requesting task execution, used for permission checks. user (User, optional): The user requesting task execution.
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
@ -333,7 +329,6 @@ async def get_temporal_tasks(
temporal_tasks = [ temporal_tasks = [
Task(classify_documents), Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task( Task(
extract_chunks_from_documents, extract_chunks_from_documents,
max_chunk_size=chunk_size or get_max_chunk_tokens(), max_chunk_size=chunk_size or get_max_chunk_tokens(),

View file

@ -8,7 +8,6 @@ from cognee.modules.users.models import User
from cognee.shared.data_models import KnowledgeGraph from cognee.shared.data_models import KnowledgeGraph
from cognee.shared.utils import send_telemetry from cognee.shared.utils import send_telemetry
from cognee.tasks.documents import ( from cognee.tasks.documents import (
check_permissions_on_dataset,
classify_documents, classify_documents,
extract_chunks_from_documents, extract_chunks_from_documents,
) )
@ -31,7 +30,6 @@ async def get_cascade_graph_tasks(
cognee_config = get_cognify_config() cognee_config = get_cognify_config()
default_tasks = [ default_tasks = [
Task(classify_documents), Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task( Task(
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens() extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
), # Extract text chunks based on the document type. ), # Extract text chunks based on the document type.

View file

@ -30,8 +30,8 @@ async def get_no_summary_tasks(
ontology_file_path=None, ontology_file_path=None,
) -> List[Task]: ) -> List[Task]:
"""Returns default tasks without summarization tasks.""" """Returns default tasks without summarization tasks."""
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks) # Get base tasks (0=classify, 1=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker) base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path) ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path)
@ -51,8 +51,8 @@ async def get_just_chunks_tasks(
chunk_size: int = None, chunker=TextChunker, user=None chunk_size: int = None, chunker=TextChunker, user=None
) -> List[Task]: ) -> List[Task]:
"""Returns default tasks with only chunk extraction and data points addition.""" """Returns default tasks with only chunk extraction and data points addition."""
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks) # Get base tasks (0=classify, 1=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker) base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
add_data_points_task = Task(add_data_points, task_config={"batch_size": 10}) add_data_points_task = Task(add_data_points, task_config={"batch_size": 10})

View file

@ -1,3 +1,2 @@
from .classify_documents import classify_documents from .classify_documents import classify_documents
from .extract_chunks_from_documents import extract_chunks_from_documents from .extract_chunks_from_documents import extract_chunks_from_documents
from .check_permissions_on_dataset import check_permissions_on_dataset

View file

@ -1,26 +0,0 @@
from cognee.modules.data.processing.document_types import Document
from cognee.modules.users.permissions.methods import check_permission_on_dataset
from typing import List
async def check_permissions_on_dataset(
documents: List[Document], context: dict, user, permissions
) -> List[Document]:
"""
Validates a user's permissions on a list of documents.
Notes:
- This function assumes that `check_permission_on_documents` raises an exception if the permission check fails.
- It is designed to validate multiple permissions in a sequential manner for the same set of documents.
- Ensure that the `Document` and `user` objects conform to the expected structure and interfaces.
"""
for permission in permissions:
await check_permission_on_dataset(
user,
permission,
# TODO: pass dataset through argument instead of context
context["dataset"].id,
)
return documents

View file

@ -32,16 +32,13 @@ async def main():
print("Cognify process steps:") print("Cognify process steps:")
print("1. Classifying the document: Determining the type and category of the input text.") print("1. Classifying the document: Determining the type and category of the input text.")
print( print(
"2. Checking permissions: Ensuring the user has the necessary rights to process the text." "2. Extracting text chunks: Breaking down the text into sentences or phrases for analysis."
) )
print( print(
"3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis." "3. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
) )
print("4. Adding data points: Storing the extracted chunks for processing.") print("4. Summarizing text: Creating concise summaries of the content for quick insights.")
print( print("5. Adding data points: Storing the extracted chunks for processing.\n")
"5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
)
print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n")
# Use LLMs and cognee to create knowledge graph # Use LLMs and cognee to create knowledge graph
await cognee.cognify() await cognee.cognify()

View file

@ -591,7 +591,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": null,
"id": "7c431fdef4921ae0", "id": "7c431fdef4921ae0",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@ -609,7 +609,6 @@
"from cognee.modules.pipelines import run_tasks\n", "from cognee.modules.pipelines import run_tasks\n",
"from cognee.modules.users.models import User\n", "from cognee.modules.users.models import User\n",
"from cognee.tasks.documents import (\n", "from cognee.tasks.documents import (\n",
" check_permissions_on_dataset,\n",
" classify_documents,\n", " classify_documents,\n",
" extract_chunks_from_documents,\n", " extract_chunks_from_documents,\n",
")\n", ")\n",
@ -627,7 +626,6 @@
"\n", "\n",
" tasks = [\n", " tasks = [\n",
" Task(classify_documents),\n", " Task(classify_documents),\n",
" Task(check_permissions_on_dataset, user=user, permissions=[\"write\"]),\n",
" Task(\n", " Task(\n",
" extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n", " extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n",
" ), # Extract text chunks based on the document type.\n", " ), # Extract text chunks based on the document type.\n",