Removed check_permissions_on_dataset.py and related references (#1786)

<!-- .github/pull_request_template.md -->

## Description
This PR removes the obsolete `check_permissions_on_dataset` task and all
its related imports and usages across the codebase.
The authorization logic is now handled earlier in the pipeline, so this
task is no longer needed.
These changes simplify the default Cognify pipeline and make the code
cleaner and easier to maintain.

### Changes Made
- Removed `cognee/tasks/documents/check_permissions_on_dataset.py` 
- Removed import from `cognee/tasks/documents/__init__.py` 
- Removed import and usage in `cognee/api/v1/cognify/cognify.py` 
- Removed import and usage in
`cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py`
- Updated comments in
`cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py`
(index positions changed)
- Removed usage in `notebooks/cognee_demo.ipynb` 
- Updated documentation in `examples/python/simple_example.py` (process
description)

---

## Type of Change
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [x] Code refactoring
- [x] Other (please specify): Task removal / cleanup of deprecated
function

---

## Pre-submission Checklist
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue**
- [x] My code follows the project's coding standards and style
guidelines
- [ ] All new and existing tests pass
- [x] I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description (Closes
#1771)
- [x] My commits have clear and descriptive messages

---

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-12-08 05:43:42 +01:00 committed by GitHub
commit 75fea8dcc8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 19 additions and 58 deletions

View file

@ -19,7 +19,6 @@ from cognee.modules.ontology.get_default_ontology_resolver import (
from cognee.modules.users.models import User
from cognee.tasks.documents import (
check_permissions_on_dataset,
classify_documents,
extract_chunks_from_documents,
)
@ -78,12 +77,11 @@ async def cognify(
Processing Pipeline:
1. **Document Classification**: Identifies document types and structures
2. **Permission Validation**: Ensures user has processing rights
3. **Text Chunking**: Breaks content into semantically meaningful segments
4. **Entity Extraction**: Identifies key concepts, people, places, organizations
5. **Relationship Detection**: Discovers connections between entities
6. **Graph Construction**: Builds semantic knowledge graph with embeddings
7. **Content Summarization**: Creates hierarchical summaries for navigation
2. **Text Chunking**: Breaks content into semantically meaningful segments
3. **Entity Extraction**: Identifies key concepts, people, places, organizations
4. **Relationship Detection**: Discovers connections between entities
5. **Graph Construction**: Builds semantic knowledge graph with embeddings
6. **Content Summarization**: Creates hierarchical summaries for navigation
Graph Model Customization:
The `graph_model` parameter allows custom knowledge structures:
@ -274,7 +272,6 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
default_tasks = [
Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task(
extract_chunks_from_documents,
max_chunk_size=chunk_size or get_max_chunk_tokens(),
@ -305,14 +302,13 @@ async def get_temporal_tasks(
The pipeline includes:
1. Document classification.
2. Dataset permission checks (requires "write" access).
3. Document chunking with a specified or default chunk size.
4. Event and timestamp extraction from chunks.
5. Knowledge graph extraction from events.
6. Batched insertion of data points.
2. Document chunking with a specified or default chunk size.
3. Event and timestamp extraction from chunks.
4. Knowledge graph extraction from events.
5. Batched insertion of data points.
Args:
user (User, optional): The user requesting task execution, used for permission checks.
user (User, optional): The user requesting task execution.
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
@ -325,7 +321,6 @@ async def get_temporal_tasks(
temporal_tasks = [
Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task(
extract_chunks_from_documents,
max_chunk_size=chunk_size or get_max_chunk_tokens(),

View file

@ -8,7 +8,6 @@ from cognee.modules.users.models import User
from cognee.shared.data_models import KnowledgeGraph
from cognee.shared.utils import send_telemetry
from cognee.tasks.documents import (
check_permissions_on_dataset,
classify_documents,
extract_chunks_from_documents,
)
@ -31,7 +30,6 @@ async def get_cascade_graph_tasks(
cognee_config = get_cognify_config()
default_tasks = [
Task(classify_documents),
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
Task(
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
), # Extract text chunks based on the document type.

View file

@ -30,8 +30,8 @@ async def get_no_summary_tasks(
ontology_file_path=None,
) -> List[Task]:
"""Returns default tasks without summarization tasks."""
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker)
# Get base tasks (0=classify, 1=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path)
@ -51,8 +51,8 @@ async def get_just_chunks_tasks(
chunk_size: int = None, chunker=TextChunker, user=None
) -> List[Task]:
"""Returns default tasks with only chunk extraction and data points addition."""
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker)
# Get base tasks (0=classify, 1=extract_chunks)
base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
add_data_points_task = Task(add_data_points, task_config={"batch_size": 10})

View file

@ -1,3 +1,2 @@
from .classify_documents import classify_documents
from .extract_chunks_from_documents import extract_chunks_from_documents
from .check_permissions_on_dataset import check_permissions_on_dataset

View file

@ -1,26 +0,0 @@
from cognee.modules.data.processing.document_types import Document
from cognee.modules.users.permissions.methods import check_permission_on_dataset
from typing import List
async def check_permissions_on_dataset(
documents: List[Document], context: dict, user, permissions
) -> List[Document]:
"""
Validates a user's permissions on a list of documents.
Notes:
- This function assumes that `check_permission_on_documents` raises an exception if the permission check fails.
- It is designed to validate multiple permissions in a sequential manner for the same set of documents.
- Ensure that the `Document` and `user` objects conform to the expected structure and interfaces.
"""
for permission in permissions:
await check_permission_on_dataset(
user,
permission,
# TODO: pass dataset through argument instead of context
context["dataset"].id,
)
return documents

View file

@ -32,16 +32,13 @@ async def main():
print("Cognify process steps:")
print("1. Classifying the document: Determining the type and category of the input text.")
print(
"2. Checking permissions: Ensuring the user has the necessary rights to process the text."
"2. Extracting text chunks: Breaking down the text into sentences or phrases for analysis."
)
print(
"3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis."
"3. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
)
print("4. Adding data points: Storing the extracted chunks for processing.")
print(
"5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
)
print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n")
print("4. Summarizing text: Creating concise summaries of the content for quick insights.")
print("5. Adding data points: Storing the extracted chunks for processing.\n")
# Use LLMs and cognee to create knowledge graph
await cognee.cognify()

View file

@ -591,7 +591,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "7c431fdef4921ae0",
"metadata": {
"ExecuteTime": {
@ -609,7 +609,6 @@
"from cognee.modules.pipelines import run_tasks\n",
"from cognee.modules.users.models import User\n",
"from cognee.tasks.documents import (\n",
" check_permissions_on_dataset,\n",
" classify_documents,\n",
" extract_chunks_from_documents,\n",
")\n",
@ -627,7 +626,6 @@
"\n",
" tasks = [\n",
" Task(classify_documents),\n",
" Task(check_permissions_on_dataset, user=user, permissions=[\"write\"]),\n",
" Task(\n",
" extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n",
" ), # Extract text chunks based on the document type.\n",