Merge branch 'main' into merge-main-vol7
This commit is contained in:
commit
59f8d12fa3
7 changed files with 19 additions and 58 deletions
|
|
@ -20,7 +20,6 @@ from cognee.modules.ontology.get_default_ontology_resolver import (
|
||||||
from cognee.modules.users.models import User
|
from cognee.modules.users.models import User
|
||||||
|
|
||||||
from cognee.tasks.documents import (
|
from cognee.tasks.documents import (
|
||||||
check_permissions_on_dataset,
|
|
||||||
classify_documents,
|
classify_documents,
|
||||||
extract_chunks_from_documents,
|
extract_chunks_from_documents,
|
||||||
)
|
)
|
||||||
|
|
@ -79,12 +78,11 @@ async def cognify(
|
||||||
|
|
||||||
Processing Pipeline:
|
Processing Pipeline:
|
||||||
1. **Document Classification**: Identifies document types and structures
|
1. **Document Classification**: Identifies document types and structures
|
||||||
2. **Permission Validation**: Ensures user has processing rights
|
2. **Text Chunking**: Breaks content into semantically meaningful segments
|
||||||
3. **Text Chunking**: Breaks content into semantically meaningful segments
|
3. **Entity Extraction**: Identifies key concepts, people, places, organizations
|
||||||
4. **Entity Extraction**: Identifies key concepts, people, places, organizations
|
4. **Relationship Detection**: Discovers connections between entities
|
||||||
5. **Relationship Detection**: Discovers connections between entities
|
5. **Graph Construction**: Builds semantic knowledge graph with embeddings
|
||||||
6. **Graph Construction**: Builds semantic knowledge graph with embeddings
|
6. **Content Summarization**: Creates hierarchical summaries for navigation
|
||||||
7. **Content Summarization**: Creates hierarchical summaries for navigation
|
|
||||||
|
|
||||||
Graph Model Customization:
|
Graph Model Customization:
|
||||||
The `graph_model` parameter allows custom knowledge structures:
|
The `graph_model` parameter allows custom knowledge structures:
|
||||||
|
|
@ -278,7 +276,6 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
||||||
|
|
||||||
default_tasks = [
|
default_tasks = [
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
|
|
||||||
Task(
|
Task(
|
||||||
extract_chunks_from_documents,
|
extract_chunks_from_documents,
|
||||||
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
||||||
|
|
@ -313,14 +310,13 @@ async def get_temporal_tasks(
|
||||||
|
|
||||||
The pipeline includes:
|
The pipeline includes:
|
||||||
1. Document classification.
|
1. Document classification.
|
||||||
2. Dataset permission checks (requires "write" access).
|
2. Document chunking with a specified or default chunk size.
|
||||||
3. Document chunking with a specified or default chunk size.
|
3. Event and timestamp extraction from chunks.
|
||||||
4. Event and timestamp extraction from chunks.
|
4. Knowledge graph extraction from events.
|
||||||
5. Knowledge graph extraction from events.
|
5. Batched insertion of data points.
|
||||||
6. Batched insertion of data points.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
user (User, optional): The user requesting task execution, used for permission checks.
|
user (User, optional): The user requesting task execution.
|
||||||
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
|
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
|
||||||
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
|
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
|
||||||
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
|
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
|
||||||
|
|
@ -333,7 +329,6 @@ async def get_temporal_tasks(
|
||||||
|
|
||||||
temporal_tasks = [
|
temporal_tasks = [
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
|
|
||||||
Task(
|
Task(
|
||||||
extract_chunks_from_documents,
|
extract_chunks_from_documents,
|
||||||
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ from cognee.modules.users.models import User
|
||||||
from cognee.shared.data_models import KnowledgeGraph
|
from cognee.shared.data_models import KnowledgeGraph
|
||||||
from cognee.shared.utils import send_telemetry
|
from cognee.shared.utils import send_telemetry
|
||||||
from cognee.tasks.documents import (
|
from cognee.tasks.documents import (
|
||||||
check_permissions_on_dataset,
|
|
||||||
classify_documents,
|
classify_documents,
|
||||||
extract_chunks_from_documents,
|
extract_chunks_from_documents,
|
||||||
)
|
)
|
||||||
|
|
@ -31,7 +30,6 @@ async def get_cascade_graph_tasks(
|
||||||
cognee_config = get_cognify_config()
|
cognee_config = get_cognify_config()
|
||||||
default_tasks = [
|
default_tasks = [
|
||||||
Task(classify_documents),
|
Task(classify_documents),
|
||||||
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
|
|
||||||
Task(
|
Task(
|
||||||
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
|
extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens()
|
||||||
), # Extract text chunks based on the document type.
|
), # Extract text chunks based on the document type.
|
||||||
|
|
|
||||||
|
|
@ -30,8 +30,8 @@ async def get_no_summary_tasks(
|
||||||
ontology_file_path=None,
|
ontology_file_path=None,
|
||||||
) -> List[Task]:
|
) -> List[Task]:
|
||||||
"""Returns default tasks without summarization tasks."""
|
"""Returns default tasks without summarization tasks."""
|
||||||
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks)
|
# Get base tasks (0=classify, 1=extract_chunks)
|
||||||
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker)
|
base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
|
||||||
|
|
||||||
ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path)
|
ontology_adapter = RDFLibOntologyResolver(ontology_file=ontology_file_path)
|
||||||
|
|
||||||
|
|
@ -51,8 +51,8 @@ async def get_just_chunks_tasks(
|
||||||
chunk_size: int = None, chunker=TextChunker, user=None
|
chunk_size: int = None, chunker=TextChunker, user=None
|
||||||
) -> List[Task]:
|
) -> List[Task]:
|
||||||
"""Returns default tasks with only chunk extraction and data points addition."""
|
"""Returns default tasks with only chunk extraction and data points addition."""
|
||||||
# Get base tasks (0=classify, 1=check_permissions, 2=extract_chunks)
|
# Get base tasks (0=classify, 1=extract_chunks)
|
||||||
base_tasks = await get_default_tasks_by_indices([0, 1, 2], chunk_size, chunker)
|
base_tasks = await get_default_tasks_by_indices([0, 1], chunk_size, chunker)
|
||||||
|
|
||||||
add_data_points_task = Task(add_data_points, task_config={"batch_size": 10})
|
add_data_points_task = Task(add_data_points, task_config={"batch_size": 10})
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,2 @@
|
||||||
from .classify_documents import classify_documents
|
from .classify_documents import classify_documents
|
||||||
from .extract_chunks_from_documents import extract_chunks_from_documents
|
from .extract_chunks_from_documents import extract_chunks_from_documents
|
||||||
from .check_permissions_on_dataset import check_permissions_on_dataset
|
|
||||||
|
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
from cognee.modules.data.processing.document_types import Document
|
|
||||||
from cognee.modules.users.permissions.methods import check_permission_on_dataset
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
|
|
||||||
async def check_permissions_on_dataset(
|
|
||||||
documents: List[Document], context: dict, user, permissions
|
|
||||||
) -> List[Document]:
|
|
||||||
"""
|
|
||||||
Validates a user's permissions on a list of documents.
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
- This function assumes that `check_permission_on_documents` raises an exception if the permission check fails.
|
|
||||||
- It is designed to validate multiple permissions in a sequential manner for the same set of documents.
|
|
||||||
- Ensure that the `Document` and `user` objects conform to the expected structure and interfaces.
|
|
||||||
"""
|
|
||||||
|
|
||||||
for permission in permissions:
|
|
||||||
await check_permission_on_dataset(
|
|
||||||
user,
|
|
||||||
permission,
|
|
||||||
# TODO: pass dataset through argument instead of context
|
|
||||||
context["dataset"].id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return documents
|
|
||||||
|
|
@ -32,16 +32,13 @@ async def main():
|
||||||
print("Cognify process steps:")
|
print("Cognify process steps:")
|
||||||
print("1. Classifying the document: Determining the type and category of the input text.")
|
print("1. Classifying the document: Determining the type and category of the input text.")
|
||||||
print(
|
print(
|
||||||
"2. Checking permissions: Ensuring the user has the necessary rights to process the text."
|
"2. Extracting text chunks: Breaking down the text into sentences or phrases for analysis."
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
"3. Extracting text chunks: Breaking down the text into sentences or phrases for analysis."
|
"3. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
|
||||||
)
|
)
|
||||||
print("4. Adding data points: Storing the extracted chunks for processing.")
|
print("4. Summarizing text: Creating concise summaries of the content for quick insights.")
|
||||||
print(
|
print("5. Adding data points: Storing the extracted chunks for processing.\n")
|
||||||
"5. Generating knowledge graph: Extracting entities and relationships to form a knowledge graph."
|
|
||||||
)
|
|
||||||
print("6. Summarizing text: Creating concise summaries of the content for quick insights.\n")
|
|
||||||
|
|
||||||
# Use LLMs and cognee to create knowledge graph
|
# Use LLMs and cognee to create knowledge graph
|
||||||
await cognee.cognify()
|
await cognee.cognify()
|
||||||
|
|
|
||||||
4
notebooks/cognee_demo.ipynb
vendored
4
notebooks/cognee_demo.ipynb
vendored
|
|
@ -591,7 +591,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"id": "7c431fdef4921ae0",
|
"id": "7c431fdef4921ae0",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
|
|
@ -609,7 +609,6 @@
|
||||||
"from cognee.modules.pipelines import run_tasks\n",
|
"from cognee.modules.pipelines import run_tasks\n",
|
||||||
"from cognee.modules.users.models import User\n",
|
"from cognee.modules.users.models import User\n",
|
||||||
"from cognee.tasks.documents import (\n",
|
"from cognee.tasks.documents import (\n",
|
||||||
" check_permissions_on_dataset,\n",
|
|
||||||
" classify_documents,\n",
|
" classify_documents,\n",
|
||||||
" extract_chunks_from_documents,\n",
|
" extract_chunks_from_documents,\n",
|
||||||
")\n",
|
")\n",
|
||||||
|
|
@ -627,7 +626,6 @@
|
||||||
"\n",
|
"\n",
|
||||||
" tasks = [\n",
|
" tasks = [\n",
|
||||||
" Task(classify_documents),\n",
|
" Task(classify_documents),\n",
|
||||||
" Task(check_permissions_on_dataset, user=user, permissions=[\"write\"]),\n",
|
|
||||||
" Task(\n",
|
" Task(\n",
|
||||||
" extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n",
|
" extract_chunks_from_documents, max_chunk_size=get_max_chunk_tokens()\n",
|
||||||
" ), # Extract text chunks based on the document type.\n",
|
" ), # Extract text chunks based on the document type.\n",
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue