From 7324564655c70dc4a3a038959283f3d697893f7e Mon Sep 17 00:00:00 2001 From: Leon Luithlen Date: Tue, 26 Nov 2024 16:30:25 +0100 Subject: [PATCH] Add metadata_id attribute to Document and DocumentChunk, make ingest_with_metadata default --- cognee/api/v1/add/add_v2.py | 5 ++--- cognee/modules/chunking/TextChunker.py | 3 +++ cognee/modules/data/processing/document_types/Document.py | 2 ++ cognee/tasks/documents/classify_documents.py | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cognee/api/v1/add/add_v2.py b/cognee/api/v1/add/add_v2.py index 9d6e33012..631d963e5 100644 --- a/cognee/api/v1/add/add_v2.py +++ b/cognee/api/v1/add/add_v2.py @@ -2,7 +2,7 @@ from typing import Union, BinaryIO from cognee.modules.users.models import User from cognee.modules.users.methods import get_default_user from cognee.modules.pipelines import run_tasks, Task -from cognee.tasks.ingestion import save_data_to_storage, ingest_data +from cognee.tasks.ingestion import ingest_data_with_metadata from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables @@ -14,8 +14,7 @@ async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_nam user = await get_default_user() tasks = [ - Task(save_data_to_storage, dataset_name), - Task(ingest_data, dataset_name, user) + Task(ingest_data_with_metadata, dataset_name, user) ] pipeline = run_tasks(tasks, data, "add_pipeline") diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index f0a72b58a..24ed0b236 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -35,6 +35,7 @@ class TextChunker(): is_part_of = self.document, chunk_index = self.chunk_index, cut_type = chunk_data["cut_type"], + metadata_id = self.document.metadata_id ) paragraph_chunks = [] self.chunk_size = 0 @@ -48,6 +49,7 @@ class TextChunker(): is_part_of = self.document, chunk_index = self.chunk_index, cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], + metadata_id = self.document.metadata_id ) except Exception as e: print(e) @@ -65,6 +67,7 @@ class TextChunker(): is_part_of = self.document, chunk_index = self.chunk_index, cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], + metadata_id = self.document.metadata_id ) except Exception as e: print(e) diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 7d5545cfc..773fc30c8 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,9 +1,11 @@ from cognee.infrastructure.engine import DataPoint +from uuid import UUID class Document(DataPoint): type: str name: str raw_data_location: str + metadata_id: UUID def read(self, chunk_size: int) -> str: pass diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 8ee87bcad..599b74e17 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -45,6 +45,7 @@ def classify_documents(data_documents: list[Data]) -> list[Document]: title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name, + metadata_id=data_item.metadata_id ) for data_item in data_documents ]