From 7324564655c70dc4a3a038959283f3d697893f7e Mon Sep 17 00:00:00 2001
From: Leon Luithlen <leontimnaluithlen@gmail.com>
Date: Tue, 26 Nov 2024 16:30:25 +0100
Subject: [PATCH] Add metadata_id attribute to Document and DocumentChunk, make
 ingest_with_metadata default

---
 cognee/api/v1/add/add_v2.py                               | 5 ++---
 cognee/modules/chunking/TextChunker.py                    | 3 +++
 cognee/modules/data/processing/document_types/Document.py | 2 ++
 cognee/tasks/documents/classify_documents.py              | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cognee/api/v1/add/add_v2.py b/cognee/api/v1/add/add_v2.py
index 9d6e33012..631d963e5 100644
--- a/cognee/api/v1/add/add_v2.py
+++ b/cognee/api/v1/add/add_v2.py
@@ -2,7 +2,7 @@ from typing import Union, BinaryIO
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines import run_tasks, Task
-from cognee.tasks.ingestion import save_data_to_storage, ingest_data
+from cognee.tasks.ingestion import ingest_data_with_metadata
 from cognee.infrastructure.databases.relational import create_db_and_tables as create_relational_db_and_tables
 from cognee.infrastructure.databases.vector.pgvector import create_db_and_tables as create_pgvector_db_and_tables
 
@@ -14,8 +14,7 @@ async def add(data: Union[BinaryIO, list[BinaryIO], str, list[str]], dataset_nam
         user = await get_default_user()
 
     tasks = [
-        Task(save_data_to_storage, dataset_name),
-        Task(ingest_data, dataset_name, user)
+        Task(ingest_data_with_metadata, dataset_name, user)
     ]
 
     pipeline = run_tasks(tasks, data, "add_pipeline")
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index f0a72b58a..24ed0b236 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -35,6 +35,7 @@ class TextChunker():
                             is_part_of = self.document,
                             chunk_index = self.chunk_index,
                             cut_type = chunk_data["cut_type"],
+                            metadata_id = self.document.metadata_id
                         )
                         paragraph_chunks = []
                         self.chunk_size = 0
@@ -48,6 +49,7 @@ class TextChunker():
                                 is_part_of = self.document,
                                 chunk_index = self.chunk_index,
                                 cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
+                                metadata_id = self.document.metadata_id
                             )
                         except Exception as e:
                             print(e)
@@ -65,6 +67,7 @@ class TextChunker():
                     is_part_of = self.document,
                     chunk_index = self.chunk_index,
                     cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
+                    metadata_id = self.document.metadata_id
                 )
             except Exception as e:
                 print(e)
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 7d5545cfc..773fc30c8 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -1,9 +1,11 @@
 from cognee.infrastructure.engine import DataPoint
+from uuid import UUID
 
 class Document(DataPoint):
     type: str
     name: str
     raw_data_location: str
+    metadata_id: UUID
 
     def read(self, chunk_size: int) -> str:
         pass
diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py
index 8ee87bcad..599b74e17 100644
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@@ -45,6 +45,7 @@ def classify_documents(data_documents: list[Data]) -> list[Document]:
             title=f"{data_item.name}.{data_item.extension}",
             raw_data_location=data_item.raw_data_location,
             name=data_item.name,
+            metadata_id=data_item.metadata_id
         )
         for data_item in data_documents
     ]