diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py index 27061ce45..f0a40ca36 100644 --- a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +++ b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py @@ -95,7 +95,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine): return await self.embed_text(text) - except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError): + except litellm.exceptions.BadRequestError: raise EmbeddingException("Failed to index data points.") except Exception as error: diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 78c02b9c9..cd71dd128 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -54,7 +54,6 @@ class TextChunker: contains=[], _metadata={ "index_fields": ["text"], - "metadata_id": self.document.metadata_id, }, ) paragraph_chunks = [] @@ -74,7 +73,6 @@ class TextChunker: contains=[], _metadata={ "index_fields": ["text"], - "metadata_id": self.document.metadata_id, }, ) except Exception as e: @@ -95,7 +93,7 @@ class TextChunker: chunk_index=self.chunk_index, cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], contains=[], - _metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id}, + _metadata={"index_fields": ["text"]}, ) except Exception as e: print(e) diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py index cf8918db7..285a4054c 100644 --- a/cognee/modules/data/models/Data.py +++ b/cognee/modules/data/models/Data.py @@ -1,13 +1,11 @@ from datetime import datetime, timezone -from typing import List from uuid import uuid4 -from sqlalchemy import UUID, Column, DateTime, String -from sqlalchemy.orm import Mapped, relationship +from sqlalchemy import UUID, Column, DateTime, String, JSON +from sqlalchemy.orm import relationship from cognee.infrastructure.databases.relational import Base from .DatasetData import DatasetData -from .Metadata import Metadata class Data(Base): @@ -21,6 +19,7 @@ class Data(Base): raw_data_location = Column(String) owner_id = Column(UUID, index=True) content_hash = Column(String) + foreign_metadata = Column(JSON) created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) @@ -32,13 +31,6 @@ class Data(Base): cascade="all, delete", ) - metadata_relationship = relationship( - "Metadata", - back_populates="data", - lazy="noload", - cascade="all, delete", - ) - def to_json(self) -> dict: return { "id": str(self.id), diff --git a/cognee/modules/data/models/Metadata.py b/cognee/modules/data/models/Metadata.py deleted file mode 100644 index ab41d94be..000000000 --- a/cognee/modules/data/models/Metadata.py +++ /dev/null @@ -1,21 +0,0 @@ -from datetime import datetime, timezone -from uuid import uuid4 - -from sqlalchemy import UUID, Column, DateTime, String, ForeignKey -from sqlalchemy.orm import relationship - -from cognee.infrastructure.databases.relational import Base - - -class Metadata(Base): - __tablename__ = "metadata_table" - - id = Column(UUID, primary_key=True, default=uuid4) - metadata_repr = Column(String) - metadata_source = Column(String) - - created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) - updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) - - data_id = Column(UUID, ForeignKey("data.id", ondelete="CASCADE"), primary_key=False) - data = relationship("Data", back_populates="metadata_relationship") diff --git a/cognee/modules/data/operations/delete_metadata.py b/cognee/modules/data/operations/delete_metadata.py deleted file mode 100644 index df94f52ed..000000000 --- a/cognee/modules/data/operations/delete_metadata.py +++ /dev/null @@ -1,19 +0,0 @@ -import warnings -from uuid import UUID - -from sqlalchemy import select - -from cognee.infrastructure.databases.relational import get_relational_engine - -from ..models.Metadata import Metadata - - -async def delete_metadata(metadata_id: UUID): - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - metadata = await session.get(Metadata, metadata_id) - if metadata is None: - warnings.warn(f"metadata for metadata_id: {metadata_id} not found") - - session.delete(metadata) - session.commit() diff --git a/cognee/modules/data/operations/get_metadata.py b/cognee/modules/data/operations/get_metadata.py deleted file mode 100644 index f827c47c3..000000000 --- a/cognee/modules/data/operations/get_metadata.py +++ /dev/null @@ -1,17 +0,0 @@ -import json -from uuid import UUID - -from sqlalchemy import select - -from cognee.infrastructure.databases.relational import get_relational_engine - -from ..models.Metadata import Metadata - - -async def get_metadata(metadata_id: UUID) -> Metadata: - db_engine = get_relational_engine() - - async with db_engine.get_async_session() as session: - metadata = await session.get(Metadata, metadata_id) - - return metadata diff --git a/cognee/modules/data/operations/write_metadata.py b/cognee/modules/data/operations/write_metadata.py deleted file mode 100644 index 3c2c839c2..000000000 --- a/cognee/modules/data/operations/write_metadata.py +++ /dev/null @@ -1,65 +0,0 @@ -import inspect -import json -import re -import warnings -from uuid import UUID -from sqlalchemy import select -from typing import Any, BinaryIO, Union - -from cognee.infrastructure.databases.relational import get_relational_engine -from cognee.infrastructure.files.utils.get_file_metadata import FileMetadata -from ..models.Metadata import Metadata - - -async def write_metadata( - data_item: Union[BinaryIO, str, Any], data_id: UUID, file_metadata: FileMetadata -) -> UUID: - metadata_dict = get_metadata_dict(data_item, file_metadata) - db_engine = get_relational_engine() - async with db_engine.get_async_session() as session: - metadata = ( - await session.execute(select(Metadata).filter(Metadata.data_id == data_id)) - ).scalar_one_or_none() - - if metadata is not None: - metadata.metadata_repr = json.dumps(metadata_dict) - metadata.metadata_source = parse_type(type(data_item)) - await session.merge(metadata) - else: - metadata = Metadata( - id=data_id, - metadata_repr=json.dumps(metadata_dict), - metadata_source=parse_type(type(data_item)), - data_id=data_id, - ) - session.add(metadata) - - await session.commit() - - -def parse_type(type_: Any) -> str: - pattern = r".+'([\w_\.]+)'" - match = re.search(pattern, str(type_)) - if match: - return match.group(1) - else: - raise Exception(f"type: {type_} could not be parsed") - - -def get_metadata_dict( - data_item: Union[BinaryIO, str, Any], file_metadata: FileMetadata -) -> dict[str, Any]: - if isinstance(data_item, str): - return file_metadata - elif isinstance(data_item, BinaryIO): - return file_metadata - elif hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): - return {**file_metadata, **data_item.dict()} - else: - warnings.warn( - f"metadata of type {type(data_item)}: {str(data_item)[:20]}... does not have dict method. Defaulting to string method" - ) - try: - return {**dict(file_metadata), "content": str(data_item)} - except Exception as e: - raise Exception(f"Could not cast metadata to string: {e}") diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 7ecdf289e..7b464acef 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -7,7 +7,6 @@ from cognee.infrastructure.engine import DataPoint class Document(DataPoint): name: str raw_data_location: str - metadata_id: UUID mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 118da5738..e86bc47e5 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -7,7 +7,6 @@ from cognee.modules.data.processing.document_types import ( TextDocument, UnstructuredDocument, ) -from cognee.modules.data.operations.get_metadata import get_metadata EXTENSION_TO_DOCUMENT_CLASS = { "pdf": PdfDocument, # Text documents @@ -52,14 +51,12 @@ EXTENSION_TO_DOCUMENT_CLASS = { async def classify_documents(data_documents: list[Data]) -> list[Document]: documents = [] for data_item in data_documents: - metadata = await get_metadata(data_item.id) document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( id=data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name, mime_type=data_item.mime_type, - metadata_id=metadata.id, ) documents.append(document) diff --git a/cognee/tasks/ingestion/ingest_data.py b/cognee/tasks/ingestion/ingest_data.py index 2306aa2de..7b63f7b3f 100644 --- a/cognee/tasks/ingestion/ingest_data.py +++ b/cognee/tasks/ingestion/ingest_data.py @@ -8,12 +8,15 @@ from cognee.modules.data.models.DatasetData import DatasetData from cognee.modules.users.models import User from cognee.modules.users.permissions.methods import give_permission_on_document from cognee.shared.utils import send_telemetry -from cognee.modules.data.operations.write_metadata import write_metadata from .get_dlt_destination import get_dlt_destination from .save_data_item_to_storage import ( save_data_item_to_storage, ) +from typing import Union, BinaryIO +import inspect +import warnings + async def ingest_data(data: Any, dataset_name: str, user: User): destination = get_dlt_destination() @@ -23,6 +26,15 @@ async def ingest_data(data: Any, dataset_name: str, user: User): destination=destination, ) + def get_foreign_metadata_dict(data_item: Union[BinaryIO, str, Any]) -> dict[str, Any]: + if hasattr(data_item, "dict") and inspect.ismethod(getattr(data_item, "dict")): + return {"metadata": data_item.dict(), "origin": str(type(data_item))} + else: + warnings.warn( + f"Data of type {type(data_item)}... does not have dict method. Returning empty metadata." + ) + return {} + @dlt.resource(standalone=True, primary_key="id", merge_key="id") async def data_resources(file_paths: List[str], user: User): for file_path in file_paths: @@ -83,6 +95,7 @@ async def ingest_data(data: Any, dataset_name: str, user: User): data_point.mime_type = file_metadata["mime_type"] data_point.owner_id = user.id data_point.content_hash = file_metadata["content_hash"] + data_point.foreign_metadata = (get_foreign_metadata_dict(data_item),) await session.merge(data_point) else: data_point = Data( @@ -93,6 +106,7 @@ async def ingest_data(data: Any, dataset_name: str, user: User): mime_type=file_metadata["mime_type"], owner_id=user.id, content_hash=file_metadata["content_hash"], + foreign_metadata=get_foreign_metadata_dict(data_item), ) # Check if data is already in dataset @@ -108,7 +122,6 @@ async def ingest_data(data: Any, dataset_name: str, user: User): dataset.data.append(data_point) await session.commit() - await write_metadata(data_item, data_point.id, file_metadata) await give_permission_on_document(user, data_id, "read") await give_permission_on_document(user, data_id, "write") diff --git a/cognee/tests/integration/documents/AudioDocument_test.py b/cognee/tests/integration/documents/AudioDocument_test.py index e07a2431b..2903844fe 100644 --- a/cognee/tests/integration/documents/AudioDocument_test.py +++ b/cognee/tests/integration/documents/AudioDocument_test.py @@ -29,7 +29,6 @@ def test_AudioDocument(): id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", - metadata_id=uuid.uuid4(), mime_type="", ) with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT): diff --git a/cognee/tests/integration/documents/ImageDocument_test.py b/cognee/tests/integration/documents/ImageDocument_test.py index b8d585419..8bac287f2 100644 --- a/cognee/tests/integration/documents/ImageDocument_test.py +++ b/cognee/tests/integration/documents/ImageDocument_test.py @@ -18,7 +18,6 @@ def test_ImageDocument(): id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", - metadata_id=uuid.uuid4(), mime_type="", ) with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT): diff --git a/cognee/tests/integration/documents/PdfDocument_test.py b/cognee/tests/integration/documents/PdfDocument_test.py index fc4307846..4f2eeb58f 100644 --- a/cognee/tests/integration/documents/PdfDocument_test.py +++ b/cognee/tests/integration/documents/PdfDocument_test.py @@ -20,7 +20,6 @@ def test_PdfDocument(): id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, - metadata_id=uuid.uuid4(), mime_type="", ) diff --git a/cognee/tests/integration/documents/TextDocument_test.py b/cognee/tests/integration/documents/TextDocument_test.py index 6daec62b7..51e2aabcf 100644 --- a/cognee/tests/integration/documents/TextDocument_test.py +++ b/cognee/tests/integration/documents/TextDocument_test.py @@ -32,7 +32,6 @@ def test_TextDocument(input_file, chunk_size): id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, - metadata_id=uuid.uuid4(), mime_type="", ) diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 773dc2293..3dea34a6a 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -39,7 +39,6 @@ def test_UnstructuredDocument(): id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, - metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation", ) @@ -47,7 +46,6 @@ def test_UnstructuredDocument(): id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, - metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) @@ -55,7 +53,6 @@ def test_UnstructuredDocument(): id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, - metadata_id=uuid.uuid4(), mime_type="text/csv", ) @@ -63,7 +60,6 @@ def test_UnstructuredDocument(): id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, - metadata_id=uuid.uuid4(), mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", )