cognee/cognee/tasks/documents/classify_documents.py
Vasilije bb7eaa017b
feat: Group DataPoints into NodeSets (#680)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
Co-authored-by: Boris Arzentar <borisarzentar@gmail.com>
2025-04-19 20:21:04 +02:00

98 lines
2.9 KiB
Python

from cognee.modules.data.models import Data
import json
from cognee.modules.data.processing.document_types import (
Document,
PdfDocument,
AudioDocument,
ImageDocument,
TextDocument,
UnstructuredDocument,
)
from cognee.modules.engine.models.node_set import NodeSet
from cognee.modules.engine.utils.generate_node_id import generate_node_id
EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"docx": UnstructuredDocument,
"doc": UnstructuredDocument,
"odt": UnstructuredDocument,
"xls": UnstructuredDocument,
"xlsx": UnstructuredDocument,
"ppt": UnstructuredDocument,
"pptx": UnstructuredDocument,
"odp": UnstructuredDocument,
"ods": UnstructuredDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,
"jpg": ImageDocument,
"jpx": ImageDocument,
"apng": ImageDocument,
"gif": ImageDocument,
"webp": ImageDocument,
"cr2": ImageDocument,
"tif": ImageDocument,
"bmp": ImageDocument,
"jxr": ImageDocument,
"psd": ImageDocument,
"ico": ImageDocument,
"heic": ImageDocument,
"avif": ImageDocument,
"aac": AudioDocument, # Audio documents
"mid": AudioDocument,
"mp3": AudioDocument,
"m4a": AudioDocument,
"ogg": AudioDocument,
"flac": AudioDocument,
"wav": AudioDocument,
"amr": AudioDocument,
"aiff": AudioDocument,
}
def update_node_set(document):
"""Extracts node_set from document's external_metadata."""
try:
external_metadata = json.loads(document.external_metadata)
except json.JSONDecodeError:
return
if not isinstance(external_metadata, dict):
return
if "node_set" not in external_metadata:
return
node_set = external_metadata["node_set"]
if not isinstance(node_set, list):
return
document.belongs_to_set = [
NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name)
for node_set_name in node_set
]
async def classify_documents(data_documents: list[Data]) -> list[Document]:
"""
Classifies a list of data items into specific document types based on file extensions.
Notes:
- The function relies on `get_metadata` to retrieve metadata information for each data item.
- Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
"""
documents = []
for data_item in data_documents:
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
mime_type=data_item.mime_type,
external_metadata=json.dumps(data_item.external_metadata, indent=4),
)
update_node_set(document)
documents.append(document)
return documents