<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
127 lines
3.9 KiB
Python
127 lines
3.9 KiB
Python
from cognee.modules.data.models import Data
|
|
import json
|
|
from cognee.modules.data.processing.document_types import (
|
|
Document,
|
|
PdfDocument,
|
|
AudioDocument,
|
|
ImageDocument,
|
|
TextDocument,
|
|
UnstructuredDocument,
|
|
)
|
|
from cognee.modules.engine.models.node_set import NodeSet
|
|
from cognee.modules.engine.utils.generate_node_id import generate_node_id
|
|
|
|
EXTENSION_TO_DOCUMENT_CLASS = {
|
|
"pdf": PdfDocument, # Text documents
|
|
"txt": TextDocument,
|
|
"docx": UnstructuredDocument,
|
|
"doc": UnstructuredDocument,
|
|
"odt": UnstructuredDocument,
|
|
"xls": UnstructuredDocument,
|
|
"xlsx": UnstructuredDocument,
|
|
"ppt": UnstructuredDocument,
|
|
"pptx": UnstructuredDocument,
|
|
"odp": UnstructuredDocument,
|
|
"ods": UnstructuredDocument,
|
|
"png": ImageDocument, # Image documents
|
|
"dwg": ImageDocument,
|
|
"xcf": ImageDocument,
|
|
"jpg": ImageDocument,
|
|
"jpx": ImageDocument,
|
|
"apng": ImageDocument,
|
|
"gif": ImageDocument,
|
|
"webp": ImageDocument,
|
|
"cr2": ImageDocument,
|
|
"tif": ImageDocument,
|
|
"bmp": ImageDocument,
|
|
"jxr": ImageDocument,
|
|
"psd": ImageDocument,
|
|
"ico": ImageDocument,
|
|
"heic": ImageDocument,
|
|
"avif": ImageDocument,
|
|
"aac": AudioDocument, # Audio documents
|
|
"mid": AudioDocument,
|
|
"mp3": AudioDocument,
|
|
"m4a": AudioDocument,
|
|
"ogg": AudioDocument,
|
|
"flac": AudioDocument,
|
|
"wav": AudioDocument,
|
|
"amr": AudioDocument,
|
|
"aiff": AudioDocument,
|
|
}
|
|
|
|
|
|
def update_node_set(document):
|
|
"""
|
|
Extracts node_set from document's external_metadata.
|
|
|
|
Parses the external_metadata of the given document and updates the document's
|
|
belongs_to_set attribute with NodeSet objects generated from the node_set found in the
|
|
external_metadata. If the external_metadata is not valid JSON, is not a dictionary, does
|
|
not contain the 'node_set' key, or if node_set is not a list, the function has no effect
|
|
and will return early.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- document: The document object which contains external_metadata from which the
|
|
node_set will be extracted.
|
|
"""
|
|
try:
|
|
external_metadata = json.loads(document.external_metadata)
|
|
except json.JSONDecodeError:
|
|
return
|
|
|
|
if not isinstance(external_metadata, dict):
|
|
return
|
|
|
|
if "node_set" not in external_metadata:
|
|
return
|
|
|
|
node_set = external_metadata["node_set"]
|
|
if not isinstance(node_set, list):
|
|
return
|
|
|
|
document.belongs_to_set = [
|
|
NodeSet(id=generate_node_id(f"NodeSet:{node_set_name}"), name=node_set_name)
|
|
for node_set_name in node_set
|
|
]
|
|
|
|
|
|
async def classify_documents(data_documents: list[Data]) -> list[Document]:
|
|
"""
|
|
Classifies a list of data items into specific document types based on their file
|
|
extensions.
|
|
|
|
This function processes each item in the provided list of data documents, retrieves
|
|
relevant metadata, and creates instances of document classes mapped to their extensions.
|
|
It ensures that the data items are valid before performing the classification and
|
|
invokes `update_node_set` to extract and set relevant node information from the
|
|
document's external metadata.
|
|
|
|
Parameters:
|
|
-----------
|
|
|
|
- data_documents (list[Data]): A list of Data objects representing the documents to
|
|
be classified.
|
|
|
|
Returns:
|
|
--------
|
|
|
|
- list[Document]: A list of Document objects created based on the classified data
|
|
documents.
|
|
"""
|
|
documents = []
|
|
for data_item in data_documents:
|
|
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
|
|
id=data_item.id,
|
|
title=f"{data_item.name}.{data_item.extension}",
|
|
raw_data_location=data_item.raw_data_location,
|
|
name=data_item.name,
|
|
mime_type=data_item.mime_type,
|
|
external_metadata=json.dumps(data_item.external_metadata, indent=4),
|
|
)
|
|
update_node_set(document)
|
|
documents.append(document)
|
|
|
|
return documents
|