feat: Add Docling as an ingestion option to cognee add.
This commit is contained in:
parent
74f7a65110
commit
8ef3bf6393
4 changed files with 1199 additions and 140 deletions
|
|
@ -27,6 +27,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|||
|
||||
return await get_data_from_llama_index(data_item)
|
||||
|
||||
from docling_core.types import DoclingDocument
|
||||
|
||||
if isinstance(data_item, DoclingDocument):
|
||||
# Convert DoclingDocument to plain text and continue processing file
|
||||
data_item = data_item.export_to_text()
|
||||
|
||||
# data is a file object coming from upload.
|
||||
if hasattr(data_item, "file"):
|
||||
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
||||
|
|
|
|||
59
cognee/tests/test_add_docling_document.py
Normal file
59
cognee/tests/test_add_docling_document.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import asyncio
|
||||
import cognee
|
||||
|
||||
import os
|
||||
|
||||
# Provide your OpenAI LLM API KEY
|
||||
os.environ["LLM_API_KEY"] = ""
|
||||
|
||||
|
||||
async def main():
|
||||
# Get file path to document to process
|
||||
from pathlib import Path
|
||||
|
||||
current_directory = Path(__file__).resolve().parent
|
||||
file_path_artificial = os.path.join(
|
||||
current_directory, "test_data", "artificial-intelligence.pdf"
|
||||
)
|
||||
file_path_png = os.path.join(current_directory, "test_data", "example_copy.png")
|
||||
file_path_pptx = os.path.join(current_directory, "test_data", "example.pptx")
|
||||
|
||||
await cognee.prune.prune_data()
|
||||
await cognee.prune.prune_system(metadata=True)
|
||||
|
||||
# Import necessary converter, and convert file to DoclingDocument format
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
converter = DocumentConverter()
|
||||
|
||||
result = converter.convert(file_path_artificial)
|
||||
await cognee.add(result.document)
|
||||
|
||||
result = converter.convert(file_path_png)
|
||||
await cognee.add(result.document)
|
||||
|
||||
result = converter.convert(file_path_pptx)
|
||||
await cognee.add(result.document)
|
||||
|
||||
await cognee.cognify()
|
||||
|
||||
answer = await cognee.search("Tell me about Artificial Intelligence.")
|
||||
assert len(answer) != 0
|
||||
|
||||
answer = await cognee.search("Do programmers change light bulbs?")
|
||||
assert len(answer) != 0
|
||||
lowercase_answer = answer[0].lower()
|
||||
assert ("no" in lowercase_answer) or ("none" in lowercase_answer)
|
||||
|
||||
answer = await cognee.search("What colours are there in the presentation table?")
|
||||
assert len(answer) != 0
|
||||
lowercase_answer = answer[0].lower()
|
||||
assert (
|
||||
("red" in lowercase_answer)
|
||||
and ("blue" in lowercase_answer)
|
||||
and ("green" in lowercase_answer)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
|
@ -137,6 +137,8 @@ debug = ["debugpy>=1.8.9,<2.0.0"]
|
|||
|
||||
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
||||
|
||||
docling = ["docling>=2.54", "transformers>=4.55"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://www.cognee.ai"
|
||||
Repository = "https://github.com/topoteretes/cognee"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue