diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 4eb9e184f..4eaaa0386 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -159,3 +159,28 @@ jobs: EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} run: uv run python ./examples/python/permissions_example.py + test_docling_add: + name: Run Add with Docling Test + runs-on: macos-15 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Cognee Setup + uses: ./.github/actions/cognee_setup + with: + python-version: '3.11.x' + extra-dependencies: 'docling' + + - name: Run Docling Test + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LLM_MODEL: ${{ secrets.LLM_MODEL }} + LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }} + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} + LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }} + EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }} + EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }} + EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }} + EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }} + run: uv run python ./cognee/tests/test_add_docling_document.py diff --git a/.github/workflows/test_s3_file_storage.yml b/.github/workflows/test_s3_file_storage.yml index 04d140513..11c808a2d 100644 --- a/.github/workflows/test_s3_file_storage.yml +++ b/.github/workflows/test_s3_file_storage.yml @@ -11,7 +11,7 @@ env: ENV: 'dev' jobs: - test-gemini: + test-s3-storage: name: Run S3 File Storage Test runs-on: ubuntu-22.04 steps: diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py index 9df5e6e57..b6e1f7d00 100644 --- a/cognee/tasks/ingestion/save_data_item_to_storage.py +++ b/cognee/tasks/ingestion/save_data_item_to_storage.py @@ -35,6 +35,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str return await get_data_from_llama_index(data_item) + if "docling" in str(type(data_item)): + from docling_core.types import DoclingDocument + + if isinstance(data_item, DoclingDocument): + data_item = data_item.export_to_text() + # data is a file object coming from upload. if hasattr(data_item, "file"): return await save_data_to_file(data_item.file, filename=data_item.filename) diff --git a/cognee/tests/test_add_docling_document.py b/cognee/tests/test_add_docling_document.py new file mode 100644 index 000000000..2c82af66f --- /dev/null +++ b/cognee/tests/test_add_docling_document.py @@ -0,0 +1,56 @@ +import asyncio +import cognee + +import os + + +async def main(): + # Get file path to document to process + from pathlib import Path + + current_directory = Path(__file__).resolve().parent + file_path_artificial = os.path.join( + current_directory, "test_data", "artificial-intelligence.pdf" + ) + file_path_png = os.path.join(current_directory, "test_data", "example_copy.png") + file_path_pptx = os.path.join(current_directory, "test_data", "example.pptx") + + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + # Import necessary converter, and convert file to DoclingDocument format + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + + result = converter.convert(file_path_artificial) + await cognee.add(result.document) + + result = converter.convert(file_path_png) + await cognee.add(result.document) + + result = converter.convert(file_path_pptx) + await cognee.add(result.document) + + await cognee.cognify() + + answer = await cognee.search("Tell me about Artificial Intelligence.") + assert len(answer) != 0 + + answer = await cognee.search("Do programmers change light bulbs?") + assert len(answer) != 0 + lowercase_answer = answer[0].lower() + assert ("no" in lowercase_answer) or ("none" in lowercase_answer) + + answer = await cognee.search("What colours are there in the presentation table?") + assert len(answer) != 0 + lowercase_answer = answer[0].lower() + assert ( + ("red" in lowercase_answer) + and ("blue" in lowercase_answer) + and ("green" in lowercase_answer) + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 23c001d96..def87d2a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,6 +143,8 @@ debug = ["debugpy>=1.8.9,<2.0.0"] monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"] +docling = ["docling>=2.54", "transformers>=4.55"] + [project.urls] Homepage = "https://www.cognee.ai" Repository = "https://github.com/topoteretes/cognee"