feat: Add Docling as an ingestion option to cognee add. (#1484)

<!-- .github/pull_request_template.md -->

## Description
<!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->
We are adding support for Docling documents to cognee add. I did this
the simplest way possible: converting the `DoclingDocument` early into
text, and then sending it for further processing in cognee. This avoids
adding a lot of docling-specific stuff like loaders, file types,
classes, etc. while giving us (and the users) the opportunity to handle
more different file types in our pipeline.

## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [x] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->

## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [x] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue/feature**
- [x] My code follows the project's coding standards and style
guidelines
- [x] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
Vasilije 2025-10-12 13:21:41 +02:00 committed by GitHub
commit 1512375e52
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 90 additions and 1 deletions

View file

@ -159,3 +159,28 @@ jobs:
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./examples/python/permissions_example.py
test_docling_add:
name: Run Add with Docling Test
runs-on: macos-15
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Cognee Setup
uses: ./.github/actions/cognee_setup
with:
python-version: '3.11.x'
extra-dependencies: 'docling'
- name: Run Docling Test
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
run: uv run python ./cognee/tests/test_add_docling_document.py

View file

@ -11,7 +11,7 @@ env:
ENV: 'dev'
jobs:
test-gemini:
test-s3-storage:
name: Run S3 File Storage Test
runs-on: ubuntu-22.04
steps:

View file

@ -35,6 +35,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
return await get_data_from_llama_index(data_item)
if "docling" in str(type(data_item)):
from docling_core.types import DoclingDocument
if isinstance(data_item, DoclingDocument):
data_item = data_item.export_to_text()
# data is a file object coming from upload.
if hasattr(data_item, "file"):
return await save_data_to_file(data_item.file, filename=data_item.filename)

View file

@ -0,0 +1,56 @@
import asyncio
import cognee
import os
async def main():
# Get file path to document to process
from pathlib import Path
current_directory = Path(__file__).resolve().parent
file_path_artificial = os.path.join(
current_directory, "test_data", "artificial-intelligence.pdf"
)
file_path_png = os.path.join(current_directory, "test_data", "example_copy.png")
file_path_pptx = os.path.join(current_directory, "test_data", "example.pptx")
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# Import necessary converter, and convert file to DoclingDocument format
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path_artificial)
await cognee.add(result.document)
result = converter.convert(file_path_png)
await cognee.add(result.document)
result = converter.convert(file_path_pptx)
await cognee.add(result.document)
await cognee.cognify()
answer = await cognee.search("Tell me about Artificial Intelligence.")
assert len(answer) != 0
answer = await cognee.search("Do programmers change light bulbs?")
assert len(answer) != 0
lowercase_answer = answer[0].lower()
assert ("no" in lowercase_answer) or ("none" in lowercase_answer)
answer = await cognee.search("What colours are there in the presentation table?")
assert len(answer) != 0
lowercase_answer = answer[0].lower()
assert (
("red" in lowercase_answer)
and ("blue" in lowercase_answer)
and ("green" in lowercase_answer)
)
if __name__ == "__main__":
asyncio.run(main())

View file

@ -143,6 +143,8 @@ debug = ["debugpy>=1.8.9,<2.0.0"]
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
docling = ["docling>=2.54", "transformers>=4.55"]
[project.urls]
Homepage = "https://www.cognee.ai"
Repository = "https://github.com/topoteretes/cognee"