feat: Add Docling as an ingestion option to cognee add. (#1484)
<!-- .github/pull_request_template.md --> ## Description <!-- Please provide a clear, human-generated description of the changes in this PR. DO NOT use AI-generated descriptions. We want to understand your thought process and reasoning. --> We are adding support for Docling documents to cognee add. I did this the simplest way possible: converting the `DoclingDocument` early into text, and then sending it for further processing in cognee. This avoids adding a lot of docling-specific stuff like loaders, file types, classes, etc. while giving us (and the users) the opportunity to handle more different file types in our pipeline. ## Type of Change <!-- Please check the relevant option --> - [ ] Bug fix (non-breaking change that fixes an issue) - [x] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable) <!-- Add screenshots or videos to help explain your changes --> ## Pre-submission Checklist <!-- Please check all boxes that apply before submitting your PR --> - [x] **I have tested my changes thoroughly before submitting this PR** - [x] **This PR contains minimal changes necessary to address the issue/feature** - [x] My code follows the project's coding standards and style guidelines - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
commit
1512375e52
5 changed files with 90 additions and 1 deletions
25
.github/workflows/examples_tests.yml
vendored
25
.github/workflows/examples_tests.yml
vendored
|
|
@ -159,3 +159,28 @@ jobs:
|
||||||
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||||
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||||
run: uv run python ./examples/python/permissions_example.py
|
run: uv run python ./examples/python/permissions_example.py
|
||||||
|
test_docling_add:
|
||||||
|
name: Run Add with Docling Test
|
||||||
|
runs-on: macos-15
|
||||||
|
steps:
|
||||||
|
- name: Check out repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Cognee Setup
|
||||||
|
uses: ./.github/actions/cognee_setup
|
||||||
|
with:
|
||||||
|
python-version: '3.11.x'
|
||||||
|
extra-dependencies: 'docling'
|
||||||
|
|
||||||
|
- name: Run Docling Test
|
||||||
|
env:
|
||||||
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
|
LLM_MODEL: ${{ secrets.LLM_MODEL }}
|
||||||
|
LLM_ENDPOINT: ${{ secrets.LLM_ENDPOINT }}
|
||||||
|
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
||||||
|
LLM_API_VERSION: ${{ secrets.LLM_API_VERSION }}
|
||||||
|
EMBEDDING_MODEL: ${{ secrets.EMBEDDING_MODEL }}
|
||||||
|
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
|
||||||
|
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
|
||||||
|
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
|
||||||
|
run: uv run python ./cognee/tests/test_add_docling_document.py
|
||||||
|
|
|
||||||
2
.github/workflows/test_s3_file_storage.yml
vendored
2
.github/workflows/test_s3_file_storage.yml
vendored
|
|
@ -11,7 +11,7 @@ env:
|
||||||
ENV: 'dev'
|
ENV: 'dev'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test-gemini:
|
test-s3-storage:
|
||||||
name: Run S3 File Storage Test
|
name: Run S3 File Storage Test
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
||||||
|
|
||||||
return await get_data_from_llama_index(data_item)
|
return await get_data_from_llama_index(data_item)
|
||||||
|
|
||||||
|
if "docling" in str(type(data_item)):
|
||||||
|
from docling_core.types import DoclingDocument
|
||||||
|
|
||||||
|
if isinstance(data_item, DoclingDocument):
|
||||||
|
data_item = data_item.export_to_text()
|
||||||
|
|
||||||
# data is a file object coming from upload.
|
# data is a file object coming from upload.
|
||||||
if hasattr(data_item, "file"):
|
if hasattr(data_item, "file"):
|
||||||
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
||||||
|
|
|
||||||
56
cognee/tests/test_add_docling_document.py
Normal file
56
cognee/tests/test_add_docling_document.py
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
import asyncio
|
||||||
|
import cognee
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Get file path to document to process
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
current_directory = Path(__file__).resolve().parent
|
||||||
|
file_path_artificial = os.path.join(
|
||||||
|
current_directory, "test_data", "artificial-intelligence.pdf"
|
||||||
|
)
|
||||||
|
file_path_png = os.path.join(current_directory, "test_data", "example_copy.png")
|
||||||
|
file_path_pptx = os.path.join(current_directory, "test_data", "example.pptx")
|
||||||
|
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
|
||||||
|
# Import necessary converter, and convert file to DoclingDocument format
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
converter = DocumentConverter()
|
||||||
|
|
||||||
|
result = converter.convert(file_path_artificial)
|
||||||
|
await cognee.add(result.document)
|
||||||
|
|
||||||
|
result = converter.convert(file_path_png)
|
||||||
|
await cognee.add(result.document)
|
||||||
|
|
||||||
|
result = converter.convert(file_path_pptx)
|
||||||
|
await cognee.add(result.document)
|
||||||
|
|
||||||
|
await cognee.cognify()
|
||||||
|
|
||||||
|
answer = await cognee.search("Tell me about Artificial Intelligence.")
|
||||||
|
assert len(answer) != 0
|
||||||
|
|
||||||
|
answer = await cognee.search("Do programmers change light bulbs?")
|
||||||
|
assert len(answer) != 0
|
||||||
|
lowercase_answer = answer[0].lower()
|
||||||
|
assert ("no" in lowercase_answer) or ("none" in lowercase_answer)
|
||||||
|
|
||||||
|
answer = await cognee.search("What colours are there in the presentation table?")
|
||||||
|
assert len(answer) != 0
|
||||||
|
lowercase_answer = answer[0].lower()
|
||||||
|
assert (
|
||||||
|
("red" in lowercase_answer)
|
||||||
|
and ("blue" in lowercase_answer)
|
||||||
|
and ("green" in lowercase_answer)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -143,6 +143,8 @@ debug = ["debugpy>=1.8.9,<2.0.0"]
|
||||||
|
|
||||||
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
monitoring = ["sentry-sdk[fastapi]>=2.9.0,<3", "langfuse>=2.32.0,<3"]
|
||||||
|
|
||||||
|
docling = ["docling>=2.54", "transformers>=4.55"]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://www.cognee.ai"
|
Homepage = "https://www.cognee.ai"
|
||||||
Repository = "https://github.com/topoteretes/cognee"
|
Repository = "https://github.com/topoteretes/cognee"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue