From f6bb375860fc821af56ac278939e5d34e5c89300 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:51:42 -0400 Subject: [PATCH 1/8] init --- docs/docs/core-components/ingestion.mdx | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docs/docs/core-components/ingestion.mdx diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx new file mode 100644 index 00000000..d240d53e --- /dev/null +++ b/docs/docs/core-components/ingestion.mdx @@ -0,0 +1,23 @@ +--- +title: Docling Ingestion +slug: /ingestion +--- + +import Icon from "@site/src/components/icon/icon"; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; + +OpenRAG uses [Docling](https://docling-project.github.io/docling/) for its document ingestion pipeline. +More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling-serve` process on your local machine and runs Docling ingestion through an API service. + +OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images. + +## Docling ingestion settings + +These settings control the Docling ingestion parameters. + +OpenRAG will warn you if `docling-serve` is not running. +To start or stop `docling-serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**. + +## Use OpenRAG default ingestion instead of Docling \ No newline at end of file From 13e30c1b7408102cc28b6b43a6eb776e86d30b87 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 30 Sep 2025 14:54:22 -0400 Subject: [PATCH 2/8] ingestion-settings --- docs/docs/core-components/ingestion.mdx | 37 +++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx index d240d53e..f4820bf7 100644 --- a/docs/docs/core-components/ingestion.mdx +++ b/docs/docs/core-components/ingestion.mdx @@ -11,13 +11,46 @@ import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx'; OpenRAG uses [Docling](https://docling-project.github.io/docling/) for its document ingestion pipeline. More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling-serve` process on your local machine and runs Docling ingestion through an API service. +Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index. + OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images. ## Docling ingestion settings -These settings control the Docling ingestion parameters. +These settings configure the Docling ingestion parameters, from using no OCR to using advanced vision language models. OpenRAG will warn you if `docling-serve` is not running. To start or stop `docling-serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**. -## Use OpenRAG default ingestion instead of Docling \ No newline at end of file +**Embedding model** determines which AI model is used to create vector embeddings. The default is + +**Chunk size** determines how large each text chunk is in number of characters. +Larger chunks yield more context per chunk, but may include irrelevant information. Smaller chunks yield more precise semantic search, but may lack context. +The default value of `1000` characters provides a good starting point that balances these considerations. + +**Chunk overlap** controls the number of characters that overlap over chunk boundaries. +Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important. +The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents. + +**OCR** enables or disabled OCR processing when extracting text from images and scanned documents. +OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed. + +Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. + +If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/). + +**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. + +**VLM (Vision Language Model)** enables or disables VLM processing. +VLM processing is used _instead of_ OCR processing. +It uses an LLM to understand a document's structure and return text in a structured `doctags` format. +For more information, see [Vision models](https://docling-project.github.io/docling/usage/vision_models/). + +Enable a VLM when you are processing complex documents containing a mixture of text, images, tables, and charts. + +If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [SmolDocling-256M-preview-mlx-bf16](https://huggingface.co/ds4sd/SmolDocling-256M-preview-mlx-bf16) VLM, which includes the [MLX framework](https://ml-explore.github.io/mlx/build/html/index.html) for Apple silicon. +Other platforms use [SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview). + +## Use OpenRAG default ingestion instead of Docling + +If you want to use OpenRAG's built in pipeline instead of Docling, set `DISABLE_INGEST_WITH_LANGFLOW=true`. \ No newline at end of file From 3fdbac561a30bf002444130f2b159444d7bc6f34 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 30 Sep 2025 14:59:26 -0400 Subject: [PATCH 3/8] sidebars --- docs/sidebars.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/sidebars.js b/docs/sidebars.js index 3048cb70..affab754 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -60,6 +60,11 @@ const sidebars = { type: "doc", id: "core-components/knowledge", label: "OpenSearch Knowledge" + }, + { + type: "doc", + id: "core-components/ingestion", + label: "Docling Ingestion" } ], }, From e902ad59019410bc9bec160d1345dd4812ac3799 Mon Sep 17 00:00:00 2001 From: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:03:43 -0400 Subject: [PATCH 4/8] docs-link-to-page --- docs/docs/core-components/knowledge.mdx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx index 2ea5ef9f..852991a7 100644 --- a/docs/docs/core-components/knowledge.mdx +++ b/docs/docs/core-components/knowledge.mdx @@ -97,6 +97,10 @@ You can monitor the sync progress in the