openrag/ingestion/index.html
2025-10-01 22:22:33 +00:00

39 lines
No EOL
18 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-core-components/ingestion" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.8.1">
<title data-rh="true">Docling Ingestion | OpenRAG</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://langflow-ai.github.io/openrag/ingestion"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Docling Ingestion | OpenRAG"><meta data-rh="true" name="description" content="OpenRAG uses Docling for its document ingestion pipeline."><meta data-rh="true" property="og:description" content="OpenRAG uses Docling for its document ingestion pipeline."><link data-rh="true" rel="icon" href="/openrag/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://langflow-ai.github.io/openrag/ingestion"><link data-rh="true" rel="alternate" href="https://langflow-ai.github.io/openrag/ingestion" hreflang="en"><link data-rh="true" rel="alternate" href="https://langflow-ai.github.io/openrag/ingestion" hreflang="x-default"><script data-rh="true" type="application/ld+json">{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Docling Ingestion","item":"https://langflow-ai.github.io/openrag/ingestion"}]}</script><link rel="stylesheet" href="/openrag/assets/css/styles.4376ff55.css">
<script src="/openrag/assets/js/runtime~main.5bd84645.js" defer="defer"></script>
<script src="/openrag/assets/js/main.7dea55cc.js" defer="defer"></script>
</head>
<body class="navigation-with-keyboard">
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;"><defs>
<symbol id="theme-svg-external-link" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol>
</defs></svg>
<script>!function(){var t="light";var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();document.documentElement.setAttribute("data-theme",e||t),document.documentElement.setAttribute("data-theme-choice",e||t)}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><link rel="preload" as="image" href="/openrag/img/logo-openrag-light.svg"><link rel="preload" as="image" href="/openrag/img/logo-openrag-dark.svg"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="theme-layout-navbar navbar navbar--fixed-top"><div class="navbar__inner"><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/openrag/"><div class="navbar__logo"><img src="/openrag/img/logo-openrag-light.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/openrag/img/logo-openrag-dark.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href="https://github.com/openrag/openrag" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill="currentColor" d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class="docsWrapper_hBAB"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docRoot_UBD9"><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class="sidebarViewport_aRkj"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/">Get Started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" role="button" aria-expanded="true" href="/openrag/agents">Core components</a></div><ul class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/openrag/agents">Langflow Agents</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/openrag/knowledge">OpenSearch Knowledge</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/openrag/ingestion">Docling Ingestion</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/configure/configuration">Configuration</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/support/troubleshoot">Support</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_TBSr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/openrag/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item"><span class="breadcrumbs__link">Core components</span></li><li class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link">Docling Ingestion</span></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Docling Ingestion</h1></header><p>OpenRAG uses <a href="https://docling-project.github.io/docling/" target="_blank" rel="noopener noreferrer">Docling</a> for its document ingestion pipeline.
More specifically, OpenRAG uses <a href="https://github.com/docling-project/docling-serve" target="_blank" rel="noopener noreferrer">Docling Serve</a>, which starts a <code>docling-serve</code> process on your local machine and runs Docling ingestion through an API service.</p>
<p>Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch <code>documents</code> index.</p>
<p>OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="docling-ingestion-settings">Docling ingestion settings<a href="#docling-ingestion-settings" class="hash-link" aria-label="Direct link to Docling ingestion settings" title="Direct link to Docling ingestion settings"></a></h2>
<p>These settings configure the Docling ingestion parameters.</p>
<p>OpenRAG will warn you if <code>docling-serve</code> is not running.
To start or stop <code>docling-serve</code> or any other native services, in the TUI main menu, click <strong>Start Native Services</strong> or <strong>Stop Native Services</strong>.</p>
<p><strong>Embedding model</strong> determines which AI model is used to create vector embeddings. The default is <code>text-embedding-3-small</code>.</p>
<p><strong>Chunk size</strong> determines how large each text chunk is in number of characters.
Larger chunks yield more context per chunk, but may include irrelevant information. Smaller chunks yield more precise semantic search, but may lack context.
The default value of <code>1000</code> characters provides a good starting point that balances these considerations.</p>
<p><strong>Chunk overlap</strong> controls the number of characters that overlap over chunk boundaries.
Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.</p>
<p><strong>OCR</strong> enables or disabled OCR processing when extracting text from images and scanned documents.
OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling&#x27;s <a href="https://docling-project.github.io/docling/reference/document_converter/" target="_blank" rel="noopener noreferrer"><code>DocumentConverter</code></a>. Images are ignored and not processed.</p>
<p>Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.</p>
<p>If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the <a href="https://www.piwheels.org/project/ocrmac/" target="_blank" rel="noopener noreferrer">ocrmac</a> OCR engine. Other platforms use <a href="https://www.jaided.ai/easyocr/" target="_blank" rel="noopener noreferrer">easyocr</a>.</p>
<p><strong>Picture descriptions</strong> adds image descriptions generated by the <a href="https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct" target="_blank" rel="noopener noreferrer">SmolVLM-256M-Instruct</a> model to OCR processing. Enabling picture descriptions can slow ingestion performance.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="use-openrag-default-ingestion-instead-of-docling-serve">Use OpenRAG default ingestion instead of Docling serve<a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="hash-link" aria-label="Direct link to Use OpenRAG default ingestion instead of Docling serve" title="Direct link to Use OpenRAG default ingestion instead of Docling serve"></a></h2>
<p>If you want to use OpenRAG&#x27;s built-in pipeline instead of Docling serve, set <code>DISABLE_INGEST_WITH_LANGFLOW=true</code> in <a href="/openrag/configure/configuration#ingestion-configuration">Environment variables</a>.</p>
<p>The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.</p>
<p>For more information, see <a href="https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58" target="_blank" rel="noopener noreferrer"><code>processors.py</code> in the OpenRAG repository</a>.</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col"><a href="https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/openrag/knowledge"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">OpenSearch Knowledge</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/openrag/configure/configuration"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Environment Variables</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#docling-ingestion-settings" class="table-of-contents__link toc-highlight">Docling ingestion settings</a></li><li><a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="table-of-contents__link toc-highlight">Use OpenRAG default ingestion instead of Docling serve</a></li></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class="footer__title">Documentation</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/openrag/">Getting Started</a></li></ul></div><div class="theme-layout-footer-column col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/openrag/openrag" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a></li><li class="footer__item"><a href="https://discord.gg/openrag" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2025 OpenRAG. Built with Docusaurus.</div></div></div></footer></div>
</body>
</html>