39 lines
No EOL
18 KiB
HTML
39 lines
No EOL
18 KiB
HTML
<!doctype html>
|
||
<html lang="en" dir="ltr" class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-core-components/ingestion" data-has-hydrated="false">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="generator" content="Docusaurus v3.8.1">
|
||
<title data-rh="true">Docling Ingestion | OpenRAG</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://langflow-ai.github.io/openrag/ingestion"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Docling Ingestion | OpenRAG"><meta data-rh="true" name="description" content="OpenRAG uses Docling for its document ingestion pipeline."><meta data-rh="true" property="og:description" content="OpenRAG uses Docling for its document ingestion pipeline."><link data-rh="true" rel="icon" href="/openrag/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://langflow-ai.github.io/openrag/ingestion"><link data-rh="true" rel="alternate" href="https://langflow-ai.github.io/openrag/ingestion" hreflang="en"><link data-rh="true" rel="alternate" href="https://langflow-ai.github.io/openrag/ingestion" hreflang="x-default"><script data-rh="true" type="application/ld+json">{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Docling Ingestion","item":"https://langflow-ai.github.io/openrag/ingestion"}]}</script><link rel="stylesheet" href="/openrag/assets/css/styles.4376ff55.css">
|
||
<script src="/openrag/assets/js/runtime~main.5bd84645.js" defer="defer"></script>
|
||
<script src="/openrag/assets/js/main.7dea55cc.js" defer="defer"></script>
|
||
</head>
|
||
<body class="navigation-with-keyboard">
|
||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;"><defs>
|
||
<symbol id="theme-svg-external-link" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol>
|
||
</defs></svg>
|
||
<script>!function(){var t="light";var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();document.documentElement.setAttribute("data-theme",e||t),document.documentElement.setAttribute("data-theme-choice",e||t)}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><link rel="preload" as="image" href="/openrag/img/logo-openrag-light.svg"><link rel="preload" as="image" href="/openrag/img/logo-openrag-dark.svg"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="theme-layout-navbar navbar navbar--fixed-top"><div class="navbar__inner"><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/openrag/"><div class="navbar__logo"><img src="/openrag/img/logo-openrag-light.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/openrag/img/logo-openrag-dark.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href="https://github.com/openrag/openrag" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill="currentColor" d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class="docsWrapper_hBAB"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docRoot_UBD9"><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class="sidebarViewport_aRkj"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/">Get Started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" role="button" aria-expanded="true" href="/openrag/agents">Core components</a></div><ul class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/openrag/agents">Langflow Agents</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/openrag/knowledge">OpenSearch Knowledge</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/openrag/ingestion">Docling Ingestion</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/configure/configuration">Configuration</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/openrag/support/troubleshoot">Support</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_TBSr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/openrag/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item"><span class="breadcrumbs__link">Core components</span></li><li class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link">Docling Ingestion</span></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Docling Ingestion</h1></header><p>OpenRAG uses <a href="https://docling-project.github.io/docling/" target="_blank" rel="noopener noreferrer">Docling</a> for its document ingestion pipeline.
|
||
More specifically, OpenRAG uses <a href="https://github.com/docling-project/docling-serve" target="_blank" rel="noopener noreferrer">Docling Serve</a>, which starts a <code>docling-serve</code> process on your local machine and runs Docling ingestion through an API service.</p>
|
||
<p>Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch <code>documents</code> index.</p>
|
||
<p>OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.</p>
|
||
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="docling-ingestion-settings">Docling ingestion settings<a href="#docling-ingestion-settings" class="hash-link" aria-label="Direct link to Docling ingestion settings" title="Direct link to Docling ingestion settings"></a></h2>
|
||
<p>These settings configure the Docling ingestion parameters.</p>
|
||
<p>OpenRAG will warn you if <code>docling-serve</code> is not running.
|
||
To start or stop <code>docling-serve</code> or any other native services, in the TUI main menu, click <strong>Start Native Services</strong> or <strong>Stop Native Services</strong>.</p>
|
||
<p><strong>Embedding model</strong> determines which AI model is used to create vector embeddings. The default is <code>text-embedding-3-small</code>.</p>
|
||
<p><strong>Chunk size</strong> determines how large each text chunk is in number of characters.
|
||
Larger chunks yield more context per chunk, but may include irrelevant information. Smaller chunks yield more precise semantic search, but may lack context.
|
||
The default value of <code>1000</code> characters provides a good starting point that balances these considerations.</p>
|
||
<p><strong>Chunk overlap</strong> controls the number of characters that overlap over chunk boundaries.
|
||
Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
|
||
The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.</p>
|
||
<p><strong>OCR</strong> enables or disabled OCR processing when extracting text from images and scanned documents.
|
||
OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's <a href="https://docling-project.github.io/docling/reference/document_converter/" target="_blank" rel="noopener noreferrer"><code>DocumentConverter</code></a>. Images are ignored and not processed.</p>
|
||
<p>Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.</p>
|
||
<p>If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the <a href="https://www.piwheels.org/project/ocrmac/" target="_blank" rel="noopener noreferrer">ocrmac</a> OCR engine. Other platforms use <a href="https://www.jaided.ai/easyocr/" target="_blank" rel="noopener noreferrer">easyocr</a>.</p>
|
||
<p><strong>Picture descriptions</strong> adds image descriptions generated by the <a href="https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct" target="_blank" rel="noopener noreferrer">SmolVLM-256M-Instruct</a> model to OCR processing. Enabling picture descriptions can slow ingestion performance.</p>
|
||
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="use-openrag-default-ingestion-instead-of-docling-serve">Use OpenRAG default ingestion instead of Docling serve<a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="hash-link" aria-label="Direct link to Use OpenRAG default ingestion instead of Docling serve" title="Direct link to Use OpenRAG default ingestion instead of Docling serve"></a></h2>
|
||
<p>If you want to use OpenRAG's built-in pipeline instead of Docling serve, set <code>DISABLE_INGEST_WITH_LANGFLOW=true</code> in <a href="/openrag/configure/configuration#ingestion-configuration">Environment variables</a>.</p>
|
||
<p>The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.</p>
|
||
<p>For more information, see <a href="https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58" target="_blank" rel="noopener noreferrer"><code>processors.py</code> in the OpenRAG repository</a>.</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col"><a href="https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/openrag/knowledge"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">OpenSearch Knowledge</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/openrag/configure/configuration"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Environment Variables</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#docling-ingestion-settings" class="table-of-contents__link toc-highlight">Docling ingestion settings</a></li><li><a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="table-of-contents__link toc-highlight">Use OpenRAG default ingestion instead of Docling serve</a></li></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class="footer__title">Documentation</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/openrag/">Getting Started</a></li></ul></div><div class="theme-layout-footer-column col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/openrag/openrag" target="_blank" rel="noopener noreferrer" class="footer__link-item">GitHub<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a></li><li class="footer__item"><a href="https://discord.gg/openrag" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2025 OpenRAG. Built with Docusaurus.</div></div></div></footer></div>
|
||
</body>
|
||
</html> |