openrag/ingestion/index.html
2025-10-22 18:19:53 +00:00

63 lines
No EOL
22 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-core-components/ingestion" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.8.1">
<title data-rh="true">Docling Ingestion | OpenRAG</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="robots" content="noindex, nofollow"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.openr.ag/ingestion"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Docling Ingestion | OpenRAG"><meta data-rh="true" name="description" content="OpenRAG uses Docling for its document ingestion pipeline."><meta data-rh="true" property="og:description" content="OpenRAG uses Docling for its document ingestion pipeline."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.openr.ag/ingestion"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="x-default"><script data-rh="true" type="application/ld+json">{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Docling Ingestion","item":"https://docs.openr.ag/ingestion"}]}</script><link rel="stylesheet" href="/assets/css/styles.1847d619.css">
<script src="/assets/js/runtime~main.c8893f6b.js" defer="defer"></script>
<script src="/assets/js/main.62f384a6.js" defer="defer"></script>
</head>
<body class="navigation-with-keyboard">
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;"><defs>
<symbol id="theme-svg-external-link" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol>
</defs></svg>
<script>!function(){var t="light";var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();document.documentElement.setAttribute("data-theme",e||t),document.documentElement.setAttribute("data-theme-choice",e||t)}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><link rel="preload" as="image" href="/img/logo-openrag-light.svg"><link rel="preload" as="image" href="/img/logo-openrag-dark.svg"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="theme-layout-navbar navbar navbar--fixed-top"><div class="navbar__inner"><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/logo-openrag-light.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/img/logo-openrag-dark.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href="https://github.com/langflow-ai/openrag" target="_blank" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill="currentColor" d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class="docsWrapper_hBAB"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docRoot_UBD9"><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class="sidebarViewport_aRkj"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/">Get Started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" role="button" aria-expanded="true" href="/agents">Core components</a></div><ul class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/agents">Langflow Agents</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/knowledge">OpenSearch Knowledge</a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/ingestion">Docling Ingestion</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/reference/configuration">Reference</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/support/troubleshoot">Support</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_TBSr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item"><span class="breadcrumbs__link">Core components</span></li><li class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link">Docling Ingestion</span></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Docling Ingestion</h1></header><p>OpenRAG uses <a href="https://docling-project.github.io/docling/" target="_blank" rel="noopener noreferrer">Docling</a> for its document ingestion pipeline.
More specifically, OpenRAG uses <a href="https://github.com/docling-project/docling-serve" target="_blank" rel="noopener noreferrer">Docling Serve</a>, which starts a <code>docling serve</code> process on your local machine and runs Docling ingestion through an API service.</p>
<p>Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch <code>documents</code> index.</p>
<p>OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="docling-ingestion-settings">Docling ingestion settings<a href="#docling-ingestion-settings" class="hash-link" aria-label="Direct link to Docling ingestion settings" title="Direct link to Docling ingestion settings"></a></h2>
<p>These settings configure the Docling ingestion parameters.</p>
<p>OpenRAG will warn you if <code>docling serve</code> is not running.
To start or stop <code>docling serve</code> or any other native services, in the TUI main menu, click <strong>Start Native Services</strong> or <strong>Stop Native Services</strong>.</p>
<p><strong>Embedding model</strong> determines which AI model is used to create vector embeddings. The default is <code>text-embedding-3-small</code>.</p>
<p><strong>Chunk size</strong> determines how large each text chunk is in number of characters.
Larger chunks yield more context per chunk, but may include irrelevant information. Smaller chunks yield more precise semantic search, but may lack context.
The default value of <code>1000</code> characters provides a good starting point that balances these considerations.</p>
<p><strong>Chunk overlap</strong> controls the number of characters that overlap over chunk boundaries.
Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.</p>
<p><strong>OCR</strong> enables or disabled OCR processing when extracting text from images and scanned documents.
OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling&#x27;s <a href="https://docling-project.github.io/docling/reference/document_converter/" target="_blank" rel="noopener noreferrer"><code>DocumentConverter</code></a>. Images are ignored and not processed.</p>
<p>Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.</p>
<p>If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the <a href="https://www.piwheels.org/project/ocrmac/" target="_blank" rel="noopener noreferrer">ocrmac</a> OCR engine. Other platforms use <a href="https://www.jaided.ai/easyocr/" target="_blank" rel="noopener noreferrer">easyocr</a>.</p>
<p><strong>Picture descriptions</strong> adds image descriptions generated by the <a href="https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct" target="_blank" rel="noopener noreferrer">SmolVLM-256M-Instruct</a> model to OCR processing. Enabling picture descriptions can slow ingestion performance.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="use-openrag-default-ingestion-instead-of-docling-serve">Use OpenRAG default ingestion instead of Docling serve<a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="hash-link" aria-label="Direct link to Use OpenRAG default ingestion instead of Docling serve" title="Direct link to Use OpenRAG default ingestion instead of Docling serve"></a></h2>
<p>If you want to use OpenRAG&#x27;s built-in pipeline instead of Docling serve, set <code>DISABLE_INGEST_WITH_LANGFLOW=true</code> in <a href="/reference/configuration#document-processing">Environment variables</a>.</p>
<p>The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.</p>
<p>For more information, see <a href="https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58" target="_blank" rel="noopener noreferrer"><code>processors.py</code> in the OpenRAG repository</a>.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="knowledge-ingestion-flows">Knowledge ingestion flows<a href="#knowledge-ingestion-flows" class="hash-link" aria-label="Direct link to Knowledge ingestion flows" title="Direct link to Knowledge ingestion flows"></a></h2>
<p><a href="https://docs.langflow.org/concepts-overview" target="_blank" rel="noopener noreferrer">Flows</a> in Langflow are functional representations of application workflows, with multiple <a href="https://docs.langflow.org/concepts-components" target="_blank" rel="noopener noreferrer">component</a> nodes connected as single steps in a workflow.</p>
<p>The <strong>OpenSearch Ingestion</strong> flow is the default knowledge ingestion flow in OpenRAG: when you <strong>Add Knowledge</strong> in OpenRAG, you run the OpenSearch Ingestion flow in the background. The flow ingests documents using <strong>Docling Serve</strong> to import and process documents.</p>
<p>This flow contains ten components connected together to process and store documents in your knowledge base.</p>
<ul>
<li>The <a href="https://docs.langflow.org/bundles-docling" target="_blank" rel="noopener noreferrer"><strong>Docling Serve</strong> component</a> processes input documents by connecting to your instance of Docling Serve.</li>
<li>The <a href="https://docs.langflow.org/components-docling" target="_blank" rel="noopener noreferrer"><strong>Export DoclingDocument</strong> component</a> exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.</li>
<li>Three <a href="https://docs.langflow.org/components-processing#dataframe-operations" target="_blank" rel="noopener noreferrer"><strong>DataFrame Operations</strong> components</a> sequentially add metadata columns to the document data of <code>filename</code>, <code>file_size</code>, and <code>mimetype</code>.</li>
<li>The <a href="https://docs.langflow.org/components-processing#split-text" target="_blank" rel="noopener noreferrer"><strong>Split Text</strong> component</a> splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.</li>
<li>Four <strong>Secret Input</strong> components provide secure access to configuration variables: <code>CONNECTOR_TYPE</code>, <code>OWNER</code>, <code>OWNER_EMAIL</code>, and <code>OWNER_NAME</code>. These are runtime variables populated from OAuth login.</li>
<li>The <strong>Create Data</strong> component combines the secret inputs into a structured data object that will be associated with the document embeddings.</li>
<li>The <a href="https://docs.langflow.org/components-embedding-models" target="_blank" rel="noopener noreferrer"><strong>Embedding Model</strong> component</a> generates vector embeddings using OpenAI&#x27;s <code>text-embedding-3-small</code> model. The embedding model is selected at [Application onboarding] and cannot be changed.</li>
<li>The <a href="https://docs.langflow.org/bundles-elastic#opensearch" target="_blank" rel="noopener noreferrer"><strong>OpenSearch</strong> component</a> stores the processed documents and their embeddings in the <code>documents</code> index at <code>https://opensearch:9200</code>. By default, the component is authenticated with a JWT token, but you can also select <code>basic</code> auth mode, and enter your OpenSearch admin username and password.</li>
</ul>
<p>All flows included with OpenRAG are designed to be modular, performant, and provider-agnostic.
To modify a flow, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-settings2 lucide-settings-2" aria-hidden="true"><path d="M14 17H5"></path><path d="M19 7h-9"></path><circle cx="17" cy="17" r="3"></circle><circle cx="7" cy="7" r="3"></circle></svg> <strong>Settings</strong>, and click <strong>Edit in Langflow</strong>.
OpenRAG&#x27;s visual editor is based on the <a href="https://docs.langflow.org/concepts-overview" target="_blank" rel="noopener noreferrer">Langflow visual editor</a>, so you can edit your flows to match your specific use case.</p>
<h3 class="anchor anchorWithStickyNavbar_LWe7" id="url-flow">OpenSearch URL Ingestion flow<a href="#url-flow" class="hash-link" aria-label="Direct link to OpenSearch URL Ingestion flow" title="Direct link to OpenSearch URL Ingestion flow"></a></h3>
<p>An additional knowledge ingestion flow is included in OpenRAG, where it is used as an MCP tool by the <a href="/agents#flow"><strong>Open Search Agent flow</strong></a>.
The agent calls this component to fetch web content, and the results are ingested into OpenSearch.</p>
<p>For more on using MCP clients in Langflow, see <a href="https://docs.langflow.org/mcp-client" target="_blank" rel="noopener noreferrer">MCP clients</a>.<br>
<!-- -->To connect additional MCP servers to the MCP client, see <a href="https://docs.langflow.org/mcp-tutorial" target="_blank" rel="noopener noreferrer">Connect to MCP servers from your application</a>.</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col"><a href="https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/knowledge"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">OpenSearch Knowledge</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/reference/configuration"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Environment variables</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#docling-ingestion-settings" class="table-of-contents__link toc-highlight">Docling ingestion settings</a></li><li><a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="table-of-contents__link toc-highlight">Use OpenRAG default ingestion instead of Docling serve</a></li><li><a href="#knowledge-ingestion-flows" class="table-of-contents__link toc-highlight">Knowledge ingestion flows</a><ul><li><a href="#url-flow" class="table-of-contents__link toc-highlight">OpenSearch URL Ingestion flow</a></li></ul></li></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class="footer__title"></div><ul class="footer__items clean-list"><li class="footer__item"><div class="footer-links">
<span>© 2025 OpenRAG</span>
</div></li></ul></div></div></div></footer></div>
</body>
</html>