openrag/ingestion/index.html
2025-11-25 16:43:20 +00:00

94 lines
No EOL
26 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-core-components/ingestion" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.9.2">
<title data-rh="true">Docling in OpenRAG | OpenRAG</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="robots" content="noindex, nofollow"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.openr.ag/ingestion"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Docling in OpenRAG | OpenRAG"><meta data-rh="true" name="description" content="OpenRAG uses Docling for document ingestion."><meta data-rh="true" property="og:description" content="OpenRAG uses Docling for document ingestion."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.openr.ag/ingestion"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="x-default"><script data-rh="true" type="application/ld+json">{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Docling in OpenRAG","item":"https://docs.openr.ag/ingestion"}]}</script><link rel="stylesheet" href="/assets/css/styles.66669ecc.css">
<script src="/assets/js/runtime~main.503e7250.js" defer="defer"></script>
<script src="/assets/js/main.179a52a2.js" defer="defer"></script>
</head>
<body class="navigation-with-keyboard">
<svg style="display: none;"><defs>
<symbol id="theme-svg-external-link" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol>
</defs></svg>
<script>!function(){var t=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();document.documentElement.setAttribute("data-theme",t||"light"),document.documentElement.setAttribute("data-theme-choice",t||"light")}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><link rel="preload" as="image" href="/img/logo-openrag-light.svg"><link rel="preload" as="image" href="/img/logo-openrag-dark.svg"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="theme-layout-navbar navbar navbar--fixed-top"><div class="navbar__inner"><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/logo-openrag-light.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/img/logo-openrag-dark.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href="https://github.com/langflow-ai/openrag" target="_blank" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill="currentColor" d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class="docsWrapper_hBAB"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docRoot_UBD9"><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class="sidebarViewport_aRkj"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/"><span title="About OpenRAG" class="linkLabel_WmDU">About OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/quickstart"><span title="Quickstart" class="linkLabel_WmDU">Quickstart</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/install"><span title="Install OpenRAG with TUI" class="linkLabel_WmDU">Install OpenRAG with TUI</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docker"><span title="Install OpenRAG containers" class="linkLabel_WmDU">Install OpenRAG containers</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/agents"><span title="Langflow in OpenRAG" class="linkLabel_WmDU">Langflow in OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/knowledge"><span title="OpenSearch in OpenRAG" class="linkLabel_WmDU">OpenSearch in OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" href="/ingestion"><span title="Docling in OpenRAG" class="linkLabel_WmDU">Docling in OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/reference/configuration"><span title="Environment variables" class="linkLabel_WmDU">Environment variables</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/support/troubleshoot"><span title="Troubleshoot OpenRAG" class="linkLabel_WmDU">Troubleshoot OpenRAG</span></a></li></ul></nav></div></div></aside><main class="docMainContainer_TBSr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link">Docling in OpenRAG</span></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Docling in OpenRAG</h1></header><p>OpenRAG uses <a href="https://docling-project.github.io/docling/" target="_blank" rel="noopener noreferrer" class="">Docling</a> for document ingestion.
More specifically, OpenRAG uses <a href="https://github.com/docling-project/docling-serve" target="_blank" rel="noopener noreferrer" class="">Docling Serve</a>, which starts a <code>docling serve</code> process on your local machine and runs Docling ingestion through an API service.</p>
<p>Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch <code>documents</code> index.</p>
<p>OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.</p>
<p>To modify OpenRAG&#x27;s ingestion settings, including the Docling settings and ingestion flows, click 2&quot; aria-hidden=&quot;true&quot;/&gt; <strong>Settings</strong>.</p>
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="knowledge-ingestion-settings">Knowledge ingestion settings<a href="#knowledge-ingestion-settings" class="hash-link" aria-label="Direct link to Knowledge ingestion settings" title="Direct link to Knowledge ingestion settings" translate="no"></a></h2>
<p>These settings configure the Docling ingestion parameters.</p>
<p>OpenRAG will warn you if <code>docling serve</code> is not running.
To start or stop <code>docling serve</code> or any other native services, in the TUI main menu, click <strong>Start Native Services</strong> or <strong>Stop Native Services</strong>.</p>
<p><strong>Embedding model</strong> determines which AI model is used to create vector embeddings. The default is the OpenAI <code>text-embedding-3-small</code> model.</p>
<p><strong>Chunk size</strong> determines how large each text chunk is in number of characters.
Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
The default value of <code>1000</code> characters provides a good starting point that balances these considerations.</p>
<p><strong>Chunk overlap</strong> controls the number of characters that overlap over chunk boundaries.
Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.</p>
<p><strong>Table Structure</strong> enables Docling&#x27;s <a href="https://docling-project.github.io/docling/reference/document_converter/" target="_blank" rel="noopener noreferrer" class=""><code>DocumentConverter</code></a> tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. <strong>Table Structure</strong> is enabled by default.</p>
<p><strong>OCR</strong> enables or disabled OCR processing when extracting text from images and scanned documents.
OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling&#x27;s <a href="https://docling-project.github.io/docling/reference/document_converter/" target="_blank" rel="noopener noreferrer" class=""><code>DocumentConverter</code></a>. Images are ignored and not processed.</p>
<p>Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.</p>
<p>If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the <a href="https://www.piwheels.org/project/ocrmac/" target="_blank" rel="noopener noreferrer" class="">ocrmac</a> OCR engine. Other platforms use <a href="https://www.jaided.ai/easyocr/" target="_blank" rel="noopener noreferrer" class="">easyocr</a>.</p>
<p><strong>Picture descriptions</strong> adds image descriptions generated by the <a href="https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct" target="_blank" rel="noopener noreferrer" class="">SmolVLM-256M-Instruct</a> model to OCR processing. Enabling picture descriptions can slow ingestion performance.</p>
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="knowledge-ingestion-flows">Knowledge ingestion flows<a href="#knowledge-ingestion-flows" class="hash-link" aria-label="Direct link to Knowledge ingestion flows" title="Direct link to Knowledge ingestion flows" translate="no"></a></h2>
<p><a href="https://docs.langflow.org/concepts-overview" target="_blank" rel="noopener noreferrer" class="">Flows</a> in Langflow are functional representations of application workflows, with multiple <a href="https://docs.langflow.org/concepts-components" target="_blank" rel="noopener noreferrer" class="">component</a> nodes connected as single steps in a workflow.</p>
<p>The <strong>OpenSearch Ingestion</strong> flow is the default knowledge ingestion flow in OpenRAG: when you <strong>Add Knowledge</strong> in OpenRAG, you run the OpenSearch Ingestion flow in the background. The flow ingests documents using <strong>Docling Serve</strong> to import and process documents.</p>
<p>This flow contains ten components connected together to process and store documents in your knowledge base.</p>
<ul>
<li class="">The <a href="https://docs.langflow.org/bundles-docling" target="_blank" rel="noopener noreferrer" class=""><strong>Docling Serve</strong> component</a> processes input documents by connecting to your instance of Docling Serve.</li>
<li class="">The <a href="https://docs.langflow.org/components-docling" target="_blank" rel="noopener noreferrer" class=""><strong>Export DoclingDocument</strong> component</a> exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.</li>
<li class="">Three <a href="https://docs.langflow.org/components-processing#dataframe-operations" target="_blank" rel="noopener noreferrer" class=""><strong>DataFrame Operations</strong> components</a> sequentially add metadata columns to the document data of <code>filename</code>, <code>file_size</code>, and <code>mimetype</code>.</li>
<li class="">The <a href="https://docs.langflow.org/components-processing#split-text" target="_blank" rel="noopener noreferrer" class=""><strong>Split Text</strong> component</a> splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.</li>
<li class="">Four <strong>Secret Input</strong> components provide secure access to configuration variables: <code>CONNECTOR_TYPE</code>, <code>OWNER</code>, <code>OWNER_EMAIL</code>, and <code>OWNER_NAME</code>. These are runtime variables populated from OAuth login.</li>
<li class="">The <strong>Create Data</strong> component combines the secret inputs into a structured data object that will be associated with the document embeddings.</li>
<li class="">The <a href="https://docs.langflow.org/components-embedding-models" target="_blank" rel="noopener noreferrer" class=""><strong>Embedding Model</strong> component</a> generates vector embeddings using OpenAI&#x27;s <code>text-embedding-3-small</code> model. The embedding model is selected at [Application onboarding] and cannot be changed.</li>
<li class="">The <a href="https://docs.langflow.org/bundles-elastic#opensearch" target="_blank" rel="noopener noreferrer" class=""><strong>OpenSearch</strong> component</a> stores the processed documents and their embeddings in the <code>documents</code> index at <code>https://opensearch:9200</code>. By default, the component is authenticated with a JWT token, but you can also select <code>basic</code> auth mode, and enter your OpenSearch admin username and password.</li>
</ul>
<p>All flows included with OpenRAG are designed to be modular, performant, and provider-agnostic.
To modify a flow, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-settings2 lucide-settings-2" aria-hidden="true"><path d="M14 17H5"></path><path d="M19 7h-9"></path><circle cx="17" cy="17" r="3"></circle><circle cx="7" cy="7" r="3"></circle></svg> <strong>Settings</strong>, and click <strong>Edit in Langflow</strong>.
OpenRAG&#x27;s visual editor is based on the <a href="https://docs.langflow.org/concepts-overview" target="_blank" rel="noopener noreferrer" class="">Langflow visual editor</a>, so you can edit your flows to match your specific use case.</p>
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="url-flow">OpenSearch URL Ingestion flow<a href="#url-flow" class="hash-link" aria-label="Direct link to OpenSearch URL Ingestion flow" title="Direct link to OpenSearch URL Ingestion flow" translate="no"></a></h3>
<p>An additional knowledge ingestion flow is included in OpenRAG, where it is used as an MCP tool by the <a class="" href="/agents#flow"><strong>Open Search Agent flow</strong></a>.
The agent calls this component to fetch web content, and the results are ingested into OpenSearch.</p>
<p>For more on using MCP clients in Langflow, see <a href="https://docs.langflow.org/mcp-client" target="_blank" rel="noopener noreferrer" class="">MCP clients</a>.<br>
<!-- -->To connect additional MCP servers to the MCP client, see <a href="https://docs.langflow.org/mcp-tutorial" target="_blank" rel="noopener noreferrer" class="">Connect to MCP servers from your application</a>.</p>
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="use-openrag-default-ingestion-instead-of-docling-serve">Use OpenRAG default ingestion instead of Docling serve<a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="hash-link" aria-label="Direct link to Use OpenRAG default ingestion instead of Docling serve" title="Direct link to Use OpenRAG default ingestion instead of Docling serve" translate="no"></a></h2>
<p>If you want to use OpenRAG&#x27;s built-in pipeline instead of Docling serve, set <code>DISABLE_INGEST_WITH_LANGFLOW=true</code> in <a class="" href="/reference/configuration#document-processing">Environment variables</a>.</p>
<p>The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.</p>
<p>For more information, see <a href="https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58" target="_blank" rel="noopener noreferrer" class=""><code>processors.py</code> in the OpenRAG repository</a>.</p>
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="performance-expectations">Performance expectations<a href="#performance-expectations" class="hash-link" aria-label="Direct link to Performance expectations" title="Direct link to Performance expectations" translate="no"></a></h2>
<p>On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
This equates to approximately 2.4 documents per second.</p>
<p>You can generally expect equal or better performance on developer laptops and significantly faster on servers.
Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.</p>
<p>This test returned 12 errors (approximately 1.1%).
All errors were file-specific, and they didn&#x27;t stop the pipeline.</p>
<p>Ingestion dataset:</p>
<ul>
<li class="">Total files: 1,083 items mounted</li>
<li class="">Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)</li>
</ul>
<p>Hardware specifications:</p>
<ul>
<li class="">Machine: Apple M4 Pro</li>
<li class="">Podman VM:<!-- -->
<ul>
<li class="">Name: <code>podman-machine-default</code></li>
<li class="">Type: <code>applehv</code></li>
<li class="">vCPUs: 7</li>
<li class="">Memory: 8 GiB</li>
<li class="">Disk size: 100 GiB</li>
</ul>
</li>
</ul>
<p>Test results:</p>
<div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">...</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082</span><br></span></code></pre></div></div>
<p>Elapsed time: ~42 minutes 15 seconds (2,535 seconds)</p>
<p>Throughput: ~2.4 documents/second</p></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col noPrint_WFHX"><a href="https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/knowledge"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">OpenSearch in OpenRAG</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/reference/configuration"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Environment variables</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#knowledge-ingestion-settings" class="table-of-contents__link toc-highlight">Knowledge ingestion settings</a></li><li><a href="#knowledge-ingestion-flows" class="table-of-contents__link toc-highlight">Knowledge ingestion flows</a><ul><li><a href="#url-flow" class="table-of-contents__link toc-highlight">OpenSearch URL Ingestion flow</a></li></ul></li><li><a href="#use-openrag-default-ingestion-instead-of-docling-serve" class="table-of-contents__link toc-highlight">Use OpenRAG default ingestion instead of Docling serve</a></li><li><a href="#performance-expectations" class="table-of-contents__link toc-highlight">Performance expectations</a></li></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class="footer__title"></div><ul class="footer__items clean-list"><li class="footer__item"><div class="footer-links">
<span>© 2025 OpenRAG</span>
</div></li></ul></div></div></div></footer></div>
</body>
</html>