323 lines
No EOL
57 KiB
HTML
323 lines
No EOL
57 KiB
HTML
<!doctype html>
|
||
<html lang="en" dir="ltr" class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-core-components/ingestion" data-has-hydrated="false">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="generator" content="Docusaurus v3.9.2">
|
||
<title data-rh="true">Ingest knowledge | OpenRAG</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://docs.openr.ag/ingestion"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="current"><meta data-rh="true" name="docusaurus_tag" content="docs-default-current"><meta data-rh="true" name="docsearch:version" content="current"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-current"><meta data-rh="true" property="og:title" content="Ingest knowledge | OpenRAG"><meta data-rh="true" name="description" content="Upload documents to your OpenRAG OpenSearch instance to populate your knowledge base with unique content, such as your own company documents, research papers, or websites."><meta data-rh="true" property="og:description" content="Upload documents to your OpenRAG OpenSearch instance to populate your knowledge base with unique content, such as your own company documents, research papers, or websites."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://docs.openr.ag/ingestion"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="en"><link data-rh="true" rel="alternate" href="https://docs.openr.ag/ingestion" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://SMEA51Q5OL-dsn.algolia.net" crossorigin="anonymous"><script data-rh="true" type="application/ld+json">{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Ingest knowledge","item":"https://docs.openr.ag/ingestion"}]}</script><link rel="search" type="application/opensearchdescription+xml" title="OpenRAG" href="/opensearch.xml">
|
||
|
||
|
||
<script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("consent","default",{ad_storage:"denied",ad_user_data:"denied",ad_personalization:"denied",analytics_storage:"denied"})</script>
|
||
<script>!function(){function e(){if(void 0!==window.truste&&window.truste.cma){var e=window.truste.cma.callApi("getConsent",window.location.href)||{},n=1===e[2],t=1===e[3];gtag("consent","update",{ad_storage:n?"granted":"denied",ad_user_data:n?"granted":"denied",ad_personalization:n?"granted":"denied",analytics_storage:t?"granted":"denied"})}}window.addEventListener&&(window.addEventListener("cm_data_subject_consent_changed",e),window.addEventListener("cm_consent_preferences_set",e)),"complete"===document.readyState?e():window.addEventListener("load",e)}()</script>
|
||
<script>window._ibmAnalytics={settings:{name:"DataStax",tealiumProfileName:"ibm-subsidiary"},trustarc:{privacyPolicyLink:"https://ibm.com/privacy"}},window.digitalData={page:{pageInfo:{ibm:{siteId:"IBM_DataStax"}},category:{primaryCategory:"PC230"}}}</script>
|
||
<script src="//1.www.s81c.com/common/stats/ibm-common.js" async="true"></script><link rel="stylesheet" href="/assets/css/styles.29e42e49.css">
|
||
<script src="/assets/js/runtime~main.a09b72d8.js" defer="defer"></script>
|
||
<script src="/assets/js/main.e1ba3126.js" defer="defer"></script>
|
||
</head>
|
||
<body class="navigation-with-keyboard">
|
||
<svg style="display: none;"><defs>
|
||
<symbol id="theme-svg-external-link" viewBox="0 0 24 24"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol>
|
||
</defs></svg>
|
||
<script>!function(){var t=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme")}catch(t){}}();document.documentElement.setAttribute("data-theme",t||"light"),document.documentElement.setAttribute("data-theme-choice",t||"light")}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><link rel="preload" as="image" href="/img/logo-openrag-light.svg"><link rel="preload" as="image" href="/img/logo-openrag-dark.svg"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="theme-layout-navbar navbar navbar--fixed-top"><div class="navbar__inner"><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/logo-openrag-light.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/img/logo-openrag-dark.svg" alt="OpenRAG Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href="https://github.com/langflow-ai/openrag" target="_blank" class="navbar__item navbar__link header-github-link" aria-label="GitHub repository"></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" aria-hidden="true" class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill="currentColor" d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search (Meta+k)" aria-keyshortcuts="Meta+k"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 24 24" aria-hidden="true"><circle cx="11" cy="11" r="8" stroke="currentColor" fill="none" stroke-width="1.4"></circle><path d="m21 21-4.3-4.3" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class="docsWrapper_hBAB"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docRoot_UBD9"><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class="sidebarViewport_aRkj"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/"><span title="About OpenRAG" class="linkLabel_WmDU">About OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/quickstart"><span title="Quickstart" class="linkLabel_WmDU">Quickstart</span></a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role="button" aria-expanded="false" href="/install-options"><span title="Installation" class="categoryLinkLabel_W154">Installation</span></a></div></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/tui"><span title="Use the TUI" class="linkLabel_WmDU">Use the TUI</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/manage-services"><span title="Manage services" class="linkLabel_WmDU">Manage services</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/agents"><span title="Flows" class="linkLabel_WmDU">Flows</span></a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" role="button" aria-expanded="true" href="/knowledge"><span title="Knowledge" class="categoryLinkLabel_W154">Knowledge</span></a></div><ul class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/knowledge"><span title="Configure knowledge" class="linkLabel_WmDU">Configure knowledge</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/ingestion"><span title="Ingest knowledge" class="linkLabel_WmDU">Ingest knowledge</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link" tabindex="0" href="/knowledge-filters"><span title="Filter knowledge" class="linkLabel_WmDU">Filter knowledge</span></a></li></ul></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/chat"><span title="Chat" class="linkLabel_WmDU">Chat</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/reference/configuration"><span title="Environment variables" class="linkLabel_WmDU">Environment variables</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/reference/api-sdk-overview"><span title="APIs and SDKs" class="linkLabel_WmDU">APIs and SDKs</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/support/contribute"><span title="Contribute to OpenRAG" class="linkLabel_WmDU">Contribute to OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/support/troubleshoot"><span title="Troubleshoot OpenRAG" class="linkLabel_WmDU">Troubleshoot OpenRAG</span></a></li><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a href="https://github.com/langflow-ai/openrag/releases" target="_blank" rel="noopener noreferrer" class="menu__link menuExternalLink_NmtK"><span title="Changelog" class="linkLabel_WmDU">Changelog</span><svg width="13.5" height="13.5" aria-label="(opens in new tab)" class="iconExternalLink_nPIU"><use href="#theme-svg-external-link"></use></svg></a></li></ul></nav></div></div></aside><main class="docMainContainer_TBSr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label="Breadcrumbs"><ul class="breadcrumbs"><li class="breadcrumbs__item"><a aria-label="Home page" class="breadcrumbs__link" href="/"><svg viewBox="0 0 24 24" class="breadcrumbHomeIcon_YNFT"><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill="currentColor"></path></svg></a></li><li class="breadcrumbs__item"><span class="breadcrumbs__link">Knowledge</span></li><li class="breadcrumbs__item breadcrumbs__item--active"><span class="breadcrumbs__link">Ingest knowledge</span></li></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>Ingest knowledge</h1></header><p>Upload documents to your <a class="" href="/knowledge">OpenRAG OpenSearch instance</a> to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
|
||
Documents are processed through OpenRAG's knowledge ingestion flows with Docling.</p>
|
||
<p>OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connectors.</p>
|
||
<p>Knowledge ingestion is powered by OpenRAG's built-in knowledge ingestion flows that use Docling to process documents before storing the documents in your OpenSearch database.
|
||
During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected <a class="" href="/knowledge#set-the-embedding-model-and-dimensions">embedding model</a>.
|
||
Then, the chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.</p>
|
||
<p>To modify chunking behavior and other ingestion settings, see <a class="" href="/knowledge#knowledge-ingestion-settings">Knowledge ingestion settings</a> and <a class="" href="/agents#inspect-and-modify-flows">Inspect and modify flows</a>.</p>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="ingest-local-files-and-folders">Ingest local files and folders<a href="#ingest-local-files-and-folders" class="hash-link" aria-label="Direct link to Ingest local files and folders" title="Direct link to Ingest local files and folders" translate="no"></a></h2>
|
||
<p>You can upload files and folders from your local machine to your knowledge base:</p>
|
||
<ol>
|
||
<li class="">
|
||
<p>Click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-library" aria-hidden="true"><path d="m16 6 4 14"></path><path d="M12 6v14"></path><path d="M8 8v12"></path><path d="M4 4v16"></path></svg> <strong>Knowledge</strong> to view your OpenSearch knowledge base.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Click <strong>Add Knowledge</strong> to add your own documents to your OpenRAG knowledge base.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>To upload one file, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-file" aria-hidden="true"><path d="M6 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h8a2.4 2.4 0 0 1 1.704.706l3.588 3.588A2.4 2.4 0 0 1 20 8v12a2 2 0 0 1-2 2z"></path><path d="M14 2v5a1 1 0 0 0 1 1h5"></path></svg> <strong>File</strong>. To upload all documents in a folder, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-folder" aria-hidden="true"><path d="M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z"></path></svg> <strong>Folder</strong>.</p>
|
||
<p>The default path is <code>~/.openrag/documents</code>.
|
||
To change this path, see <a class="" href="/knowledge#set-the-local-documents-path">Set the local documents path</a>.</p>
|
||
</li>
|
||
</ol>
|
||
<p>The selected files are processed in the background through the <strong>OpenSearch Ingestion</strong> flow.</p>
|
||
<details class="details_lb9f alert alert--info details_b_Ee" data-collapsed="true"><summary>About the OpenSearch Ingestion flow</summary><div><div class="collapsibleContent_i85q"><p>When you upload documents locally or with OAuth connectors, the <strong>OpenSearch Ingestion</strong> flow runs in the background.
|
||
By default, this flow uses Docling Serve to import and process documents.</p><p>Like all <a class="" href="/agents">OpenRAG flows</a>, you can <a class="" href="/agents#inspect-and-modify-flows">inspect the flow in Langflow</a>, and you can customize it if you want to change the knowledge ingestion settings.</p><p>The <strong>OpenSearch Ingestion</strong> flow is comprised of several components that work together to process and store documents in your knowledge base:</p><ul>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-docling#docling-serve" target="_blank" rel="noopener noreferrer" class=""><strong>Docling Serve</strong> component</a>: Ingests files and processes them by connecting to OpenRAG's local Docling Serve service. The output is <code>DoclingDocument</code> data that contains the extracted text and metadata from the documents.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-docling#export-doclingdocument" target="_blank" rel="noopener noreferrer" class=""><strong>Export DoclingDocument</strong> component</a>: Exports processed <code>DoclingDocument</code> data to Markdown format with image placeholders. This conversion standardizes the document data in preparation for further processing.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/dataframe-operations" target="_blank" rel="noopener noreferrer" class=""><strong>DataFrame Operations</strong> component</a>: Three of these components run sequentially to add metadata to the document data: <code>filename</code>, <code>file_size</code>, and <code>mimetype</code>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/split-text" target="_blank" rel="noopener noreferrer" class=""><strong>Split Text</strong> component</a>: Splits the processed text into chunks, based on the configured <a class="" href="/knowledge#knowledge-ingestion-settings">chunk size and overlap settings</a>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Secret Input</strong> component: If needed, four of these components securely fetch the <a class="" href="/knowledge#auth">OAuth authentication</a> configuration variables: <code>CONNECTOR_TYPE</code>, <code>OWNER</code>, <code>OWNER_EMAIL</code>, and <code>OWNER_NAME</code>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Create Data</strong> component: Combines the authentication credentials from the <strong>Secret Input</strong> components into a structured data object that is associated with the document embeddings.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/components-embedding-models" target="_blank" rel="noopener noreferrer" class=""><strong>Embedding Model</strong> component</a>: Generates vector embeddings using your selected <a class="" href="/knowledge#set-the-embedding-model-and-dimensions">embedding model</a>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-elastic#opensearch" target="_blank" rel="noopener noreferrer" class=""><strong>OpenSearch</strong> component</a>: Stores the processed documents and their embeddings in a <code>documents</code> index of your OpenRAG <a class="" href="/knowledge">OpenSearch knowledge base</a>.</p>
|
||
<p>The default address for the OpenSearch instance is <code>https://opensearch:9200</code>. To change this address, edit the <code>OPENSEARCH_PORT</code> <a class="" href="/reference/configuration#opensearch-settings">environment variable</a>.</p>
|
||
<p>The default authentication method is JSON Web Token (JWT) authentication. If you <a class="" href="/agents#inspect-and-modify-flows">edit the flow</a>, you can select <code>basic</code> auth mode, which uses the <code>OPENSEARCH_USERNAME</code> and <code>OPENSEARCH_PASSWORD</code> <a class="" href="/reference/configuration#opensearch-settings">environment variables</a> for authentication instead of JWT.</p>
|
||
</li>
|
||
</ul></div></div></details>
|
||
<p>You can <a href="#monitor-ingestion" class="">monitor ingestion</a> to see the progress of the uploads and check for failed uploads.</p>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="ingest-local-files-temporarily">Ingest local files temporarily<a href="#ingest-local-files-temporarily" class="hash-link" aria-label="Direct link to Ingest local files temporarily" title="Direct link to Ingest local files temporarily" translate="no"></a></h2>
|
||
<p>When using the OpenRAG <strong>Chat</strong>, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-plus" aria-hidden="true"><path d="M5 12h14"></path><path d="M12 5v14"></path></svg> <strong>Add</strong> in the chat input field to upload a file to the current chat session.
|
||
Files added this way are processed and made available to the agent for the current conversation only.
|
||
These files aren't stored in the knowledge base permanently.</p>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="oauth-ingestion">Ingest files with OAuth connectors<a href="#oauth-ingestion" class="hash-link" aria-label="Direct link to Ingest files with OAuth connectors" title="Direct link to Ingest files with OAuth connectors" translate="no"></a></h2>
|
||
<p>OpenRAG can use OAuth authenticated connectors to ingest documents from the following external services:</p>
|
||
<ul>
|
||
<li class="">AWS S3</li>
|
||
<li class="">Google Drive</li>
|
||
<li class="">Microsoft OneDrive</li>
|
||
<li class="">Microsoft Sharepoint</li>
|
||
</ul>
|
||
<p>These connectors enable seamless ingestion of files from cloud storage to your OpenRAG knowledge base.</p>
|
||
<p>Individual users can connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage. When a user connects a cloud storage service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage.</p>
|
||
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="enable-oauth-connectors">Enable OAuth connectors<a href="#enable-oauth-connectors" class="hash-link" aria-label="Direct link to Enable OAuth connectors" title="Direct link to Enable OAuth connectors" translate="no"></a></h3>
|
||
<p>Before users can connect their own cloud storage accounts, you must configure the provider's OAuth credentials in OpenRAG. Typically, this requires that you register OpenRAG as an OAuth application in your cloud provider, and then obtain the app's OAuth credentials, such as a client ID and secret key.
|
||
To enable multiple connectors, you must register an app and generate credentials for each provider.</p>
|
||
<div class="theme-tabs-container tabs-container tabList__CuJ"><ul role="tablist" aria-orientation="horizontal" class="tabs"><li role="tab" tabindex="0" aria-selected="true" class="tabs__item tabItem_LNqP tabs__item--active">TUI-managed services</li><li role="tab" tabindex="-1" aria-selected="false" class="tabs__item tabItem_LNqP">Self-managed services</li></ul><div class="margin-top--md"><div role="tabpanel" class="tabItem_Ymn6"><p>If you use the <a class="" href="/tui">Terminal User Interface (TUI)</a> to manage your OpenRAG services, enter OAuth credentials on the <strong>Advanced Setup</strong> page.
|
||
You can do this during <a class="" href="/install#setup">installation</a>, or you can add the credentials afterwards:</p><ol>
|
||
<li class="">
|
||
<p>If OpenRAG is running, click <strong>Stop All Services</strong> in the TUI.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Open the <strong>Advanced Setup</strong> page, and then add the OAuth credentials for the cloud storage providers that you want to use under <strong>API Keys</strong>:</p>
|
||
<ul>
|
||
<li class=""><strong>Google</strong>: Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the <a href="https://console.cloud.google.com/apis/credentials" target="_blank" rel="noopener noreferrer" class="">Google Cloud Console</a>. For more information, see the <a href="https://developers.google.com/identity/protocols/oauth2" target="_blank" rel="noopener noreferrer" class="">Google OAuth client documentation</a>.</li>
|
||
<li class=""><strong>Microsoft</strong>: For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide <a href="https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online" target="_blank" rel="noopener noreferrer" class="">Azure application registration credentials for SharePoint and OneDrive</a>. For more information, see the <a href="https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth" target="_blank" rel="noopener noreferrer" class="">Microsoft Graph OAuth client documentation</a>.</li>
|
||
<li class=""><strong>Amazon</strong>: Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on <a href="https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html" target="_blank" rel="noopener noreferrer" class="">Configuring access to AWS applications</a>.</li>
|
||
</ul>
|
||
</li>
|
||
<li class="">
|
||
<p>Register the redirect URIs shown in the TUI in your OAuth provider.
|
||
These are the URLs your OAuth provider will use to redirect users back to OpenRAG after they sign in.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Click <strong>Save Configuration</strong> to add the OAuth credentials to your <a class="" href="/reference/configuration">OpenRAG <code>.env</code> file</a>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Click <strong>Start Services</strong> to restart the OpenRAG containers with OAuth enabled.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Launch the OpenRAG app.
|
||
You should be prompted to sign in to your OAuth provider before being redirected to your OpenRAG instance.</p>
|
||
</li>
|
||
</ol></div><div role="tabpanel" class="tabItem_Ymn6" hidden=""><p>If you <a class="" href="/docker">installed OpenRAG with self-managed services</a>, set OAuth credentials in your <a class="" href="/reference/configuration">OpenRAG <code>.env</code> file</a>.</p><p>You can do this during <a class="" href="/docker#setup">initial set up</a>, or you can add the credentials afterwards:</p><ol>
|
||
<li class="">
|
||
<p>Stop all OpenRAG containers:</p>
|
||
<div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockTitle_OeMC">Docker</div><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token function" style="color:#d73a49">docker</span><span class="token plain"> stop </span><span class="token variable" style="color:#36acaa">$(</span><span class="token variable function" style="color:#d73a49">docker</span><span class="token variable" style="color:#36acaa"> </span><span class="token variable function" style="color:#d73a49">ps</span><span class="token variable" style="color:#36acaa"> </span><span class="token variable parameter variable" style="color:#36acaa">-q</span><span class="token variable" style="color:#36acaa">)</span><br></span></code></pre></div></div>
|
||
<div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockTitle_OeMC">Podman</div><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token function" style="color:#d73a49">podman</span><span class="token plain"> stop </span><span class="token parameter variable" style="color:#36acaa">--all</span><br></span></code></pre></div></div>
|
||
</li>
|
||
<li class="">
|
||
<p>Edit your OpenRAG <code>.env</code> file to add the OAuth credentials for the cloud storage providers that you want to use:</p>
|
||
<ul>
|
||
<li class="">
|
||
<p><strong>Google</strong>: Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the <a href="https://console.cloud.google.com/apis/credentials" target="_blank" rel="noopener noreferrer" class="">Google Cloud Console</a>. For more information, see the <a href="https://developers.google.com/identity/protocols/oauth2" target="_blank" rel="noopener noreferrer" class="">Google OAuth client documentation</a>.</p>
|
||
<div class="language-env codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-env codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token plain">GOOGLE_OAUTH_CLIENT_ID=</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">GOOGLE_OAUTH_CLIENT_SECRET=</span><br></span></code></pre></div></div>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Microsoft</strong>: For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide <a href="https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online" target="_blank" rel="noopener noreferrer" class="">Azure application registration credentials for SharePoint and OneDrive</a>. For more information, see the <a href="https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth" target="_blank" rel="noopener noreferrer" class="">Microsoft Graph OAuth client documentation</a>.</p>
|
||
<div class="language-env codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-env codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token plain">MICROSOFT_GRAPH_OAUTH_CLIENT_ID=</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET=</span><br></span></code></pre></div></div>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Amazon</strong>: Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on <a href="https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html" target="_blank" rel="noopener noreferrer" class="">Configuring access to AWS applications</a>.</p>
|
||
<div class="language-env codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-env codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token plain">AWS_ACCESS_KEY_ID=</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">AWS_SECRET_ACCESS_KEY=</span><br></span></code></pre></div></div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="">
|
||
<p>Save the <code>.env</code> file.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Restart your OpenRAG containers:</p>
|
||
<div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockTitle_OeMC">Docker</div><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token function" style="color:#d73a49">docker</span><span class="token plain"> compose up </span><span class="token parameter variable" style="color:#36acaa">-d</span><br></span></code></pre></div></div>
|
||
<div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockTitle_OeMC">Podman</div><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token function" style="color:#d73a49">podman</span><span class="token plain"> compose up </span><span class="token parameter variable" style="color:#36acaa">-d</span><br></span></code></pre></div></div>
|
||
</li>
|
||
<li class="">
|
||
<p>Access the OpenRAG frontend at <code>http://localhost:3000</code>.
|
||
You should be prompted to sign in to your OAuth provider before being redirected to your OpenRAG instance.</p>
|
||
</li>
|
||
</ol></div></div></div>
|
||
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="authenticate-and-ingest-files-from-cloud-storage">Authenticate and ingest files from cloud storage<a href="#authenticate-and-ingest-files-from-cloud-storage" class="hash-link" aria-label="Direct link to Authenticate and ingest files from cloud storage" title="Direct link to Authenticate and ingest files from cloud storage" translate="no"></a></h3>
|
||
<p>After you start OpenRAG with OAuth connectors enabled, each user is prompted to authenticate with the OAuth provider upon accessing your OpenRAG instance.
|
||
Individual authentication is required to access a user's cloud storage from your OpenRAG instance.
|
||
For example, if a user navigates to the default OpenRAG URL at <code>http://localhost:3000</code>, they are redirected to the OAuth provider's sign-in page.
|
||
After authenticating and granting the required permissions for OpenRAG, the user is redirected back to OpenRAG.</p>
|
||
<p>To ingest knowledge with an OAuth connector, do the following:</p>
|
||
<ol>
|
||
<li class="">
|
||
<p>Click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-library" aria-hidden="true"><path d="m16 6 4 14"></path><path d="M12 6v14"></path><path d="M8 8v12"></path><path d="M4 4v16"></path></svg> <strong>Knowledge</strong> to view your OpenSearch knowledge base.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Click <strong>Add Knowledge</strong>, and then select a storage provider.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>On the <strong>Add Cloud Knowledge</strong> page, click <strong>Add Files</strong>, and then select the files and folders to ingest from the connected storage.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Click <strong>Ingest Files</strong>.</p>
|
||
</li>
|
||
</ol>
|
||
<p>The selected files are processed in the background through the <strong>OpenSearch Ingestion</strong> flow.</p>
|
||
<details class="details_lb9f alert alert--info details_b_Ee" data-collapsed="true"><summary>About the OpenSearch Ingestion flow</summary><div><div class="collapsibleContent_i85q"><p>When you upload documents locally or with OAuth connectors, the <strong>OpenSearch Ingestion</strong> flow runs in the background.
|
||
By default, this flow uses Docling Serve to import and process documents.</p><p>Like all <a class="" href="/agents">OpenRAG flows</a>, you can <a class="" href="/agents#inspect-and-modify-flows">inspect the flow in Langflow</a>, and you can customize it if you want to change the knowledge ingestion settings.</p><p>The <strong>OpenSearch Ingestion</strong> flow is comprised of several components that work together to process and store documents in your knowledge base:</p><ul>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-docling#docling-serve" target="_blank" rel="noopener noreferrer" class=""><strong>Docling Serve</strong> component</a>: Ingests files and processes them by connecting to OpenRAG's local Docling Serve service. The output is <code>DoclingDocument</code> data that contains the extracted text and metadata from the documents.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-docling#export-doclingdocument" target="_blank" rel="noopener noreferrer" class=""><strong>Export DoclingDocument</strong> component</a>: Exports processed <code>DoclingDocument</code> data to Markdown format with image placeholders. This conversion standardizes the document data in preparation for further processing.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/dataframe-operations" target="_blank" rel="noopener noreferrer" class=""><strong>DataFrame Operations</strong> component</a>: Three of these components run sequentially to add metadata to the document data: <code>filename</code>, <code>file_size</code>, and <code>mimetype</code>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/split-text" target="_blank" rel="noopener noreferrer" class=""><strong>Split Text</strong> component</a>: Splits the processed text into chunks, based on the configured <a class="" href="/knowledge#knowledge-ingestion-settings">chunk size and overlap settings</a>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Secret Input</strong> component: If needed, four of these components securely fetch the <a class="" href="/knowledge#auth">OAuth authentication</a> configuration variables: <code>CONNECTOR_TYPE</code>, <code>OWNER</code>, <code>OWNER_EMAIL</code>, and <code>OWNER_NAME</code>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><strong>Create Data</strong> component: Combines the authentication credentials from the <strong>Secret Input</strong> components into a structured data object that is associated with the document embeddings.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/components-embedding-models" target="_blank" rel="noopener noreferrer" class=""><strong>Embedding Model</strong> component</a>: Generates vector embeddings using your selected <a class="" href="/knowledge#set-the-embedding-model-and-dimensions">embedding model</a>.</p>
|
||
</li>
|
||
<li class="">
|
||
<p><a href="https://docs.langflow.org/bundles-elastic#opensearch" target="_blank" rel="noopener noreferrer" class=""><strong>OpenSearch</strong> component</a>: Stores the processed documents and their embeddings in a <code>documents</code> index of your OpenRAG <a class="" href="/knowledge">OpenSearch knowledge base</a>.</p>
|
||
<p>The default address for the OpenSearch instance is <code>https://opensearch:9200</code>. To change this address, edit the <code>OPENSEARCH_PORT</code> <a class="" href="/reference/configuration#opensearch-settings">environment variable</a>.</p>
|
||
<p>The default authentication method is JSON Web Token (JWT) authentication. If you <a class="" href="/agents#inspect-and-modify-flows">edit the flow</a>, you can select <code>basic</code> auth mode, which uses the <code>OPENSEARCH_USERNAME</code> and <code>OPENSEARCH_PASSWORD</code> <a class="" href="/reference/configuration#opensearch-settings">environment variables</a> for authentication instead of JWT.</p>
|
||
</li>
|
||
</ul></div></div></details>
|
||
<p>You can <a href="#monitor-ingestion" class="">monitor ingestion</a> to see the progress of the uploads and check for failed uploads.</p>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="url-flow">Ingest knowledge from URLs<a href="#url-flow" class="hash-link" aria-label="Direct link to Ingest knowledge from URLs" title="Direct link to Ingest knowledge from URLs" translate="no"></a></h2>
|
||
<p>When using the OpenRAG chat, you can enter URLs into the chat to be ingested in real-time during your conversation.</p>
|
||
<div class="theme-admonition theme-admonition-info admonition_xJq3 alert alert--info"><div class="admonitionHeading_Gvgb"><span class="admonitionIcon_Rf37"><svg viewBox="0 0 14 16"><path fill-rule="evenodd" d="M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"></path></svg></span>info</div><div class="admonitionContent_BuS1"><p>The chat cannot ingest URLs that end in static document file extensions like <code>.pdf</code>.
|
||
To upload these types of files, see <a href="#ingest-local-files-and-folders" class="">Ingest local files and folders</a> and <a href="#oauth-ingestion" class="">Ingest files with OAuth connectors</a>.</p></div></div>
|
||
<p>OpenRAG runs the <strong>OpenSearch URL Ingestion</strong> flow to ingest web content from URLs.
|
||
This flow isn't directly accessible from the OpenRAG user interface.
|
||
Instead, this flow is called by the <a class="" href="/chat#flow"><strong>OpenRAG OpenSearch Agent</strong> flow</a> as a Model Context Protocol (MCP) tool.
|
||
The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
|
||
Like all OpenRAG flows, you can <a class="" href="/agents#inspect-and-modify-flows">inspect the flow in Langflow</a>, and you can customize it.
|
||
For more information about MCP in Langflow, see the Langflow documentation on <a href="https://docs.langflow.org/mcp-client" target="_blank" rel="noopener noreferrer" class="">MCP clients</a> and <a href="https://docs.langflow.org/mcp-tutorial" target="_blank" rel="noopener noreferrer" class="">MCP servers</a>.</p>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="monitor-ingestion">Monitor ingestion<a href="#monitor-ingestion" class="hash-link" aria-label="Direct link to Monitor ingestion" title="Direct link to Monitor ingestion" translate="no"></a></h2>
|
||
<p>Depending on the amount of data to ingest, document ingestion can take a few seconds, minutes, or longer.
|
||
For this reason, document ingestion tasks run in the background.</p>
|
||
<p>In the OpenRAG user interface, a badge is shown on <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-bell" aria-hidden="true"><path d="M10.268 21a2 2 0 0 0 3.464 0"></path><path d="M3.262 15.326A1 1 0 0 0 4 17h16a1 1 0 0 0 .74-1.673C19.41 13.956 18 12.499 18 8A6 6 0 0 0 6 8c0 4.499-1.411 5.956-2.738 7.326"></path></svg> <strong>Tasks</strong> when OpenRAG tasks are active.
|
||
Click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-bell" aria-hidden="true"><path d="M10.268 21a2 2 0 0 0 3.464 0"></path><path d="M3.262 15.326A1 1 0 0 0 4 17h16a1 1 0 0 0 .74-1.673C19.41 13.956 18 12.499 18 8A6 6 0 0 0 6 8c0 4.499-1.411 5.956-2.738 7.326"></path></svg> <strong>Tasks</strong> to inspect and cancel tasks.
|
||
Tasks are separated into multiple sections:</p>
|
||
<ul>
|
||
<li class="">
|
||
<p>The <strong>Active Tasks</strong> section includes all tasks that are <strong>Pending</strong>, <strong>Running</strong>, or <strong>Processing</strong>:</p>
|
||
<ul>
|
||
<li class=""><strong>Pending</strong>: The task is queued and waiting to start.</li>
|
||
<li class=""><strong>Running</strong>: The task is actively processing files.</li>
|
||
<li class=""><strong>Processing</strong>: The task is performing ingestion operations.</li>
|
||
</ul>
|
||
<p>To stop an active task, click <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-x" aria-hidden="true"><path d="M18 6 6 18"></path><path d="m6 6 12 12"></path></svg> <strong>Cancel</strong>. Canceling a task stops processing immediately and marks the ingestion as failed.</p>
|
||
</li>
|
||
<li class="">
|
||
<p>The <strong>Recent Tasks</strong> section lists recently finished tasks.</p>
|
||
<div class="theme-admonition theme-admonition-warning admonition_xJq3 alert alert--warning"><div class="admonitionHeading_Gvgb"><span class="admonitionIcon_Rf37"><svg viewBox="0 0 16 16"><path fill-rule="evenodd" d="M8.893 1.5c-.183-.31-.52-.5-.887-.5s-.703.19-.886.5L.138 13.499a.98.98 0 0 0 0 1.001c.193.31.53.501.886.501h13.964c.367 0 .704-.19.877-.5a1.03 1.03 0 0 0 .01-1.002L8.893 1.5zm.133 11.497H6.987v-2.003h2.039v2.003zm0-3.004H6.987V5.987h2.039v4.006z"></path></svg></span>warning</div><div class="admonitionContent_BuS1"><p><strong>Completed</strong> doesn't mean success.</p><p>A completed task can report successful ingestions, failed ingestions, or both, depending on the number of files processed.</p></div></div>
|
||
<p>Check the <strong>Success</strong> and <strong>Failed</strong> counts for each completed task to determine the overall success rate.</p>
|
||
<p><strong>Failed</strong> means something went wrong during ingestion, or the task was manually canceled.
|
||
For more information, see <a href="#troubleshoot-ingestion" class="">Troubleshoot ingestion</a>.</p>
|
||
</li>
|
||
</ul>
|
||
<p>For each task, depending on its state, you can find the task ID, start time, duration, number of files processed successfully, number of files that failed, and the number of files enqueued for processing.</p>
|
||
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="ingestion-performance-expectations">Ingestion performance expectations<a href="#ingestion-performance-expectations" class="hash-link" aria-label="Direct link to Ingestion performance expectations" title="Direct link to Ingestion performance expectations" translate="no"></a></h3>
|
||
<p>The following performance test was conducted with Docling Serve.</p>
|
||
<p>On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
|
||
This equates to approximately 2.4 documents per second.</p>
|
||
<p>You can generally expect equal or better performance on developer laptops, and significantly faster performance on servers.
|
||
Throughput scales with CPU cores, memory, storage speed, and configuration choices, such as the embedding model, chunk size, overlap, and concurrency.</p>
|
||
<p>This test returned 12 error, approximately 1.1 percent of the total files ingested.
|
||
All errors were file-specific, and they didn't stop the pipeline.</p>
|
||
<details class="details_lb9f alert alert--info details_b_Ee" data-collapsed="true"><summary>Ingestion performance test details</summary><div><div class="collapsibleContent_i85q"><ul>
|
||
<li class="">
|
||
<p>Ingestion dataset:</p>
|
||
<ul>
|
||
<li class="">Total files: 1,083 items mounted</li>
|
||
<li class="">Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)</li>
|
||
</ul>
|
||
</li>
|
||
<li class="">
|
||
<p>Hardware specifications:</p>
|
||
<ul>
|
||
<li class="">
|
||
<p>Machine: Apple M4 Pro</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Podman VM:</p>
|
||
<ul>
|
||
<li class="">Name: podman-machine-default</li>
|
||
<li class="">Type: applehv</li>
|
||
<li class="">vCPUs: 7</li>
|
||
<li class="">Memory: 8 GiB</li>
|
||
<li class="">Disk size: 100 GiB</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="">
|
||
<p>Test results:</p>
|
||
<div class="language-text codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#393A34;--prism-background-color:#f6f8fa"><div class="codeBlockContent_QJqH"><pre tabindex="0" class="prism-code language-text codeBlock_bY9V thin-scrollbar" style="color:#393A34;background-color:#f6f8fa"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">...</span><br></span><span class="token-line" style="color:#393A34"><span class="token plain">2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082</span><br></span></code></pre></div></div>
|
||
</li>
|
||
<li class="">
|
||
<p>Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)</p>
|
||
</li>
|
||
<li class="">
|
||
<p>Throughput: Approximately 2.4 documents per second</p>
|
||
</li>
|
||
</ul></div></div></details>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="troubleshoot-ingestion">Troubleshoot ingestion<a href="#troubleshoot-ingestion" class="hash-link" aria-label="Direct link to Troubleshoot ingestion" title="Direct link to Troubleshoot ingestion" translate="no"></a></h2>
|
||
<p>The following issues can occur during document ingestion.</p>
|
||
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="failed-or-slow-ingestion">Failed or slow ingestion<a href="#failed-or-slow-ingestion" class="hash-link" aria-label="Direct link to Failed or slow ingestion" title="Direct link to Failed or slow ingestion" translate="no"></a></h3>
|
||
<p>If an ingestion task fails, do the following:</p>
|
||
<ul>
|
||
<li class="">Make sure you uploaded only supported file types.</li>
|
||
<li class="">Split very large files into smaller files.</li>
|
||
<li class="">Remove unusual or complex embedded content, such as videos or animations. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.</li>
|
||
<li class="">Make sure your Podman/Docker VM has sufficient memory for the ingestion tasks.
|
||
The minimum recommendation is 8 GB of RAM.
|
||
If you regularly upload large files, more RAM is recommended.
|
||
For more information, see <a class="" href="/support/troubleshoot#memory-issue-with-podman-on-macos">Memory issue with Podman on macOS</a> and <a class="" href="/support/troubleshoot#container-out-of-memory-errors">Container out of memory errors</a>.</li>
|
||
<li class="">If OCR ingestion fails due to OCR missing, see <a class="" href="/support/troubleshoot#ocr-ingestion-fails-easyocr-not-installed">OCR ingestion fails (easyocr not installed)</a>.</li>
|
||
</ul>
|
||
<h3 class="anchor anchorTargetStickyNavbar_Vzrq" id="problems-when-referencing-documents-in-chat">Problems when referencing documents in chat<a href="#problems-when-referencing-documents-in-chat" class="hash-link" aria-label="Direct link to Problems when referencing documents in chat" title="Direct link to Problems when referencing documents in chat" translate="no"></a></h3>
|
||
<p>If the OpenRAG <strong>Chat</strong> doesn't seem to use your documents correctly, <a class="" href="/knowledge#browse-knowledge">browse your knowledge base</a> to confirm that the documents are uploaded in full, and the chunks are correct.</p>
|
||
<p>If the documents are present and well-formed, check your <a class="" href="/knowledge-filters">knowledge filters</a>.
|
||
If you applied a filter to the chat, make sure the expected documents aren't excluded by the filter settings.
|
||
You can test this by applying the filter when you <a class="" href="/knowledge#browse-knowledge">browse the knowledge base</a>.
|
||
If the filter excludes any documents, the agent cannot access those documents.
|
||
Be aware that some settings create dynamic filters that don't always produce the same results, such as a <strong>Search query</strong> combined with a low <strong>Response limit</strong>.</p>
|
||
<p>If the document chunks have missing, incorrect, or unexpected text, you must <a class="" href="/knowledge#delete-knowledge">delete the documents</a> from your knowledge base, modify the <a class="" href="/knowledge#knowledge-ingestion-settings">ingestion parameters</a> or the documents themselves, and then reingest the documents.
|
||
For example:</p>
|
||
<ul>
|
||
<li class="">Break combined documents into separate files for better metadata context.</li>
|
||
<li class="">Make sure scanned documents are legible enough for extraction, and enable the <strong>OCR</strong> option. Poorly scanned documents might require additional preparation or rescanning before ingestion.</li>
|
||
<li class="">Adjust the <strong>Chunk size</strong> and <strong>Chunk overlap</strong> settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.</li>
|
||
</ul>
|
||
<h2 class="anchor anchorTargetStickyNavbar_Vzrq" id="see-also">See also<a href="#see-also" class="hash-link" aria-label="Direct link to See also" title="Direct link to See also" translate="no"></a></h2>
|
||
<ul>
|
||
<li class=""><a class="" href="/knowledge">Configure knowledge</a></li>
|
||
<li class=""><a class="" href="/knowledge-filters">Filter knowledge</a></li>
|
||
<li class=""><a class="" href="/chat">Chat with knowledge</a></li>
|
||
<li class=""><a class="" href="/agents#inspect-and-modify-flows">Inspect and modify flows</a></li>
|
||
</ul></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col noPrint_WFHX"><a href="https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href="/knowledge"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">Configure knowledge</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/knowledge-filters"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">Filter knowledge</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#ingest-local-files-and-folders" class="table-of-contents__link toc-highlight">Ingest local files and folders</a></li><li><a href="#ingest-local-files-temporarily" class="table-of-contents__link toc-highlight">Ingest local files temporarily</a></li><li><a href="#oauth-ingestion" class="table-of-contents__link toc-highlight">Ingest files with OAuth connectors</a><ul><li><a href="#enable-oauth-connectors" class="table-of-contents__link toc-highlight">Enable OAuth connectors</a></li><li><a href="#authenticate-and-ingest-files-from-cloud-storage" class="table-of-contents__link toc-highlight">Authenticate and ingest files from cloud storage</a></li></ul></li><li><a href="#url-flow" class="table-of-contents__link toc-highlight">Ingest knowledge from URLs</a></li><li><a href="#monitor-ingestion" class="table-of-contents__link toc-highlight">Monitor ingestion</a><ul><li><a href="#ingestion-performance-expectations" class="table-of-contents__link toc-highlight">Ingestion performance expectations</a></li></ul></li><li><a href="#troubleshoot-ingestion" class="table-of-contents__link toc-highlight">Troubleshoot ingestion</a><ul><li><a href="#failed-or-slow-ingestion" class="table-of-contents__link toc-highlight">Failed or slow ingestion</a></li><li><a href="#problems-when-referencing-documents-in-chat" class="table-of-contents__link toc-highlight">Problems when referencing documents in chat</a></li></ul></li><li><a href="#see-also" class="table-of-contents__link toc-highlight">See also</a></li></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class="footer__title"></div><ul class="footer__items clean-list"><li class="footer__item"><div class="footer-links">
|
||
<span>© 2026 OpenRAG</span>
|
||
<span id="preferenceCenterContainer"> · <a href="#" onclick='return"undefined"!=typeof window&&window.truste&&window.truste.eu&&window.truste.eu.clickListener&&window.truste.eu.clickListener(),!1' style="cursor: pointer;">Manage Privacy Choices</a></span>
|
||
</div></li></ul></div></div></div></footer></div>
|
||
</body>
|
||
</html> |