1 line
No EOL
39 KiB
JavaScript
1 line
No EOL
39 KiB
JavaScript
"use strict";(globalThis.webpackChunkopenrag_docs=globalThis.webpackChunkopenrag_docs||[]).push([[6919],{1381:(e,n,s)=>{s.d(n,{Ay:()=>l,RM:()=>r});var t=s(4848),o=s(8453);const r=[];function i(e){const n={code:"code",pre:"pre",...(0,o.R)(),...e.components};return(0,t.jsxs)(t.Fragment,{children:[(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Docker"',children:"docker compose up -d\n"})}),"\n",(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Podman"',children:"podman compose up -d\n"})})]})}function l(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(i,{...e})}):i(e)}},1470:(e,n,s)=>{s.d(n,{A:()=>A});var t=s(6540),o=s(4164),r=s(7559),i=s(3104),l=s(6347),a=s(205),c=s(7485),d=s(1682),h=s(679);function u(e){return t.Children.toArray(e).filter(e=>"\n"!==e).map(e=>{if(!e||(0,t.isValidElement)(e)&&function(e){const{props:n}=e;return!!n&&"object"==typeof n&&"value"in n}(e))return e;throw new Error(`Docusaurus error: Bad <Tabs> child <${"string"==typeof e.type?e.type:e.type.name}>: all children of the <Tabs> component should be <TabItem>, and every <TabItem> should have a unique "value" prop.`)})?.filter(Boolean)??[]}function p(e){const{values:n,children:s}=e;return(0,t.useMemo)(()=>{const e=n??function(e){return u(e).map(({props:{value:e,label:n,attributes:s,default:t}})=>({value:e,label:n,attributes:s,default:t}))}(s);return function(e){const n=(0,d.XI)(e,(e,n)=>e.value===n.value);if(n.length>0)throw new Error(`Docusaurus error: Duplicate values "${n.map(e=>e.value).join(", ")}" found in <Tabs>. Every value needs to be unique.`)}(e),e},[n,s])}function g({value:e,tabValues:n}){return n.some(n=>n.value===e)}function f({queryString:e=!1,groupId:n}){const s=(0,l.W6)(),o=function({queryString:e=!1,groupId:n}){if("string"==typeof e)return e;if(!1===e)return null;if(!0===e&&!n)throw new Error('Docusaurus error: The <Tabs> component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return n??null}({queryString:e,groupId:n});return[(0,c.aZ)(o),(0,t.useCallback)(e=>{if(!o)return;const n=new URLSearchParams(s.location.search);n.set(o,e),s.replace({...s.location,search:n.toString()})},[o,s])]}function m(e){const{defaultValue:n,queryString:s=!1,groupId:o}=e,r=p(e),[i,l]=(0,t.useState)(()=>function({defaultValue:e,tabValues:n}){if(0===n.length)throw new Error("Docusaurus error: the <Tabs> component requires at least one <TabItem> children component");if(e){if(!g({value:e,tabValues:n}))throw new Error(`Docusaurus error: The <Tabs> has a defaultValue "${e}" but none of its children has the corresponding value. Available values are: ${n.map(e=>e.value).join(", ")}. If you intend to show no default tab, use defaultValue={null} instead.`);return e}const s=n.find(e=>e.default)??n[0];if(!s)throw new Error("Unexpected error: 0 tabValues");return s.value}({defaultValue:n,tabValues:r})),[c,d]=f({queryString:s,groupId:o}),[u,m]=function({groupId:e}){const n=function(e){return e?`docusaurus.tab.${e}`:null}(e),[s,o]=(0,h.Dv)(n);return[s,(0,t.useCallback)(e=>{n&&o.set(e)},[n,o])]}({groupId:o}),x=(()=>{const e=c??u;return g({value:e,tabValues:r})?e:null})();(0,a.A)(()=>{x&&l(x)},[x]);return{selectedValue:i,selectValue:(0,t.useCallback)(e=>{if(!g({value:e,tabValues:r}))throw new Error(`Can't select invalid tab value=${e}`);l(e),d(e),m(e)},[d,m,r]),tabValues:r}}var x=s(2303);const j={tabList:"tabList__CuJ",tabItem:"tabItem_LNqP"};var w=s(4848);function y({className:e,block:n,selectedValue:s,selectValue:t,tabValues:r}){const l=[],{blockElementScrollPositionUntilNextRender:a}=(0,i.a_)(),c=e=>{const n=e.currentTarget,o=l.indexOf(n),i=r[o].value;i!==s&&(a(n),t(i))},d=e=>{let n=null;switch(e.key){case"Enter":c(e);break;case"ArrowRight":{const s=l.indexOf(e.currentTarget)+1;n=l[s]??l[0];break}case"ArrowLeft":{const s=l.indexOf(e.currentTarget)-1;n=l[s]??l[l.length-1];break}}n?.focus()};return(0,w.jsx)("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.A)("tabs",{"tabs--block":n},e),children:r.map(({value:e,label:n,attributes:t})=>(0,w.jsx)("li",{role:"tab",tabIndex:s===e?0:-1,"aria-selected":s===e,ref:e=>{l.push(e)},onKeyDown:d,onClick:c,...t,className:(0,o.A)("tabs__item",j.tabItem,t?.className,{"tabs__item--active":s===e}),children:n??e},e))})}function b({lazy:e,children:n,selectedValue:s}){const r=(Array.isArray(n)?n:[n]).filter(Boolean);if(e){const e=r.find(e=>e.props.value===s);return e?(0,t.cloneElement)(e,{className:(0,o.A)("margin-top--md",e.props.className)}):null}return(0,w.jsx)("div",{className:"margin-top--md",children:r.map((e,n)=>(0,t.cloneElement)(e,{key:n,hidden:e.props.value!==s}))})}function v(e){const n=m(e);return(0,w.jsxs)("div",{className:(0,o.A)(r.G.tabs.container,"tabs-container",j.tabList),children:[(0,w.jsx)(y,{...n,...e}),(0,w.jsx)(b,{...n,...e})]})}function A(e){const n=(0,x.A)();return(0,w.jsx)(v,{...e,children:u(e.children)},String(n))}},4577:(e,n,s)=>{s.d(n,{Ay:()=>l,RM:()=>r});var t=s(4848),o=s(8453);const r=[];function i(e){const n={code:"code",pre:"pre",...(0,o.R)(),...e.components};return(0,t.jsxs)(t.Fragment,{children:[(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Docker"',children:"docker stop $(docker ps -q)\n"})}),"\n",(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Podman"',children:"podman stop --all\n"})})]})}function l(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(i,{...e})}):i(e)}},5421:(e,n,s)=>{s.r(n),s.d(n,{assets:()=>x,contentTitle:()=>m,default:()=>y,frontMatter:()=>f,metadata:()=>t,toc:()=>j});const t=JSON.parse('{"id":"core-components/ingestion","title":"Ingest knowledge","description":"Upload documents to your OpenRAG OpenSearch instance to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.","source":"@site/docs/core-components/ingestion.mdx","sourceDirName":"core-components","slug":"/ingestion","permalink":"/ingestion","draft":false,"unlisted":false,"editUrl":"https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx","tags":[],"version":"current","frontMatter":{"title":"Ingest knowledge","slug":"/ingestion"},"sidebar":"tutorialSidebar","previous":{"title":"Configure knowledge","permalink":"/knowledge"},"next":{"title":"Filter knowledge","permalink":"/knowledge-filters"}}');var o=s(4848),r=s(8453),i=s(9179),l=s(1470),a=s(9365),c=s(8401);const d=[];function h(e){const n={a:"a",code:"code",li:"li",p:"p",strong:"strong",ul:"ul",...(0,r.R)(),...e.components},{Details:s}=n;return s||function(e,n){throw new Error("Expected "+(n?"component":"object")+" `"+e+"` to be defined: you likely forgot to import, pass, or provide it.")}("Details",!0),(0,o.jsxs)(s,{children:[(0,o.jsx)("summary",{children:"About the OpenSearch Ingestion flow"}),(0,o.jsxs)(n.p,{children:["When you upload documents locally or with OAuth connectors, the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow runs in the background.\nBy default, this flow uses Docling Serve to import and process documents."]}),(0,o.jsxs)(n.p,{children:["Like all ",(0,o.jsx)(n.a,{href:"/agents",children:"OpenRAG flows"}),", you can ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"inspect the flow in Langflow"}),", and you can customize it if you want to change the knowledge ingestion settings."]}),(0,o.jsxs)(n.p,{children:["The ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow is comprised of several components that work together to process and store documents in your knowledge base:"]}),(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-docling#docling-serve",children:[(0,o.jsx)(n.strong,{children:"Docling Serve"})," component"]}),": Ingests files and processes them by connecting to OpenRAG's local Docling Serve service. The output is ",(0,o.jsx)(n.code,{children:"DoclingDocument"})," data that contains the extracted text and metadata from the documents."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-docling#export-doclingdocument",children:[(0,o.jsx)(n.strong,{children:"Export DoclingDocument"})," component"]}),": Exports processed ",(0,o.jsx)(n.code,{children:"DoclingDocument"})," data to Markdown format with image placeholders. This conversion standardizes the document data in preparation for further processing."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/dataframe-operations",children:[(0,o.jsx)(n.strong,{children:"DataFrame Operations"})," component"]}),": Three of these components run sequentially to add metadata to the document data: ",(0,o.jsx)(n.code,{children:"filename"}),", ",(0,o.jsx)(n.code,{children:"file_size"}),", and ",(0,o.jsx)(n.code,{children:"mimetype"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/split-text",children:[(0,o.jsx)(n.strong,{children:"Split Text"})," component"]}),": Splits the processed text into chunks, based on the configured ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"chunk size and overlap settings"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Secret Input"})," component: If needed, four of these components securely fetch the ",(0,o.jsx)(n.a,{href:"/knowledge#auth",children:"OAuth authentication"})," configuration variables: ",(0,o.jsx)(n.code,{children:"CONNECTOR_TYPE"}),", ",(0,o.jsx)(n.code,{children:"OWNER"}),", ",(0,o.jsx)(n.code,{children:"OWNER_EMAIL"}),", and ",(0,o.jsx)(n.code,{children:"OWNER_NAME"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Create Data"})," component: Combines the authentication credentials from the ",(0,o.jsx)(n.strong,{children:"Secret Input"})," components into a structured data object that is associated with the document embeddings."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/components-embedding-models",children:[(0,o.jsx)(n.strong,{children:"Embedding Model"})," component"]}),": Generates vector embeddings using your selected ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-embedding-model-and-dimensions",children:"embedding model"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-elastic#opensearch",children:[(0,o.jsx)(n.strong,{children:"OpenSearch"})," component"]}),": Stores the processed documents and their embeddings in a ",(0,o.jsx)(n.code,{children:"documents"})," index of your OpenRAG ",(0,o.jsx)(n.a,{href:"/knowledge",children:"OpenSearch knowledge base"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default address for the OpenSearch instance is ",(0,o.jsx)(n.code,{children:"https://opensearch:9200"}),". To change this address, edit the ",(0,o.jsx)(n.code,{children:"OPENSEARCH_PORT"})," ",(0,o.jsx)(n.a,{href:"/reference/configuration#opensearch-settings",children:"environment variable"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default authentication method is JSON Web Token (JWT) authentication. If you ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"edit the flow"}),", you can select ",(0,o.jsx)(n.code,{children:"basic"})," auth mode, which uses the ",(0,o.jsx)(n.code,{children:"OPENSEARCH_USERNAME"})," and ",(0,o.jsx)(n.code,{children:"OPENSEARCH_PASSWORD"})," ",(0,o.jsx)(n.a,{href:"/reference/configuration#opensearch-settings",children:"environment variables"})," for authentication instead of JWT."]}),"\n"]}),"\n"]})]})}function u(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,o.jsx)(n,{...e,children:(0,o.jsx)(h,{...e})}):h(e)}var p=s(1381),g=s(4577);const f={title:"Ingest knowledge",slug:"/ingestion"},m=void 0,x={},j=[{value:"Ingest local files and folders",id:"ingest-local-files-and-folders",level:2},...d,{value:"Ingest local files temporarily",id:"ingest-local-files-temporarily",level:2},...c.RM,{value:"Ingest files with OAuth connectors",id:"oauth-ingestion",level:2},{value:"Enable OAuth connectors",id:"enable-oauth-connectors",level:3},...g.RM,...p.RM,{value:"Authenticate and ingest files from cloud storage",id:"authenticate-and-ingest-files-from-cloud-storage",level:3},...d,{value:"Ingest knowledge from URLs",id:"url-flow",level:2},{value:"Monitor ingestion",id:"monitor-ingestion",level:2},{value:"Ingestion performance expectations",id:"ingestion-performance-expectations",level:3},{value:"Troubleshoot ingestion",id:"troubleshoot-ingestion",level:2},{value:"Failed or slow ingestion",id:"failed-or-slow-ingestion",level:3},{value:"Problems when referencing documents in chat",id:"problems-when-referencing-documents-in-chat",level:3},{value:"See also",id:"see-also",level:2}];function w(e){const n={a:"a",admonition:"admonition",code:"code",h2:"h2",h3:"h3",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",ul:"ul",...(0,r.R)(),...e.components},{Details:s}=n;return s||function(e,n){throw new Error("Expected "+(n?"component":"object")+" `"+e+"` to be defined: you likely forgot to import, pass, or provide it.")}("Details",!0),(0,o.jsxs)(o.Fragment,{children:[(0,o.jsxs)(n.p,{children:["Upload documents to your ",(0,o.jsx)(n.a,{href:"/knowledge",children:"OpenRAG OpenSearch instance"})," to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.\nDocuments are processed through OpenRAG's knowledge ingestion flows with Docling."]}),"\n",(0,o.jsx)(n.p,{children:"OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connectors."}),"\n",(0,o.jsxs)(n.p,{children:["Knowledge ingestion is powered by OpenRAG's built-in knowledge ingestion flows that use Docling to process documents before storing the documents in your OpenSearch database.\nDuring ingestion, documents are broken into smaller chunks of content that are then embedded using your selected ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-embedding-model-and-dimensions",children:"embedding model"}),".\nThen, the chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database."]}),"\n",(0,o.jsxs)(n.p,{children:["To modify chunking behavior and other ingestion settings, see ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"Knowledge ingestion settings"})," and ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"Inspect and modify flows"}),"."]}),"\n",(0,o.jsx)(n.h2,{id:"ingest-local-files-and-folders",children:"Ingest local files and folders"}),"\n",(0,o.jsx)(n.p,{children:"You can upload files and folders from your local machine to your knowledge base:"}),"\n",(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(i.A,{name:"Library","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Knowledge"})," to view your OpenSearch knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Add Knowledge"})," to add your own documents to your OpenRAG knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["To upload one file, click ",(0,o.jsx)(i.A,{name:"File","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"File"}),". To upload all documents in a folder, click ",(0,o.jsx)(i.A,{name:"Folder","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Folder"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default path is ",(0,o.jsx)(n.code,{children:"~/.openrag/documents"}),".\nTo change this path, see ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-local-documents-path",children:"Set the local documents path"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["The selected files are processed in the background through the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow."]}),"\n",(0,o.jsx)(u,{}),"\n",(0,o.jsxs)(n.p,{children:["You can ",(0,o.jsx)(n.a,{href:"#monitor-ingestion",children:"monitor ingestion"})," to see the progress of the uploads and check for failed uploads."]}),"\n",(0,o.jsx)(n.h2,{id:"ingest-local-files-temporarily",children:"Ingest local files temporarily"}),"\n",(0,o.jsx)(c.Ay,{}),"\n",(0,o.jsx)(n.h2,{id:"oauth-ingestion",children:"Ingest files with OAuth connectors"}),"\n",(0,o.jsx)(n.p,{children:"OpenRAG can use OAuth authenticated connectors to ingest documents from the following external services:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"AWS S3"}),"\n",(0,o.jsx)(n.li,{children:"Google Drive"}),"\n",(0,o.jsx)(n.li,{children:"Microsoft OneDrive"}),"\n",(0,o.jsx)(n.li,{children:"Microsoft Sharepoint"}),"\n"]}),"\n",(0,o.jsx)(n.p,{children:"These connectors enable seamless ingestion of files from cloud storage to your OpenRAG knowledge base."}),"\n",(0,o.jsx)(n.p,{children:"Individual users can connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage. When a user connects a cloud storage service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage."}),"\n",(0,o.jsx)(n.h3,{id:"enable-oauth-connectors",children:"Enable OAuth connectors"}),"\n",(0,o.jsx)(n.p,{children:"Before users can connect their own cloud storage accounts, you must configure the provider's OAuth credentials in OpenRAG. Typically, this requires that you register OpenRAG as an OAuth application in your cloud provider, and then obtain the app's OAuth credentials, such as a client ID and secret key.\nTo enable multiple connectors, you must register an app and generate credentials for each provider."}),"\n",(0,o.jsxs)(l.A,{children:[(0,o.jsxs)(a.A,{value:"TUI",label:"TUI-managed services",default:!0,children:[(0,o.jsxs)(n.p,{children:["If you use the ",(0,o.jsx)(n.a,{href:"/tui",children:"Terminal User Interface (TUI)"})," to manage your OpenRAG services, enter OAuth credentials on the ",(0,o.jsx)(n.strong,{children:"Advanced Setup"})," page.\nYou can do this during ",(0,o.jsx)(n.a,{href:"/install#setup",children:"installation"}),", or you can add the credentials afterwards:"]}),(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["If OpenRAG is running, click ",(0,o.jsx)(n.strong,{children:"Stop All Services"})," in the TUI."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Open the ",(0,o.jsx)(n.strong,{children:"Advanced Setup"})," page, and then add the OAuth credentials for the cloud storage providers that you want to use under ",(0,o.jsx)(n.strong,{children:"API Keys"}),":"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Google"}),": Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the ",(0,o.jsx)(n.a,{href:"https://console.cloud.google.com/apis/credentials",children:"Google Cloud Console"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://developers.google.com/identity/protocols/oauth2",children:"Google OAuth client documentation"}),"."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Microsoft"}),": For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online",children:"Azure application registration credentials for SharePoint and OneDrive"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth",children:"Microsoft Graph OAuth client documentation"}),"."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Amazon"}),": Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on ",(0,o.jsx)(n.a,{href:"https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html",children:"Configuring access to AWS applications"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Register the redirect URIs shown in the TUI in your OAuth provider.\nThese are the URLs your OAuth provider will use to redirect users back to OpenRAG after they sign in."}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Save Configuration"})," to add the OAuth credentials to your ",(0,o.jsxs)(n.a,{href:"/reference/configuration",children:["OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file"]}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Start Services"})," to restart the OpenRAG containers with OAuth enabled."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Launch the OpenRAG app.\nYou should be prompted to sign in to your OAuth provider before being redirected to your OpenRAG instance."}),"\n"]}),"\n"]})]}),(0,o.jsxs)(a.A,{value:"env",label:"Self-managed services",children:[(0,o.jsxs)(n.p,{children:["If you ",(0,o.jsx)(n.a,{href:"/docker",children:"installed OpenRAG with self-managed services"}),", set OAuth credentials in your ",(0,o.jsxs)(n.a,{href:"/reference/configuration",children:["OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file"]}),"."]}),(0,o.jsxs)(n.p,{children:["You can do this during ",(0,o.jsx)(n.a,{href:"/docker#setup",children:"initial set up"}),", or you can add the credentials afterwards:"]}),(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Stop all OpenRAG containers:"}),"\n",(0,o.jsx)(g.Ay,{}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Edit your OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file to add the OAuth credentials for the cloud storage providers that you want to use:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Google"}),": Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the ",(0,o.jsx)(n.a,{href:"https://console.cloud.google.com/apis/credentials",children:"Google Cloud Console"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://developers.google.com/identity/protocols/oauth2",children:"Google OAuth client documentation"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"GOOGLE_OAUTH_CLIENT_ID=\nGOOGLE_OAUTH_CLIENT_SECRET=\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Microsoft"}),": For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online",children:"Azure application registration credentials for SharePoint and OneDrive"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth",children:"Microsoft Graph OAuth client documentation"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"MICROSOFT_GRAPH_OAUTH_CLIENT_ID=\nMICROSOFT_GRAPH_OAUTH_CLIENT_SECRET=\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Amazon"}),": Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on ",(0,o.jsx)(n.a,{href:"https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html",children:"Configuring access to AWS applications"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"AWS_ACCESS_KEY_ID=\nAWS_SECRET_ACCESS_KEY=\n"})}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Save the ",(0,o.jsx)(n.code,{children:".env"})," file."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Restart your OpenRAG containers:"}),"\n",(0,o.jsx)(p.Ay,{}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Access the OpenRAG frontend at ",(0,o.jsx)(n.code,{children:"http://localhost:3000"}),".\nYou should be prompted to sign in to your OAuth provider before being redirected to your OpenRAG instance."]}),"\n"]}),"\n"]})]})]}),"\n",(0,o.jsx)(n.h3,{id:"authenticate-and-ingest-files-from-cloud-storage",children:"Authenticate and ingest files from cloud storage"}),"\n",(0,o.jsxs)(n.p,{children:["After you start OpenRAG with OAuth connectors enabled, each user is prompted to authenticate with the OAuth provider upon accessing your OpenRAG instance.\nIndividual authentication is required to access a user's cloud storage from your OpenRAG instance.\nFor example, if a user navigates to the default OpenRAG URL at ",(0,o.jsx)(n.code,{children:"http://localhost:3000"}),", they are redirected to the OAuth provider's sign-in page.\nAfter authenticating and granting the required permissions for OpenRAG, the user is redirected back to OpenRAG."]}),"\n",(0,o.jsx)(n.p,{children:"To ingest knowledge with an OAuth connector, do the following:"}),"\n",(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(i.A,{name:"Library","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Knowledge"})," to view your OpenSearch knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Add Knowledge"}),", and then select a storage provider."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["On the ",(0,o.jsx)(n.strong,{children:"Add Cloud Knowledge"})," page, click ",(0,o.jsx)(n.strong,{children:"Add Files"}),", and then select the files and folders to ingest from the connected storage."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Ingest Files"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["The selected files are processed in the background through the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow."]}),"\n",(0,o.jsx)(u,{}),"\n",(0,o.jsxs)(n.p,{children:["You can ",(0,o.jsx)(n.a,{href:"#monitor-ingestion",children:"monitor ingestion"})," to see the progress of the uploads and check for failed uploads."]}),"\n",(0,o.jsx)(n.h2,{id:"url-flow",children:"Ingest knowledge from URLs"}),"\n",(0,o.jsx)(n.p,{children:"When using the OpenRAG chat, you can enter URLs into the chat to be ingested in real-time during your conversation."}),"\n",(0,o.jsx)(n.admonition,{type:"info",children:(0,o.jsxs)(n.p,{children:["The chat cannot ingest URLs that end in static document file extensions like ",(0,o.jsx)(n.code,{children:".pdf"}),".\nTo upload these types of files, see ",(0,o.jsx)(n.a,{href:"#ingest-local-files-and-folders",children:"Ingest local files and folders"})," and ",(0,o.jsx)(n.a,{href:"#oauth-ingestion",children:"Ingest files with OAuth connectors"}),"."]})}),"\n",(0,o.jsxs)(n.p,{children:["OpenRAG runs the ",(0,o.jsx)(n.strong,{children:"OpenSearch URL Ingestion"})," flow to ingest web content from URLs.\nThis flow isn't directly accessible from the OpenRAG user interface.\nInstead, this flow is called by the ",(0,o.jsxs)(n.a,{href:"/chat#flow",children:[(0,o.jsx)(n.strong,{children:"OpenRAG OpenSearch Agent"})," flow"]})," as a Model Context Protocol (MCP) tool.\nThe agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.\nLike all OpenRAG flows, you can ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"inspect the flow in Langflow"}),", and you can customize it.\nFor more information about MCP in Langflow, see the Langflow documentation on ",(0,o.jsx)(n.a,{href:"https://docs.langflow.org/mcp-client",children:"MCP clients"})," and ",(0,o.jsx)(n.a,{href:"https://docs.langflow.org/mcp-tutorial",children:"MCP servers"}),"."]}),"\n",(0,o.jsx)(n.h2,{id:"monitor-ingestion",children:"Monitor ingestion"}),"\n",(0,o.jsx)(n.p,{children:"Depending on the amount of data to ingest, document ingestion can take a few seconds, minutes, or longer.\nFor this reason, document ingestion tasks run in the background."}),"\n",(0,o.jsxs)(n.p,{children:["In the OpenRAG user interface, a badge is shown on ",(0,o.jsx)(i.A,{name:"Bell","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Tasks"})," when OpenRAG tasks are active.\nClick ",(0,o.jsx)(i.A,{name:"Bell","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Tasks"})," to inspect and cancel tasks.\nTasks are separated into multiple sections:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["The ",(0,o.jsx)(n.strong,{children:"Active Tasks"})," section includes all tasks that are ",(0,o.jsx)(n.strong,{children:"Pending"}),", ",(0,o.jsx)(n.strong,{children:"Running"}),", or ",(0,o.jsx)(n.strong,{children:"Processing"}),":"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Pending"}),": The task is queued and waiting to start."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Running"}),": The task is actively processing files."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Processing"}),": The task is performing ingestion operations."]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["To stop an active task, click ",(0,o.jsx)(i.A,{name:"X","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Cancel"}),". Canceling a task stops processing immediately and marks the ingestion as failed."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["The ",(0,o.jsx)(n.strong,{children:"Recent Tasks"})," section lists recently finished tasks."]}),"\n",(0,o.jsxs)(n.admonition,{type:"warning",children:[(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Completed"})," doesn't mean success."]}),(0,o.jsx)(n.p,{children:"A completed task can report successful ingestions, failed ingestions, or both, depending on the number of files processed."})]}),"\n",(0,o.jsxs)(n.p,{children:["Check the ",(0,o.jsx)(n.strong,{children:"Success"})," and ",(0,o.jsx)(n.strong,{children:"Failed"})," counts for each completed task to determine the overall success rate."]}),"\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Failed"})," means something went wrong during ingestion, or the task was manually canceled.\nFor more information, see ",(0,o.jsx)(n.a,{href:"#troubleshoot-ingestion",children:"Troubleshoot ingestion"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsx)(n.p,{children:"For each task, depending on its state, you can find the task ID, start time, duration, number of files processed successfully, number of files that failed, and the number of files enqueued for processing."}),"\n",(0,o.jsx)(n.h3,{id:"ingestion-performance-expectations",children:"Ingestion performance expectations"}),"\n",(0,o.jsx)(n.p,{children:"The following performance test was conducted with Docling Serve."}),"\n",(0,o.jsx)(n.p,{children:"On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.\nThis equates to approximately 2.4 documents per second."}),"\n",(0,o.jsx)(n.p,{children:"You can generally expect equal or better performance on developer laptops, and significantly faster performance on servers.\nThroughput scales with CPU cores, memory, storage speed, and configuration choices, such as the embedding model, chunk size, overlap, and concurrency."}),"\n",(0,o.jsx)(n.p,{children:"This test returned 12 error, approximately 1.1 percent of the total files ingested.\nAll errors were file-specific, and they didn't stop the pipeline."}),"\n",(0,o.jsxs)(s,{children:[(0,o.jsx)("summary",{children:"Ingestion performance test details"}),(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Ingestion dataset:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Total files: 1,083 items mounted"}),"\n",(0,o.jsx)(n.li,{children:"Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)"}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Hardware specifications:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Machine: Apple M4 Pro"}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Podman VM:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Name: podman-machine-default"}),"\n",(0,o.jsx)(n.li,{children:"Type: applehv"}),"\n",(0,o.jsx)(n.li,{children:"vCPUs: 7"}),"\n",(0,o.jsx)(n.li,{children:"Memory: 8 GiB"}),"\n",(0,o.jsx)(n.li,{children:"Disk size: 100 GiB"}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Test results:"}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-text",children:"2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False\n2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082\n...\n2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)"}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Throughput: Approximately 2.4 documents per second"}),"\n"]}),"\n"]})]}),"\n",(0,o.jsx)(n.h2,{id:"troubleshoot-ingestion",children:"Troubleshoot ingestion"}),"\n",(0,o.jsx)(n.p,{children:"The following issues can occur during document ingestion."}),"\n",(0,o.jsx)(n.h3,{id:"failed-or-slow-ingestion",children:"Failed or slow ingestion"}),"\n",(0,o.jsx)(n.p,{children:"If an ingestion task fails, do the following:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Make sure you uploaded only supported file types."}),"\n",(0,o.jsx)(n.li,{children:"Split very large files into smaller files."}),"\n",(0,o.jsx)(n.li,{children:"Remove unusual or complex embedded content, such as videos or animations. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors."}),"\n",(0,o.jsxs)(n.li,{children:["Make sure your Podman/Docker VM has sufficient memory for the ingestion tasks.\nThe minimum recommendation is 8 GB of RAM.\nIf you regularly upload large files, more RAM is recommended.\nFor more information, see ",(0,o.jsx)(n.a,{href:"/support/troubleshoot#memory-issue-with-podman-on-macos",children:"Memory issue with Podman on macOS"})," and ",(0,o.jsx)(n.a,{href:"/support/troubleshoot#container-out-of-memory-errors",children:"Container out of memory errors"}),"."]}),"\n",(0,o.jsxs)(n.li,{children:["If OCR ingestion fails due to OCR missing, see ",(0,o.jsx)(n.a,{href:"/support/troubleshoot#ocr-ingestion-fails-easyocr-not-installed",children:"OCR ingestion fails (easyocr not installed)"}),"."]}),"\n"]}),"\n",(0,o.jsx)(n.h3,{id:"problems-when-referencing-documents-in-chat",children:"Problems when referencing documents in chat"}),"\n",(0,o.jsxs)(n.p,{children:["If the OpenRAG ",(0,o.jsx)(n.strong,{children:"Chat"})," doesn't seem to use your documents correctly, ",(0,o.jsx)(n.a,{href:"/knowledge#browse-knowledge",children:"browse your knowledge base"})," to confirm that the documents are uploaded in full, and the chunks are correct."]}),"\n",(0,o.jsxs)(n.p,{children:["If the documents are present and well-formed, check your ",(0,o.jsx)(n.a,{href:"/knowledge-filters",children:"knowledge filters"}),".\nIf you applied a filter to the chat, make sure the expected documents aren't excluded by the filter settings.\nYou can test this by applying the filter when you ",(0,o.jsx)(n.a,{href:"/knowledge#browse-knowledge",children:"browse the knowledge base"}),".\nIf the filter excludes any documents, the agent cannot access those documents.\nBe aware that some settings create dynamic filters that don't always produce the same results, such as a ",(0,o.jsx)(n.strong,{children:"Search query"})," combined with a low ",(0,o.jsx)(n.strong,{children:"Response limit"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["If the document chunks have missing, incorrect, or unexpected text, you must ",(0,o.jsx)(n.a,{href:"/knowledge#delete-knowledge",children:"delete the documents"})," from your knowledge base, modify the ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"ingestion parameters"})," or the documents themselves, and then reingest the documents.\nFor example:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Break combined documents into separate files for better metadata context."}),"\n",(0,o.jsxs)(n.li,{children:["Make sure scanned documents are legible enough for extraction, and enable the ",(0,o.jsx)(n.strong,{children:"OCR"})," option. Poorly scanned documents might require additional preparation or rescanning before ingestion."]}),"\n",(0,o.jsxs)(n.li,{children:["Adjust the ",(0,o.jsx)(n.strong,{children:"Chunk size"})," and ",(0,o.jsx)(n.strong,{children:"Chunk overlap"})," settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context."]}),"\n"]}),"\n",(0,o.jsx)(n.h2,{id:"see-also",children:"See also"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/knowledge",children:"Configure knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/knowledge-filters",children:"Filter knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/chat",children:"Chat with knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"Inspect and modify flows"})}),"\n"]})]})}function y(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,o.jsx)(n,{...e,children:(0,o.jsx)(w,{...e})}):w(e)}},8401:(e,n,s)=>{s.d(n,{Ay:()=>a,RM:()=>i});var t=s(4848),o=s(8453),r=s(9179);const i=[];function l(e){const n={p:"p",strong:"strong",...(0,o.R)(),...e.components};return(0,t.jsxs)(n.p,{children:["When using the OpenRAG ",(0,t.jsx)(n.strong,{children:"Chat"}),", click ",(0,t.jsx)(r.A,{name:"Plus","aria-hidden":"true"})," ",(0,t.jsx)(n.strong,{children:"Add"})," in the chat input field to upload a file to the current chat session.\nFiles added this way are processed and made available to the agent for the current conversation only.\nThese files aren't stored in the knowledge base permanently."]})}function a(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(l,{...e})}):l(e)}},9179:(e,n,s)=>{s.d(n,{A:()=>r});s(6540);var t=s(7856),o=s(4848);function r({name:e,...n}){const s=t[e];return s?(0,o.jsx)(s,{...n}):null}},9365:(e,n,s)=>{s.d(n,{A:()=>i});s(6540);var t=s(4164);const o={tabItem:"tabItem_Ymn6"};var r=s(4848);function i({children:e,hidden:n,className:s}){return(0,r.jsx)("div",{role:"tabpanel",className:(0,t.A)(o.tabItem,s),hidden:n,children:e})}}}]); |