1 line
No EOL
36 KiB
JavaScript
1 line
No EOL
36 KiB
JavaScript
"use strict";(globalThis.webpackChunkopenrag_docs=globalThis.webpackChunkopenrag_docs||[]).push([[6919],{1381:(e,n,s)=>{s.d(n,{Ay:()=>l,RM:()=>r});var t=s(4848),o=s(8453);const r=[];function i(e){const n={code:"code",pre:"pre",...(0,o.R)(),...e.components};return(0,t.jsxs)(t.Fragment,{children:[(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Docker"',children:"docker compose up -d\n"})}),"\n",(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Podman"',children:"podman compose up -d\n"})})]})}function l(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(i,{...e})}):i(e)}},1470:(e,n,s)=>{s.d(n,{A:()=>A});var t=s(6540),o=s(4164),r=s(7559),i=s(3104),l=s(6347),a=s(205),c=s(7485),d=s(1682),h=s(679);function u(e){return t.Children.toArray(e).filter(e=>"\n"!==e).map(e=>{if(!e||(0,t.isValidElement)(e)&&function(e){const{props:n}=e;return!!n&&"object"==typeof n&&"value"in n}(e))return e;throw new Error(`Docusaurus error: Bad <Tabs> child <${"string"==typeof e.type?e.type:e.type.name}>: all children of the <Tabs> component should be <TabItem>, and every <TabItem> should have a unique "value" prop.`)})?.filter(Boolean)??[]}function p(e){const{values:n,children:s}=e;return(0,t.useMemo)(()=>{const e=n??function(e){return u(e).map(({props:{value:e,label:n,attributes:s,default:t}})=>({value:e,label:n,attributes:s,default:t}))}(s);return function(e){const n=(0,d.XI)(e,(e,n)=>e.value===n.value);if(n.length>0)throw new Error(`Docusaurus error: Duplicate values "${n.map(e=>e.value).join(", ")}" found in <Tabs>. Every value needs to be unique.`)}(e),e},[n,s])}function g({value:e,tabValues:n}){return n.some(n=>n.value===e)}function x({queryString:e=!1,groupId:n}){const s=(0,l.W6)(),o=function({queryString:e=!1,groupId:n}){if("string"==typeof e)return e;if(!1===e)return null;if(!0===e&&!n)throw new Error('Docusaurus error: The <Tabs> component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return n??null}({queryString:e,groupId:n});return[(0,c.aZ)(o),(0,t.useCallback)(e=>{if(!o)return;const n=new URLSearchParams(s.location.search);n.set(o,e),s.replace({...s.location,search:n.toString()})},[o,s])]}function f(e){const{defaultValue:n,queryString:s=!1,groupId:o}=e,r=p(e),[i,l]=(0,t.useState)(()=>function({defaultValue:e,tabValues:n}){if(0===n.length)throw new Error("Docusaurus error: the <Tabs> component requires at least one <TabItem> children component");if(e){if(!g({value:e,tabValues:n}))throw new Error(`Docusaurus error: The <Tabs> has a defaultValue "${e}" but none of its children has the corresponding value. Available values are: ${n.map(e=>e.value).join(", ")}. If you intend to show no default tab, use defaultValue={null} instead.`);return e}const s=n.find(e=>e.default)??n[0];if(!s)throw new Error("Unexpected error: 0 tabValues");return s.value}({defaultValue:n,tabValues:r})),[c,d]=x({queryString:s,groupId:o}),[u,f]=function({groupId:e}){const n=function(e){return e?`docusaurus.tab.${e}`:null}(e),[s,o]=(0,h.Dv)(n);return[s,(0,t.useCallback)(e=>{n&&o.set(e)},[n,o])]}({groupId:o}),m=(()=>{const e=c??u;return g({value:e,tabValues:r})?e:null})();(0,a.A)(()=>{m&&l(m)},[m]);return{selectedValue:i,selectValue:(0,t.useCallback)(e=>{if(!g({value:e,tabValues:r}))throw new Error(`Can't select invalid tab value=${e}`);l(e),d(e),f(e)},[d,f,r]),tabValues:r}}var m=s(2303);const j={tabList:"tabList__CuJ",tabItem:"tabItem_LNqP"};var w=s(4848);function b({className:e,block:n,selectedValue:s,selectValue:t,tabValues:r}){const l=[],{blockElementScrollPositionUntilNextRender:a}=(0,i.a_)(),c=e=>{const n=e.currentTarget,o=l.indexOf(n),i=r[o].value;i!==s&&(a(n),t(i))},d=e=>{let n=null;switch(e.key){case"Enter":c(e);break;case"ArrowRight":{const s=l.indexOf(e.currentTarget)+1;n=l[s]??l[0];break}case"ArrowLeft":{const s=l.indexOf(e.currentTarget)-1;n=l[s]??l[l.length-1];break}}n?.focus()};return(0,w.jsx)("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.A)("tabs",{"tabs--block":n},e),children:r.map(({value:e,label:n,attributes:t})=>(0,w.jsx)("li",{role:"tab",tabIndex:s===e?0:-1,"aria-selected":s===e,ref:e=>{l.push(e)},onKeyDown:d,onClick:c,...t,className:(0,o.A)("tabs__item",j.tabItem,t?.className,{"tabs__item--active":s===e}),children:n??e},e))})}function y({lazy:e,children:n,selectedValue:s}){const r=(Array.isArray(n)?n:[n]).filter(Boolean);if(e){const e=r.find(e=>e.props.value===s);return e?(0,t.cloneElement)(e,{className:(0,o.A)("margin-top--md",e.props.className)}):null}return(0,w.jsx)("div",{className:"margin-top--md",children:r.map((e,n)=>(0,t.cloneElement)(e,{key:n,hidden:e.props.value!==s}))})}function v(e){const n=f(e);return(0,w.jsxs)("div",{className:(0,o.A)(r.G.tabs.container,"tabs-container",j.tabList),children:[(0,w.jsx)(b,{...n,...e}),(0,w.jsx)(y,{...n,...e})]})}function A(e){const n=(0,m.A)();return(0,w.jsx)(v,{...e,children:u(e.children)},String(n))}},4577:(e,n,s)=>{s.d(n,{Ay:()=>l,RM:()=>r});var t=s(4848),o=s(8453);const r=[];function i(e){const n={code:"code",pre:"pre",...(0,o.R)(),...e.components};return(0,t.jsxs)(t.Fragment,{children:[(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Docker"',children:"docker stop $(docker ps -q)\n"})}),"\n",(0,t.jsx)(n.pre,{children:(0,t.jsx)(n.code,{className:"language-bash",metastring:'title="Podman"',children:"podman stop --all\n"})})]})}function l(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(i,{...e})}):i(e)}},5421:(e,n,s)=>{s.r(n),s.d(n,{assets:()=>m,contentTitle:()=>f,default:()=>b,frontMatter:()=>x,metadata:()=>t,toc:()=>j});const t=JSON.parse('{"id":"core-components/ingestion","title":"Ingest knowledge","description":"Upload documents to your OpenRAG OpenSearch instance to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.","source":"@site/docs/core-components/ingestion.mdx","sourceDirName":"core-components","slug":"/ingestion","permalink":"/ingestion","draft":false,"unlisted":false,"editUrl":"https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx","tags":[],"version":"current","frontMatter":{"title":"Ingest knowledge","slug":"/ingestion"},"sidebar":"tutorialSidebar","previous":{"title":"Configure knowledge","permalink":"/knowledge"},"next":{"title":"Filter knowledge","permalink":"/knowledge-filters"}}');var o=s(4848),r=s(8453),i=s(9179),l=s(1470),a=s(9365),c=s(8401);const d=[];function h(e){const n={a:"a",code:"code",li:"li",p:"p",strong:"strong",ul:"ul",...(0,r.R)(),...e.components},{Details:s}=n;return s||function(e,n){throw new Error("Expected "+(n?"component":"object")+" `"+e+"` to be defined: you likely forgot to import, pass, or provide it.")}("Details",!0),(0,o.jsxs)(s,{children:[(0,o.jsx)("summary",{children:"About the OpenSearch Ingestion flow"}),(0,o.jsxs)(n.p,{children:["When you upload documents locally or with OAuth connectors, the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow runs in the background.\nBy default, this flow uses Docling Serve to import and process documents."]}),(0,o.jsxs)(n.p,{children:["Like all ",(0,o.jsx)(n.a,{href:"/agents",children:"OpenRAG flows"}),", you can ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"inspect the flow in Langflow"}),", and you can customize it if you want to change the knowledge ingestion settings."]}),(0,o.jsxs)(n.p,{children:["The ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow is comprised of several components that work together to process and store documents in your knowledge base:"]}),(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-docling#docling-serve",children:[(0,o.jsx)(n.strong,{children:"Docling Serve"})," component"]}),": Ingests files and processes them by connecting to OpenRAG's local Docling Serve service. The output is ",(0,o.jsx)(n.code,{children:"DoclingDocument"})," data that contains the extracted text and metadata from the documents."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-docling#export-doclingdocument",children:[(0,o.jsx)(n.strong,{children:"Export DoclingDocument"})," component"]}),": Exports processed ",(0,o.jsx)(n.code,{children:"DoclingDocument"})," data to Markdown format with image placeholders. This conversion standardizes the document data in preparation for further processing."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/components-processing#dataframe-operations",children:[(0,o.jsx)(n.strong,{children:"DataFrame Operations"})," component"]}),": Three of these components run sequentially to add metadata to the document data: ",(0,o.jsx)(n.code,{children:"filename"}),", ",(0,o.jsx)(n.code,{children:"file_size"}),", and ",(0,o.jsx)(n.code,{children:"mimetype"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/components-processing#split-text",children:[(0,o.jsx)(n.strong,{children:"Split Text"})," component"]}),": Splits the processed text into chunks, based on the configured ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"chunk size and overlap settings"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Secret Input"})," component: If needed, four of these components securely fetch the ",(0,o.jsx)(n.a,{href:"/knowledge#auth",children:"OAuth authentication"})," configuration variables: ",(0,o.jsx)(n.code,{children:"CONNECTOR_TYPE"}),", ",(0,o.jsx)(n.code,{children:"OWNER"}),", ",(0,o.jsx)(n.code,{children:"OWNER_EMAIL"}),", and ",(0,o.jsx)(n.code,{children:"OWNER_NAME"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Create Data"})," component: Combines the authentication credentials from the ",(0,o.jsx)(n.strong,{children:"Secret Input"})," components into a structured data object that is associated with the document embeddings."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/components-embedding-models",children:[(0,o.jsx)(n.strong,{children:"Embedding Model"})," component"]}),": Generates vector embeddings using your selected ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-embedding-model-and-dimensions",children:"embedding model"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-elastic#opensearch",children:[(0,o.jsx)(n.strong,{children:"OpenSearch"})," component"]}),": Stores the processed documents and their embeddings in a ",(0,o.jsx)(n.code,{children:"documents"})," index of your OpenRAG ",(0,o.jsx)(n.a,{href:"/knowledge",children:"OpenSearch knowledge base"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default address for the OpenSearch instance is ",(0,o.jsx)(n.code,{children:"https://opensearch:9200"}),". To change this address, edit the ",(0,o.jsx)(n.code,{children:"OPENSEARCH_PORT"})," ",(0,o.jsx)(n.a,{href:"/reference/configuration#opensearch-settings",children:"environment variable"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default authentication method is JSON Web Token (JWT) authentication. If you ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"edit the flow"}),", you can select ",(0,o.jsx)(n.code,{children:"basic"})," auth mode, which uses the ",(0,o.jsx)(n.code,{children:"OPENSEARCH_USERNAME"})," and ",(0,o.jsx)(n.code,{children:"OPENSEARCH_PASSWORD"})," ",(0,o.jsx)(n.a,{href:"/reference/configuration#opensearch-settings",children:"environment variables"})," for authentication instead of JWT."]}),"\n"]}),"\n"]})]})}function u(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,o.jsx)(n,{...e,children:(0,o.jsx)(h,{...e})}):h(e)}var p=s(1381),g=s(4577);const x={title:"Ingest knowledge",slug:"/ingestion"},f=void 0,m={},j=[{value:"Ingest local files and folders",id:"ingest-local-files-and-folders",level:2},...d,{value:"Ingest local files temporarily",id:"ingest-local-files-temporarily",level:2},...c.RM,{value:"Ingest files with OAuth connectors",id:"oauth-ingestion",level:2},{value:"Enable OAuth connectors",id:"enable-oauth-connectors",level:3},...g.RM,...p.RM,{value:"Authenticate and ingest files from cloud storage",id:"authenticate-and-ingest-files-from-cloud-storage",level:3},...d,{value:"Ingest knowledge from URLs",id:"url-flow",level:2},{value:"Monitor ingestion",id:"monitor-ingestion",level:2},{value:"Ingestion performance expectations",id:"ingestion-performance-expectations",level:3},{value:"Troubleshoot ingestion",id:"troubleshoot-ingestion",level:2},{value:"See also",id:"see-also",level:2}];function w(e){const n={a:"a",code:"code",h2:"h2",h3:"h3",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",ul:"ul",...(0,r.R)(),...e.components},{Details:s}=n;return s||function(e,n){throw new Error("Expected "+(n?"component":"object")+" `"+e+"` to be defined: you likely forgot to import, pass, or provide it.")}("Details",!0),(0,o.jsxs)(o.Fragment,{children:[(0,o.jsxs)(n.p,{children:["Upload documents to your ",(0,o.jsx)(n.a,{href:"/knowledge",children:"OpenRAG OpenSearch instance"})," to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.\nDocuments are processed through OpenRAG's knowledge ingestion flows with Docling."]}),"\n",(0,o.jsx)(n.p,{children:"OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connectors."}),"\n",(0,o.jsxs)(n.p,{children:["Knowledge ingestion is powered by OpenRAG's built-in knowledge ingestion flows that use Docling to process documents before storing the documents in your OpenSearch database.\nDuring ingestion, documents are broken into smaller chunks of content that are then embedded using your selected ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-embedding-model-and-dimensions",children:"embedding model"}),".\nThen, the chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database."]}),"\n",(0,o.jsxs)(n.p,{children:["To modify chunking behavior and other ingestion settings, see ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"Knowledge ingestion settings"})," and ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"Inspect and modify flows"}),"."]}),"\n",(0,o.jsx)(n.h2,{id:"ingest-local-files-and-folders",children:"Ingest local files and folders"}),"\n",(0,o.jsx)(n.p,{children:"You can upload files and folders from your local machine to your knowledge base:"}),"\n",(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(i.A,{name:"Library","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Knowledge"})," to view your OpenSearch knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Add Knowledge"})," to add your own documents to your OpenRAG knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["To upload one file, click ",(0,o.jsx)(i.A,{name:"File","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"File"}),". To upload all documents in a folder, click ",(0,o.jsx)(i.A,{name:"Folder","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Folder"}),"."]}),"\n",(0,o.jsxs)(n.p,{children:["The default path is the ",(0,o.jsx)(n.code,{children:"./documents"})," subdirectory in your OpenRAG installation directory.\nTo change this path, see ",(0,o.jsx)(n.a,{href:"/knowledge#set-the-local-documents-path",children:"Set the local documents path"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["The selected files are processed in the background through the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow."]}),"\n",(0,o.jsx)(u,{}),"\n",(0,o.jsxs)(n.p,{children:["You can ",(0,o.jsx)(n.a,{href:"#monitor-ingestion",children:"monitor ingestion"})," to see the progress of the uploads and check for failed uploads."]}),"\n",(0,o.jsx)(n.h2,{id:"ingest-local-files-temporarily",children:"Ingest local files temporarily"}),"\n",(0,o.jsx)(c.Ay,{}),"\n",(0,o.jsx)(n.h2,{id:"oauth-ingestion",children:"Ingest files with OAuth connectors"}),"\n",(0,o.jsx)(n.p,{children:"OpenRAG can use OAuth authenticated connectors to ingest documents from the following external services:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"AWS S3"}),"\n",(0,o.jsx)(n.li,{children:"Google Drive"}),"\n",(0,o.jsx)(n.li,{children:"Microsoft OneDrive"}),"\n",(0,o.jsx)(n.li,{children:"Microsoft Sharepoint"}),"\n"]}),"\n",(0,o.jsx)(n.p,{children:"These connectors enable seamless ingestion of files from cloud storage to your OpenRAG knowledge base."}),"\n",(0,o.jsx)(n.p,{children:"Individual users can connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage. When a user connects a cloud storage service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage."}),"\n",(0,o.jsx)(n.h3,{id:"enable-oauth-connectors",children:"Enable OAuth connectors"}),"\n",(0,o.jsx)(n.p,{children:"Before users can connect their own cloud storage accounts, you must configure the provider's OAuth credentials in OpenRAG. Typically, this requires that you register OpenRAG as an OAuth application in your cloud provider, and then obtain the app's OAuth credentials, such as a client ID and secret key.\nTo enable multiple connectors, you must register an app and generate credentials for each provider."}),"\n",(0,o.jsxs)(l.A,{children:[(0,o.jsxs)(a.A,{value:"TUI",label:"TUI-managed services",default:!0,children:[(0,o.jsxs)(n.p,{children:["If you use the ",(0,o.jsx)(n.a,{href:"/tui",children:"Terminal User Interface (TUI)"})," to manage your OpenRAG services, enter OAuth credentials in the ",(0,o.jsx)(n.strong,{children:"Advanced Setup"})," menu.\nYou can do this during ",(0,o.jsx)(n.a,{href:"/install#setup",children:"installation"}),", or you can add the credentials afterwards:"]}),(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["If OpenRAG is running, open the TUI's ",(0,o.jsx)(n.strong,{children:"Status"})," menu (",(0,o.jsx)("kbd",{children:"3"}),"), and then click ",(0,o.jsx)(n.strong,{children:"Stop Services"}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Open the ",(0,o.jsx)(n.strong,{children:"Advanced Setup"})," menu (",(0,o.jsx)("kbd",{children:"2"}),"), and then add the OAuth credentials for the cloud storage providers that you want to use:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Amazon"}),": Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on ",(0,o.jsx)(n.a,{href:"https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html",children:"Configuring access to AWS applications"}),"."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Google"}),": Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the ",(0,o.jsx)(n.a,{href:"https://console.cloud.google.com/apis/credentials",children:"Google Cloud Console"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://developers.google.com/identity/protocols/oauth2",children:"Google OAuth client documentation"}),"."]}),"\n",(0,o.jsxs)(n.li,{children:[(0,o.jsx)(n.strong,{children:"Microsoft"}),": For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online",children:"Azure application registration credentials for SharePoint and OneDrive"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth",children:"Microsoft Graph OAuth client documentation"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"The TUI presents redirect URIs for your OAuth app that you must register with your OAuth provider.\nThese are the URLs your OAuth provider will redirect back to after users authenticate and grant access to their cloud storage."}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Save Configuration"})," to add the OAuth credentials to your ",(0,o.jsxs)(n.a,{href:"/reference/configuration",children:["OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file"]}),"."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Start All Services"})," to restart the OpenRAG containers with OAuth enabled."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Launch the OpenRAG app.\nYou should be prompted to sign in to your OAuth provider before being redirected to your OpenRAG instance."}),"\n"]}),"\n"]})]}),(0,o.jsxs)(a.A,{value:"env",label:"Self-managed services",children:[(0,o.jsxs)(n.p,{children:["If you ",(0,o.jsx)(n.a,{href:"/docker",children:"installed OpenRAG with self-managed services"}),", set OAuth credentials in your ",(0,o.jsxs)(n.a,{href:"/reference/configuration",children:["OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file"]}),"."]}),(0,o.jsxs)(n.p,{children:["You can do this during ",(0,o.jsx)(n.a,{href:"/docker#setup",children:"initial set up"}),", or you can add the credentials afterwards:"]}),(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Stop all OpenRAG containers:"}),"\n",(0,o.jsx)(g.Ay,{}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Edit your OpenRAG ",(0,o.jsx)(n.code,{children:".env"})," file to add the OAuth credentials for the cloud storage providers that you want to use:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Amazon"}),": Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on ",(0,o.jsx)(n.a,{href:"https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html",children:"Configuring access to AWS applications"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"AWS_ACCESS_KEY_ID=\nAWS_SECRET_ACCESS_KEY=\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Google"}),": Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the ",(0,o.jsx)(n.a,{href:"https://console.cloud.google.com/apis/credentials",children:"Google Cloud Console"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://developers.google.com/identity/protocols/oauth2",children:"Google OAuth client documentation"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"GOOGLE_OAUTH_CLIENT_ID=\nGOOGLE_OAUTH_CLIENT_SECRET=\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Microsoft"}),": For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online",children:"Azure application registration credentials for SharePoint and OneDrive"}),". For more information, see the ",(0,o.jsx)(n.a,{href:"https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth",children:"Microsoft Graph OAuth client documentation"}),"."]}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-env",children:"MICROSOFT_GRAPH_OAUTH_CLIENT_ID=\nMICROSOFT_GRAPH_OAUTH_CLIENT_SECRET=\n"})}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Save the ",(0,o.jsx)(n.code,{children:".env"})," file."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Restart your OpenRAG containers:"}),"\n",(0,o.jsx)(p.Ay,{}),"\n"]}),"\n"]})]})]}),"\n",(0,o.jsx)(n.h3,{id:"authenticate-and-ingest-files-from-cloud-storage",children:"Authenticate and ingest files from cloud storage"}),"\n",(0,o.jsxs)(n.p,{children:["After you start OpenRAG with OAuth connectors enabled, each user is prompted to authenticate with the OAuth provider upon accessing your OpenRAG instance.\nIndividual authentication is required to access a user's cloud storage from your OpenRAG instance.\nFor example, if a user navigates to the default OpenRAG URL at ",(0,o.jsx)(n.code,{children:"http://localhost:3000"}),", they are redirected to the OAuth provider's sign-in page.\nAfter authenticating and granting the required permissions for OpenRAG, the user is redirected back to OpenRAG."]}),"\n",(0,o.jsx)(n.p,{children:"To ingest knowledge with an OAuth connector, do the following:"}),"\n",(0,o.jsxs)(n.ol,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(i.A,{name:"Library","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Knowledge"})," to view your OpenSearch knowledge base."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Add Knowledge"}),", and then select a storage provider."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["On the ",(0,o.jsx)(n.strong,{children:"Add Cloud Knowledge"})," page, click ",(0,o.jsx)(n.strong,{children:"Add Files"}),", and then select the files and folders to ingest from the connected storage."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:["Click ",(0,o.jsx)(n.strong,{children:"Ingest Files"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["The selected files are processed in the background through the ",(0,o.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow."]}),"\n",(0,o.jsx)(u,{}),"\n",(0,o.jsxs)(n.p,{children:["You can ",(0,o.jsx)(n.a,{href:"#monitor-ingestion",children:"monitor ingestion"})," to see the progress of the uploads and check for failed uploads."]}),"\n",(0,o.jsx)(n.h2,{id:"url-flow",children:"Ingest knowledge from URLs"}),"\n",(0,o.jsxs)(n.p,{children:["The ",(0,o.jsx)(n.strong,{children:"OpenSearch URL Ingestion"})," flow is used to ingest web content from URLs.\nThis flow isn't directly accessible from the OpenRAG user interface.\nInstead, this flow is called by the ",(0,o.jsxs)(n.a,{href:"/chat#flow",children:[(0,o.jsx)(n.strong,{children:"OpenRAG OpenSearch Agent"})," flow"]})," as a Model Context Protocol (MCP) tool.\nThe agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base."]}),"\n",(0,o.jsxs)(n.p,{children:["Like all OpenRAG flows, you can ",(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"inspect the flow in Langflow"}),", and you can customize it."]}),"\n",(0,o.jsxs)(n.p,{children:["For more information about MCP in Langflow, see the Langflow documentation on ",(0,o.jsx)(n.a,{href:"https://docs.langflow.org/mcp-client",children:"MCP clients"})," and ",(0,o.jsx)(n.a,{href:"https://docs.langflow.org/mcp-tutorial",children:"MCP servers"}),"."]}),"\n",(0,o.jsx)(n.h2,{id:"monitor-ingestion",children:"Monitor ingestion"}),"\n",(0,o.jsx)(n.p,{children:"Document ingestion tasks run in the background."}),"\n",(0,o.jsxs)(n.p,{children:["In the OpenRAG user interface, a badge is shown on ",(0,o.jsx)(i.A,{name:"Bell","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Tasks"})," when OpenRAG tasks are active.\nClick ",(0,o.jsx)(i.A,{name:"Bell","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Tasks"})," to inspect and cancel tasks:"]}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Active Tasks"}),": All tasks that are ",(0,o.jsx)(n.strong,{children:"Pending"}),", ",(0,o.jsx)(n.strong,{children:"Running"}),", or ",(0,o.jsx)(n.strong,{children:"Processing"}),".\nFor each active task, depending on its state, you can find the task ID, start time, duration, number of files processed, and the total files enqueued for processing."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Pending"}),": The task is queued and waiting to start."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Running"}),": The task is actively processing files."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Processing"}),": The task is performing ingestion operations."]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsxs)(n.p,{children:[(0,o.jsx)(n.strong,{children:"Failed"}),": Something went wrong during ingestion, or the task was manually canceled.\nFor troubleshooting advice, see ",(0,o.jsx)(n.a,{href:"#troubleshoot-ingestion",children:"Troubleshoot ingestion"}),"."]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["To stop an active task, click ",(0,o.jsx)(i.A,{name:"X","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Cancel"}),". Canceling a task stops processing immediately and marks the task as ",(0,o.jsx)(n.strong,{children:"Failed"}),"."]}),"\n",(0,o.jsx)(n.h3,{id:"ingestion-performance-expectations",children:"Ingestion performance expectations"}),"\n",(0,o.jsx)(n.p,{children:"The following performance test was conducted with Docling Serve."}),"\n",(0,o.jsx)(n.p,{children:"On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.\nThis equates to approximately 2.4 documents per second."}),"\n",(0,o.jsx)(n.p,{children:"You can generally expect equal or better performance on developer laptops, and significantly faster performance on servers.\nThroughput scales with CPU cores, memory, storage speed, and configuration choices, such as the embedding model, chunk size, overlap, and concurrency."}),"\n",(0,o.jsx)(n.p,{children:"This test returned 12 error, approximately 1.1 percent of the total files ingested.\nAll errors were file-specific, and they didn't stop the pipeline."}),"\n",(0,o.jsxs)(s,{children:[(0,o.jsx)("summary",{children:"Ingestion performance test details"}),(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Ingestion dataset:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Total files: 1,083 items mounted"}),"\n",(0,o.jsx)(n.li,{children:"Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)"}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Hardware specifications:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Machine: Apple M4 Pro"}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Podman VM:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Name: podman-machine-default"}),"\n",(0,o.jsx)(n.li,{children:"Type: applehv"}),"\n",(0,o.jsx)(n.li,{children:"vCPUs: 7"}),"\n",(0,o.jsx)(n.li,{children:"Memory: 8 GiB"}),"\n",(0,o.jsx)(n.li,{children:"Disk size: 100 GiB"}),"\n"]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Test results:"}),"\n",(0,o.jsx)(n.pre,{children:(0,o.jsx)(n.code,{className:"language-text",children:"2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False\n2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082\n...\n2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082\n"})}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)"}),"\n"]}),"\n",(0,o.jsxs)(n.li,{children:["\n",(0,o.jsx)(n.p,{children:"Throughput: Approximately 2.4 documents per second"}),"\n"]}),"\n"]})]}),"\n",(0,o.jsx)(n.h2,{id:"troubleshoot-ingestion",children:"Troubleshoot ingestion"}),"\n",(0,o.jsx)(n.p,{children:"If an ingestion task fails, do the following:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Make sure you are uploading supported file types."}),"\n",(0,o.jsx)(n.li,{children:"Split excessively large files into smaller files before uploading."}),"\n",(0,o.jsx)(n.li,{children:"Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors."}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["If the OpenRAG ",(0,o.jsx)(n.strong,{children:"Chat"})," doesn't seem to use your documents correctly, ",(0,o.jsx)(n.a,{href:"/knowledge#browse-knowledge",children:"browse your knowledge base"})," to confirm that the documents are uploaded in full, and the chunks are correct."]}),"\n",(0,o.jsxs)(n.p,{children:["If the documents are present and well-formed, check your ",(0,o.jsx)(n.a,{href:"/knowledge-filters",children:"knowledge filters"}),".\nIf a global filter is applied, make sure the expected documents are included in the global filter.\nIf the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter."]}),"\n",(0,o.jsx)(n.p,{children:"If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.\nFor example:"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:"Break combined documents into separate files for better metadata context."}),"\n",(0,o.jsxs)(n.li,{children:["Make sure scanned documents are legible enough for extraction, and enable the ",(0,o.jsx)(n.strong,{children:"OCR"})," option. Poorly scanned documents might require additional preparation or rescanning before ingestion."]}),"\n",(0,o.jsxs)(n.li,{children:["Adjust the ",(0,o.jsx)(n.strong,{children:"Chunk Size"})," and ",(0,o.jsx)(n.strong,{children:"Chunk Overlap"})," settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context."]}),"\n"]}),"\n",(0,o.jsxs)(n.p,{children:["For more information about modifying ingestion parameters and flows, see ",(0,o.jsx)(n.a,{href:"/knowledge#knowledge-ingestion-settings",children:"Knowledge ingestion settings"}),"."]}),"\n",(0,o.jsx)(n.h2,{id:"see-also",children:"See also"}),"\n",(0,o.jsxs)(n.ul,{children:["\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/knowledge",children:"Configure knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/knowledge-filters",children:"Filter knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/chat",children:"Chat with knowledge"})}),"\n",(0,o.jsx)(n.li,{children:(0,o.jsx)(n.a,{href:"/agents#inspect-and-modify-flows",children:"Inspect and modify flows"})}),"\n"]})]})}function b(e={}){const{wrapper:n}={...(0,r.R)(),...e.components};return n?(0,o.jsx)(n,{...e,children:(0,o.jsx)(w,{...e})}):w(e)}},8401:(e,n,s)=>{s.d(n,{Ay:()=>a,RM:()=>i});var t=s(4848),o=s(8453),r=s(9179);const i=[];function l(e){const n={p:"p",strong:"strong",...(0,o.R)(),...e.components};return(0,t.jsxs)(n.p,{children:["When using the OpenRAG ",(0,t.jsx)(n.strong,{children:"Chat"}),", click ",(0,t.jsx)(r.A,{name:"Plus","aria-hidden":"true"})," in the chat input field to upload a file to the current chat session.\nFiles added this way are processed and made available to the agent for the current conversation only.\nThese files aren't stored in the knowledge base permanently."]})}function a(e={}){const{wrapper:n}={...(0,o.R)(),...e.components};return n?(0,t.jsx)(n,{...e,children:(0,t.jsx)(l,{...e})}):l(e)}},9179:(e,n,s)=>{s.d(n,{A:()=>r});s(6540);var t=s(7856),o=s(4848);function r({name:e,...n}){const s=t[e];return s?(0,o.jsx)(s,{...n}):null}},9365:(e,n,s)=>{s.d(n,{A:()=>i});s(6540);var t=s(4164);const o={tabItem:"tabItem_Ymn6"};var r=s(4848);function i({children:e,hidden:n,className:s}){return(0,r.jsx)("div",{role:"tabpanel",className:(0,t.A)(o.tabItem,s),hidden:n,children:e})}}}]); |