openrag/assets/js/ca2c3c0c.733fd21f.js
2025-11-24 23:35:11 +00:00

1 line
No EOL
15 KiB
JavaScript

"use strict";(self.webpackChunkopenrag_docs=self.webpackChunkopenrag_docs||[]).push([[6919],{3782:(e,n,s)=>{s.d(n,{Ay:()=>l,RM:()=>r});var o=s(4848),i=s(8453),t=s(1610);const r=[];function c(e){const n={a:"a",p:"p",strong:"strong",...(0,i.R)(),...e.components};return(0,o.jsxs)(n.p,{children:["All flows included with OpenRAG are designed to be modular, performant, and provider-agnostic.\nTo modify a flow, click ",(0,o.jsx)(t.A,{name:"Settings2","aria-hidden":"true"})," ",(0,o.jsx)(n.strong,{children:"Settings"}),", and click ",(0,o.jsx)(n.strong,{children:"Edit in Langflow"}),".\nOpenRAG's visual editor is based on the ",(0,o.jsx)(n.a,{href:"https://docs.langflow.org/concepts-overview",children:"Langflow visual editor"}),", so you can edit your flows to match your specific use case."]})}function l(e={}){const{wrapper:n}={...(0,i.R)(),...e.components};return n?(0,o.jsx)(n,{...e,children:(0,o.jsx)(c,{...e})}):c(e)}},7125:(e,n,s)=>{s.r(n),s.d(n,{assets:()=>a,contentTitle:()=>l,default:()=>p,frontMatter:()=>c,metadata:()=>o,toc:()=>d});const o=JSON.parse('{"id":"core-components/ingestion","title":"Docling in OpenRAG","description":"OpenRAG uses Docling for document ingestion.","source":"@site/docs/core-components/ingestion.mdx","sourceDirName":"core-components","slug":"/ingestion","permalink":"/ingestion","draft":false,"unlisted":false,"editUrl":"https://github.com/openrag/openrag/tree/main/docs/docs/core-components/ingestion.mdx","tags":[],"version":"current","frontMatter":{"title":"Docling in OpenRAG","slug":"/ingestion"},"sidebar":"tutorialSidebar","previous":{"title":"OpenSearch in OpenRAG","permalink":"/knowledge"},"next":{"title":"Environment variables","permalink":"/reference/configuration"}}');var i=s(4848),t=s(8453),r=(s(1610),s(1470),s(9365),s(3782));const c={title:"Docling in OpenRAG",slug:"/ingestion"},l=void 0,a={},d=[{value:"Knowledge ingestion settings",id:"knowledge-ingestion-settings",level:2},{value:"Knowledge ingestion flows",id:"knowledge-ingestion-flows",level:2},...r.RM,{value:"OpenSearch URL Ingestion flow",id:"url-flow",level:3},{value:"Use OpenRAG default ingestion instead of Docling serve",id:"use-openrag-default-ingestion-instead-of-docling-serve",level:2},{value:"Performance expectations",id:"performance-expectations",level:2}];function h(e){const n={a:"a",br:"br",code:"code",h2:"h2",h3:"h3",li:"li",p:"p",pre:"pre",strong:"strong",ul:"ul",...(0,t.R)(),...e.components};return(0,i.jsxs)(i.Fragment,{children:[(0,i.jsxs)(n.p,{children:["OpenRAG uses ",(0,i.jsx)(n.a,{href:"https://docling-project.github.io/docling/",children:"Docling"})," for document ingestion.\nMore specifically, OpenRAG uses ",(0,i.jsx)(n.a,{href:"https://github.com/docling-project/docling-serve",children:"Docling Serve"}),", which starts a ",(0,i.jsx)(n.code,{children:"docling serve"})," process on your local machine and runs Docling ingestion through an API service."]}),"\n",(0,i.jsxs)(n.p,{children:["Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch ",(0,i.jsx)(n.code,{children:"documents"})," index."]}),"\n",(0,i.jsx)(n.p,{children:"OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images."}),"\n",(0,i.jsxs)(n.p,{children:['To modify OpenRAG\'s ingestion settings, including the Docling settings and ingestion flows, click 2" aria-hidden="true"/> ',(0,i.jsx)(n.strong,{children:"Settings"}),"."]}),"\n",(0,i.jsx)(n.h2,{id:"knowledge-ingestion-settings",children:"Knowledge ingestion settings"}),"\n",(0,i.jsx)(n.p,{children:"These settings configure the Docling ingestion parameters."}),"\n",(0,i.jsxs)(n.p,{children:["OpenRAG will warn you if ",(0,i.jsx)(n.code,{children:"docling serve"})," is not running.\nTo start or stop ",(0,i.jsx)(n.code,{children:"docling serve"})," or any other native services, in the TUI main menu, click ",(0,i.jsx)(n.strong,{children:"Start Native Services"})," or ",(0,i.jsx)(n.strong,{children:"Stop Native Services"}),"."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"Embedding model"})," determines which AI model is used to create vector embeddings. The default is the OpenAI ",(0,i.jsx)(n.code,{children:"text-embedding-3-small"})," model."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"Chunk size"})," determines how large each text chunk is in number of characters.\nLarger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.\nThe default value of ",(0,i.jsx)(n.code,{children:"1000"})," characters provides a good starting point that balances these considerations."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"Chunk overlap"})," controls the number of characters that overlap over chunk boundaries.\nUse larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.\nThe default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"Table Structure"})," enables Docling's ",(0,i.jsx)(n.a,{href:"https://docling-project.github.io/docling/reference/document_converter/",children:(0,i.jsx)(n.code,{children:"DocumentConverter"})})," tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. ",(0,i.jsx)(n.strong,{children:"Table Structure"})," is enabled by default."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"OCR"})," enables or disabled OCR processing when extracting text from images and scanned documents.\nOCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's ",(0,i.jsx)(n.a,{href:"https://docling-project.github.io/docling/reference/document_converter/",children:(0,i.jsx)(n.code,{children:"DocumentConverter"})}),". Images are ignored and not processed."]}),"\n",(0,i.jsx)(n.p,{children:"Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance."}),"\n",(0,i.jsxs)(n.p,{children:["If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the ",(0,i.jsx)(n.a,{href:"https://www.piwheels.org/project/ocrmac/",children:"ocrmac"})," OCR engine. Other platforms use ",(0,i.jsx)(n.a,{href:"https://www.jaided.ai/easyocr/",children:"easyocr"}),"."]}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.strong,{children:"Picture descriptions"})," adds image descriptions generated by the ",(0,i.jsx)(n.a,{href:"https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct",children:"SmolVLM-256M-Instruct"})," model to OCR processing. Enabling picture descriptions can slow ingestion performance."]}),"\n",(0,i.jsx)(n.h2,{id:"knowledge-ingestion-flows",children:"Knowledge ingestion flows"}),"\n",(0,i.jsxs)(n.p,{children:[(0,i.jsx)(n.a,{href:"https://docs.langflow.org/concepts-overview",children:"Flows"})," in Langflow are functional representations of application workflows, with multiple ",(0,i.jsx)(n.a,{href:"https://docs.langflow.org/concepts-components",children:"component"})," nodes connected as single steps in a workflow."]}),"\n",(0,i.jsxs)(n.p,{children:["The ",(0,i.jsx)(n.strong,{children:"OpenSearch Ingestion"})," flow is the default knowledge ingestion flow in OpenRAG: when you ",(0,i.jsx)(n.strong,{children:"Add Knowledge"})," in OpenRAG, you run the OpenSearch Ingestion flow in the background. The flow ingests documents using ",(0,i.jsx)(n.strong,{children:"Docling Serve"})," to import and process documents."]}),"\n",(0,i.jsx)(n.p,{children:"This flow contains ten components connected together to process and store documents in your knowledge base."}),"\n",(0,i.jsxs)(n.ul,{children:["\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-docling",children:[(0,i.jsx)(n.strong,{children:"Docling Serve"})," component"]})," processes input documents by connecting to your instance of Docling Serve."]}),"\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/components-docling",children:[(0,i.jsx)(n.strong,{children:"Export DoclingDocument"})," component"]})," exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing."]}),"\n",(0,i.jsxs)(n.li,{children:["Three ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/components-processing#dataframe-operations",children:[(0,i.jsx)(n.strong,{children:"DataFrame Operations"})," components"]})," sequentially add metadata columns to the document data of ",(0,i.jsx)(n.code,{children:"filename"}),", ",(0,i.jsx)(n.code,{children:"file_size"}),", and ",(0,i.jsx)(n.code,{children:"mimetype"}),"."]}),"\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/components-processing#split-text",children:[(0,i.jsx)(n.strong,{children:"Split Text"})," component"]})," splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters."]}),"\n",(0,i.jsxs)(n.li,{children:["Four ",(0,i.jsx)(n.strong,{children:"Secret Input"})," components provide secure access to configuration variables: ",(0,i.jsx)(n.code,{children:"CONNECTOR_TYPE"}),", ",(0,i.jsx)(n.code,{children:"OWNER"}),", ",(0,i.jsx)(n.code,{children:"OWNER_EMAIL"}),", and ",(0,i.jsx)(n.code,{children:"OWNER_NAME"}),". These are runtime variables populated from OAuth login."]}),"\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsx)(n.strong,{children:"Create Data"})," component combines the secret inputs into a structured data object that will be associated with the document embeddings."]}),"\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/components-embedding-models",children:[(0,i.jsx)(n.strong,{children:"Embedding Model"})," component"]})," generates vector embeddings using OpenAI's ",(0,i.jsx)(n.code,{children:"text-embedding-3-small"})," model. The embedding model is selected at [Application onboarding] and cannot be changed."]}),"\n",(0,i.jsxs)(n.li,{children:["The ",(0,i.jsxs)(n.a,{href:"https://docs.langflow.org/bundles-elastic#opensearch",children:[(0,i.jsx)(n.strong,{children:"OpenSearch"})," component"]})," stores the processed documents and their embeddings in the ",(0,i.jsx)(n.code,{children:"documents"})," index at ",(0,i.jsx)(n.code,{children:"https://opensearch:9200"}),". By default, the component is authenticated with a JWT token, but you can also select ",(0,i.jsx)(n.code,{children:"basic"})," auth mode, and enter your OpenSearch admin username and password."]}),"\n"]}),"\n",(0,i.jsx)(r.Ay,{}),"\n",(0,i.jsx)(n.h3,{id:"url-flow",children:"OpenSearch URL Ingestion flow"}),"\n",(0,i.jsxs)(n.p,{children:["An additional knowledge ingestion flow is included in OpenRAG, where it is used as an MCP tool by the ",(0,i.jsx)(n.a,{href:"/agents#flow",children:(0,i.jsx)(n.strong,{children:"Open Search Agent flow"})}),".\nThe agent calls this component to fetch web content, and the results are ingested into OpenSearch."]}),"\n",(0,i.jsxs)(n.p,{children:["For more on using MCP clients in Langflow, see ",(0,i.jsx)(n.a,{href:"https://docs.langflow.org/mcp-client",children:"MCP clients"}),".",(0,i.jsx)(n.br,{}),"\n","To connect additional MCP servers to the MCP client, see ",(0,i.jsx)(n.a,{href:"https://docs.langflow.org/mcp-tutorial",children:"Connect to MCP servers from your application"}),"."]}),"\n",(0,i.jsx)(n.h2,{id:"use-openrag-default-ingestion-instead-of-docling-serve",children:"Use OpenRAG default ingestion instead of Docling serve"}),"\n",(0,i.jsxs)(n.p,{children:["If you want to use OpenRAG's built-in pipeline instead of Docling serve, set ",(0,i.jsx)(n.code,{children:"DISABLE_INGEST_WITH_LANGFLOW=true"})," in ",(0,i.jsx)(n.a,{href:"/reference/configuration#document-processing",children:"Environment variables"}),"."]}),"\n",(0,i.jsx)(n.p,{children:"The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API."}),"\n",(0,i.jsxs)(n.p,{children:["For more information, see ",(0,i.jsxs)(n.a,{href:"https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58",children:[(0,i.jsx)(n.code,{children:"processors.py"})," in the OpenRAG repository"]}),"."]}),"\n",(0,i.jsx)(n.h2,{id:"performance-expectations",children:"Performance expectations"}),"\n",(0,i.jsx)(n.p,{children:"On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.\nThis equates to approximately 2.4 documents per second."}),"\n",(0,i.jsx)(n.p,{children:"You can generally expect equal or better performance on developer laptops and significantly faster on servers.\nThroughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency."}),"\n",(0,i.jsx)(n.p,{children:"This test returned 12 errors (approximately 1.1%).\nAll errors were file-specific, and they didn't stop the pipeline."}),"\n",(0,i.jsx)(n.p,{children:"Ingestion dataset:"}),"\n",(0,i.jsxs)(n.ul,{children:["\n",(0,i.jsx)(n.li,{children:"Total files: 1,083 items mounted"}),"\n",(0,i.jsx)(n.li,{children:"Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)"}),"\n"]}),"\n",(0,i.jsx)(n.p,{children:"Hardware specifications:"}),"\n",(0,i.jsxs)(n.ul,{children:["\n",(0,i.jsx)(n.li,{children:"Machine: Apple M4 Pro"}),"\n",(0,i.jsxs)(n.li,{children:["Podman VM:","\n",(0,i.jsxs)(n.ul,{children:["\n",(0,i.jsxs)(n.li,{children:["Name: ",(0,i.jsx)(n.code,{children:"podman-machine-default"})]}),"\n",(0,i.jsxs)(n.li,{children:["Type: ",(0,i.jsx)(n.code,{children:"applehv"})]}),"\n",(0,i.jsx)(n.li,{children:"vCPUs: 7"}),"\n",(0,i.jsx)(n.li,{children:"Memory: 8 GiB"}),"\n",(0,i.jsx)(n.li,{children:"Disk size: 100 GiB"}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,i.jsx)(n.p,{children:"Test results:"}),"\n",(0,i.jsx)(n.pre,{children:(0,i.jsx)(n.code,{className:"language-text",children:"2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False\n2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082\n...\n2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082\n"})}),"\n",(0,i.jsx)(n.p,{children:"Elapsed time: ~42 minutes 15 seconds (2,535 seconds)"}),"\n",(0,i.jsx)(n.p,{children:"Throughput: ~2.4 documents/second"})]})}function p(e={}){const{wrapper:n}={...(0,t.R)(),...e.components};return n?(0,i.jsx)(n,{...e,children:(0,i.jsx)(h,{...e})}):h(e)}}}]);