From bf3224e7093621b11a1ba756744ba0249bfd6638 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 09:43:51 -0800
Subject: [PATCH 01/13] flow customization info
---
docs/docs/_partial-modify-flows.mdx | 5 ---
docs/docs/core-components/agents.mdx | 41 +++++++++++++++++++++----
docs/docs/core-components/ingestion.mdx | 2 +-
docs/docs/core-components/knowledge.mdx | 4 +--
docs/docs/get-started/quickstart.mdx | 3 +-
5 files changed, 39 insertions(+), 16 deletions(-)
delete mode 100644 docs/docs/_partial-modify-flows.mdx
diff --git a/docs/docs/_partial-modify-flows.mdx b/docs/docs/_partial-modify-flows.mdx
deleted file mode 100644
index 02ec1502..00000000
--- a/docs/docs/_partial-modify-flows.mdx
+++ /dev/null
@@ -1,5 +0,0 @@
-import Icon from "@site/src/components/icon/icon";
-
-All flows included with OpenRAG are designed to be modular, performant, and provider-agnostic.
-To modify a flow, click **Settings**, and click **Edit in Langflow**.
-OpenRAG's visual editor is based on the [Langflow visual editor](https://docs.langflow.org/concepts-overview), so you can edit your flows to match your specific use case.
\ No newline at end of file
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 9d440ddf..c27de881 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -6,7 +6,6 @@ slug: /agents
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx';
OpenRAG leverages Langflow's Agent component to power the OpenRAG OpenSearch Agent flow.
@@ -31,7 +30,7 @@ In an agentic context, tools are functions that the agent can run to perform tas
-## Use the OpenRAG OpenSearch Agent flow {#flow}
+## Use the OpenRAG Chat (OpenRAG OpenSearch Agent flow) {#flow}
If you've chatted with your knowledge in OpenRAG, you've already experienced the OpenRAG OpenSearch Agent chat flow.
To switch OpenRAG over to the [Langflow visual editor](https://docs.langflow.org/concepts-overview) and view the OpenRAG OpenSearch Agentflow, click **Settings**, and then click **Edit in Langflow**.
@@ -48,12 +47,42 @@ This filter is the [Knowledge filter](/knowledge#create-knowledge-filters), and
* The **Agent** component's Output port is connected to the [**Chat Output** component](https://docs.langflow.org/components-io), which returns the final response to the user or application.
* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [OpenSearch URL Ingestion flow](/ingestion#url-flow), which Langflow uses as an MCP server to fetch content from URLs and store in OpenSearch.
-
+## Inspect and modify flows {#inspect-and-modify-flows}
-For an example of changing out the agent's language model in OpenRAG, see the [Quickstart](/quickstart#change-components).
+All OpenRAG flows are designed to be modular, performant, and provider-agnostic.
-To restore the flow to its initial state, in OpenRAG, click **Settings**, and then click **Restore Flow**.
-OpenRAG warns you that this discards all custom settings. Click **Restore** to restore the flow.
+OpenRAG provides quick access to common settings and an embedded [Langflow visual editor](https://docs.langflow.org/concepts-overview) where you can fully customize the flows to suit your use case.
+
+To modify a flow, click **Settings**. Then, to launch the embedded Langflow visual editor, click **Edit in Langflow**.
+
+For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSearch Agent** flow), do the following:
+
+1. In OpenRAG, click **Chat**.
+
+2. Click **Settings**.
+
+ On the **Settings** page, you can edit commonly used parameters like the **Language model** and **Agent Instructions**.
+
+3. For more detail and customization options, click **Edit in Langflow** to launch the Langflow visual editor in a new browser window.
+
+ If prompted to acknowledge that you are entering Langflow, click **Proceed**.
+
+ If Langflow requests login information, enter the `LANGFLOW_SUPERUSER` and `LANGFLOW_SUPERUSER_PASSWORD` from the `.env` file in your OpenRAG installation directory.
+
+ 
+
+4. Modify the flow as desired, and then press Command+S (Ctrl+S) to save your changes.
+
+ You can close the Langflow browser window, or leave it open if you want to continue experimenting with the flow editor.
+
+ :::tip
+ If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original model.
+ :::
+
+### Revert a built-in flow to the default state
+
+After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
+This is a destructive action that discards all customizations to the flow.
## Additional Langflow functionality
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index 67f746b0..f09634b9 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -15,7 +15,7 @@ Docling ingests documents from your local machine or OAuth connectors, splits th
OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
-To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, click 2" aria-hidden="true"/> **Settings**.
+To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
## Knowledge ingestion settings
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 80a997c2..4c8bc6b0 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -106,9 +106,7 @@ The **Knowledge** page lists the documents OpenRAG has ingested into the OpenSea
To explore your current knowledge, click **Knowledge**.
Click on a document to display the chunks derived from splitting the default documents into the vector database.
-Documents are processed with the default **Knowledge Ingest** flow, so if you want to split your documents differently, edit the **Knowledge Ingest** flow.
-
-
+Documents are processed with the default **Knowledge Ingest** flow. If you want to split your documents differently, edit the **Knowledge Ingest** flow, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
## Create knowledge filters
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 80eb0902..7dd66c7f 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -135,7 +135,8 @@ You can click a document to view the chunks of the document as they are stored i
3. For this quickstart, try changing the model.
Click the **Language Model** component, and then change the **Model Name** to a different OpenAI model.
- When editing built-in flows, you can click **Restore flow** to revert the flow to its initial state.
+ After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
+ This is a destructive action that discards all customizations to the flow.
4. Press Command+S (Ctrl+S) to save your changes.
From b00e6e9c98779ff331d81f4381793defd13c3cc9 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 10:49:03 -0800
Subject: [PATCH 02/13] langflow functionality
---
docs/docs/core-components/agents.mdx | 85 ++++++++++++++++---------
docs/docs/core-components/ingestion.mdx | 20 +++---
docs/docs/core-components/knowledge.mdx | 1 -
3 files changed, 65 insertions(+), 41 deletions(-)
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index c27de881..6702fff4 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -7,45 +7,56 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-OpenRAG leverages Langflow's Agent component to power the OpenRAG OpenSearch Agent flow.
+OpenRAG includes a built-in [Langflow](https://docs.langflow.org/) instance for creating and managing application workflows called [_flows_](https://docs.langflow.org/concepts-overview).
+In a flow, the individual workflow steps are represented by [_components_](https://docs.langflow.org/concepts-components) that are connected together to form a complete process.
-[Flows](https://docs.langflow.org/concepts-overview) in Langflow are functional representations of application workflows, with multiple [component](https://docs.langflow.org/concepts-components) nodes connected as single steps in a workflow.
+OpenRAG includes several built-in flows:
-In the OpenRAG OpenSearch Agent flow, components like the Langflow [**Agent** component](https://docs.langflow.org/agents) and [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) are connected to intelligently chat with your knowledge by embedding your query, comparing it the vector database embeddings, and generating a response with the LLM.
+* The [**OpenRAG OpenSearch Agent** flow](/agents#flow) powers the **Chat** feature in OpenRAG.
+* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/ingestion#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge bases.
+
+You can customize the built-in flows or create your own flows using OpenRAG's embedded Langflow visual editor.
+
+## About the OpenRAG Chat flow (OpenRAG OpenSearch Agent flow) {#flow}
+
+When you **Chat** with your knowledge in OpenRAG, the **OpenRAG OpenSearch Agent** flow runs in the background.
+
+If you [inspect the flow in Langflow](#inspect-and-modify-flows), you'll see that it is comprised of eight components that work together to ingest chat messages, retrieve relevant information from your knowledge base, and then generate responses.

-The Agent component shines here in its ability to make decisions on not only what query should be sent, but when a query is necessary to solve the problem at hand.
-
-
-How do agents work?
-
-Agents extend Large Language Models (LLMs) by integrating tools, which are functions that provide additional context and enable autonomous task execution. These integrations make agents more specialized and powerful than standalone LLMs.
-
-Whereas an LLM might generate acceptable, inert responses to general queries and tasks, an agent can leverage the integrated context and tools to provide more relevant responses and even take action. For example, you might create an agent that can access your company's documentation, repositories, and other resources to help your team with tasks that require knowledge of your specific products, customers, and code.
-
-Agents use LLMs as a reasoning engine to process input, determine which actions to take to address the query, and then generate a response. The response could be a typical text-based LLM response, or it could involve an action, like editing a file, running a script, or calling an external API.
-
-In an agentic context, tools are functions that the agent can run to perform tasks or access external resources. A function is wrapped as a Tool object with a common interface that the agent understands. Agents become aware of tools through tool registration, which is when the agent is provided a list of available tools typically at agent initialization. The Tool object's description tells the agent what the tool can do so that it can decide whether the tool is appropriate for a given request.
-
-
-
-## Use the OpenRAG Chat (OpenRAG OpenSearch Agent flow) {#flow}
-
-If you've chatted with your knowledge in OpenRAG, you've already experienced the OpenRAG OpenSearch Agent chat flow.
-To switch OpenRAG over to the [Langflow visual editor](https://docs.langflow.org/concepts-overview) and view the OpenRAG OpenSearch Agentflow, click **Settings**, and then click **Edit in Langflow**.
-This flow contains eight components connected together to chat with your data:
-
* The [**Agent** component](https://docs.langflow.org/agents) orchestrates the entire flow by deciding when to search the knowledge base, how to formulate search queries, and how to combine retrieved information with the user's question to generate a comprehensive response.
The **Agent** behaves according to the prompt in the **Agent Instructions** field.
+
+ The Agent component is the star of this flow because it powers decision making, tool calling, and an LLM-driven conversational experience.
+
+
+ How do agents work?
+
+ Agents extend Large Language Models (LLMs) by integrating tools, which are functions that provide additional context and enable autonomous task execution. These integrations make agents more specialized and powerful than standalone LLMs.
+
+ Whereas an LLM might generate acceptable, inert responses to general queries and tasks, an agent can leverage the integrated context and tools to provide more relevant responses and even take action. For example, you might create an agent that can access your company's documentation, repositories, and other resources to help your team with tasks that require knowledge of your specific products, customers, and code.
+
+ Agents use LLMs as a reasoning engine to process input, determine which actions to take to address the query, and then generate a response. The response could be a typical text-based LLM response, or it could involve an action, like editing a file, running a script, or calling an external API.
+
+ In an agentic context, tools are functions that the agent can run to perform tasks or access external resources. A function is wrapped as a Tool object with a common interface that the agent understands. Agents become aware of tools through tool registration, which is when the agent is provided a list of available tools typically at agent initialization. The Tool object's description tells the agent what the tool can do so that it can decide whether the tool is appropriate for a given request.
+
+
+
* The [**Chat Input** component](https://docs.langflow.org/components-io) is connected to the Agent component's Input port. This allows to flow to be triggered by an incoming prompt from a user or application.
+
* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) is connected to the Agent component's Tools port. The agent might not use this database for every request; the agent only uses this connection if it decides the knowledge can help respond to the prompt.
+
* The [**Language Model** component](https://docs.langflow.org/components-models) is connected to the Agent component's Language Model port. The agent uses the connected LLM to reason through the request sent through Chat Input.
+
* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) is connected to the OpenSearch component's Embedding port. This component converts text queries into vector representations that are compared with document embeddings stored in OpenSearch for semantic similarity matching. This gives your Agent's queries context.
-* The [**Text Input** component](https://docs.langflow.org/components-io) is populated with the global variable `OPENRAG-QUERY-FILTER`.
+
+* The [**Text Input** component](https://docs.langflow.org/components-io) is populated with the global variable `OPENRAG-QUERY-FILTER`.
This filter is the [Knowledge filter](/knowledge#create-knowledge-filters), and filters which knowledge sources to search through.
+
* The **Agent** component's Output port is connected to the [**Chat Output** component](https://docs.langflow.org/components-io), which returns the final response to the user or application.
-* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [OpenSearch URL Ingestion flow](/ingestion#url-flow), which Langflow uses as an MCP server to fetch content from URLs and store in OpenSearch.
+
+* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [**OpenSearch URL Ingestion** flow](/ingestion#url-flow), which Langflow uses as a [Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) to fetch content from URLs and store in OpenSearch.
## Inspect and modify flows {#inspect-and-modify-flows}
@@ -79,17 +90,29 @@ For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSear
If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original model.
:::
-### Revert a built-in flow to the default state
+## Revert a built-in flow to the default state
After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
This is a destructive action that discards all customizations to the flow.
## Additional Langflow functionality
-Langflow includes features beyond Agents to help you integrate OpenRAG into your application, and all Langflow features are included in OpenRAG.
+In addition to OpenRAG's built-in flows, all Langflow features are available through OpenRAG, including popular extensibility features such as the following:
-* Langflow can serve your flows as an [MCP server](https://docs.langflow.org/mcp-server), or consume other MCP servers as an [MCP client](https://docs.langflow.org/mcp-client). Get started with the [MCP tutorial](https://docs.langflow.org/mcp-tutorial).
+* [Create custom components](https://docs.langflow.org/components-custom-components).
+* Integrate with many third-party services through [bundles](https://docs.langflow.org/components-bundle-components).
+* Use [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-server), and serve flows as MCP tools for your agentic flows.
-* If you don't see the component you need, extend Langflow's functionality by creating [custom Python components](https://docs.langflow.org/components-custom-components).
+Explore the [Langflow documentation](https://docs.langflow.org/) to learn more about the Langflow platform, features, and visual editor.
-* Langflow offers component [bundles](https://docs.langflow.org/components-bundle-components) to integrate with many popular vector stores, AI/ML providers, and search APIs.
\ No newline at end of file
+## Set the Langflow version
+
+By default, OpenRAG is pinned to the latest Langflow Docker image for stability.
+
+You can set a specific Langflow version with the [`LANGFLOW_VERSION`](/reference/configuration). However, there are risks to changing this setting:
+
+* The [Langflow documentation](https://docs.langflow.org/) describes the functionality present in the latest release of the Langflow OSS Python package. If your `LANGFLOW_VERSION` is different, the Langflow documentation might not align with the features and default settings in your OpenRAG installation.
+
+* Components might break, including components in OpenRAG's built-in flows.
+
+* Default settings and behaviors might change causing unexpected results when OpenRAG expects a newer default.
\ No newline at end of file
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index f09634b9..7095f343 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -6,7 +6,6 @@ slug: /ingestion
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx';
OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
@@ -45,13 +44,15 @@ If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the
**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
-## Knowledge ingestion flows
+## Knowledge ingestion flows {#knowledge-ingestion-flows}
[Flows](https://docs.langflow.org/concepts-overview) in Langflow are functional representations of application workflows, with multiple [component](https://docs.langflow.org/concepts-components) nodes connected as single steps in a workflow.
-The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG: when you **Add Knowledge** in OpenRAG, you run the OpenSearch Ingestion flow in the background. The flow ingests documents using **Docling Serve** to import and process documents.
+### OpenSearch Ingestion flow
-This flow contains ten components connected together to process and store documents in your knowledge base.
+The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the OpenSearch Ingestion flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
+
+If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
@@ -62,15 +63,16 @@ This flow contains ten components connected together to process and store docume
* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
-
+To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
### OpenSearch URL Ingestion flow {#url-flow}
-An additional knowledge ingestion flow is included in OpenRAG, where it is used as an MCP tool by the [**Open Search Agent flow**](/agents#flow).
-The agent calls this component to fetch web content, and the results are ingested into OpenSearch.
+The **OpenSearch URL Ingestion** flow is used to ingest web content from URLs.
+This flow isn't directly accessible from the OpenRAG user interface.
+Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/agents#flow) as a Model Context Protocol (MCP) tool.
+The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
-For more on using MCP clients in Langflow, see [MCP clients](https://docs.langflow.org/mcp-client).\
-To connect additional MCP servers to the MCP client, see [Connect to MCP servers from your application](https://docs.langflow.org/mcp-tutorial).
+For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
## Use OpenRAG default ingestion instead of Docling serve
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 4c8bc6b0..7ad7f74a 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -6,7 +6,6 @@ slug: /knowledge
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-import PartialModifyFlows from '@site/docs/_partial-modify-flows.mdx';
OpenRAG uses [OpenSearch](https://docs.opensearch.org/latest/) for its vector-backed knowledge store.
This is a specialized database for storing and retrieving embeddings, which helps your Agent efficiently find relevant information.
From c93cf42f885bb3d873aafc3541b3e0dc2d265112 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 13:02:39 -0800
Subject: [PATCH 03/13] notes and prep
---
docs/docs/core-components/knowledge.mdx | 18 ++++++++++++++----
docs/sidebars.js | 24 +++++++++++++++++++++---
2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 7ad7f74a..40401781 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -26,6 +26,8 @@ To configure the knowledge ingestion pipeline parameters, see [Docling Ingestion
### Direct file ingestion
+
+
The **Knowledge Ingest** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
@@ -38,6 +40,8 @@ The files are loaded into your OpenSearch database, and appear in the Knowledge
To add files directly to a chat session, click in the chat input and select the files you want to include. Files added this way are processed and made available to the agent for the current conversation, and are not permanently added to the knowledge base.
+
+
### Ingest files through OAuth connectors {#oauth-ingestion}
OpenRAG supports Google Drive, OneDrive, and Sharepoint as OAuth connectors for seamless document synchronization.
@@ -87,7 +91,7 @@ The ingestion process can take some time depending on the size of your documents
If ingestion fails, click **Status** to view the logged error.
-## Monitor ingestion tasks
+### Monitor ingestion tasks
When you upload files, process folders, or sync documents, OpenRAG processes them as background tasks.
A badge appears on the **Tasks** icon when there are active tasks running.
@@ -98,15 +102,21 @@ A **Pending** task is queued and waiting to start, a **Running** task is activel
You can cancel active tasks by clicking **Cancel**. Canceling a task stops processing immediately and marks the task as failed.
-## Explore knowledge
+## Browse knowledge
The **Knowledge** page lists the documents OpenRAG has ingested into the OpenSearch vector database's `documents` index.
-To explore your current knowledge, click **Knowledge**.
-Click on a document to display the chunks derived from splitting the default documents into the vector database.
+To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
+Click a document to view the chunks produced from splitting the documents during ingestion into the vector database.
Documents are processed with the default **Knowledge Ingest** flow. If you want to split your documents differently, edit the **Knowledge Ingest** flow, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
+## Chat with knowledge
+
+
+
+
+
## Create knowledge filters
OpenRAG includes a knowledge filter system for organizing and managing document collections.
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 373c6f82..2381a125 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -24,9 +24,27 @@ const sidebars = {
"get-started/quickstart",
"get-started/install",
"get-started/docker",
- "core-components/agents",
- "core-components/knowledge",
- "core-components/ingestion",
+ {
+ type: "category",
+ label: "Langflow in OpenRAG",
+ items: [
+ "core-components/agents",
+ ],
+ },
+ {
+ type: "category",
+ label: "OpenSearch in OpenRAG",
+ items: [
+ "core-components/knowledge",
+ ],
+ },
+ {
+ type: "category",
+ label: "Docling in OpenRAG",
+ items: [
+ "core-components/ingestion",
+ ],
+ },
"reference/configuration",
"support/troubleshoot",
],
From 0f9ec832c24ac918781cd726f24fe75e38dbed68 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:30:02 -0800
Subject: [PATCH 04/13] nudges and flow names
---
docs/docs/core-components/agents.mdx | 9 +++++++++
docs/docs/core-components/ingestion.mdx | 2 +-
docs/docs/core-components/knowledge.mdx | 6 ++----
docs/docs/get-started/quickstart.mdx | 4 ++--
4 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 6702fff4..23d50016 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -58,6 +58,15 @@ This filter is the [Knowledge filter](/knowledge#create-knowledge-filters), and
* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [**OpenSearch URL Ingestion** flow](/ingestion#url-flow), which Langflow uses as a [Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) to fetch content from URLs and store in OpenSearch.
+### Nudges
+
+When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Nudges** flow runs in the background to pull additional context from your knowledge base and chat history.
+
+Nudges appear as prompts in the chat.
+Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat** agent (the **OpenRAG OpenSearch Agent** flow).
+
+Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
+
## Inspect and modify flows {#inspect-and-modify-flows}
All OpenRAG flows are designed to be modular, performant, and provider-agnostic.
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index 7095f343..ef122138 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -50,7 +50,7 @@ If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the
### OpenSearch Ingestion flow
-The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the OpenSearch Ingestion flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
+The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 40401781..05f1caec 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -26,9 +26,7 @@ To configure the knowledge ingestion pipeline parameters, see [Docling Ingestion
### Direct file ingestion
-
-
-The **Knowledge Ingest** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
+The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
@@ -109,7 +107,7 @@ The **Knowledge** page lists the documents OpenRAG has ingested into the OpenSea
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
Click a document to view the chunks produced from splitting the documents during ingestion into the vector database.
-Documents are processed with the default **Knowledge Ingest** flow. If you want to split your documents differently, edit the **Knowledge Ingest** flow, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
+Documents are processed with the default **OpenSearch Ingestion** flow. If you want to split your documents differently, edit the **OpenSearch Ingestion** flow, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
## Chat with knowledge
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 7dd66c7f..4c4b7b1f 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -116,7 +116,7 @@ You can click a document to view the chunks of the document as they are stored i
* Click **Settings** to modify the knowledge ingestion settings.
- For more information about knowledge bases and knowledge ingestion, see [OpenSearch in OpenRAG](/knowledge).
+ For more information about knowledge bases, knowledge ingestion, and the OpenRAG **Chat**, see [OpenSearch in OpenRAG](/knowledge).
## Change the language model and chat settings {#change-components}
@@ -128,7 +128,7 @@ You can click a document to view the chunks of the document as they are stored i
If Langflow requests login information, enter the `LANGFLOW_SUPERUSER` and `LANGFLOW_SUPERUSER_PASSWORD` from the `.env` file in your OpenRAG installation directory.
- The OpenRAG OpenSearch Agent flow opens in a new browser window.
+ The **OpenRAG OpenSearch Agent** flow opens in a new browser window.

From 75c7f237d05a9b8e5e6f9bc78de335b63d4a73a1 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:21:05 -0800
Subject: [PATCH 05/13] working on knowledge topics
---
docs/docs/core-components/agents.mdx | 66 +-----
docs/docs/core-components/chat.mdx | 62 +++++
docs/docs/core-components/ingestion.mdx | 114 +---------
.../core-components/knowledge-configure.mdx | 58 +++++
.../core-components/knowledge-filters.mdx | 53 +++++
docs/docs/core-components/knowledge.mdx | 212 +++++++++++++-----
docs/docs/get-started/install.mdx | 8 +-
docs/docs/get-started/quickstart.mdx | 4 +-
docs/docs/reference/configuration.mdx | 3 +-
docs/sidebars.js | 20 +-
10 files changed, 352 insertions(+), 248 deletions(-)
create mode 100644 docs/docs/core-components/chat.mdx
create mode 100644 docs/docs/core-components/knowledge-configure.mdx
create mode 100644 docs/docs/core-components/knowledge-filters.mdx
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 23d50016..dc6ce5d2 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -1,5 +1,5 @@
---
-title: Langflow in OpenRAG
+title: Use Langflow in OpenRAG
slug: /agents
---
@@ -10,62 +10,14 @@ import TabItem from '@theme/TabItem';
OpenRAG includes a built-in [Langflow](https://docs.langflow.org/) instance for creating and managing application workflows called [_flows_](https://docs.langflow.org/concepts-overview).
In a flow, the individual workflow steps are represented by [_components_](https://docs.langflow.org/concepts-components) that are connected together to form a complete process.
-OpenRAG includes several built-in flows:
+OpenRAG includes several built-in flows that you can customize.
+You can also create your own flows using OpenRAG's embedded Langflow visual editor.
-* The [**OpenRAG OpenSearch Agent** flow](/agents#flow) powers the **Chat** feature in OpenRAG.
-* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/ingestion#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge bases.
+## Built-in flows
-You can customize the built-in flows or create your own flows using OpenRAG's embedded Langflow visual editor.
-
-## About the OpenRAG Chat flow (OpenRAG OpenSearch Agent flow) {#flow}
-
-When you **Chat** with your knowledge in OpenRAG, the **OpenRAG OpenSearch Agent** flow runs in the background.
-
-If you [inspect the flow in Langflow](#inspect-and-modify-flows), you'll see that it is comprised of eight components that work together to ingest chat messages, retrieve relevant information from your knowledge base, and then generate responses.
-
-
-
-* The [**Agent** component](https://docs.langflow.org/agents) orchestrates the entire flow by deciding when to search the knowledge base, how to formulate search queries, and how to combine retrieved information with the user's question to generate a comprehensive response.
-The **Agent** behaves according to the prompt in the **Agent Instructions** field.
-
- The Agent component is the star of this flow because it powers decision making, tool calling, and an LLM-driven conversational experience.
-
-
- How do agents work?
-
- Agents extend Large Language Models (LLMs) by integrating tools, which are functions that provide additional context and enable autonomous task execution. These integrations make agents more specialized and powerful than standalone LLMs.
-
- Whereas an LLM might generate acceptable, inert responses to general queries and tasks, an agent can leverage the integrated context and tools to provide more relevant responses and even take action. For example, you might create an agent that can access your company's documentation, repositories, and other resources to help your team with tasks that require knowledge of your specific products, customers, and code.
-
- Agents use LLMs as a reasoning engine to process input, determine which actions to take to address the query, and then generate a response. The response could be a typical text-based LLM response, or it could involve an action, like editing a file, running a script, or calling an external API.
-
- In an agentic context, tools are functions that the agent can run to perform tasks or access external resources. A function is wrapped as a Tool object with a common interface that the agent understands. Agents become aware of tools through tool registration, which is when the agent is provided a list of available tools typically at agent initialization. The Tool object's description tells the agent what the tool can do so that it can decide whether the tool is appropriate for a given request.
-
-
-
-* The [**Chat Input** component](https://docs.langflow.org/components-io) is connected to the Agent component's Input port. This allows to flow to be triggered by an incoming prompt from a user or application.
-
-* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) is connected to the Agent component's Tools port. The agent might not use this database for every request; the agent only uses this connection if it decides the knowledge can help respond to the prompt.
-
-* The [**Language Model** component](https://docs.langflow.org/components-models) is connected to the Agent component's Language Model port. The agent uses the connected LLM to reason through the request sent through Chat Input.
-
-* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) is connected to the OpenSearch component's Embedding port. This component converts text queries into vector representations that are compared with document embeddings stored in OpenSearch for semantic similarity matching. This gives your Agent's queries context.
-
-* The [**Text Input** component](https://docs.langflow.org/components-io) is populated with the global variable `OPENRAG-QUERY-FILTER`.
-This filter is the [Knowledge filter](/knowledge#create-knowledge-filters), and filters which knowledge sources to search through.
-
-* The **Agent** component's Output port is connected to the [**Chat Output** component](https://docs.langflow.org/components-io), which returns the final response to the user or application.
-
-* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [**OpenSearch URL Ingestion** flow](/ingestion#url-flow), which Langflow uses as a [Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) to fetch content from URLs and store in OpenSearch.
-
-### Nudges
-
-When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Nudges** flow runs in the background to pull additional context from your knowledge base and chat history.
-
-Nudges appear as prompts in the chat.
-Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat** agent (the **OpenRAG OpenSearch Agent** flow).
-
-Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
+* The [**OpenRAG OpenSearch Agent** flow](/chat#flow) powers the **Chat** feature in OpenRAG.
+* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/knowledge#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge base.
+* The [**OpenRAG OpenSearch Nudges** flow](/chat#nudges) provides optional contextual suggestions in the OpenRAG **Chat**.
## Inspect and modify flows {#inspect-and-modify-flows}
@@ -99,12 +51,12 @@ For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSear
If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original model.
:::
-## Revert a built-in flow to the default state
+## Revert a built-in flow to its default state
After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
This is a destructive action that discards all customizations to the flow.
-## Additional Langflow functionality
+## Custom flows and additional Langflow functionality
In addition to OpenRAG's built-in flows, all Langflow features are available through OpenRAG, including popular extensibility features such as the following:
diff --git a/docs/docs/core-components/chat.mdx b/docs/docs/core-components/chat.mdx
new file mode 100644
index 00000000..31827aa1
--- /dev/null
+++ b/docs/docs/core-components/chat.mdx
@@ -0,0 +1,62 @@
+---
+title: Chat in OpenRAG
+slug: /chat
+---
+
+import Icon from "@site/src/components/icon/icon";
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+After you [upload documents to your knowledge base](/knowledge), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
+
+
+
+## OpenRAG OpenSearch Agent flow {#flow}
+
+When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Agent** [flow](/agents) runs in the background to retrieve relevant information from your knowledge base and generate a response.
+
+If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of eight components that work together to ingest chat messages, retrieve relevant information from your knowledge base, and then generate responses.
+
+
+
+* The [**Agent** component](https://docs.langflow.org/agents) orchestrates the entire flow by deciding when to search the knowledge base, how to formulate search queries, and how to combine retrieved information with the user's question to generate a comprehensive response.
+The **Agent** behaves according to the prompt in the **Agent Instructions** field.
+
+ The Agent component is the star of this flow because it powers decision making, tool calling, and an LLM-driven conversational experience.
+
+
+ How do agents work?
+
+ Agents extend Large Language Models (LLMs) by integrating tools, which are functions that provide additional context and enable autonomous task execution. These integrations make agents more specialized and powerful than standalone LLMs.
+
+ Whereas an LLM might generate acceptable, inert responses to general queries and tasks, an agent can leverage the integrated context and tools to provide more relevant responses and even take action. For example, you might create an agent that can access your company's documentation, repositories, and other resources to help your team with tasks that require knowledge of your specific products, customers, and code.
+
+ Agents use LLMs as a reasoning engine to process input, determine which actions to take to address the query, and then generate a response. The response could be a typical text-based LLM response, or it could involve an action, like editing a file, running a script, or calling an external API.
+
+ In an agentic context, tools are functions that the agent can run to perform tasks or access external resources. A function is wrapped as a Tool object with a common interface that the agent understands. Agents become aware of tools through tool registration, which is when the agent is provided a list of available tools typically at agent initialization. The Tool object's description tells the agent what the tool can do so that it can decide whether the tool is appropriate for a given request.
+
+
+
+* The [**Chat Input** component](https://docs.langflow.org/components-io) is connected to the Agent component's Input port. This allows to flow to be triggered by an incoming prompt from a user or application.
+
+* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) is connected to the Agent component's Tools port. The agent might not use this database for every request; the agent only uses this connection if it decides the knowledge can help respond to the prompt.
+
+* The [**Language Model** component](https://docs.langflow.org/components-models) is connected to the Agent component's Language Model port. The agent uses the connected LLM to reason through the request sent through Chat Input.
+
+* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) is connected to the OpenSearch component's Embedding port. This component converts text queries into vector representations that are compared with document embeddings stored in OpenSearch for semantic similarity matching. This gives your Agent's queries context.
+
+* The [**Text Input** component](https://docs.langflow.org/components-io) is populated with the global variable `OPENRAG-QUERY-FILTER`.
+This filter is a [knowledge filter](/knowledge-filters) that limits the documents the Agent can access in the knowledge base if a global or chat filter is set.
+
+* The **Agent** component's Output port is connected to the [**Chat Output** component](https://docs.langflow.org/components-io), which returns the final response to the user or application.
+
+* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [**OpenSearch URL Ingestion** flow](/knowledge#url-flow), which Langflow uses as a [Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) to fetch content from URLs and store in OpenSearch.
+
+## Nudges {#nudges}
+
+When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Nudges** flow runs in the background to pull additional context from your knowledge base and chat history.
+
+Nudges appear as prompts in the chat.
+Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat** agent (the **OpenRAG OpenSearch Agent** flow).
+
+Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
\ No newline at end of file
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index ef122138..b4378bed 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -7,116 +7,4 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
-More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
-
-Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
-
-OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
-
-To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
-
-## Knowledge ingestion settings
-
-These settings configure the Docling ingestion parameters.
-
-OpenRAG will warn you if `docling serve` is not running.
-To start or stop `docling serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**.
-
-**Embedding model** determines which AI model is used to create vector embeddings. The default is the OpenAI `text-embedding-3-small` model.
-
-**Chunk size** determines how large each text chunk is in number of characters.
-Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
-The default value of `1000` characters provides a good starting point that balances these considerations.
-
-**Chunk overlap** controls the number of characters that overlap over chunk boundaries.
-Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
-The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.
-
-**Table Structure** enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. **Table Structure** is enabled by default.
-
-**OCR** enables or disabled OCR processing when extracting text from images and scanned documents.
-OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
-
-Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.
-
-If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
-
-**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
-
-## Knowledge ingestion flows {#knowledge-ingestion-flows}
-
-[Flows](https://docs.langflow.org/concepts-overview) in Langflow are functional representations of application workflows, with multiple [component](https://docs.langflow.org/concepts-components) nodes connected as single steps in a workflow.
-
-### OpenSearch Ingestion flow
-
-The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
-
-If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
-
-* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
-* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
-* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
-* The [**Split Text** component](https://docs.langflow.org/components-processing#split-text) splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.
-* Four **Secret Input** components provide secure access to configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`. These are runtime variables populated from OAuth login.
-* The **Create Data** component combines the secret inputs into a structured data object that will be associated with the document embeddings.
-* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
-* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
-
-To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
-
-### OpenSearch URL Ingestion flow {#url-flow}
-
-The **OpenSearch URL Ingestion** flow is used to ingest web content from URLs.
-This flow isn't directly accessible from the OpenRAG user interface.
-Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/agents#flow) as a Model Context Protocol (MCP) tool.
-The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
-
-For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
-
-## Use OpenRAG default ingestion instead of Docling serve
-
-If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/reference/configuration#document-processing).
-
-The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.
-
-For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58).
-
-## Performance expectations
-
-On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
-This equates to approximately 2.4 documents per second.
-
-You can generally expect equal or better performance on developer laptops and significantly faster on servers.
-Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.
-
-This test returned 12 errors (approximately 1.1%).
-All errors were file-specific, and they didn't stop the pipeline.
-
-Ingestion dataset:
-
-* Total files: 1,083 items mounted
-* Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)
-
-Hardware specifications:
-
-* Machine: Apple M4 Pro
-* Podman VM:
- * Name: `podman-machine-default`
- * Type: `applehv`
- * vCPUs: 7
- * Memory: 8 GiB
- * Disk size: 100 GiB
-
-Test results:
-
-```text
-2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False
-2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082
-...
-2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082
-```
-
-Elapsed time: ~42 minutes 15 seconds (2,535 seconds)
-
-Throughput: ~2.4 documents/second
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge-configure.mdx b/docs/docs/core-components/knowledge-configure.mdx
new file mode 100644
index 00000000..f3829d42
--- /dev/null
+++ b/docs/docs/core-components/knowledge-configure.mdx
@@ -0,0 +1,58 @@
+---
+title: Configure OpenSearch in OpenRAG
+slug: /knowledge-configure
+---
+
+import Icon from "@site/src/components/icon/icon";
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+OpenRAG includes a built-in [OpenSearch](https://docs.opensearch.org/latest/) instance that serves as the underlying datastore for your knowledge.
+This specialized database is used to store and retrieve your documents and the associated vector data (embeddings).
+
+The [OpenRAG **Chat**](/chat) runs [similarity searches](https://www.ibm.com/think/topics/vector-search) against your OpenSearch database to retrieve relevant information and generate context-aware responses.
+
+Additionally, OpenSearch provides powerful hybrid search capabilities with enterprise-grade security and multi-tenancy support.
+
+## OpenSearch authentication and document access {#auth}
+
+When you [install OpenRAG](/install), you can choose between two setup modes: **Basic Setup** and **Advanced Setup**.
+The mode you choose determines how OpenRAG authenticates with OpenSearch and controls access to documents:
+
+* **Basic Setup (no-auth mode)**: If you choose **Basic Setup**, then OpenRAG is installed in no-auth mode.
+This mode uses one, anonymous JWT token for OpenSearch authentication.
+There is no differentiation between users.
+All users that access your OpenRAG instance can access all documents uploaded to your OpenSearch `documents` index.
+
+* **Advanced Setup (OAuth mode)**: If you choose **Advanced Setup**, then OpenRAG is installed in OAuth mode.
+This mode uses a unique JWT token for each OpenRAG user, and each document is tagged with user ownership. Documents are filtered by user owner.
+This means users see only the documents that they uploaded or have access to.
+
+You can enable OAuth mode after installation.
+For more information, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).
+
+## Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
+
+When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
+OpenRAG automatically detects and configures the appropriate vector dimensions for your selected embedding model, ensuring optimal search performance and compatibility.
+
+In the OpenRAG repository, you can find the complete list of supported models in [`models_service.py`](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py) and the corresponding vector dimensions in [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
+
+The default embedding dimension is `1536` and the default model is `text-embedding-3-small`.
+
+You can use any supported or unsupported embedding model by specifying the model in your OpenRAG configuration during installation.
+
+If you use an unsupported embedding model that doesn't have defined dimensions in `settings.py`, then OpenRAG falls back to the default dimensions (1536) and logs a warning. OpenRAG's OpenSearch instance and flows continue to work, but [similarity search](https://www.ibm.com/think/topics/vector-search) quality can be affected if the actual model dimensions aren't 1536.
+
+The embedding model setting is immutable.
+To change the embedding model, you must [reinstall OpenRAG](/install#reinstall).
+
+## Set ingestion parameters
+
+For information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](/knowledge#knowledge-ingestion-settings) and [Knowledge ingestion flows](/knowledge#knowledge-ingestion-flows).
+
+## See also
+
+* [Ingest knowledge](/knowledge)
+* [Filter knowledge](/knowledge-filters)
+* [Chat with knowledge](/chat)
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge-filters.mdx b/docs/docs/core-components/knowledge-filters.mdx
new file mode 100644
index 00000000..1b3a974b
--- /dev/null
+++ b/docs/docs/core-components/knowledge-filters.mdx
@@ -0,0 +1,53 @@
+---
+title: Filter knowledge
+slug: /knowledge-filters
+---
+
+import Icon from "@site/src/components/icon/icon";
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+OpenRAG's knowledge filters help you organize and manage your [knowledge base](/knowledge-configure) by creating pre-defined views of your documents.
+
+Each knowledge filter captures a specific subset of documents based on given a search query and filters.
+
+Knowledge filters can be used with different OpenRAG functionality.
+For example, knowledge filters can help agents access large knowledge bases efficiently by narrowing the scope of documents that you want the agent to use.
+
+## Create a filter
+
+To create a knowledge filter, do the following:
+
+1. Click **Knowledge**, and then click **Knowledge Filters**.
+
+2. Enter a **Name** and **Description**, and then click **Create Filter**.
+
+ By default, new filters match all documents in your knowledge base.
+ Modify the filter to customize it.
+
+3. To modify the filter, click **Knowledge**, and then click your new filter. You can edit the following settings:
+
+ * **Search Query**: Enter text for semantic search, such as `financial reports from Q4`.
+ * **Data Sources**: Select specific data sources or folders to include.
+ * **Document Types**: Filter by file type.
+ * **Owners**: Filter by the user that uploaded the documents.
+ * **Connectors**: Filter by [upload source](/knowledge), such as the local file system or a Google Drive OAuth connector.
+ * **Response Limit**: Set the maximum number of results to return from the knowledge base. The default is `10`.
+ * **Score Threshold**: Set the minimum relevance score for similarity search. The default score is `0`.
+
+4. To save your changes, click **Update Filter**.
+
+## Apply a filter {#apply-a-filter}
+
+* **Apply a global filter**: Click **Knowledge**, and then enable the toggle next to your preferred filter. Only one filter can be the global filter. The global filter applies to all chat sessions.
+
+* **Apply a chat filter**: In the **Chat** window, click **Filter**, and then select the filter to apply.
+Chat filters apply to one chat session only.
+
+## Delete a filter
+
+1. Click **Knowledge**.
+
+2. Click the filter that you want to delete.
+
+3. Click **Delete Filter**.
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 05f1caec..cb8afaad 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -1,5 +1,5 @@
---
-title: OpenSearch in OpenRAG
+title: Ingest knowledge
slug: /knowledge
---
@@ -7,24 +7,22 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-OpenRAG uses [OpenSearch](https://docs.opensearch.org/latest/) for its vector-backed knowledge store.
-This is a specialized database for storing and retrieving embeddings, which helps your Agent efficiently find relevant information.
-OpenSearch provides powerful hybrid search capabilities with enterprise-grade security and multi-tenancy support.
+The documents in your OpenRAG [OpenSearch knowledge base](/knowledge-configure) provide specialized context in addition to the general knowledge available to the language model that you select when you [install OpenRAG](/install).
+Upload documents to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
+Then, the [OpenRAG **Chat**](/chat) can retrieve relevant content from your knowledge base to provide context-aware responses.
-## Authentication and document access {#auth}
+OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth connectors.
-OpenRAG supports two authentication modes based on how you [install OpenRAG](/install), and which mode you choose affects document access.
+Knowledge ingestion is powered by OpenRAG's built-in [knowledge ingestion flows](/knowledge#knowledge-ingestion-flows) that use Docling Serve to process documents before storing the documents in your OpenSearch database.
-**No-auth mode (Basic Setup)**: This mode uses a single anonymous JWT token for OpenSearch authentication, so documents uploaded to the `documents` index by one user are visible to all other users on the OpenRAG server.
+During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected [embedding model](/knowledge-configure#set-the-embedding-model-and-dimensions).
+The chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
-**OAuth mode (Advanced Setup)**: Each OpenRAG user is granted a JWT token, and each document is tagged with user ownership. Documents are filtered by user ownership, ensuring users only see documents they uploaded or have access to.
+Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the [ingestion settings](/knowledge#knowledge-ingestion-settings).
-## Ingest knowledge
+## Ingest local files and folders {#knowledge-ingestion-flows}
-OpenRAG supports knowledge ingestion through direct file uploads and OAuth connectors.
-To configure the knowledge ingestion pipeline parameters, see [Docling Ingestion](/ingestion).
-
-### Direct file ingestion
+
The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
@@ -38,9 +36,35 @@ The files are loaded into your OpenSearch database, and appear in the Knowledge
To add files directly to a chat session, click in the chat input and select the files you want to include. Files added this way are processed and made available to the agent for the current conversation, and are not permanently added to the knowledge base.
-
+### OpenSearch Ingestion flow
-### Ingest files through OAuth connectors {#oauth-ingestion}
+
+
+The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
+
+If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
+
+* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
+* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
+* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
+* The [**Split Text** component](https://docs.langflow.org/components-processing#split-text) splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.
+* Four **Secret Input** components provide secure access to configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`. These are runtime variables populated from OAuth login.
+* The **Create Data** component combines the secret inputs into a structured data object that will be associated with the document embeddings.
+* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
+* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
+
+To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
+
+## Ingest knowledge from URLs {#url-flow}
+
+The **OpenSearch URL Ingestion** flow is used to ingest web content from URLs.
+This flow isn't directly accessible from the OpenRAG user interface.
+Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/chat#flow) as a Model Context Protocol (MCP) tool.
+The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
+
+For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
+
+## Ingest files through OAuth connectors {#oauth-ingestion}
OpenRAG supports Google Drive, OneDrive, and Sharepoint as OAuth connectors for seamless document synchronization.
@@ -89,75 +113,145 @@ The ingestion process can take some time depending on the size of your documents
If ingestion fails, click **Status** to view the logged error.
-### Monitor ingestion tasks
+## Monitor ingestion
-When you upload files, process folders, or sync documents, OpenRAG processes them as background tasks.
-A badge appears on the **Tasks** icon when there are active tasks running.
-To open the Tasks menu, click **Tasks**.
+Document ingestion tasks run in the background.
-**Active Tasks** shows tasks that are currently processing.
-A **Pending** task is queued and waiting to start, a **Running** task is actively processing files, and a **Processing** task is performing ingestion operations. For each active task, you can find the task ID, start time, duration, the number of files processed so far, and the total files.
+In the OpenRAG UI, a badge is shown on **Tasks** when OpenRAG tasks are active.
+Click **Tasks** to inspect and cancel tasks:
-You can cancel active tasks by clicking **Cancel**. Canceling a task stops processing immediately and marks the task as failed.
+* **Active Tasks**: All tasks that are **Pending**, **Running**, or **Processing**.
+For each active task, depending on its state, you can find the task ID, start time, duration, number of files processed, and the total files enqueued for processing.
-## Browse knowledge
+* **Pending**: The task is queued and waiting to start.
-The **Knowledge** page lists the documents OpenRAG has ingested into the OpenSearch vector database's `documents` index.
+* **Running**: The task is actively processing files.
+
+* **Processing**: The task is performing ingestion operations.
+
+* **Failed**: Something went wrong during ingestion, or the task was manually canceled.
+
+To stop an active task, click **Cancel**. Canceling a task stops processing immediately and marks the task as **Failed**.
+
+## Browse knowledge {#browse-knowledge}
+
+The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSearch database, specifically in the `documents` index.
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
-Click a document to view the chunks produced from splitting the documents during ingestion into the vector database.
+Click a document to view the chunks produced from splitting the document during ingestion.
-Documents are processed with the default **OpenSearch Ingestion** flow. If you want to split your documents differently, edit the **OpenSearch Ingestion** flow, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
+## Troubleshoot ingestion
-## Chat with knowledge
+If an ingestion task fails, do the following:
-
+* Make sure you are uploading supported file types.
+* Split excessively large files into smaller files before uploading.
+* Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.
-
+If the OpenRAG **Chat** doesn't seem to use your documents correctly, [browse your knowledge base](#browse-knowledge) to confirm that the documents are uploaded in full, and the chunks are correct.
-## Create knowledge filters
+If the documents are present and well-formed, check your [knowledge filters](/knowledge-filters).
+If a global filter is applied, make sure the expected documents are included in the global filter.
+If the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter.
-OpenRAG includes a knowledge filter system for organizing and managing document collections.
-Knowledge filters are saved search configurations that allow you to create custom views of your document collection. They store search queries, filter criteria, and display settings that can be reused across different parts of OpenRAG.
+If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.
+For example:
-Knowledge filters help agents work more efficiently with large document collections by focusing their context within relevant documents sets.
+* Break combined documents into separate files for better metadata context.
+* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
+* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
-To create a knowledge filter, do the following:
+For more information about modifying ingestion parameters and flows, see [Docling Serve for knowledge ingestion](/knowledge#docling-serve-for-knowledge-ingestion).
-1. Click **Knowledge**, and then click **Knowledge Filters**.
- The **Knowledge Filter** pane appears.
-2. Enter a **Name** and **Description**, and then click **Create Filter**.
-A new filter is created with default settings that match all documents.
-3. To modify the filter, click **Knowledge**, and then click your new filter to edit it in the **Knowledge Filter** pane.
+## Docling Serve for knowledge ingestion {#docling-serve-for-knowledge-ingestion}
- The following filter options are configurable.
-
- * **Search Query**: Enter text for semantic search, such as "financial reports from Q4".
- * **Data Sources**: Select specific data sources or folders to include.
- * **Document Types**: Filter by file type.
- * **Owners**: Filter by who uploaded the documents.
- * **Connectors**: Filter by connector types, such as local upload or Google Drive.
- * **Response Limit**: Set maximum number of results. The default is `10`.
- * **Score Threshold**: Set minimum relevance score. The default score is `0`.
+
-4. When you're done editing the filter, click **Update Filter**.
+OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
+More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
-5. To apply the filter to OpenRAG globally, click **Knowledge**, and then select the filter to apply. One filter can be enabled at a time.
+Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
- To apply the filter to a single chat session, in the **Chat** window, click , and then select the filter to apply.
+OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
- To delete the filter, in the **Knowledge Filter** pane, click **Delete Filter**.
+### Knowledge ingestion settings {#knowledge-ingestion-settings}
-## OpenRAG default configuration
+To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
-OpenRAG automatically detects and configures the correct vector dimensions for embedding models, ensuring optimal search performance and compatibility.
+These settings configure the Docling ingestion parameters.
-The complete list of supported models is available at [`models_service.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py).
+OpenRAG will warn you if `docling serve` is not running.
+To start or stop `docling serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**.
-You can use custom embedding models by specifying them in your configuration.
+**Embedding model** determines which AI model is used to create vector embeddings. The default is the OpenAI `text-embedding-3-small` model.
-If you use an unknown embedding model, OpenRAG automatically falls back to `1536` dimensions and logs a warning. The system continues to work, but search quality can be affected if the actual model dimensions differ from `1536`.
+**Chunk size** determines how large each text chunk is in number of characters.
+Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
+The default value of `1000` characters provides a good starting point that balances these considerations.
-The default embedding dimension is `1536` and the default model is `text-embedding-3-small`.
+**Chunk overlap** controls the number of characters that overlap over chunk boundaries.
+Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
+The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.
-For models with known vector dimensions, see [`settings.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
\ No newline at end of file
+**Table Structure** enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. **Table Structure** is enabled by default.
+
+**OCR** enables or disabled OCR processing when extracting text from images and scanned documents.
+OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
+
+Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.
+
+If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
+
+**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
+
+### Use OpenRAG default ingestion instead of Docling serve
+
+If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/reference/configuration#document-processing).
+
+The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.
+
+For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58).
+
+## Ingestion performance expectations
+
+On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
+This equates to approximately 2.4 documents per second.
+
+You can generally expect equal or better performance on developer laptops and significantly faster on servers.
+Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.
+
+This test returned 12 errors (approximately 1.1 percent).
+All errors were file-specific, and they didn't stop the pipeline.
+
+* Ingestion dataset:
+
+ * Total files: 1,083 items mounted
+ * Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)
+
+* Hardware specifications:
+
+ * Machine: Apple M4 Pro
+ * Podman VM:
+ * Name: `podman-machine-default`
+ * Type: `applehv`
+ * vCPUs: 7
+ * Memory: 8 GiB
+ * Disk size: 100 GiB
+
+* Test results:
+
+ ```text
+ 2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False
+ 2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082
+ ...
+ 2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082
+ ```
+
+* Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)
+
+* Throughput: Approximately 2.4 documents/second
+
+## See also
+
+* [Configure OpenSearch in OpenRAG](/knowledge-configure)
+* [Filter knowledge](/knowledge-filters)
\ No newline at end of file
diff --git a/docs/docs/get-started/install.mdx b/docs/docs/get-started/install.mdx
index 3c1c4bcc..3d6e4b73 100644
--- a/docs/docs/get-started/install.mdx
+++ b/docs/docs/get-started/install.mdx
@@ -190,14 +190,14 @@ If the TUI detects OAuth credentials, it enforces the **Advanced Setup** path.
**Basic Setup** can generate all of the required values for OpenRAG. The OpenAI API key is optional and can be provided during onboarding.
**Basic Setup** does not set up OAuth connections for ingestion from cloud providers.
For OAuth setup, use **Advanced Setup**.
- For information about the difference between basic (no auth) and OAuth in OpenRAG, see [Authentication and document access](/knowledge#auth).
+ For information about the difference between basic (no auth) and OAuth in OpenRAG, see [OpenSearch authentication and document access](/knowledge-configure#auth).
- 1. To install OpenRAG with **Basic Setup**, click **Basic Setup** or press 1.
+ 1. To install OpenRAG with **Basic Setup**, click **Basic Setup** or press 1.
2. Click **Generate Passwords** to generate passwords for OpenSearch and Langflow.
-
+
The OpenSearch password is required. The Langflow admin password is optional.
If no Langflow admin password is generated, Langflow runs in [autologin mode](https://docs.langflow.org/api-keys-and-authentication#langflow-auto-login) with no password required.
-
+
3. Optional: Paste your OpenAI API key in the OpenAI API key field. You can also provide this during onboarding or choose a different model provider.
4. Click **Save Configuration**.
Your passwords are saved in the `.env` file used to start OpenRAG.
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 4c4b7b1f..4256158c 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -84,7 +84,7 @@ You can complete this quickstart without going through the overview.
## Load and chat with documents {#chat-with-documents}
-OpenRAG's knowledge base chat is powered by the [OpenRAG OpenSearch Agent](/agents).
+Use the [OpenRAG **Chat**](/chat) to explore the documents in your OpenRAG database using natural language queries.
Some documents are included by default to get you started, and you can load your own documents.
1. In OpenRAG, click **Chat**.
@@ -116,7 +116,7 @@ You can click a document to view the chunks of the document as they are stored i
* Click **Settings** to modify the knowledge ingestion settings.
- For more information about knowledge bases, knowledge ingestion, and the OpenRAG **Chat**, see [OpenSearch in OpenRAG](/knowledge).
+ For more information, see [Configure OpenSearch in OpenRAG](/knowledge-configure) and [Ingest knowledge](/knowledge).
## Change the language model and chat settings {#change-components}
diff --git a/docs/docs/reference/configuration.mdx b/docs/docs/reference/configuration.mdx
index 1dbc4198..a5d6e8d9 100644
--- a/docs/docs/reference/configuration.mdx
+++ b/docs/docs/reference/configuration.mdx
@@ -71,8 +71,7 @@ For more information, see [Application onboarding](/install#application-onboardi
### Document processing
-Control how OpenRAG processes and ingests documents into your knowledge base.
-For more information, see [Ingestion](/ingestion).
+Control how OpenRAG [processes and ingests documents](/knowledge) into your knowledge base.
| Variable | Default | Description |
|----------|---------|-------------|
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 2381a125..c8a57c97 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -25,25 +25,23 @@ const sidebars = {
"get-started/install",
"get-started/docker",
{
- type: "category",
- label: "Langflow in OpenRAG",
- items: [
- "core-components/agents",
- ],
+ type: "doc",
+ id: "core-components/agents",
+ label: "Flows",
},
{
type: "category",
- label: "OpenSearch in OpenRAG",
+ label: "Knowledge",
items: [
+ "core-components/knowledge-configure",
"core-components/knowledge",
+ "core-components/knowledge-filters",
],
},
{
- type: "category",
- label: "Docling in OpenRAG",
- items: [
- "core-components/ingestion",
- ],
+ type: "doc",
+ id: "core-components/chat",
+ label: "Chat",
},
"reference/configuration",
"support/troubleshoot",
From 33df187f59888a08632efbeb98376562242825ce Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 25 Nov 2025 18:22:40 -0800
Subject: [PATCH 06/13] add topic backto nav
---
docs/sidebars.js | 1 +
1 file changed, 1 insertion(+)
diff --git a/docs/sidebars.js b/docs/sidebars.js
index c8a57c97..df240798 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -35,6 +35,7 @@ const sidebars = {
items: [
"core-components/knowledge-configure",
"core-components/knowledge",
+ "core-components/ingestion",
"core-components/knowledge-filters",
],
},
From 3101dd0ed5377b2b6569d84220e9c979b663289f Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Wed, 26 Nov 2025 06:14:20 -0800
Subject: [PATCH 07/13] polish agents page
---
docs/docs/core-components/agents.mdx | 33 ++++++++++++----------------
docs/docs/core-components/chat.mdx | 2 +-
2 files changed, 15 insertions(+), 20 deletions(-)
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index dc6ce5d2..1ffe19f1 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -7,35 +7,30 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-OpenRAG includes a built-in [Langflow](https://docs.langflow.org/) instance for creating and managing application workflows called [_flows_](https://docs.langflow.org/concepts-overview).
+OpenRAG includes a built-in [Langflow](https://docs.langflow.org/) instance for creating and managing functional application workflows called _flows_.
In a flow, the individual workflow steps are represented by [_components_](https://docs.langflow.org/concepts-components) that are connected together to form a complete process.
-OpenRAG includes several built-in flows that you can customize.
-You can also create your own flows using OpenRAG's embedded Langflow visual editor.
-
-## Built-in flows
+OpenRAG includes several built-in flows:
* The [**OpenRAG OpenSearch Agent** flow](/chat#flow) powers the **Chat** feature in OpenRAG.
* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/knowledge#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge base.
* The [**OpenRAG OpenSearch Nudges** flow](/chat#nudges) provides optional contextual suggestions in the OpenRAG **Chat**.
+You can customize these flows and create your own flows using OpenRAG's embedded Langflow visual editor.
+
## Inspect and modify flows {#inspect-and-modify-flows}
All OpenRAG flows are designed to be modular, performant, and provider-agnostic.
-OpenRAG provides quick access to common settings and an embedded [Langflow visual editor](https://docs.langflow.org/concepts-overview) where you can fully customize the flows to suit your use case.
-
-To modify a flow, click **Settings**. Then, to launch the embedded Langflow visual editor, click **Edit in Langflow**.
+To modify a flow in OpenRAG, click **Settings**.
+From here, you can quickly edit commonly used parameters, such as the **Language model** and **Agent Instructions**.
+To further explore and edit the flow, click **Edit in Langflow** to launch the embedded [Langflow visual editor](https://docs.langflow.org/concepts-overview) where you can fully [customize the flow](https://docs.langflow.org/concepts-flows) to suit your use case.
For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSearch Agent** flow), do the following:
1. In OpenRAG, click **Chat**.
-2. Click **Settings**.
-
- On the **Settings** page, you can edit commonly used parameters like the **Language model** and **Agent Instructions**.
-
-3. For more detail and customization options, click **Edit in Langflow** to launch the Langflow visual editor in a new browser window.
+2. Click **Settings**, and then click **Edit in Langflow** to launch the Langflow visual editor in a new browser window.
If prompted to acknowledge that you are entering Langflow, click **Proceed**.
@@ -43,22 +38,22 @@ For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSear

-4. Modify the flow as desired, and then press Command+S (Ctrl+S) to save your changes.
+3. Modify the flow as desired, and then press Command+S (Ctrl+S) to save your changes.
You can close the Langflow browser window, or leave it open if you want to continue experimenting with the flow editor.
:::tip
- If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original model.
+ If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original flow settings.
:::
-## Revert a built-in flow to its default state
+### Revert a built-in flow to its default state
After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
This is a destructive action that discards all customizations to the flow.
-## Custom flows and additional Langflow functionality
+## Build custom flows and use other Langflow functionality
-In addition to OpenRAG's built-in flows, all Langflow features are available through OpenRAG, including popular extensibility features such as the following:
+In addition to OpenRAG's built-in flows, all Langflow features are available through OpenRAG, including the ability to [create your own flows](https://docs.langflow.org/concepts-flows) and popular extensibility features such as the following:
* [Create custom components](https://docs.langflow.org/components-custom-components).
* Integrate with many third-party services through [bundles](https://docs.langflow.org/components-bundle-components).
@@ -70,7 +65,7 @@ Explore the [Langflow documentation](https://docs.langflow.org/) to learn more a
By default, OpenRAG is pinned to the latest Langflow Docker image for stability.
-You can set a specific Langflow version with the [`LANGFLOW_VERSION`](/reference/configuration). However, there are risks to changing this setting:
+If necessary, you can set a specific Langflow version with the [`LANGFLOW_VERSION`](/reference/configuration). However, there are risks to changing this setting:
* The [Langflow documentation](https://docs.langflow.org/) describes the functionality present in the latest release of the Langflow OSS Python package. If your `LANGFLOW_VERSION` is different, the Langflow documentation might not align with the features and default settings in your OpenRAG installation.
diff --git a/docs/docs/core-components/chat.mdx b/docs/docs/core-components/chat.mdx
index 31827aa1..92958285 100644
--- a/docs/docs/core-components/chat.mdx
+++ b/docs/docs/core-components/chat.mdx
@@ -59,4 +59,4 @@ When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Nudges** flow runs i
Nudges appear as prompts in the chat.
Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat** agent (the **OpenRAG OpenSearch Agent** flow).
-Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
\ No newline at end of file
+Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
\ No newline at end of file
From 069135d563efcb49fd28ced9a371c9c6cf8929f2 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Wed, 26 Nov 2025 07:33:55 -0800
Subject: [PATCH 08/13] build out chat page
---
docs/docs/_partial-integrate-chat.mdx | 114 ++++++++++++++++++++++++
docs/docs/core-components/chat.mdx | 53 ++++++++---
docs/docs/core-components/ingestion.mdx | 14 ++-
docs/docs/core-components/knowledge.mdx | 2 +-
docs/docs/get-started/quickstart.mdx | 112 +----------------------
5 files changed, 169 insertions(+), 126 deletions(-)
create mode 100644 docs/docs/_partial-integrate-chat.mdx
diff --git a/docs/docs/_partial-integrate-chat.mdx b/docs/docs/_partial-integrate-chat.mdx
new file mode 100644
index 00000000..de3d9a62
--- /dev/null
+++ b/docs/docs/_partial-integrate-chat.mdx
@@ -0,0 +1,114 @@
+import Icon from "@site/src/components/icon/icon";
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+1. Open the **OpenRAG OpenSearch Agent** flow in the Langflow visual editor: From the **Chat** window, click **Settings**, click **Edit in Langflow**, and then click **Proceed**.
+
+2. Create a [Langflow API key](https://docs.langflow.org/api-keys-and-authentication), which is a user-specific token required to send requests to the Langflow server.
+This key doesn't grant access to OpenRAG.
+
+ 1. In the Langflow visual editor, click your user icon in the header, and then select **Settings**.
+ 2. Click **Langflow API Keys**, and then click **Add New**.
+ 3. Name your key, and then click **Create API Key**.
+ 4. Copy the API key and store it securely.
+ 5. Exit the Langflow **Settings** page to return to the visual editor.
+
+3. Click **Share**, and then select **API access** to get pregenerated code snippets that call the Langflow API and run the flow.
+
+ These code snippets construct API requests with your Langflow server URL (`LANGFLOW_SERVER_ADDRESS`), the flow to run (`FLOW_ID`), required headers (`LANGFLOW_API_KEY`, `Content-Type`), and a payload containing the required inputs to run the flow, including a default chat input message.
+
+ In production, you would modify the inputs to suit your application logic. For example, you could replace the default chat input message with dynamic user input.
+
+
+
+
+ ```python
+ import requests
+ import os
+ import uuid
+
+ api_key = 'LANGFLOW_API_KEY'
+ url = "http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID" # The complete API endpoint URL for this flow
+
+ # Request payload configuration
+ payload = {
+ "output_type": "chat",
+ "input_type": "chat",
+ "input_value": "hello world!"
+ }
+ payload["session_id"] = str(uuid.uuid4())
+
+ headers = {"x-api-key": api_key}
+
+ try:
+ # Send API request
+ response = requests.request("POST", url, json=payload, headers=headers)
+ response.raise_for_status() # Raise exception for bad status codes
+
+ # Print response
+ print(response.text)
+
+ except requests.exceptions.RequestException as e:
+ print(f"Error making API request: {e}")
+ except ValueError as e:
+ print(f"Error parsing response: {e}")
+ ```
+
+
+
+
+ ```typescript
+ const crypto = require('crypto');
+ const apiKey = 'LANGFLOW_API_KEY';
+ const payload = {
+ "output_type": "chat",
+ "input_type": "chat",
+ "input_value": "hello world!"
+ };
+ payload.session_id = crypto.randomUUID();
+
+ const options = {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ "x-api-key": apiKey
+ },
+ body: JSON.stringify(payload)
+ };
+
+ fetch('http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID', options)
+ .then(response => response.json())
+ .then(response => console.warn(response))
+ .catch(err => console.error(err));
+ ```
+
+
+
+
+ ```bash
+ curl --request POST \
+ --url 'http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID?stream=false' \
+ --header 'Content-Type: application/json' \
+ --header "x-api-key: LANGFLOW_API_KEY" \
+ --data '{
+ "output_type": "chat",
+ "input_type": "chat",
+ "input_value": "hello world!"
+ }'
+ ```
+
+
+
+
+4. Copy your preferred snippet, and then run it:
+
+ * **Python**: Paste the snippet into a `.py` file, save it, and then run it with `python filename.py`.
+ * **TypeScript**: Paste the snippet into a `.ts` file, save it, and then run it with `ts-node filename.ts`.
+ * **curl**: Paste and run snippet directly in your terminal.
+
+If the request is successful, the response includes many details about the flow run, including the session ID, inputs, outputs, components, durations, and more.
+
+In production, you won't pass the raw response to the user in its entirety.
+Instead, you extract and reformat relevant fields for different use cases, as demonstrated in the [Langflow quickstart](https://docs.langflow.org/quickstart#extract-data-from-the-response).
+For example, you could pass the chat output text to a front-end user-facing application, and store specific fields in logs and backend data stores for monitoring, chat history, or analytics.
+You could also pass the output from one flow as input to another flow.
\ No newline at end of file
diff --git a/docs/docs/core-components/chat.mdx b/docs/docs/core-components/chat.mdx
index 92958285..4137e4e1 100644
--- a/docs/docs/core-components/chat.mdx
+++ b/docs/docs/core-components/chat.mdx
@@ -6,10 +6,13 @@ slug: /chat
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
+import PartialIntegrateChat from '@site/docs/_partial-integrate-chat.mdx';
-After you [upload documents to your knowledge base](/knowledge), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
+After you [upload documents to your knowledge base](/knowledge), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
-
+:::tip
+Try chatting, uploading documents, and modifying chat settings in the [quickstart](/quickstart).
+:::
## OpenRAG OpenSearch Agent flow {#flow}
@@ -19,10 +22,14 @@ If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll

-* The [**Agent** component](https://docs.langflow.org/agents) orchestrates the entire flow by deciding when to search the knowledge base, how to formulate search queries, and how to combine retrieved information with the user's question to generate a comprehensive response.
-The **Agent** behaves according to the prompt in the **Agent Instructions** field.
+* [**Chat Input** component](https://docs.langflow.org/components-io): This component starts the flow when it receives a chat message. It is connected to the **Agent** component's **Input** port.
+When you use the OpenRAG **Chat**, your chat messages are passed to the **Chat Input** component, which then sends them to the **Agent** component for processing.
- The Agent component is the star of this flow because it powers decision making, tool calling, and an LLM-driven conversational experience.
+* [**Agent** component](https://docs.langflow.org/agents): This component orchestrates the entire flow by processing chat messages, searching the knowledge base, and organizing the retrieved information into a cohesive response.
+The agent's general behavior is defined by the prompt in the **Agent Instructions** field and the model connected to the **Language Model** port.
+One or more specialized tools can be attached to the **Tools** port to extend the agent's capabilities. In this case, there are two tools: **MCP Tools** and **OpenSearch**.
+
+ The **Agent** component is the star of this flow because it powers decision making, tool calling, and an LLM-driven conversational experience.
How do agents work?
@@ -37,20 +44,21 @@ The **Agent** behaves according to the prompt in the **Agent Instructions** fiel
-* The [**Chat Input** component](https://docs.langflow.org/components-io) is connected to the Agent component's Input port. This allows to flow to be triggered by an incoming prompt from a user or application.
+* [**Language Model** component](https://docs.langflow.org/components-models): Connected to the **Agent** component's **Language Model** port, this component provides the base language model driver for the agent. The agent cannot function without a model because the model is used for general knowledge, reasoning, and generating responses.
-* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) is connected to the Agent component's Tools port. The agent might not use this database for every request; the agent only uses this connection if it decides the knowledge can help respond to the prompt.
+ Different models can change the style and content of the agent's responses, and some models might be better suited for certain tasks than others. If the agent doesn't seem to be handling requests well, try changing the model to see how the responses change. For example, fast models might be good for simple queries, but they might not have the depth of reasoning for complex, multi-faceted queries.
-* The [**Language Model** component](https://docs.langflow.org/components-models) is connected to the Agent component's Language Model port. The agent uses the connected LLM to reason through the request sent through Chat Input.
+* [**MCP Tools** component](https://docs.langflow.org/mcp-client): Connected to the **Agent** component's **Tools** port, this component can be used to [access any Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) and the MCP tools provided by that server. In this case, your OpenRAG Langflow instance's [**Starter Project**](https://docs.langflow.org/concepts-flows#projects) is the MCP server, and the [**OpenSearch URL Ingestion** flow](/knowledge#url-flow) is the MCP tool.
+This flow fetches content from URLs, and then stores the content in your OpenRAG OpenSearch knowledge base. By serving this flow as an MCP tool, the agent can selectively call this tool if a URL is detected in the chat input.
-* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) is connected to the OpenSearch component's Embedding port. This component converts text queries into vector representations that are compared with document embeddings stored in OpenSearch for semantic similarity matching. This gives your Agent's queries context.
+* [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch): Connected to the **Agent** component's **Tools** port, this component lets the agent search your [OpenRAG OpenSearch knowledge base](/knowledge). The agent might not use this database for every request; the agent uses this connection only if it decides that documents in your knowledge base are relevant to your query.
-* The [**Text Input** component](https://docs.langflow.org/components-io) is populated with the global variable `OPENRAG-QUERY-FILTER`.
-This filter is a [knowledge filter](/knowledge-filters) that limits the documents the Agent can access in the knowledge base if a global or chat filter is set.
+* [**Embedding Model** component](https://docs.langflow.org/components-embedding-models): Connected to the **OpenSearch** component's **Embedding** port, this component generates embeddings from chat input that are used in [similarity search](https://www.ibm.com/think/topics/vector-search) to find content in your knowledge base that is relevant to the chat input. The agent uses this information to generate context-aware responses that are specialized for your data.
-* The **Agent** component's Output port is connected to the [**Chat Output** component](https://docs.langflow.org/components-io), which returns the final response to the user or application.
+* [**Text Input** component](https://docs.langflow.org/components-io): Connected to the **OpenSearch** component's **Search Filters** port, this component is populated with a Langflow global variable named `OPENRAG-QUERY-FILTER`. If a global or chat-level [knowledge filter](/knowledge-filters) is set, then the variable contains the filter expression, which limits the documents that the agent can access in the knowledge base.
+If no knowledge filter is set, then the `OPENRAG-QUERY-FILTER` variable is empty, and the agent can access all documents in the knowledge base.
-* An [**MCP Tools** component](https://docs.langflow.org/mcp-client) is connected to the Agent's **Tools** port. This component calls the [**OpenSearch URL Ingestion** flow](/knowledge#url-flow), which Langflow uses as a [Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) to fetch content from URLs and store in OpenSearch.
+* [**Chat Output** component](https://docs.langflow.org/components-io): Connected to the **Agent** component's **Output** port, this component returns the agent's generated response as a chat message.
## Nudges {#nudges}
@@ -59,4 +67,21 @@ When you use the OpenRAG **Chat**, the **OpenRAG OpenSearch Nudges** flow runs i
Nudges appear as prompts in the chat.
Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat** agent (the **OpenRAG OpenSearch Agent** flow).
-Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
\ No newline at end of file
+Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
+
+## Inspect tool calls and knowledge
+
+During the chat, you'll see information about the agent's process. For more detail, you can inspect individual tool calls. This is helpful for troubleshooting because it shows you how the agent used particular tools. For example, click **Function Call: search_documents (tool_call)** to view the log of tool calls made by the agent to the **OpenSearch** component.
+
+If documents in your knowledge base seem to be missing or interpreted incorrectly, see [Troubleshoot ingestion](/knowledge#troubleshoot-ingestion).
+
+If tool calls and knowledge appear normal, but the agent's responses seem off-topic or incorrect, consider changing the agent's language model or prompt, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
+
+## Integrate OpenRAG chat into an application
+
+You can integrate OpenRAG flows into your applications using the [Langflow API](https://docs.langflow.org/api-reference-api-examples).
+To simplify this integration, you can get pre-configured code snippets directly from the embedded Langflow visual editor.
+
+The following example demonstrates how to generate and use code snippets for the **OpenRAG OpenSearch Agent** flow:
+
+
\ No newline at end of file
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index b4378bed..43f3a8aa 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -7,4 +7,16 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-
\ No newline at end of file
+
+
+
+ To verify the agent's response, click **Knowledge** to view the documents stored in the OpenRAG OpenSearch vector database.
+You can click a document to view the chunks of the document as they are stored in the database.
+
+4. Click **Add Knowledge** to add your own documents to your OpenRAG knowledge base.
+
+ For this quickstart, use either the **File** or **Folder** upload options to load documents from your local machine.
+ **Folder** uploads an entire directory.
+ The default directory is the `/documents` subdirectory in your OpenRAG installation directory.
+
+ For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index cb8afaad..9633e4e0 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -140,7 +140,7 @@ The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSe
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
Click a document to view the chunks produced from splitting the document during ingestion.
-## Troubleshoot ingestion
+## Troubleshoot ingestion (#troubleshoot-ingestion)
If an ingestion task fails, do the following:
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 4256158c..4884de07 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -7,6 +7,7 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import PartialWsl from '@site/docs/_partial-wsl-install.mdx';
+import PartialIntegrateChat from '@site/docs/_partial-integrate-chat.mdx';
Use this quickstart to install OpenRAG, and then try some of OpenRAG's core features.
@@ -154,116 +155,7 @@ You can use these flows as-is or modify them to better suit your needs, as demon
You can send and receive requests with the Langflow API using Python, TypeScript, or curl.
-1. Open the OpenRAG OpenSearch Agent flow in the Langflow visual editor: From the **Chat** window, click **Settings**, click **Edit in Langflow**, and then click **Proceed**.
-
-2. Create a [Langflow API key](https://docs.langflow.org/api-keys-and-authentication), which is a user-specific token required to send requests to the Langflow server.
-This key doesn't grant access to OpenRAG.
-
- 1. In the Langflow visual editor, click your user icon in the header, and then select **Settings**.
- 2. Click **Langflow API Keys**, and then click **Add New**.
- 3. Name your key, and then click **Create API Key**.
- 4. Copy the API key and store it securely.
- 5. Exit the Langflow **Settings** page to return to the visual editor.
-
-3. Click **Share**, and then select **API access** to get pregenerated code snippets that call the Langflow API and run the flow.
-
- These code snippets construct API requests with your Langflow server URL (`LANGFLOW_SERVER_ADDRESS`), the flow to run (`FLOW_ID`), required headers (`LANGFLOW_API_KEY`, `Content-Type`), and a payload containing the required inputs to run the flow, including a default chat input message.
-
- In production, you would modify the inputs to suit your application logic. For example, you could replace the default chat input message with dynamic user input.
-
-
-
-
- ```python
- import requests
- import os
- import uuid
-
- api_key = 'LANGFLOW_API_KEY'
- url = "http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID" # The complete API endpoint URL for this flow
-
- # Request payload configuration
- payload = {
- "output_type": "chat",
- "input_type": "chat",
- "input_value": "hello world!"
- }
- payload["session_id"] = str(uuid.uuid4())
-
- headers = {"x-api-key": api_key}
-
- try:
- # Send API request
- response = requests.request("POST", url, json=payload, headers=headers)
- response.raise_for_status() # Raise exception for bad status codes
-
- # Print response
- print(response.text)
-
- except requests.exceptions.RequestException as e:
- print(f"Error making API request: {e}")
- except ValueError as e:
- print(f"Error parsing response: {e}")
- ```
-
-
-
-
- ```typescript
- const crypto = require('crypto');
- const apiKey = 'LANGFLOW_API_KEY';
- const payload = {
- "output_type": "chat",
- "input_type": "chat",
- "input_value": "hello world!"
- };
- payload.session_id = crypto.randomUUID();
-
- const options = {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- "x-api-key": apiKey
- },
- body: JSON.stringify(payload)
- };
-
- fetch('http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID', options)
- .then(response => response.json())
- .then(response => console.warn(response))
- .catch(err => console.error(err));
- ```
-
-
-
-
- ```bash
- curl --request POST \
- --url 'http://LANGFLOW_SERVER_ADDRESS/api/v1/run/FLOW_ID?stream=false' \
- --header 'Content-Type: application/json' \
- --header "x-api-key: LANGFLOW_API_KEY" \
- --data '{
- "output_type": "chat",
- "input_type": "chat",
- "input_value": "hello world!"
- }'
- ```
-
-
-
-
-4. Copy your preferred snippet, and then run it:
-
- * **Python**: Paste the snippet into a `.py` file, save it, and then run it with `python filename.py`.
- * **TypeScript**: Paste the snippet into a `.ts` file, save it, and then run it with `ts-node filename.ts`.
- * **curl**: Paste and run snippet directly in your terminal.
-
-If the request is successful, the response includes many details about the flow run, including the session ID, inputs, outputs, components, durations, and more.
-
-In production, you won't pass the raw response to the user in its entirety.
-Instead, you extract and reformat relevant fields for different use cases, as demonstrated in the [Langflow quickstart](https://docs.langflow.org/quickstart#extract-data-from-the-response).
-For example, you could pass the chat output text to a front-end user-facing application, and store specific fields in logs and backend data stores for monitoring, chat history, or analytics.
-You could also pass the output from one flow as input to another flow.
+
## Next steps
From fa825525f8c62e4ade60df41eba5048c2f7daa80 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Wed, 26 Nov 2025 08:22:27 -0800
Subject: [PATCH 09/13] organize knowledge pages
---
docs/docs/core-components/agents.mdx | 2 +-
docs/docs/core-components/chat.mdx | 8 +-
docs/docs/core-components/ingestion.mdx | 249 ++++++++++++++++-
.../core-components/knowledge-configure.mdx | 58 ----
.../core-components/knowledge-filters.mdx | 4 +-
docs/docs/core-components/knowledge.mdx | 256 +++---------------
docs/docs/get-started/install.mdx | 2 +-
docs/docs/get-started/quickstart.mdx | 4 +-
docs/docs/get-started/what-is-openrag.mdx | 21 +-
docs/docs/reference/configuration.mdx | 2 +-
docs/sidebars.js | 1 -
11 files changed, 310 insertions(+), 297 deletions(-)
delete mode 100644 docs/docs/core-components/knowledge-configure.mdx
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 1ffe19f1..97f48158 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -13,7 +13,7 @@ In a flow, the individual workflow steps are represented by [_components_](https
OpenRAG includes several built-in flows:
* The [**OpenRAG OpenSearch Agent** flow](/chat#flow) powers the **Chat** feature in OpenRAG.
-* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/knowledge#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge base.
+* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/ingestion#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge base.
* The [**OpenRAG OpenSearch Nudges** flow](/chat#nudges) provides optional contextual suggestions in the OpenRAG **Chat**.
You can customize these flows and create your own flows using OpenRAG's embedded Langflow visual editor.
diff --git a/docs/docs/core-components/chat.mdx b/docs/docs/core-components/chat.mdx
index 4137e4e1..def7d8ba 100644
--- a/docs/docs/core-components/chat.mdx
+++ b/docs/docs/core-components/chat.mdx
@@ -8,7 +8,7 @@ import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import PartialIntegrateChat from '@site/docs/_partial-integrate-chat.mdx';
-After you [upload documents to your knowledge base](/knowledge), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
+After you [upload documents to your knowledge base](/ingestion), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
:::tip
Try chatting, uploading documents, and modifying chat settings in the [quickstart](/quickstart).
@@ -48,13 +48,15 @@ One or more specialized tools can be attached to the **Tools** port to extend th
Different models can change the style and content of the agent's responses, and some models might be better suited for certain tasks than others. If the agent doesn't seem to be handling requests well, try changing the model to see how the responses change. For example, fast models might be good for simple queries, but they might not have the depth of reasoning for complex, multi-faceted queries.
-* [**MCP Tools** component](https://docs.langflow.org/mcp-client): Connected to the **Agent** component's **Tools** port, this component can be used to [access any Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) and the MCP tools provided by that server. In this case, your OpenRAG Langflow instance's [**Starter Project**](https://docs.langflow.org/concepts-flows#projects) is the MCP server, and the [**OpenSearch URL Ingestion** flow](/knowledge#url-flow) is the MCP tool.
+* [**MCP Tools** component](https://docs.langflow.org/mcp-client): Connected to the **Agent** component's **Tools** port, this component can be used to [access any Model Context Protocol (MCP) server](https://docs.langflow.org/mcp-server) and the MCP tools provided by that server. In this case, your OpenRAG Langflow instance's [**Starter Project**](https://docs.langflow.org/concepts-flows#projects) is the MCP server, and the [**OpenSearch URL Ingestion** flow](/ingestion#url-flow) is the MCP tool.
This flow fetches content from URLs, and then stores the content in your OpenRAG OpenSearch knowledge base. By serving this flow as an MCP tool, the agent can selectively call this tool if a URL is detected in the chat input.
* [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch): Connected to the **Agent** component's **Tools** port, this component lets the agent search your [OpenRAG OpenSearch knowledge base](/knowledge). The agent might not use this database for every request; the agent uses this connection only if it decides that documents in your knowledge base are relevant to your query.
* [**Embedding Model** component](https://docs.langflow.org/components-embedding-models): Connected to the **OpenSearch** component's **Embedding** port, this component generates embeddings from chat input that are used in [similarity search](https://www.ibm.com/think/topics/vector-search) to find content in your knowledge base that is relevant to the chat input. The agent uses this information to generate context-aware responses that are specialized for your data.
+ It is critical that the embedding model used here matches the embedding model used when you [upload documents to your knowledge base](/ingestion). Mismatched models and dimensions can degrade the quality of similarity search results causing the agent to retrieve irrelevant documents from your knowledge base.
+
* [**Text Input** component](https://docs.langflow.org/components-io): Connected to the **OpenSearch** component's **Search Filters** port, this component is populated with a Langflow global variable named `OPENRAG-QUERY-FILTER`. If a global or chat-level [knowledge filter](/knowledge-filters) is set, then the variable contains the filter expression, which limits the documents that the agent can access in the knowledge base.
If no knowledge filter is set, then the `OPENRAG-QUERY-FILTER` variable is empty, and the agent can access all documents in the knowledge base.
@@ -73,7 +75,7 @@ Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](/age
During the chat, you'll see information about the agent's process. For more detail, you can inspect individual tool calls. This is helpful for troubleshooting because it shows you how the agent used particular tools. For example, click **Function Call: search_documents (tool_call)** to view the log of tool calls made by the agent to the **OpenSearch** component.
-If documents in your knowledge base seem to be missing or interpreted incorrectly, see [Troubleshoot ingestion](/knowledge#troubleshoot-ingestion).
+If documents in your knowledge base seem to be missing or interpreted incorrectly, see [Troubleshoot ingestion](/ingestion#troubleshoot-ingestion).
If tool calls and knowledge appear normal, but the agent's responses seem off-topic or incorrect, consider changing the agent's language model or prompt, as explained in [Inspect and modify flows](/agents#inspect-and-modify-flows).
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index 43f3a8aa..4f00be00 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -1,5 +1,5 @@
---
-title: Docling in OpenRAG
+title: Ingest knowledge
slug: /ingestion
---
@@ -7,6 +7,10 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
+The documents in your OpenRAG [OpenSearch knowledge base](/knowledge) provide specialized context in addition to the general knowledge available to the language model that you select when you [install OpenRAG](/install).
+Upload documents to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
+Then, the [OpenRAG **Chat**](/chat) can retrieve relevant content from your knowledge base to provide context-aware responses.
+
@@ -19,4 +23,245 @@ You can click a document to view the chunks of the document as they are stored i
**Folder** uploads an entire directory.
The default directory is the `/documents` subdirectory in your OpenRAG installation directory.
- For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).
\ No newline at end of file
+ For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
+
+
+
+
+
+
+
+## ingest
+
+OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth connectors.
+
+Knowledge ingestion is powered by OpenRAG's built-in [knowledge ingestion flows](/ingestion#knowledge-ingestion-flows) that use Docling Serve to process documents before storing the documents in your OpenSearch database.
+
+During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected [embedding model](/knowledge#set-the-embedding-model-and-dimensions).
+The chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
+
+Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the [ingestion settings](/ingestion#knowledge-ingestion-settings).
+
+## Ingest local files and folders {#knowledge-ingestion-flows}
+
+
+
+The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
+
+The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
+
+To load and process a single file from the mapped location, click **Add Knowledge**, and then click **File**.
+The file is loaded into your OpenSearch database, and appears in the Knowledge page.
+
+To load and process a directory from the mapped location, click **Add Knowledge**, and then click **Folder**.
+The files are loaded into your OpenSearch database, and appear in the Knowledge page.
+
+To add files directly to a chat session, click in the chat input and select the files you want to include. Files added this way are processed and made available to the agent for the current conversation, and are not permanently added to the knowledge base.
+
+### OpenSearch Ingestion flow
+
+
+
+The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
+
+If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
+
+* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
+* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
+* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
+* The [**Split Text** component](https://docs.langflow.org/components-processing#split-text) splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.
+* Four **Secret Input** components provide secure access to configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`. These are runtime variables populated from OAuth login.
+* The **Create Data** component combines the secret inputs into a structured data object that will be associated with the document embeddings.
+* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
+* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
+
+To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
+
+## Ingest knowledge from URLs {#url-flow}
+
+The **OpenSearch URL Ingestion** flow is used to ingest web content from URLs.
+This flow isn't directly accessible from the OpenRAG user interface.
+Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/chat#flow) as a Model Context Protocol (MCP) tool.
+The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
+
+For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
+
+## Ingest files through OAuth connectors {#oauth-ingestion}
+
+OpenRAG supports Google Drive, OneDrive, and Sharepoint as OAuth connectors for seamless document synchronization.
+
+OAuth integration allows individual users to connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage files. When a user connects a cloud service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage.
+
+Before users can connect their cloud storage accounts, you must configure OAuth credentials in OpenRAG. This requires registering OpenRAG as an OAuth application with a cloud provider and obtaining client ID and secret keys for each service you want to support.
+
+To add an OAuth connector to OpenRAG, do the following.
+This example uses Google OAuth.
+If you wish to use another provider, add the secrets to another provider.
+
+
+
+ 1. If OpenRAG is running, stop it with **Status** > **Stop Services**.
+ 2. Click **Advanced Setup**.
+ 3. Add the OAuth provider's client and secret key in the [Advanced Setup](/install#setup) menu.
+ 4. Click **Save Configuration**.
+ The TUI generates a new `.env` file with your OAuth values.
+ 5. Click **Start Container Services**.
+
+
+ 1. Stop the Docker deployment.
+ 2. Add the OAuth provider's client and secret key in the `.env` file for Docker Compose.
+ ```bash
+ GOOGLE_OAUTH_CLIENT_ID='YOUR_OAUTH_CLIENT_ID'
+ GOOGLE_OAUTH_CLIENT_SECRET='YOUR_OAUTH_CLIENT_SECRET'
+ ```
+ 3. Save your `.env` file.
+ 4. Start the Docker deployment.
+
+
+
+The OpenRAG frontend at `http://localhost:3000` now redirects to an OAuth callback login page for your OAuth provider.
+A successful authentication opens OpenRAG with the required scopes for your connected storage.
+
+To add knowledge from an OAuth-connected storage provider, do the following:
+
+1. Click **Add Knowledge**, and then select the storage provider, for example, **Google Drive**.
+The **Add Cloud Knowledge** page opens.
+2. To add files or folders from the connected storage, click **Add Files**.
+Select the files or folders you want and click **Select**.
+You can select multiple files.
+3. When your files are selected, click **Ingest Files**.
+The ingestion process can take some time depending on the size of your documents.
+4. When ingestion is complete, your documents are available in the Knowledge screen.
+
+If ingestion fails, click **Status** to view the logged error.
+
+## Monitor ingestion
+
+Document ingestion tasks run in the background.
+
+In the OpenRAG UI, a badge is shown on **Tasks** when OpenRAG tasks are active.
+Click **Tasks** to inspect and cancel tasks:
+
+* **Active Tasks**: All tasks that are **Pending**, **Running**, or **Processing**.
+For each active task, depending on its state, you can find the task ID, start time, duration, number of files processed, and the total files enqueued for processing.
+
+* **Pending**: The task is queued and waiting to start.
+
+* **Running**: The task is actively processing files.
+
+* **Processing**: The task is performing ingestion operations.
+
+* **Failed**: Something went wrong during ingestion, or the task was manually canceled.
+
+To stop an active task, click **Cancel**. Canceling a task stops processing immediately and marks the task as **Failed**.
+
+## Troubleshoot ingestion (#troubleshoot-ingestion)
+
+If an ingestion task fails, do the following:
+
+* Make sure you are uploading supported file types.
+* Split excessively large files into smaller files before uploading.
+* Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.
+
+If the OpenRAG **Chat** doesn't seem to use your documents correctly, [browse your knowledge base](#browse-knowledge) to confirm that the documents are uploaded in full, and the chunks are correct.
+
+If the documents are present and well-formed, check your [knowledge filters](/knowledge-filters).
+If a global filter is applied, make sure the expected documents are included in the global filter.
+If the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter.
+
+If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.
+For example:
+
+* Break combined documents into separate files for better metadata context.
+* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
+* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
+
+For more information about modifying ingestion parameters and flows, see [Docling Serve for knowledge ingestion](/knowledge#docling-serve-for-knowledge-ingestion).
+
+## Docling Serve for knowledge ingestion {#docling-serve-for-knowledge-ingestion}
+
+
+
+OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
+More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
+
+Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
+
+OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
+
+### Knowledge ingestion settings {#knowledge-ingestion-settings}
+
+To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
+
+These settings configure the Docling ingestion parameters.
+
+OpenRAG will warn you if `docling serve` is not running.
+To start or stop `docling serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**.
+
+**Embedding model** determines which AI model is used to create vector embeddings. The default is the OpenAI `text-embedding-3-small` model.
+
+**Chunk size** determines how large each text chunk is in number of characters.
+Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
+The default value of `1000` characters provides a good starting point that balances these considerations.
+
+**Chunk overlap** controls the number of characters that overlap over chunk boundaries.
+Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
+The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.
+
+**Table Structure** enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. **Table Structure** is enabled by default.
+
+**OCR** enables or disabled OCR processing when extracting text from images and scanned documents.
+OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
+
+Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.
+
+If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
+
+**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
+
+### Use OpenRAG default ingestion instead of Docling serve
+
+If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/reference/configuration#document-processing).
+
+The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.
+
+For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58).
+
+## Ingestion performance expectations
+
+On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
+This equates to approximately 2.4 documents per second.
+
+You can generally expect equal or better performance on developer laptops and significantly faster on servers.
+Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.
+
+This test returned 12 errors (approximately 1.1 percent).
+All errors were file-specific, and they didn't stop the pipeline.
+
+* Ingestion dataset:
+
+ * Total files: 1,083 items mounted
+ * Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)
+
+* Hardware specifications:
+
+ * Machine: Apple M4 Pro
+ * Podman VM:
+ * Name: `podman-machine-default`
+ * Type: `applehv`
+ * vCPUs: 7
+ * Memory: 8 GiB
+ * Disk size: 100 GiB
+
+* Test results:
+
+ ```text
+ 2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False
+ 2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082
+ ...
+ 2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082
+ ```
+
+* Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)
+
+* Throughput: Approximately 2.4 documents/second
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge-configure.mdx b/docs/docs/core-components/knowledge-configure.mdx
deleted file mode 100644
index f3829d42..00000000
--- a/docs/docs/core-components/knowledge-configure.mdx
+++ /dev/null
@@ -1,58 +0,0 @@
----
-title: Configure OpenSearch in OpenRAG
-slug: /knowledge-configure
----
-
-import Icon from "@site/src/components/icon/icon";
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-OpenRAG includes a built-in [OpenSearch](https://docs.opensearch.org/latest/) instance that serves as the underlying datastore for your knowledge.
-This specialized database is used to store and retrieve your documents and the associated vector data (embeddings).
-
-The [OpenRAG **Chat**](/chat) runs [similarity searches](https://www.ibm.com/think/topics/vector-search) against your OpenSearch database to retrieve relevant information and generate context-aware responses.
-
-Additionally, OpenSearch provides powerful hybrid search capabilities with enterprise-grade security and multi-tenancy support.
-
-## OpenSearch authentication and document access {#auth}
-
-When you [install OpenRAG](/install), you can choose between two setup modes: **Basic Setup** and **Advanced Setup**.
-The mode you choose determines how OpenRAG authenticates with OpenSearch and controls access to documents:
-
-* **Basic Setup (no-auth mode)**: If you choose **Basic Setup**, then OpenRAG is installed in no-auth mode.
-This mode uses one, anonymous JWT token for OpenSearch authentication.
-There is no differentiation between users.
-All users that access your OpenRAG instance can access all documents uploaded to your OpenSearch `documents` index.
-
-* **Advanced Setup (OAuth mode)**: If you choose **Advanced Setup**, then OpenRAG is installed in OAuth mode.
-This mode uses a unique JWT token for each OpenRAG user, and each document is tagged with user ownership. Documents are filtered by user owner.
-This means users see only the documents that they uploaded or have access to.
-
-You can enable OAuth mode after installation.
-For more information, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).
-
-## Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
-
-When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
-OpenRAG automatically detects and configures the appropriate vector dimensions for your selected embedding model, ensuring optimal search performance and compatibility.
-
-In the OpenRAG repository, you can find the complete list of supported models in [`models_service.py`](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py) and the corresponding vector dimensions in [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
-
-The default embedding dimension is `1536` and the default model is `text-embedding-3-small`.
-
-You can use any supported or unsupported embedding model by specifying the model in your OpenRAG configuration during installation.
-
-If you use an unsupported embedding model that doesn't have defined dimensions in `settings.py`, then OpenRAG falls back to the default dimensions (1536) and logs a warning. OpenRAG's OpenSearch instance and flows continue to work, but [similarity search](https://www.ibm.com/think/topics/vector-search) quality can be affected if the actual model dimensions aren't 1536.
-
-The embedding model setting is immutable.
-To change the embedding model, you must [reinstall OpenRAG](/install#reinstall).
-
-## Set ingestion parameters
-
-For information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](/knowledge#knowledge-ingestion-settings) and [Knowledge ingestion flows](/knowledge#knowledge-ingestion-flows).
-
-## See also
-
-* [Ingest knowledge](/knowledge)
-* [Filter knowledge](/knowledge-filters)
-* [Chat with knowledge](/chat)
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge-filters.mdx b/docs/docs/core-components/knowledge-filters.mdx
index 1b3a974b..8376a55b 100644
--- a/docs/docs/core-components/knowledge-filters.mdx
+++ b/docs/docs/core-components/knowledge-filters.mdx
@@ -7,7 +7,7 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-OpenRAG's knowledge filters help you organize and manage your [knowledge base](/knowledge-configure) by creating pre-defined views of your documents.
+OpenRAG's knowledge filters help you organize and manage your [knowledge base](/knowledge) by creating pre-defined views of your documents.
Each knowledge filter captures a specific subset of documents based on given a search query and filters.
@@ -31,7 +31,7 @@ To create a knowledge filter, do the following:
* **Data Sources**: Select specific data sources or folders to include.
* **Document Types**: Filter by file type.
* **Owners**: Filter by the user that uploaded the documents.
- * **Connectors**: Filter by [upload source](/knowledge), such as the local file system or a Google Drive OAuth connector.
+ * **Connectors**: Filter by [upload source](/ingestion), such as the local file system or a Google Drive OAuth connector.
* **Response Limit**: Set the maximum number of results to return from the knowledge base. The default is `10`.
* **Score Threshold**: Set the minimum relevance score for similarity search. The default score is `0`.
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 9633e4e0..ddca2a8e 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -1,5 +1,5 @@
---
-title: Ingest knowledge
+title: Configure knowledge
slug: /knowledge
---
@@ -7,131 +7,15 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-The documents in your OpenRAG [OpenSearch knowledge base](/knowledge-configure) provide specialized context in addition to the general knowledge available to the language model that you select when you [install OpenRAG](/install).
-Upload documents to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
-Then, the [OpenRAG **Chat**](/chat) can retrieve relevant content from your knowledge base to provide context-aware responses.
+OpenRAG includes a built-in [OpenSearch](https://docs.opensearch.org/latest/) instance that serves as the underlying datastore for your _knowledge_ (documents).
+This specialized database is used to store and retrieve your documents and the associated vector data (embeddings).
-OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth connectors.
+You can [upload documents](/ingestion) from a variety of sources.
+Documents are processed through OpenRAG's knowledge ingestion flows with Docling.
-Knowledge ingestion is powered by OpenRAG's built-in [knowledge ingestion flows](/knowledge#knowledge-ingestion-flows) that use Docling Serve to process documents before storing the documents in your OpenSearch database.
+The [OpenRAG **Chat**](/chat) runs [similarity searches](https://www.ibm.com/think/topics/vector-search) against your OpenSearch database to retrieve relevant information and generate context-aware responses.
-During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected [embedding model](/knowledge-configure#set-the-embedding-model-and-dimensions).
-The chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
-
-Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the [ingestion settings](/knowledge#knowledge-ingestion-settings).
-
-## Ingest local files and folders {#knowledge-ingestion-flows}
-
-
-
-The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
-
-The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
-
-To load and process a single file from the mapped location, click **Add Knowledge**, and then click **File**.
-The file is loaded into your OpenSearch database, and appears in the Knowledge page.
-
-To load and process a directory from the mapped location, click **Add Knowledge**, and then click **Folder**.
-The files are loaded into your OpenSearch database, and appear in the Knowledge page.
-
-To add files directly to a chat session, click in the chat input and select the files you want to include. Files added this way are processed and made available to the agent for the current conversation, and are not permanently added to the knowledge base.
-
-### OpenSearch Ingestion flow
-
-
-
-The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
-
-If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
-
-* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
-* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
-* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
-* The [**Split Text** component](https://docs.langflow.org/components-processing#split-text) splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.
-* Four **Secret Input** components provide secure access to configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`. These are runtime variables populated from OAuth login.
-* The **Create Data** component combines the secret inputs into a structured data object that will be associated with the document embeddings.
-* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
-* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
-
-To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
-
-## Ingest knowledge from URLs {#url-flow}
-
-The **OpenSearch URL Ingestion** flow is used to ingest web content from URLs.
-This flow isn't directly accessible from the OpenRAG user interface.
-Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/chat#flow) as a Model Context Protocol (MCP) tool.
-The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
-
-For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
-
-## Ingest files through OAuth connectors {#oauth-ingestion}
-
-OpenRAG supports Google Drive, OneDrive, and Sharepoint as OAuth connectors for seamless document synchronization.
-
-OAuth integration allows individual users to connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage files. When a user connects a cloud service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage.
-
-Before users can connect their cloud storage accounts, you must configure OAuth credentials in OpenRAG. This requires registering OpenRAG as an OAuth application with a cloud provider and obtaining client ID and secret keys for each service you want to support.
-
-To add an OAuth connector to OpenRAG, do the following.
-This example uses Google OAuth.
-If you wish to use another provider, add the secrets to another provider.
-
-
-
- 1. If OpenRAG is running, stop it with **Status** > **Stop Services**.
- 2. Click **Advanced Setup**.
- 3. Add the OAuth provider's client and secret key in the [Advanced Setup](/install#setup) menu.
- 4. Click **Save Configuration**.
- The TUI generates a new `.env` file with your OAuth values.
- 5. Click **Start Container Services**.
-
-
- 1. Stop the Docker deployment.
- 2. Add the OAuth provider's client and secret key in the `.env` file for Docker Compose.
- ```bash
- GOOGLE_OAUTH_CLIENT_ID='YOUR_OAUTH_CLIENT_ID'
- GOOGLE_OAUTH_CLIENT_SECRET='YOUR_OAUTH_CLIENT_SECRET'
- ```
- 3. Save your `.env` file.
- 4. Start the Docker deployment.
-
-
-
-The OpenRAG frontend at `http://localhost:3000` now redirects to an OAuth callback login page for your OAuth provider.
-A successful authentication opens OpenRAG with the required scopes for your connected storage.
-
-To add knowledge from an OAuth-connected storage provider, do the following:
-
-1. Click **Add Knowledge**, and then select the storage provider, for example, **Google Drive**.
-The **Add Cloud Knowledge** page opens.
-2. To add files or folders from the connected storage, click **Add Files**.
-Select the files or folders you want and click **Select**.
-You can select multiple files.
-3. When your files are selected, click **Ingest Files**.
-The ingestion process can take some time depending on the size of your documents.
-4. When ingestion is complete, your documents are available in the Knowledge screen.
-
-If ingestion fails, click **Status** to view the logged error.
-
-## Monitor ingestion
-
-Document ingestion tasks run in the background.
-
-In the OpenRAG UI, a badge is shown on **Tasks** when OpenRAG tasks are active.
-Click **Tasks** to inspect and cancel tasks:
-
-* **Active Tasks**: All tasks that are **Pending**, **Running**, or **Processing**.
-For each active task, depending on its state, you can find the task ID, start time, duration, number of files processed, and the total files enqueued for processing.
-
-* **Pending**: The task is queued and waiting to start.
-
-* **Running**: The task is actively processing files.
-
-* **Processing**: The task is performing ingestion operations.
-
-* **Failed**: Something went wrong during ingestion, or the task was manually canceled.
-
-To stop an active task, click **Cancel**. Canceling a task stops processing immediately and marks the task as **Failed**.
+You can configure how documents are ingested and how the **Chat** interacts with your knowledge base.
## Browse knowledge {#browse-knowledge}
@@ -140,118 +24,54 @@ The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSe
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
Click a document to view the chunks produced from splitting the document during ingestion.
-## Troubleshoot ingestion (#troubleshoot-ingestion)
+OpenRAG includes some sample documents that you can use to see how the agent references documents in the [**Chat**](/chat).
-If an ingestion task fails, do the following:
+## OpenSearch authentication and document access {#auth}
-* Make sure you are uploading supported file types.
-* Split excessively large files into smaller files before uploading.
-* Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.
+When you [install OpenRAG](/install), you can choose between two setup modes: **Basic Setup** and **Advanced Setup**.
+The mode you choose determines how OpenRAG authenticates with OpenSearch and controls access to documents:
-If the OpenRAG **Chat** doesn't seem to use your documents correctly, [browse your knowledge base](#browse-knowledge) to confirm that the documents are uploaded in full, and the chunks are correct.
+* **Basic Setup (no-auth mode)**: If you choose **Basic Setup**, then OpenRAG is installed in no-auth mode.
+This mode uses one, anonymous JWT token for OpenSearch authentication.
+There is no differentiation between users.
+All users that access your OpenRAG instance can access all documents uploaded to your OpenSearch `documents` index.
-If the documents are present and well-formed, check your [knowledge filters](/knowledge-filters).
-If a global filter is applied, make sure the expected documents are included in the global filter.
-If the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter.
+* **Advanced Setup (OAuth mode)**: If you choose **Advanced Setup**, then OpenRAG is installed in OAuth mode.
+This mode uses a unique JWT token for each OpenRAG user, and each document is tagged with user ownership. Documents are filtered by user owner.
+This means users see only the documents that they uploaded or have access to.
-If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.
-For example:
+You can enable OAuth mode after installation.
+For more information, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
-* Break combined documents into separate files for better metadata context.
-* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
-* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
+## Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
-For more information about modifying ingestion parameters and flows, see [Docling Serve for knowledge ingestion](/knowledge#docling-serve-for-knowledge-ingestion).
+When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
+OpenRAG automatically detects and configures the appropriate vector dimensions for your selected embedding model, ensuring optimal search performance and compatibility.
-## Docling Serve for knowledge ingestion {#docling-serve-for-knowledge-ingestion}
+In the OpenRAG repository, you can find the complete list of supported models in [`models_service.py`](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py) and the corresponding vector dimensions in [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
-
+The default embedding dimension is `1536` and the default model is `text-embedding-3-small`.
-OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
-More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
+You can use any supported or unsupported embedding model by specifying the model in your OpenRAG configuration during installation.
-Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
+If you use an unsupported embedding model that doesn't have defined dimensions in `settings.py`, then OpenRAG falls back to the default dimensions (1536) and logs a warning. OpenRAG's OpenSearch instance and flows continue to work, but [similarity search](https://www.ibm.com/think/topics/vector-search) quality can be affected if the actual model dimensions aren't 1536.
-OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
+The embedding model setting is immutable.
+To change the embedding model, you must [reinstall OpenRAG](/install#reinstall).
-### Knowledge ingestion settings {#knowledge-ingestion-settings}
+## Set ingestion parameters
-To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
+For information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](/ingestion#knowledge-ingestion-settings) and [Knowledge ingestion flows](/ingestion#knowledge-ingestion-flows).
-These settings configure the Docling ingestion parameters.
+## Delete knowledge
-OpenRAG will warn you if `docling serve` is not running.
-To start or stop `docling serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**.
+To clear your entire knowledge base, you can delete the contents of the `./opensearch-data` folder in your OpenRAG installation directory, or you can [reset the OpenRAG containers](/install#tui-container-management).
-**Embedding model** determines which AI model is used to create vector embeddings. The default is the OpenAI `text-embedding-3-small` model.
-
-**Chunk size** determines how large each text chunk is in number of characters.
-Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
-The default value of `1000` characters provides a good starting point that balances these considerations.
-
-**Chunk overlap** controls the number of characters that overlap over chunk boundaries.
-Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
-The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.
-
-**Table Structure** enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. **Table Structure** is enabled by default.
-
-**OCR** enables or disabled OCR processing when extracting text from images and scanned documents.
-OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
-
-Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.
-
-If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
-
-**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
-
-### Use OpenRAG default ingestion instead of Docling serve
-
-If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/reference/configuration#document-processing).
-
-The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.
-
-For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58).
-
-## Ingestion performance expectations
-
-On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
-This equates to approximately 2.4 documents per second.
-
-You can generally expect equal or better performance on developer laptops and significantly faster on servers.
-Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.
-
-This test returned 12 errors (approximately 1.1 percent).
-All errors were file-specific, and they didn't stop the pipeline.
-
-* Ingestion dataset:
-
- * Total files: 1,083 items mounted
- * Total size on disk: 5,026,474,862 bytes (approximately 5.03 GB)
-
-* Hardware specifications:
-
- * Machine: Apple M4 Pro
- * Podman VM:
- * Name: `podman-machine-default`
- * Type: `applehv`
- * vCPUs: 7
- * Memory: 8 GiB
- * Disk size: 100 GiB
-
-* Test results:
-
- ```text
- 2025-09-24T22:40:45.542190Z /app/src/main.py:231 Ingesting default documents when ready disable_langflow_ingest=False
- 2025-09-24T22:40:45.546385Z /app/src/main.py:270 Using Langflow ingestion pipeline for default documents file_count=1082
- ...
- 2025-09-24T23:19:44.866365Z /app/src/main.py:351 Langflow ingestion completed success_count=1070 error_count=12 total_files=1082
- ```
-
-* Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)
-
-* Throughput: Approximately 2.4 documents/second
+Be aware that both of these operations are destructive and cannot be undone.
+In particular, resetting containers reverts your OpenRAG instance to the initial state as though it were a fresh installation.
## See also
-* [Configure OpenSearch in OpenRAG](/knowledge-configure)
-* [Filter knowledge](/knowledge-filters)
\ No newline at end of file
+* [Ingest knowledge](/ingestion)
+* [Filter knowledge](/knowledge-filters)
+* [Chat with knowledge](/chat)
\ No newline at end of file
diff --git a/docs/docs/get-started/install.mdx b/docs/docs/get-started/install.mdx
index 3d6e4b73..99e9fb74 100644
--- a/docs/docs/get-started/install.mdx
+++ b/docs/docs/get-started/install.mdx
@@ -190,7 +190,7 @@ If the TUI detects OAuth credentials, it enforces the **Advanced Setup** path.
**Basic Setup** can generate all of the required values for OpenRAG. The OpenAI API key is optional and can be provided during onboarding.
**Basic Setup** does not set up OAuth connections for ingestion from cloud providers.
For OAuth setup, use **Advanced Setup**.
- For information about the difference between basic (no auth) and OAuth in OpenRAG, see [OpenSearch authentication and document access](/knowledge-configure#auth).
+ For information about the difference between basic (no auth) and OAuth in OpenRAG, see [OpenSearch authentication and document access](/knowledge#auth).
1. To install OpenRAG with **Basic Setup**, click **Basic Setup** or press 1.
2. Click **Generate Passwords** to generate passwords for OpenSearch and Langflow.
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 4884de07..1546aca1 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -104,7 +104,7 @@ You can click a document to view the chunks of the document as they are stored i
**Folder** uploads an entire directory.
The default directory is the `/documents` subdirectory in your OpenRAG installation directory.
- For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/knowledge#oauth-ingestion).
+ For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
5. Return to the **Chat** window, and then ask a question related to the documents that you just uploaded.
@@ -117,7 +117,7 @@ You can click a document to view the chunks of the document as they are stored i
* Click **Settings** to modify the knowledge ingestion settings.
- For more information, see [Configure OpenSearch in OpenRAG](/knowledge-configure) and [Ingest knowledge](/knowledge).
+ For more information, see [Configure knowledge](/knowledge) and [Ingest knowledge](/ingestion).
## Change the language model and chat settings {#change-components}
diff --git a/docs/docs/get-started/what-is-openrag.mdx b/docs/docs/get-started/what-is-openrag.mdx
index f23a57db..47201ad6 100644
--- a/docs/docs/get-started/what-is-openrag.mdx
+++ b/docs/docs/get-started/what-is-openrag.mdx
@@ -10,13 +10,18 @@ OpenRAG connects and amplifies three popular, proven open-source projects into o
* [Langflow](https://docs.langflow.org): Langflow is a versatile tool for building and deploying AI agents and MCP servers. It supports all major LLMs, vector databases, and a growing library of AI tools.
+ OpenRAG uses several built-in flows, and it provides full access to all Langflow features through the embedded Langflow visual editor.
+
+ By customizing the built-in flows or creating your own flows, every part of the OpenRAG stack interchangeable. You can modify any aspect of the flows from basic settings, like changing the language model, to replacing entire components. You can also write your own custom Langflow components, integrate MCP servers, call APIs, and leverage any other functionality provided by Langflow.
+
* [OpenSearch](https://docs.opensearch.org/latest/): OpenSearch is a community-driven, Apache 2.0-licensed open source search and analytics suite that makes it easy to ingest, search, visualize, and analyze data.
+It provides powerful hybrid search capabilities with enterprise-grade security and multi-tenancy support.
-* [Docling](https://docling-project.github.io/docling/): Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
+ OpenRAG uses OpenSearch as the underlying vector database for storing and retrieving your documents and associated vector data (embeddings). You can ingest documents from a variety of sources, including your local filesystem and OAuth authenticated connections to popular cloud storage services.
-OpenRAG builds on Langflow's familiar interface while adding OpenSearch for vector storage and Docling for simplified document parsing. It uses opinionated flows that serve as ready-to-use recipes for ingestion, retrieval, and generation from familiar sources like Google Drive, OneDrive, and SharePoint.
+* [Docling](https://docling-project.github.io/docling/): Docling simplifies document processing, supports many file formats and advanced PDF parsing, and provides seamless integrations with the generative AI ecosystem.
-What's more, every part of the stack is interchangeable: You can write your own custom components in Python, try different language models, and customize your flows to build a personalized agentic RAG system.
+ OpenRAG uses Docling to parse and chunk documents that are stored in your OpenSearch knowledge base.
:::tip
Ready to get started? Try the [quickstart](/quickstart) to install OpenRAG and start exploring in minutes.
@@ -52,12 +57,12 @@ flowchart TD
ext --> backend
```
-* The **OpenRAG Backend** is the central orchestration service that coordinates all other components.
+* **OpenRAG backend**: The central orchestration service that coordinates all other components.
-* **Langflow** provides a visual workflow engine for building AI agents, and connects to **OpenSearch** for vector storage and retrieval.
+* **Langflow**: This container runs a Langflow instance. It provides the embedded Langflow visual editor for editing and creating flow, and it connects to the **OpenSearch** container for vector storage and retrieval.
-* **Docling Serve** is a local document processing service managed by the **OpenRAG Backend**.
+* **Docling Serve**: This is a local document processing service managed by the **OpenRAG backend**.
-* **External connectors** integrate third-party cloud storage services through OAuth authenticated connections to the **OpenRAG Backend**, allowing synchronization of external storage with your OpenSearch knowledge base.
+* **External connectors**: Integrate third-party cloud storage services through OAuth authenticated connections to the **OpenRAG backend**, allowing synchronization of external storage with your OpenSearch knowledge base.
-* The **OpenRAG Frontend** provides the user interface for interacting with the platform.
\ No newline at end of file
+* **OpenRAG frontend**: Provides the user interface for interacting with the OpenRAG platform.
\ No newline at end of file
diff --git a/docs/docs/reference/configuration.mdx b/docs/docs/reference/configuration.mdx
index a5d6e8d9..36138188 100644
--- a/docs/docs/reference/configuration.mdx
+++ b/docs/docs/reference/configuration.mdx
@@ -71,7 +71,7 @@ For more information, see [Application onboarding](/install#application-onboardi
### Document processing
-Control how OpenRAG [processes and ingests documents](/knowledge) into your knowledge base.
+Control how OpenRAG [processes and ingests documents](/ingestion) into your knowledge base.
| Variable | Default | Description |
|----------|---------|-------------|
diff --git a/docs/sidebars.js b/docs/sidebars.js
index df240798..3549d62e 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -33,7 +33,6 @@ const sidebars = {
type: "category",
label: "Knowledge",
items: [
- "core-components/knowledge-configure",
"core-components/knowledge",
"core-components/ingestion",
"core-components/knowledge-filters",
From ae8638b0719e0ee178104e20bca780651f74b553 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Wed, 26 Nov 2025 09:10:32 -0800
Subject: [PATCH 10/13] working on knowledge pages still
---
docs/docs/_partial-temp-knowledge.mdx | 5 ++
docs/docs/core-components/agents.mdx | 2 +-
docs/docs/core-components/chat.mdx | 5 ++
docs/docs/core-components/ingestion.mdx | 108 ++++++++++++------------
docs/docs/core-components/knowledge.mdx | 8 +-
5 files changed, 68 insertions(+), 60 deletions(-)
create mode 100644 docs/docs/_partial-temp-knowledge.mdx
diff --git a/docs/docs/_partial-temp-knowledge.mdx b/docs/docs/_partial-temp-knowledge.mdx
new file mode 100644
index 00000000..7ecdf99c
--- /dev/null
+++ b/docs/docs/_partial-temp-knowledge.mdx
@@ -0,0 +1,5 @@
+import Icon from "@site/src/components/icon/icon";
+
+When using the OpenRAG **Chat**, click in the chat input field to upload a file to the current chat session.
+Files added this way are processed and made available to the agent for the current conversation only.
+These files aren't stored in the knowledge base permanently.
\ No newline at end of file
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 97f48158..8def9af5 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -13,7 +13,7 @@ In a flow, the individual workflow steps are represented by [_components_](https
OpenRAG includes several built-in flows:
* The [**OpenRAG OpenSearch Agent** flow](/chat#flow) powers the **Chat** feature in OpenRAG.
-* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/ingestion#knowledge-ingestion-flows) process documents and web content for storage in your OpenSearch knowledge base.
+* The [**OpenSearch Ingestion** and **OpenSearch URL Ingestion** flows](/ingestion) process documents and web content for storage in your OpenSearch knowledge base.
* The [**OpenRAG OpenSearch Nudges** flow](/chat#nudges) provides optional contextual suggestions in the OpenRAG **Chat**.
You can customize these flows and create your own flows using OpenRAG's embedded Langflow visual editor.
diff --git a/docs/docs/core-components/chat.mdx b/docs/docs/core-components/chat.mdx
index def7d8ba..728f3fb6 100644
--- a/docs/docs/core-components/chat.mdx
+++ b/docs/docs/core-components/chat.mdx
@@ -7,6 +7,7 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import PartialIntegrateChat from '@site/docs/_partial-integrate-chat.mdx';
+import PartialTempKnowledge from '@site/docs/_partial-temp-knowledge.mdx';
After you [upload documents to your knowledge base](/ingestion), you can use the OpenRAG **Chat** feature to interact with your knowledge through natural language queries.
@@ -71,6 +72,10 @@ Click a nudge to accept it and provide the nudge's context to the OpenRAG **Chat
Like OpenRAG's other built-in flows, you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it if you want to change the nudge behavior.
+## Upload documents to the chat
+
+
+
## Inspect tool calls and knowledge
During the chat, you'll see information about the agent's process. For more detail, you can inspect individual tool calls. This is helpful for troubleshooting because it shows you how the agent used particular tools. For example, click **Function Call: search_documents (tool_call)** to view the log of tool calls made by the agent to the **OpenSearch** component.
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index 4f00be00..cb63a02f 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -6,57 +6,34 @@ slug: /ingestion
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
+import PartialTempKnowledge from '@site/docs/_partial-temp-knowledge.mdx';
-The documents in your OpenRAG [OpenSearch knowledge base](/knowledge) provide specialized context in addition to the general knowledge available to the language model that you select when you [install OpenRAG](/install).
-Upload documents to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
-Then, the [OpenRAG **Chat**](/chat) can retrieve relevant content from your knowledge base to provide context-aware responses.
+Upload documents to your [OpenRAG OpenSearch instance](/knowledge) to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
+Documents are processed through OpenRAG's knowledge ingestion flows with Docling.
-
-
-
- To verify the agent's response, click **Knowledge** to view the documents stored in the OpenRAG OpenSearch vector database.
-You can click a document to view the chunks of the document as they are stored in the database.
-
-4. Click **Add Knowledge** to add your own documents to your OpenRAG knowledge base.
-
- For this quickstart, use either the **File** or **Folder** upload options to load documents from your local machine.
- **Folder** uploads an entire directory.
- The default directory is the `/documents` subdirectory in your OpenRAG installation directory.
-
- For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
-
-
-
-
-
-
-
-## ingest
-
-OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth connectors.
-
-Knowledge ingestion is powered by OpenRAG's built-in [knowledge ingestion flows](/ingestion#knowledge-ingestion-flows) that use Docling Serve to process documents before storing the documents in your OpenSearch database.
+OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connections.
+Knowledge ingestion is powered by OpenRAG's built-in knowledge ingestion flows that use Docling to process documents before storing the documents in your OpenSearch database.
During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected [embedding model](/knowledge#set-the-embedding-model-and-dimensions).
-The chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
+Then, the chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
-Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the [ingestion settings](/ingestion#knowledge-ingestion-settings).
+Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the knowledge ingestion settings.
-## Ingest local files and folders {#knowledge-ingestion-flows}
+## Ingest local files and folders
-
+You can upload files and folders from your local machine to your knowledge base. When you do this, the **OpenSearch Ingestion** flow runs in the background.
-The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
+1. Click **Knowledge** to view your OpenSearch knowledge base.
-The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
+2. Click **Add Knowledge** to add your own documents to your OpenRAG knowledge base.
-To load and process a single file from the mapped location, click **Add Knowledge**, and then click **File**.
-The file is loaded into your OpenSearch database, and appears in the Knowledge page.
+3. To upload one file, click **File**. To upload all documents in a folder, click **Folder**.
-To load and process a directory from the mapped location, click **Add Knowledge**, and then click **Folder**.
-The files are loaded into your OpenSearch database, and appear in the Knowledge page.
+ The default path for either **File** or **Folder** uploads is the `/documents` subdirectory in your OpenRAG installation directory.
-To add files directly to a chat session, click in the chat input and select the files you want to include. Files added this way are processed and made available to the agent for the current conversation, and are not permanently added to the knowledge base.
+### Ingest local files temporarily
+
+
### OpenSearch Ingestion flow
@@ -66,6 +43,8 @@ The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in Ope
If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
+* The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
+The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
@@ -152,10 +131,11 @@ For each active task, depending on its state, you can find the task ID, start ti
* **Processing**: The task is performing ingestion operations.
* **Failed**: Something went wrong during ingestion, or the task was manually canceled.
+For troubleshooting advice, see [Troubleshoot ingestion](#troubleshoot-ingestion).
To stop an active task, click **Cancel**. Canceling a task stops processing immediately and marks the task as **Failed**.
-## Troubleshoot ingestion (#troubleshoot-ingestion)
+## Troubleshoot ingestion {#troubleshoot-ingestion}
If an ingestion task fails, do the following:
@@ -176,20 +156,24 @@ For example:
* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
-For more information about modifying ingestion parameters and flows, see [Docling Serve for knowledge ingestion](/knowledge#docling-serve-for-knowledge-ingestion).
+For more information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](#knowledge-ingestion-settings).
-## Docling Serve for knowledge ingestion {#docling-serve-for-knowledge-ingestion}
-
-
+## Knowledge ingestion settings {#knowledge-ingestion-settings}
OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
-More specifically, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), which starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
+
+You can use either Docling Serve or OpenRAG's built-in Docling ingestion pipeline to process documents.
+
+
+
+
+When OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), it starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
-### Knowledge ingestion settings {#knowledge-ingestion-settings}
+The following knowledge ingestion settings only apply to the Docling Serve option:
To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
@@ -219,23 +203,29 @@ If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the
**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
-### Use OpenRAG default ingestion instead of Docling serve
+
+
-If you want to use OpenRAG's built-in pipeline instead of Docling serve, set `DISABLE_INGEST_WITH_LANGFLOW=true` in [Environment variables](/reference/configuration#document-processing).
+If you want to use OpenRAG's built-in Docling ingestion pipeline instead of the separate Docling Serve service, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your [OpenRAG environment variables](/reference/configuration#document-processing).
-The built-in pipeline still uses the Docling processor, but uses it directly without the Docling Serve API.
+The built-in pipeline uses the Docling processor directly instead of through the Docling Serve API.
-For more information, see [`processors.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58).
+For the underlying functionality, see [`processors.py`](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58) in the OpenRAG repository.
+
+
+
## Ingestion performance expectations
+The following performance test was conducted with Docling Serve.
+
On a local VM with 7 vCPUs and 8 GiB RAM, OpenRAG ingested approximately 5.03 GB across 1,083 files in about 42 minutes.
This equates to approximately 2.4 documents per second.
-You can generally expect equal or better performance on developer laptops and significantly faster on servers.
-Throughput scales with CPU cores, memory, storage speed, and configuration choices such as embedding model, chunk size and overlap, and concurrency.
+You can generally expect equal or better performance on developer laptops, and significantly faster performance on servers.
+Throughput scales with CPU cores, memory, storage speed, and configuration choices, such as the embedding model, chunk size, overlap, and concurrency.
-This test returned 12 errors (approximately 1.1 percent).
+This test returned 12 error, approximately 1.1 percent of the total files ingested.
All errors were file-specific, and they didn't stop the pipeline.
* Ingestion dataset:
@@ -247,8 +237,8 @@ All errors were file-specific, and they didn't stop the pipeline.
* Machine: Apple M4 Pro
* Podman VM:
- * Name: `podman-machine-default`
- * Type: `applehv`
+ * Name: podman-machine-default
+ * Type: applehv
* vCPUs: 7
* Memory: 8 GiB
* Disk size: 100 GiB
@@ -264,4 +254,10 @@ All errors were file-specific, and they didn't stop the pipeline.
* Elapsed time: Approximately 42 minutes 15 seconds (2,535 seconds)
-* Throughput: Approximately 2.4 documents/second
\ No newline at end of file
+* Throughput: Approximately 2.4 documents per second
+
+## See also
+
+* [Configure knowledge](/knowledge)
+* [Filter knowledge](/knowledge-filters)
+* [Chat with knowledge](/chat)
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index ddca2a8e..1fe9503a 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -10,10 +10,12 @@ import TabItem from '@theme/TabItem';
OpenRAG includes a built-in [OpenSearch](https://docs.opensearch.org/latest/) instance that serves as the underlying datastore for your _knowledge_ (documents).
This specialized database is used to store and retrieve your documents and the associated vector data (embeddings).
-You can [upload documents](/ingestion) from a variety of sources.
+The documents in your OpenSearch knowledge base provide specialized context in addition to the general knowledge available to the language model that you select when you [install OpenRAG](/install) or [edit a flow](/agents).
+
+You can [upload documents](/ingestion) from a variety of sources to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
Documents are processed through OpenRAG's knowledge ingestion flows with Docling.
-The [OpenRAG **Chat**](/chat) runs [similarity searches](https://www.ibm.com/think/topics/vector-search) against your OpenSearch database to retrieve relevant information and generate context-aware responses.
+Then, the [OpenRAG **Chat**](/chat) can run [similarity searches](https://www.ibm.com/think/topics/vector-search) against your OpenSearch database to retrieve relevant information and generate context-aware responses.
You can configure how documents are ingested and how the **Chat** interacts with your knowledge base.
@@ -61,7 +63,7 @@ To change the embedding model, you must [reinstall OpenRAG](/install#reinstall).
## Set ingestion parameters
-For information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](/ingestion#knowledge-ingestion-settings) and [Knowledge ingestion flows](/ingestion#knowledge-ingestion-flows).
+For information about modifying ingestion parameters and flows, see [Ingest knowledge](/ingestion).
## Delete knowledge
From 0a83ea2e6c405dfabf0849524a6e178462feb1c1 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:22:59 -0800
Subject: [PATCH 11/13] finish knowledge and oauth
---
docs/docs/_partial-ingestion-flow.mdx | 24 ++
docs/docs/core-components/agents.mdx | 2 +-
docs/docs/core-components/ingestion.mdx | 331 ++++++++++++----------
docs/docs/core-components/knowledge.mdx | 104 ++++++-
docs/docs/get-started/install.mdx | 52 ++--
docs/docs/get-started/quickstart.mdx | 3 +-
docs/docs/get-started/what-is-openrag.mdx | 4 +-
docs/docs/support/troubleshoot.mdx | 6 +-
8 files changed, 332 insertions(+), 194 deletions(-)
create mode 100644 docs/docs/_partial-ingestion-flow.mdx
diff --git a/docs/docs/_partial-ingestion-flow.mdx b/docs/docs/_partial-ingestion-flow.mdx
new file mode 100644
index 00000000..f8a77630
--- /dev/null
+++ b/docs/docs/_partial-ingestion-flow.mdx
@@ -0,0 +1,24 @@
+
+About the OpenSearch Ingestion flow
+
+When you upload documents locally or with OAuth connectors, the **OpenSearch Ingestion** flow runs in the background.
+By default, this flow uses Docling Serve to import and process documents.
+
+Like all [OpenRAG flows](/agents), you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it if you want to change the knowledge ingestion settings.
+
+The **OpenSearch Ingestion** flow is comprised of several components that work together to process and store documents in your knowledge base:
+
+* [**Docling Serve** component](https://docs.langflow.org/bundles-docling#docling-serve): Ingests files and processes them by connecting to OpenRAG's local Docling Serve service. The output is `DoclingDocument` data that contains the extracted text and metadata from the documents.
+* [**Export DoclingDocument** component](https://docs.langflow.org/bundles-docling#export-doclingdocument): Exports processed `DoclingDocument` data to Markdown format with image placeholders. This conversion standardizes the document data in preparation for further processing.
+* [**DataFrame Operations** component](https://docs.langflow.org/components-processing#dataframe-operations): Three of these components run sequentially to add metadata to the document data: `filename`, `file_size`, and `mimetype`.
+* [**Split Text** component](https://docs.langflow.org/components-processing#split-text): Splits the processed text into chunks, based on the configured [chunk size and overlap settings](/knowledge#knowledge-ingestion-settings).
+* **Secret Input** component: If needed, four of these components securely fetch the [OAuth authentication](/knowledge#auth) configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`.
+* **Create Data** component: Combines the authentication credentials from the **Secret Input** components into a structured data object that is associated with the document embeddings.
+* [**Embedding Model** component](https://docs.langflow.org/components-embedding-models): Generates vector embeddings using your selected [embedding model](/knowledge#set-the-embedding-model-and-dimensions).
+* [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch): Stores the processed documents and their embeddings in a `documents` index of your OpenRAG [OpenSearch knowledge base](/knowledge).
+
+ The default address for the OpenSearch instance is `https://opensearch:9200`. To change this address, edit the `OPENSEARCH_PORT` [environment variable](/reference/configuration#opensearch-settings).
+
+ The default authentication method is JSON Web Token (JWT) authentication. If you [edit the flow](/agents#inspect-and-modify-flows), you can select `basic` auth mode, which uses the `OPENSEARCH_USERNAME` and `OPENSEARCH_PASSWORD` [environment variables](/reference/configuration#opensearch-settings) for authentication instead of JWT.
+
+
\ No newline at end of file
diff --git a/docs/docs/core-components/agents.mdx b/docs/docs/core-components/agents.mdx
index 8def9af5..405dbd42 100644
--- a/docs/docs/core-components/agents.mdx
+++ b/docs/docs/core-components/agents.mdx
@@ -46,7 +46,7 @@ For example, to view and edit the built-in **Chat** flow (the **OpenRAG OpenSear
If you modify the built-in **Chat** flow, make sure you click in the **Conversations** tab to start a new conversation. This ensures that the chat doesn't persist any context from the previous conversation with the original flow settings.
:::
-### Revert a built-in flow to its default state
+### Revert a built-in flow to its original configuration {#revert-a-built-in-flow-to-its-original-configuration}
After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
This is a destructive action that discards all customizations to the flow.
diff --git a/docs/docs/core-components/ingestion.mdx b/docs/docs/core-components/ingestion.mdx
index cb63a02f..a745be0c 100644
--- a/docs/docs/core-components/ingestion.mdx
+++ b/docs/docs/core-components/ingestion.mdx
@@ -7,21 +7,22 @@ import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import PartialTempKnowledge from '@site/docs/_partial-temp-knowledge.mdx';
+import PartialIngestionFlow from '@site/docs/_partial-ingestion-flow.mdx';
Upload documents to your [OpenRAG OpenSearch instance](/knowledge) to populate your knowledge base with unique content, such as your own company documents, research papers, or websites.
Documents are processed through OpenRAG's knowledge ingestion flows with Docling.
-OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connections.
+OpenRAG can ingest knowledge from direct file uploads, URLs, and OAuth authenticated connectors.
Knowledge ingestion is powered by OpenRAG's built-in knowledge ingestion flows that use Docling to process documents before storing the documents in your OpenSearch database.
During ingestion, documents are broken into smaller chunks of content that are then embedded using your selected [embedding model](/knowledge#set-the-embedding-model-and-dimensions).
Then, the chunks, embeddings, and associated metadata (which connects chunks of the same document) are stored in your OpenSearch database.
-Like all [OpenRAG flows](/agents), you can [inspect the flows in Langflow](/agents#inspect-and-modify-flows), and you can customize them if you want to change the knowledge ingestion settings.
+To modify chunking behavior and other ingestion settings, see [Knowledge ingestion settings](/knowledge#knowledge-ingestion-settings) and [Inspect and modify flows](/agents#inspect-and-modify-flows).
## Ingest local files and folders
-You can upload files and folders from your local machine to your knowledge base. When you do this, the **OpenSearch Ingestion** flow runs in the background.
+You can upload files and folders from your local machine to your knowledge base:
1. Click **Knowledge** to view your OpenSearch knowledge base.
@@ -29,32 +30,156 @@ You can upload files and folders from your local machine to your knowledge base.
3. To upload one file, click **File**. To upload all documents in a folder, click **Folder**.
- The default path for either **File** or **Folder** uploads is the `/documents` subdirectory in your OpenRAG installation directory.
+ The default path is the `./documents` subdirectory in your OpenRAG installation directory.
+ To change this path, see [Set the local documents path](/knowledge#set-the-local-documents-path).
-### Ingest local files temporarily
+The selected files are processed in the background through the **OpenSearch Ingestion** flow.
+
+
+
+You can [monitor ingestion](#monitor-ingestion) to see the progress of the uploads and check for failed uploads.
+
+## Ingest local files temporarily
-### OpenSearch Ingestion flow
+## Ingest files with OAuth connectors {#oauth-ingestion}
-
+OpenRAG can use OAuth authenticated connectors to ingest documents from the following external services:
-The **OpenSearch Ingestion** flow is the default knowledge ingestion flow in OpenRAG. When you **Add Knowledge** in OpenRAG, the **OpenSearch Ingestion** flow runs in the background. The flow ingests documents using Docling Serve to import and process documents.
+* AWS S3
+* Google Drive
+* Microsoft OneDrive
+* Microsoft Sharepoint
-If you [inspect the flow in Langflow](/agents#inspect-and-modify-flows), you'll see that it is comprised of ten components that work together to process and store documents in your knowledge base:
+These connectors enable seamless ingestion of files from cloud storage to your OpenRAG knowledge base.
-* The **OpenSearch Ingestion** flow uses Langflow's [**File** component](https://docs.langflow.org/components-data#file) to split and embed files loaded from your local machine into the OpenSearch database.
-The default path to your local folder is mounted from the `./documents` folder in your OpenRAG project directory to the `/app/documents/` directory inside the Docker container. Files added to the host or the container will be visible in both locations. To configure this location, modify the **Documents Paths** variable in either the TUI's [Advanced Setup](/install#setup) menu or in the `.env` used by Docker Compose.
-* The [**Docling Serve** component](https://docs.langflow.org/bundles-docling) processes input documents by connecting to your instance of Docling Serve.
-* The [**Export DoclingDocument** component](https://docs.langflow.org/components-docling) exports the processed DoclingDocument to markdown format with image export mode set to placeholder. This conversion makes the structured document data into a standardized format for further processing.
-* Three [**DataFrame Operations** components](https://docs.langflow.org/components-processing#dataframe-operations) sequentially add metadata columns to the document data of `filename`, `file_size`, and `mimetype`.
-* The [**Split Text** component](https://docs.langflow.org/components-processing#split-text) splits the processed text into chunks with a chunk size of 1000 characters and an overlap of 200 characters.
-* Four **Secret Input** components provide secure access to configuration variables: `CONNECTOR_TYPE`, `OWNER`, `OWNER_EMAIL`, and `OWNER_NAME`. These are runtime variables populated from OAuth login.
-* The **Create Data** component combines the secret inputs into a structured data object that will be associated with the document embeddings.
-* The [**Embedding Model** component](https://docs.langflow.org/components-embedding-models) generates vector embeddings using OpenAI's `text-embedding-3-small` model. The embedding model is selected at [Application onboarding] and cannot be changed.
-* The [**OpenSearch** component](https://docs.langflow.org/bundles-elastic#opensearch) stores the processed documents and their embeddings in the `documents` index at `https://opensearch:9200`. By default, the component is authenticated with a JWT token, but you can also select `basic` auth mode, and enter your OpenSearch admin username and password.
+Individual users can connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage. When a user connects a cloud storage service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage.
-To customize this flow, see [Inspect and modify flows](/agents#inspect-and-modify-flows).
+### Enable OAuth connectors
+
+Before users can connect their own cloud storage accounts, you must configure the provider's OAuth credentials in OpenRAG. Typically, this requires that you register OpenRAG as an OAuth application in your cloud provider, and then obtain the app's OAuth credentials, such as a client ID and secret key.
+To enable multiple connectors, you must register an app and generate credentials for each provider.
+
+
+
+
+If you use the TUI to manage your OpenRAG containers, provide OAuth credentials in the **Advanced Setup**.
+
+You can do this during [installation](/install#setup), or you can add the credentials afterwards:
+
+1. If OpenRAG is running, stop it: Go to [**Status**](/install#tui-container-management), and then click **Stop Services**.
+
+2. Click **Advanced Setup**, and then add the OAuth credentials for the cloud storage providers that you want to use:
+
+ * **Amazon**: Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on [Configuring access to AWS applications](https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html).
+ * **Google**: Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the [Google Cloud Console](https://console.cloud.google.com/apis/credentials). For more information, see the [Google OAuth client documentation](https://developers.google.com/identity/protocols/oauth2).
+ * **Microsoft**: For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide [Azure application registration credentials for SharePoint and OneDrive](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online). For more information, see the [Microsoft Graph OAuth client documentation](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth).
+
+3. The OpenRAG TUI presents redirect URIs for your OAuth app that you must register with your OAuth provider.
+These are the URLs your OAuth provider will redirect back to after users authenticate and grant access to their cloud storage.
+
+4. Click **Save Configuration**.
+
+ OpenRAG regenerates the [`.env`](/reference/configuration) file with the given credentials.
+
+5. Click **Start Container Services**.
+
+
+
+
+If you [install OpenRAG with self-managed containers](/docker), set OAuth credentials in the `.env` file for Docker Compose.
+
+You can do this during [initial set up](/docker#install-openrag-with-docker-compose), or you can add the credentials afterwards:
+
+1. Stop your OpenRAG deployment.
+
+
+
+
+ ```bash
+ podman stop --all
+ ```
+
+
+
+
+ ```bash
+ docker stop $(docker ps -q)
+ ```
+
+
+
+
+2. Edit the `.env` file for Docker Compose to add the OAuth credentials for the cloud storage providers that you want to use:
+
+ * **Amazon**: Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on [Configuring access to AWS applications](https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html).
+
+ ```env
+ AWS_ACCESS_KEY_ID=
+ AWS_SECRET_ACCESS_KEY=
+ ```
+
+ * **Google**: Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the [Google Cloud Console](https://console.cloud.google.com/apis/credentials). For more information, see the [Google OAuth client documentation](https://developers.google.com/identity/protocols/oauth2).
+
+ ```env
+ GOOGLE_OAUTH_CLIENT_ID=
+ GOOGLE_OAUTH_CLIENT_SECRET=
+ ```
+
+ * **Microsoft**: For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide [Azure application registration credentials for SharePoint and OneDrive](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online). For more information, see the [Microsoft Graph OAuth client documentation](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth).
+
+ ```env
+ MICROSOFT_GRAPH_OAUTH_CLIENT_ID=
+ MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET=
+ ```
+
+3. Save the `.env` file.
+
+4. Restart your OpenRAG deployment:
+
+
+
+
+ ```bash
+ podman-compose up -d
+ ```
+
+
+
+
+ ```bash
+ docker-compose up -d
+ ```
+
+
+
+
+
+
+
+### Authenticate and ingest files from cloud storage
+
+After you start OpenRAG with OAuth connectors enabled, each user is prompted to authenticate with the OAuth provider upon accessing your OpenRAG instance.
+Individual authentication is required to access a user's cloud storage from your OpenRAG instance.
+For example, if a user navigates to the default OpenRAG URL at `http://localhost:3000`, they are redirected to the OAuth provider's sign-in page.
+After authenticating and granting the required permissions for OpenRAG, the user is redirected back to OpenRAG.
+
+To ingest knowledge with an OAuth connector, do the following:
+
+1. Click **Knowledge** to view your OpenSearch knowledge base.
+
+2. Click **Add Knowledge**, and then select a storage provider.
+
+3. On the **Add Cloud Knowledge** page, click **Add Files**, and then select the files and folders to ingest from the connected storage.
+
+4. Click **Ingest Files**.
+
+The selected files are processed in the background through the **OpenSearch Ingestion** flow.
+
+
+
+You can [monitor ingestion](#monitor-ingestion) to see the progress of the uploads and check for failed uploads.
## Ingest knowledge from URLs {#url-flow}
@@ -63,62 +188,15 @@ This flow isn't directly accessible from the OpenRAG user interface.
Instead, this flow is called by the [**OpenRAG OpenSearch Agent** flow](/chat#flow) as a Model Context Protocol (MCP) tool.
The agent can call this component to fetch web content from a given URL, and then ingest that content into your OpenSearch knowledge base.
+Like all OpenRAG flows, you can [inspect the flow in Langflow](/agents#inspect-and-modify-flows), and you can customize it.
+
For more information about MCP in Langflow, see the Langflow documentation on [MCP clients](https://docs.langflow.org/mcp-client) and [MCP servers](https://docs.langflow.org/mcp-tutorial).
-## Ingest files through OAuth connectors {#oauth-ingestion}
-
-OpenRAG supports Google Drive, OneDrive, and Sharepoint as OAuth connectors for seamless document synchronization.
-
-OAuth integration allows individual users to connect their personal cloud storage accounts to OpenRAG. Each user must separately authorize OpenRAG to access their own cloud storage files. When a user connects a cloud service, they are redirected to authenticate with that service provider and grant OpenRAG permission to sync documents from their personal cloud storage.
-
-Before users can connect their cloud storage accounts, you must configure OAuth credentials in OpenRAG. This requires registering OpenRAG as an OAuth application with a cloud provider and obtaining client ID and secret keys for each service you want to support.
-
-To add an OAuth connector to OpenRAG, do the following.
-This example uses Google OAuth.
-If you wish to use another provider, add the secrets to another provider.
-
-
-
- 1. If OpenRAG is running, stop it with **Status** > **Stop Services**.
- 2. Click **Advanced Setup**.
- 3. Add the OAuth provider's client and secret key in the [Advanced Setup](/install#setup) menu.
- 4. Click **Save Configuration**.
- The TUI generates a new `.env` file with your OAuth values.
- 5. Click **Start Container Services**.
-
-
- 1. Stop the Docker deployment.
- 2. Add the OAuth provider's client and secret key in the `.env` file for Docker Compose.
- ```bash
- GOOGLE_OAUTH_CLIENT_ID='YOUR_OAUTH_CLIENT_ID'
- GOOGLE_OAUTH_CLIENT_SECRET='YOUR_OAUTH_CLIENT_SECRET'
- ```
- 3. Save your `.env` file.
- 4. Start the Docker deployment.
-
-
-
-The OpenRAG frontend at `http://localhost:3000` now redirects to an OAuth callback login page for your OAuth provider.
-A successful authentication opens OpenRAG with the required scopes for your connected storage.
-
-To add knowledge from an OAuth-connected storage provider, do the following:
-
-1. Click **Add Knowledge**, and then select the storage provider, for example, **Google Drive**.
-The **Add Cloud Knowledge** page opens.
-2. To add files or folders from the connected storage, click **Add Files**.
-Select the files or folders you want and click **Select**.
-You can select multiple files.
-3. When your files are selected, click **Ingest Files**.
-The ingestion process can take some time depending on the size of your documents.
-4. When ingestion is complete, your documents are available in the Knowledge screen.
-
-If ingestion fails, click **Status** to view the logged error.
-
## Monitor ingestion
Document ingestion tasks run in the background.
-In the OpenRAG UI, a badge is shown on **Tasks** when OpenRAG tasks are active.
+In the OpenRAG user interface, a badge is shown on **Tasks** when OpenRAG tasks are active.
Click **Tasks** to inspect and cancel tasks:
* **Active Tasks**: All tasks that are **Pending**, **Running**, or **Processing**.
@@ -135,87 +213,7 @@ For troubleshooting advice, see [Troubleshoot ingestion](#troubleshoot-ingestion
To stop an active task, click **Cancel**. Canceling a task stops processing immediately and marks the task as **Failed**.
-## Troubleshoot ingestion {#troubleshoot-ingestion}
-
-If an ingestion task fails, do the following:
-
-* Make sure you are uploading supported file types.
-* Split excessively large files into smaller files before uploading.
-* Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.
-
-If the OpenRAG **Chat** doesn't seem to use your documents correctly, [browse your knowledge base](#browse-knowledge) to confirm that the documents are uploaded in full, and the chunks are correct.
-
-If the documents are present and well-formed, check your [knowledge filters](/knowledge-filters).
-If a global filter is applied, make sure the expected documents are included in the global filter.
-If the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter.
-
-If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.
-For example:
-
-* Break combined documents into separate files for better metadata context.
-* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
-* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
-
-For more information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](#knowledge-ingestion-settings).
-
-## Knowledge ingestion settings {#knowledge-ingestion-settings}
-
-OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion.
-
-You can use either Docling Serve or OpenRAG's built-in Docling ingestion pipeline to process documents.
-
-
-
-
-When OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve), it starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
-
-Docling ingests documents from your local machine or OAuth connectors, splits them into chunks, and stores them as separate, structured documents in the OpenSearch `documents` index.
-
-OpenRAG chose Docling for its support for a wide variety of file formats, high performance, and advanced understanding of tables and images.
-
-The following knowledge ingestion settings only apply to the Docling Serve option:
-
-To modify OpenRAG's ingestion settings, including the Docling settings and ingestion flows, **Settings**.
-
-These settings configure the Docling ingestion parameters.
-
-OpenRAG will warn you if `docling serve` is not running.
-To start or stop `docling serve` or any other native services, in the TUI main menu, click **Start Native Services** or **Stop Native Services**.
-
-**Embedding model** determines which AI model is used to create vector embeddings. The default is the OpenAI `text-embedding-3-small` model.
-
-**Chunk size** determines how large each text chunk is in number of characters.
-Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
-The default value of `1000` characters provides a good starting point that balances these considerations.
-
-**Chunk overlap** controls the number of characters that overlap over chunk boundaries.
-Use larger overlap values for documents where context is most important, and use smaller overlap values for simpler documents, or when optimization is most important.
-The default value of 200 characters of overlap with a chunk size of 1000 (20% overlap) is suitable for general use cases. Decrease the overlap to 10% for a more efficient pipeline, or increase to 40% for more complex documents.
-
-**Table Structure** enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. **Table Structure** is enabled by default.
-
-**OCR** enables or disabled OCR processing when extracting text from images and scanned documents.
-OCR is disabled by default. This setting is best suited for processing text-based documents as quickly as possible with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
-
-Enable OCR when you are processing documents containing images with text that requires extraction, or for scanned documents. Enabling OCR can slow ingestion performance.
-
-If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
-
-**Picture descriptions** adds image descriptions generated by the [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model to OCR processing. Enabling picture descriptions can slow ingestion performance.
-
-
-
-
-If you want to use OpenRAG's built-in Docling ingestion pipeline instead of the separate Docling Serve service, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your [OpenRAG environment variables](/reference/configuration#document-processing).
-
-The built-in pipeline uses the Docling processor directly instead of through the Docling Serve API.
-
-For the underlying functionality, see [`processors.py`](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58) in the OpenRAG repository.
-
-
-
-
-## Ingestion performance expectations
+### Ingestion performance expectations
The following performance test was conducted with Docling Serve.
@@ -228,6 +226,9 @@ Throughput scales with CPU cores, memory, storage speed, and configuration choic
This test returned 12 error, approximately 1.1 percent of the total files ingested.
All errors were file-specific, and they didn't stop the pipeline.
+
+Ingestion performance test details
+
* Ingestion dataset:
* Total files: 1,083 items mounted
@@ -256,8 +257,34 @@ All errors were file-specific, and they didn't stop the pipeline.
* Throughput: Approximately 2.4 documents per second
+
+
+## Troubleshoot ingestion {#troubleshoot-ingestion}
+
+If an ingestion task fails, do the following:
+
+* Make sure you are uploading supported file types.
+* Split excessively large files into smaller files before uploading.
+* Remove unusual embedded content, such as videos or animations, before uploading. Although Docling can replace some non-text content with placeholders during ingestion, some embedded content might cause errors.
+
+If the OpenRAG **Chat** doesn't seem to use your documents correctly, [browse your knowledge base](#browse-knowledge) to confirm that the documents are uploaded in full, and the chunks are correct.
+
+If the documents are present and well-formed, check your [knowledge filters](/knowledge-filters).
+If a global filter is applied, make sure the expected documents are included in the global filter.
+If the global filter excludes any documents, the agent cannot access those documents unless you apply a chat-level filter or change the global filter.
+
+If text is missing or incorrectly processed, you need to reupload the documents after modifying the ingestion parameters or the documents themselves.
+For example:
+
+* Break combined documents into separate files for better metadata context.
+* Make sure scanned documents are legible enough for extraction, and enable the **OCR** option. Poorly scanned documents might require additional preparation or rescanning before ingestion.
+* Adjust the **Chunk Size** and **Chunk Overlap** settings to better suit your documents. Larger chunks provide more context but can include irrelevant information, while smaller chunks yield more precise semantic search but can lack context.
+
+For more information about modifying ingestion parameters and flows, see [Knowledge ingestion settings](/knowledge#knowledge-ingestion-settings).
+
## See also
* [Configure knowledge](/knowledge)
* [Filter knowledge](/knowledge-filters)
-* [Chat with knowledge](/chat)
\ No newline at end of file
+* [Chat with knowledge](/chat)
+* [Inspect and modify flows](/agents#inspect-and-modify-flows)
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index 1fe9503a..ade572c6 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -21,12 +21,13 @@ You can configure how documents are ingested and how the **Chat** interacts with
## Browse knowledge {#browse-knowledge}
-The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSearch database, specifically in the `documents` index.
+The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSearch database, specifically in an [OpenSearch index](https://docs.opensearch.org/latest/getting-started/intro/#index) named `documents`.
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
Click a document to view the chunks produced from splitting the document during ingestion.
OpenRAG includes some sample documents that you can use to see how the agent references documents in the [**Chat**](/chat).
+You might want to [delete these documents](#delete-knowledge) before uploading your own documents to avoid polluting the agent's context with these samples.
## OpenSearch authentication and document access {#auth}
@@ -36,38 +37,116 @@ The mode you choose determines how OpenRAG authenticates with OpenSearch and con
* **Basic Setup (no-auth mode)**: If you choose **Basic Setup**, then OpenRAG is installed in no-auth mode.
This mode uses one, anonymous JWT token for OpenSearch authentication.
There is no differentiation between users.
-All users that access your OpenRAG instance can access all documents uploaded to your OpenSearch `documents` index.
+All users that access your OpenRAG instance can access all documents uploaded to your OpenSearch knowledge base.
* **Advanced Setup (OAuth mode)**: If you choose **Advanced Setup**, then OpenRAG is installed in OAuth mode.
This mode uses a unique JWT token for each OpenRAG user, and each document is tagged with user ownership. Documents are filtered by user owner.
This means users see only the documents that they uploaded or have access to.
You can enable OAuth mode after installation.
-For more information, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
+For more information, see [Ingest files with OAuth connectors](/ingestion#oauth-ingestion).
-## Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
+## OpenSearch indexes
+
+An [OpenSearch index](https://docs.opensearch.org/latest/getting-started/intro/#index) is a collection of documents in an OpenSearch database.
+
+By default, all documents you upload to your OpenRAG knowledge base are stored in an index named `documents`.
+
+It is possible to change the index name by [editing the ingestion flow](/agents#inspect-and-modify-flows).
+However, this can impact dependent processes, such as the [filters](/knowledge-filters) and [**Chat**](/chat) flow, that reference the `documents` index by default.
+Make sure you edit other flows as needed to ensure all processes use the same index name.
+
+If you encounter errors or unexpected behavior after changing the index name, you can [revert the flows to their original configuration](/agents#revert-a-built-in-flow-to-its-original-configuration), or [delete knowledge](/knowledge#delete-knowledge) to clear the existing documents from your knowledge base.
+
+## Knowledge ingestion settings {#knowledge-ingestion-settings}
+
+:::warning
+Knowledge ingestion settings apply to documents you upload after making the changes.
+Documents uploaded before changing these settings aren't reprocessed.
+To ensure consistency across your knowledge base, you must reupload all documents after adjusting any of these settings.
+:::
+
+### Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
OpenRAG automatically detects and configures the appropriate vector dimensions for your selected embedding model, ensuring optimal search performance and compatibility.
In the OpenRAG repository, you can find the complete list of supported models in [`models_service.py`](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py) and the corresponding vector dimensions in [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
-The default embedding dimension is `1536` and the default model is `text-embedding-3-small`.
+The default embedding dimension is `1536` and the default model is the OpenAI `text-embedding-3-small`.
You can use any supported or unsupported embedding model by specifying the model in your OpenRAG configuration during installation.
If you use an unsupported embedding model that doesn't have defined dimensions in `settings.py`, then OpenRAG falls back to the default dimensions (1536) and logs a warning. OpenRAG's OpenSearch instance and flows continue to work, but [similarity search](https://www.ibm.com/think/topics/vector-search) quality can be affected if the actual model dimensions aren't 1536.
-The embedding model setting is immutable.
-To change the embedding model, you must [reinstall OpenRAG](/install#reinstall).
+This embedding model you choose during **Application Onboarding** is immutable and can only be changed by [reinstalling OpenRAG](/install#reinstall).
+Alternatively, you can [edit the OpenRAG flows](/agents#inspect-and-modify-flows) for knowledge ingestion and chat. Make sure all flows use the same embedding model.
-## Set ingestion parameters
+### Set Docling parameters
-For information about modifying ingestion parameters and flows, see [Ingest knowledge](/ingestion).
+OpenRAG uses [Docling](https://docling-project.github.io/docling/) for document ingestion because it supports many file formats, processes tables and images well, and performs efficiently.
-## Delete knowledge
+When you [upload documents](/ingestion), Docling processes the files, splits them into chunks, and stores them as separate, structured documents in your OpenSearch knowledge base.
-To clear your entire knowledge base, you can delete the contents of the `./opensearch-data` folder in your OpenRAG installation directory, or you can [reset the OpenRAG containers](/install#tui-container-management).
+You can use either Docling Serve or OpenRAG's built-in Docling ingestion pipeline to process documents.
+
+
+
+
+By default, OpenRAG uses [Docling Serve](https://github.com/docling-project/docling-serve).
+This means that OpenRAG starts a `docling serve` process on your local machine and runs Docling ingestion through an API service.
+
+
+
+
+If you want to use OpenRAG's built-in Docling ingestion pipeline instead of the separate Docling Serve service, set `DISABLE_INGEST_WITH_LANGFLOW=true` in your [OpenRAG environment variables](/reference/configuration#document-processing).
+
+The built-in pipeline uses the Docling processor directly instead of through the Docling Serve API.
+
+For the underlying functionality, see [`processors.py`](https://github.com/langflow-ai/openrag/blob/main/src/models/processors.py#L58) in the OpenRAG repository.
+
+
+
+
+To modify the Docling ingestion and embedding parameters, click **Settings** in the OpenRAG user interface.
+
+:::tip
+OpenRAG warns you if `docling serve` isn't running.
+You can [start and stop OpenRAG services](/install#tui-container-management) from the TUI main menu with **Start Native Services** or **Stop Native Services**.
+:::
+
+* **Embedding model**: Select the model to use to generate vector embeddings for your documents. This is initially set during installation.
+The recommended way to change this setting is by [reinstalling OpenRAG](/install#reinstall).
+If you change this value by directly [editing the flow](/agents#inspect-and-modify-flows), you must also change the embedding model in other [OpenRAG flows](/agents) to ensure that similarity search results are consistent.
+If you uploaded documents prior to changing the embedding model, you must either [create filters](/knowledge-filters) to prevent mixing documents embedded with different models, or you must reupload all documents to regenerate embeddings with the new model.
+
+* **Chunk size**: Set the number of characters for each text chunk when breaking down a file.
+Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
+The default value is 1000 characters, which is usually a good balance between context and precision.
+
+* **Chunk overlap**: Set the number of characters to overlap over chunk boundaries.
+Use larger overlap values for documents where context is most important. Use smaller overlap values for simpler documents or when optimization is most important.
+The default value is 200 characters, which represents an overlap of 20 percent if the **Chunk size** is 1000. This is suitable for general use. For faster processing, decrease the overlap to approximately 10 percent. For more complex documents where you need to preserve context across chunks, increase it to approximately 40 percent.
+
+* **Table Structure**: Enables Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/) tool for parsing tables. Instead of treating tables as plain text, tables are output as structured table data with preserved relationships and metadata. This option is enabled by default.
+
+* **OCR**: Enables Optical Character Recognition (OCR) processing when extracting text from images and ingesting scanned documents. This setting is best suited for processing text-based documents faster with Docling's [`DocumentConverter`](https://docling-project.github.io/docling/reference/document_converter/). Images are ignored and not processed.
+
+ This option is disabled by default. Enabling OCR can slow ingestion performance.
+
+ If OpenRAG detects that the local machine is running on macOS, OpenRAG uses the [ocrmac](https://www.piwheels.org/project/ocrmac/) OCR engine. Other platforms use [easyocr](https://www.jaided.ai/easyocr/).
+
+* **Picture descriptions**: Only applicable if **OCR** is enabled. Adds image descriptions generated by the [`SmolVLM-256M-Instruct`](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) model. Enabling picture descriptions can slow ingestion performance.
+
+### Set the local documents path {#set-the-local-documents-path}
+
+The default path for local uploads is the `./documents` subdirectory in your OpenRAG installation directory. This is mounted to the `/app/documents/` directory inside the OpenRAG container. Files added to the host or container directory are visible in both locations.
+
+To change this location, modify the **Documents Paths** variable in either the [**Advanced Setup** menu](/install#setup) or in the `.env` used by Docker Compose.
+
+## Delete knowledge {#delete-knowledge}
+
+To clear your entire knowledge base, you can delete the contents of the `./opensearch-data` folder in your OpenRAG installation directory, or you can [reset the containers](/install#tui-container-management).
Be aware that both of these operations are destructive and cannot be undone.
In particular, resetting containers reverts your OpenRAG instance to the initial state as though it were a fresh installation.
@@ -76,4 +155,5 @@ In particular, resetting containers reverts your OpenRAG instance to the initial
* [Ingest knowledge](/ingestion)
* [Filter knowledge](/knowledge-filters)
-* [Chat with knowledge](/chat)
\ No newline at end of file
+* [Chat with knowledge](/chat)
+* [Inspect and modify flows](/agents#inspect-and-modify-flows)
\ No newline at end of file
diff --git a/docs/docs/get-started/install.mdx b/docs/docs/get-started/install.mdx
index 99e9fb74..7e2c1e09 100644
--- a/docs/docs/get-started/install.mdx
+++ b/docs/docs/get-started/install.mdx
@@ -180,18 +180,15 @@ If you encounter errors during installation, see [Troubleshoot OpenRAG](/support
## Set up OpenRAG with the TUI {#setup}
-The TUI creates a `.env` file in your OpenRAG directory root and starts OpenRAG.
-If the TUI detects a `.env` file in the OpenRAG root directory, it sources any variables from the `.env` file.
-If the TUI detects OAuth credentials, it enforces the **Advanced Setup** path.
+The OpenRAG setup process creates a `.env` file at the root of your OpenRAG directory, and then starts OpenRAG.
+If it detects a `.env` file in the OpenRAG root directory, it sources any variables from the `.env` file.
+
+The TUI offers two setup methods to populate the required values. **Basic Setup** can generate all minimum required values for OpenRAG. However, **Basic Setup** doesn't enable [OAuth connectors for cloud storage](/knowledge#auth). If you want to use OAuth connectors to upload documents from cloud storage, select **Advanced Setup**.
+If OpenRAG detects OAuth credentials, it recommends **Advanced Setup**.
- **Basic Setup** can generate all of the required values for OpenRAG. The OpenAI API key is optional and can be provided during onboarding.
- **Basic Setup** does not set up OAuth connections for ingestion from cloud providers.
- For OAuth setup, use **Advanced Setup**.
- For information about the difference between basic (no auth) and OAuth in OpenRAG, see [OpenSearch authentication and document access](/knowledge#auth).
-
1. To install OpenRAG with **Basic Setup**, click **Basic Setup** or press 1.
2. Click **Generate Passwords** to generate passwords for OpenSearch and Langflow.
@@ -215,16 +212,21 @@ If the TUI detects OAuth credentials, it enforces the **Advanced Setup** path.
- 1. To install OpenRAG with **Advanced Setup**, click **Advanced Setup** or press 2.
+ 1. To install OpenRAG with **Advanced Setup**, click **Advanced Setup** or press 2.
2. Click **Generate Passwords** to generate passwords for OpenSearch and Langflow.
-
+
The OpenSearch password is required. The Langflow admin password is optional.
If no Langflow admin password is generated, Langflow runs in [autologin mode](https://docs.langflow.org/api-keys-and-authentication#langflow-auto-login) with no password required.
-
+
3. Paste your OpenAI API key in the OpenAI API key field.
- 4. Add your client and secret values for Google or Microsoft OAuth.
- These values can be found with your OAuth provider.
- For more information, see the [Google OAuth client](https://developers.google.com/identity/protocols/oauth2) or [Microsoft Graph OAuth client](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth) documentation.
+ 4. If you want to upload documents from external storage, such as Google Drive, add the required OAuth credentials for the connectors that you want to use. These settings can be populated automatically if OpenRAG detects these credentials in a `.env` file in the OpenRAG installation directory.
+
+ * **Amazon**: Provide your AWS Access Key ID and AWS Secret Access Key with access to your S3 instance. For more information, see the AWS documentation on [Configuring access to AWS applications](https://docs.aws.amazon.com/singlesignon/latest/userguide/manage-your-applications.html).
+ * **Google**: Provide your Google OAuth Client ID and Google OAuth Client Secret. You can generate these in the [Google Cloud Console](https://console.cloud.google.com/apis/credentials). For more information, see the [Google OAuth client documentation](https://developers.google.com/identity/protocols/oauth2).
+ * **Microsoft**: For the Microsoft OAuth Client ID and Microsoft OAuth Client Secret, provide [Azure application registration credentials for SharePoint and OneDrive](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/app-registration?view=odsp-graph-online). For more information, see the [Microsoft Graph OAuth client documentation](https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth).
+
+ You can [manage OAuth credentials](/ingestion#oauth-ingestion) later, but it is recommended to configure them during initial set up.
+
5. The OpenRAG TUI presents redirect URIs for your OAuth app.
These are the URLs your OAuth provider will redirect back to after user sign-in.
Register these redirect values with your OAuth provider as they are presented in the TUI.
@@ -239,21 +241,23 @@ If the TUI detects OAuth credentials, it enforces the **Advanced Setup** path.
8. To start the Docling service, under **Native Services**, click **Start**.
9. To open the OpenRAG application, navigate to the TUI main menu, and then click **Open App**.
Alternatively, in your browser, navigate to `localhost:3000`.
- You are presented with your provider's OAuth sign-in screen.
- After sign-in, you are redirected to the redirect URI.
- Two additional variables are available for Advanced Setup:
+ 10. If you enabled OAuth connectors, you must sign in to your OAuth provider before being redirected to your OpenRAG instance.
- The `LANGFLOW_PUBLIC_URL` controls where the Langflow web interface can be accessed. This is where users interact with their flows in a browser.
+ 11. Two additional variables are available for **Advanced Setup** at this point.
+ Only change these variables if you have a non-default network configuration for your deployment, such as using a reverse proxy or custom domain.
- The `WEBHOOK_BASE_URL` controls where the endpoint for `/connectors/CONNECTOR_TYPE/webhook` will be available.
- This connection enables real-time document synchronization with external services.
+ * `LANGFLOW_PUBLIC_URL`: Sets the base address to access the Langflow web interface. This is where users interact with flows in a browser.
+
+ * `WEBHOOK_BASE_URL`: Sets the base address of the OpenRAG OAuth connector endpoint.
Supported webhook endpoints:
- - Google Drive: `/connectors/google_drive/webhook`
- - OneDrive: `/connectors/onedrive/webhook`
- - SharePoint: `/connectors/sharepoint/webhook`
- 10. Continue with [Application Onboarding](#application-onboarding).
+ - Amazon S3: Not applicable.
+ - Google Drive: `/connectors/google_drive/webhook`
+ - OneDrive: `/connectors/onedrive/webhook`
+ - SharePoint: `/connectors/sharepoint/webhook`
+
+ 12. Continue with [Application Onboarding](#application-onboarding).
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
index 1546aca1..77f61b15 100644
--- a/docs/docs/get-started/quickstart.mdx
+++ b/docs/docs/get-started/quickstart.mdx
@@ -104,7 +104,7 @@ You can click a document to view the chunks of the document as they are stored i
**Folder** uploads an entire directory.
The default directory is the `/documents` subdirectory in your OpenRAG installation directory.
- For information about the cloud storage provider options, see [Ingest files through OAuth connectors](/ingestion#oauth-ingestion).
+ For information about the cloud storage provider options, see [Ingest files with OAuth connectors](/ingestion#oauth-ingestion).
5. Return to the **Chat** window, and then ask a question related to the documents that you just uploaded.
@@ -137,7 +137,6 @@ You can click a document to view the chunks of the document as they are stored i
Click the **Language Model** component, and then change the **Model Name** to a different OpenAI model.
After you edit a built-in flow, you can click **Restore flow** on the **Settings** page to revert the flow to its original state when you first installed OpenRAG.
- This is a destructive action that discards all customizations to the flow.
4. Press Command+S (Ctrl+S) to save your changes.
diff --git a/docs/docs/get-started/what-is-openrag.mdx b/docs/docs/get-started/what-is-openrag.mdx
index 47201ad6..1bb4e66e 100644
--- a/docs/docs/get-started/what-is-openrag.mdx
+++ b/docs/docs/get-started/what-is-openrag.mdx
@@ -17,7 +17,7 @@ OpenRAG connects and amplifies three popular, proven open-source projects into o
* [OpenSearch](https://docs.opensearch.org/latest/): OpenSearch is a community-driven, Apache 2.0-licensed open source search and analytics suite that makes it easy to ingest, search, visualize, and analyze data.
It provides powerful hybrid search capabilities with enterprise-grade security and multi-tenancy support.
- OpenRAG uses OpenSearch as the underlying vector database for storing and retrieving your documents and associated vector data (embeddings). You can ingest documents from a variety of sources, including your local filesystem and OAuth authenticated connections to popular cloud storage services.
+ OpenRAG uses OpenSearch as the underlying vector database for storing and retrieving your documents and associated vector data (embeddings). You can ingest documents from a variety of sources, including your local filesystem and OAuth authenticated connectors to popular cloud storage services.
* [Docling](https://docling-project.github.io/docling/): Docling simplifies document processing, supports many file formats and advanced PDF parsing, and provides seamless integrations with the generative AI ecosystem.
@@ -63,6 +63,6 @@ flowchart TD
* **Docling Serve**: This is a local document processing service managed by the **OpenRAG backend**.
-* **External connectors**: Integrate third-party cloud storage services through OAuth authenticated connections to the **OpenRAG backend**, allowing synchronization of external storage with your OpenSearch knowledge base.
+* **External connectors**: Integrate third-party cloud storage services with OAuth authenticated connectors to the **OpenRAG backend**, allowing you to load documents from external storage to your OpenSearch knowledge base.
* **OpenRAG frontend**: Provides the user interface for interacting with the OpenRAG platform.
\ No newline at end of file
diff --git a/docs/docs/support/troubleshoot.mdx b/docs/docs/support/troubleshoot.mdx
index 3fb26feb..bad30891 100644
--- a/docs/docs/support/troubleshoot.mdx
+++ b/docs/docs/support/troubleshoot.mdx
@@ -210,4 +210,8 @@ After removing the containers, retry the upgrade in the OpenRAG TUI by clicking
```
-
\ No newline at end of file
+
+
+## Document ingestion or similarity search issues
+
+See [Troubleshoot ingestion](/ingestion#troubleshoot-ingestion).
\ No newline at end of file
From 33234925733f7a653dc546149522a218c89cf9c8 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Mon, 1 Dec 2025 16:58:01 -0800
Subject: [PATCH 12/13] peer review pt 1
---
docs/docs/core-components/knowledge.mdx | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index cbf3a2b5..c17fb61a 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -26,8 +26,9 @@ The **Knowledge** page lists the documents OpenRAG has ingested into your OpenSe
To explore the raw contents of your knowledge base, click **Knowledge** to get a list of all ingested documents.
Click a document to view the chunks produced from splitting the document during ingestion.
-OpenRAG includes some sample documents that you can use to see how the agent references documents in the [**Chat**](/chat).
-You might want to [delete these documents](#delete-knowledge) before uploading your own documents to avoid polluting the agent's context with these samples.
+OpenRAG includes some initial documents about OpenRAG. You can use these documents to ask OpenRAG about itself, and to test the [**Chat**](/chat) feature before uploading your own documents.
+If you [delete these documents](#delete-knowledge), you won't be able to ask OpenRAG about itself and it's own functionality.
+It is recommended that you keep these documents, and use [filters](/knowledge-filters) to separate them from your other knowledge.
## OpenSearch authentication and document access {#auth}
@@ -63,9 +64,15 @@ If you encounter errors or unexpected behavior after changing the index name, yo
:::warning
Knowledge ingestion settings apply to documents you upload after making the changes.
Documents uploaded before changing these settings aren't reprocessed.
-To ensure consistency across your knowledge base, you must reupload all documents after adjusting any of these settings.
:::
+After changing knowledge ingestion settings, you must determine if you need to reupload any documents to be consistent with the new settings.
+
+It isn't always necessary to reupload documents after changing knowledge ingestion settings.
+For example, it is typical to upload some documents with OCR enabled and others without OCR enabled.
+
+If needed, you can use [filters](/knowledge-filters) to separate documents that you uploaded with different settings, such as different embedding models.
+
### Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
@@ -146,10 +153,8 @@ To change this location, modify the **Documents Paths** variable in either the [
## Delete knowledge {#delete-knowledge}
-To clear your entire knowledge base, you can delete the contents of the `./opensearch-data` folder in your OpenRAG installation directory, or you can [reset the containers](/install#tui-container-management).
-
-Be aware that both of these operations are destructive and cannot be undone.
-In particular, resetting containers reverts your OpenRAG instance to the initial state as though it were a fresh installation.
+To clear your entire knowledge base, delete the contents of the `./opensearch-data` folder in your OpenRAG installation directory.
+This is a destructive operation that cannot be undone.
## See also
From dc7588bb7ecd40b41eb761a949650e442f10b862 Mon Sep 17 00:00:00 2001
From: April M <36110273+aimurphy@users.noreply.github.com>
Date: Tue, 2 Dec 2025 07:36:52 -0800
Subject: [PATCH 13/13] peer review pt 2
---
docs/docs/_partial-onboarding.mdx | 33 ++++----
docs/docs/core-components/knowledge.mdx | 24 +++---
docs/docs/get-started/docker.mdx | 6 +-
docs/docs/get-started/install.mdx | 10 +--
docs/docs/reference/configuration.mdx | 102 ++++++++++++++----------
docs/docs/support/troubleshoot.mdx | 2 +-
6 files changed, 100 insertions(+), 77 deletions(-)
diff --git a/docs/docs/_partial-onboarding.mdx b/docs/docs/_partial-onboarding.mdx
index be70e8bf..28680853 100644
--- a/docs/docs/_partial-onboarding.mdx
+++ b/docs/docs/_partial-onboarding.mdx
@@ -1,28 +1,28 @@
import Icon from "@site/src/components/icon/icon";
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
-import PartialOllama from '@site/docs/_partial-ollama.mdx';
+import PartialOllama from '@site/docs/_partial-ollama.mdx';
-## Application onboarding
+## Application onboarding
-The first time you start OpenRAG, whether using the TUI or a `.env` file, you must complete application onboarding.
+The first time you start OpenRAG, regardless of how you installed it, you must complete application onboarding.
-:::warning
-Most values from onboarding can be changed later in the OpenRAG **Settings** page, but there are important restrictions.
-
-The **language model provider** and **embeddings model provider** can only be selected at onboarding.
-To change your provider selection later, you must [reinstall OpenRAG](/install#reinstall).
+Some of these variables, such as the embedding models, can be changed seamlessly after onboarding.
+Others are immutable and require you to destroy and recreate the OpenRAG containers.
+For more information, see [Environment variables](/reference/configuration).
You can use different providers for your language model and embedding model, such as Anthropic for the language model and OpenAI for the embeddings model.
-:::
+Additionally, you can set multiple embedding models.
-Choose one LLM provider and complete these steps:
+You only need to complete onboarding for your preferred providers.
+
:::info
- Anthropic does not provide embedding models. If you select Anthropic for your language model, you must then select a different provider for embeddings.
+ Anthropic doesn't provide embedding models. If you select Anthropic for your language model, you must select a different provider for embeddings.
:::
+
1. Enable **Use environment Anthropic API key** to automatically use your key from the `.env` file.
Alternatively, paste an Anthropic API key into the field.
2. Under **Advanced settings**, select your **Language Model**.
@@ -34,6 +34,7 @@ Choose one LLM provider and complete these steps:
+
1. Enable **Get API key from environment variable** to automatically enter your key from the TUI-generated `.env` file.
Alternatively, paste an OpenAI API key into the field.
2. Under **Advanced settings**, select your **Language Model**.
@@ -45,6 +46,7 @@ Choose one LLM provider and complete these steps:
+
1. Complete the fields for **watsonx.ai API Endpoint**, **IBM Project ID**, and **IBM API key**.
These values are found in your IBM watsonx deployment.
2. Under **Advanced settings**, select your **Language Model**.
@@ -56,9 +58,11 @@ Choose one LLM provider and complete these steps:
- :::tip
- Ollama is not included with OpenRAG. To install Ollama, see the [Ollama documentation](https://docs.ollama.com/).
- :::
+
+ :::info
+ Ollama isn't installed with OpenRAG. To install Ollama, see the [Ollama documentation](https://docs.ollama.com/).
+ :::
+
1. To connect to an Ollama server running on your local machine, enter your Ollama server's base URL address.
The default Ollama server address is `http://localhost:11434`.
OpenRAG connects to the Ollama server and populates the model lists with the server's available models.
@@ -70,5 +74,6 @@ Choose one LLM provider and complete these steps:
3. Click **Complete**.
4. To complete the onboarding tasks, click **What is OpenRAG**, and then click **Add a Document**.
5. Continue with the [Quickstart](/quickstart).
+
\ No newline at end of file
diff --git a/docs/docs/core-components/knowledge.mdx b/docs/docs/core-components/knowledge.mdx
index c17fb61a..d299aab1 100644
--- a/docs/docs/core-components/knowledge.mdx
+++ b/docs/docs/core-components/knowledge.mdx
@@ -75,19 +75,19 @@ If needed, you can use [filters](/knowledge-filters) to separate documents that
### Set the embedding model and dimensions {#set-the-embedding-model-and-dimensions}
-When you [install OpenRAG](/install), you select an embedding model during **Application Onboarding**.
+When you [install OpenRAG](/install), you select at least one embedding model during [application onboarding](/install#application-onboarding).
OpenRAG automatically detects and configures the appropriate vector dimensions for your selected embedding model, ensuring optimal search performance and compatibility.
In the OpenRAG repository, you can find the complete list of supported models in [`models_service.py`](https://github.com/langflow-ai/openrag/blob/main/src/services/models_service.py) and the corresponding vector dimensions in [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
-The default embedding dimension is `1536` and the default model is the OpenAI `text-embedding-3-small`.
-
-You can use any supported or unsupported embedding model by specifying the model in your OpenRAG configuration during installation.
+During application onboarding, you can select from the supported models.
+The default embedding dimension is `1536`, and the default model is the OpenAI `text-embedding-3-small`.
+If you want to use an unsupported model, you must manually set the model in your [OpenRAG configuration](/reference/configuration).
If you use an unsupported embedding model that doesn't have defined dimensions in `settings.py`, then OpenRAG falls back to the default dimensions (1536) and logs a warning. OpenRAG's OpenSearch instance and flows continue to work, but [similarity search](https://www.ibm.com/think/topics/vector-search) quality can be affected if the actual model dimensions aren't 1536.
-This embedding model you choose during **Application Onboarding** is immutable and can only be changed by [reinstalling OpenRAG](/install#reinstall).
-Alternatively, you can [edit the OpenRAG flows](/agents#inspect-and-modify-flows) for knowledge ingestion and chat. Make sure all flows use the same embedding model.
+To change the embedding model after onboarding, it is recommended that you modify the embedding model setting in the OpenRAG **Settings** page or in your [OpenRAG configuration](/reference/configuration).
+This will automatically update all relevant [OpenRAG flows](/agents) to use the new embedding model configuration.
### Set Docling parameters
@@ -122,10 +122,14 @@ OpenRAG warns you if `docling serve` isn't running.
You can [start and stop OpenRAG services](/install#tui-container-management) from the TUI main menu with **Start Native Services** or **Stop Native Services**.
:::
-* **Embedding model**: Select the model to use to generate vector embeddings for your documents. This is initially set during installation.
-The recommended way to change this setting is by [reinstalling OpenRAG](/install#reinstall).
-If you change this value by directly [editing the flow](/agents#inspect-and-modify-flows), you must also change the embedding model in other [OpenRAG flows](/agents) to ensure that similarity search results are consistent.
-If you uploaded documents prior to changing the embedding model, you must either [create filters](/knowledge-filters) to prevent mixing documents embedded with different models, or you must reupload all documents to regenerate embeddings with the new model.
+* **Embedding model**: Select the model to use to generate vector embeddings for your documents.
+
+ This is initially set during installation.
+ The recommended way to change this setting is in the OpenRAG **Settings** or your [OpenRAG configuration](/reference/configuration).
+ This will automatically update all relevant [OpenRAG flows](/agents) to use the new embedding model configuration.
+
+ If you uploaded documents prior to changing the embedding model, you can [create filters](/knowledge-filters) to separate documents embedded with different models, or you can reupload all documents to regenerate embeddings with the new model.
+ If you want to use multiple embeddings models, similarity search (in the **Chat**) can take longer as it searching each model's embeddings separately.
* **Chunk size**: Set the number of characters for each text chunk when breaking down a file.
Larger chunks yield more context per chunk, but can include irrelevant information. Smaller chunks yield more precise semantic search, but can lack context.
diff --git a/docs/docs/get-started/docker.mdx b/docs/docs/get-started/docker.mdx
index af2e191f..c78bc64d 100644
--- a/docs/docs/get-started/docker.mdx
+++ b/docs/docs/get-started/docker.mdx
@@ -34,7 +34,7 @@ OpenRAG has two Docker Compose files. Both files deploy the same applications an
- Prepare model providers and credentials.
- During [Application Onboarding](#application-onboarding), you must select language model and embedding model providers.
+ During [application onboarding](#application-onboarding), you must select language model and embedding model providers.
If your chosen provider offers both types, you can use the same provider for both selections.
If your provider offers only one type, such as Anthropic, you must select two providers.
@@ -84,7 +84,7 @@ To install OpenRAG with Docker Compose, do the following:
LANGFLOW_SECRET_KEY=your_secret_key
```
- `OPENAI_API_KEY` is optional. You can provide it during [Application Onboarding](#application-onboarding) or choose a different model provider. If you want to set it in your `.env` file, you can find your OpenAI API key in your [OpenAI account](https://platform.openai.com/api-keys).
+ `OPENAI_API_KEY` is optional. You can provide it during [application onboarding](#application-onboarding) or choose a different model provider. If you want to set it in your `.env` file, you can find your OpenAI API key in your [OpenAI account](https://platform.openai.com/api-keys).
`LANGFLOW_SECRET_KEY` is optional. Langflow will auto-generate it if not set. For more information, see the [Langflow documentation](https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key).
@@ -159,7 +159,7 @@ To install OpenRAG with Docker Compose, do the following:
- **Backend API**: http://localhost:8000
- **Langflow**: http://localhost:7860
-9. Continue with [Application Onboarding](#application-onboarding).
+9. Continue with [application onboarding](#application-onboarding).
To stop `docling serve` when you're done with your OpenRAG deployment, run:
diff --git a/docs/docs/get-started/install.mdx b/docs/docs/get-started/install.mdx
index b30784b7..03f71637 100644
--- a/docs/docs/get-started/install.mdx
+++ b/docs/docs/get-started/install.mdx
@@ -41,7 +41,7 @@ If you prefer running Podman or Docker containers and manually editing `.env` fi
- Prepare model providers and credentials.
- During [Application Onboarding](#application-onboarding), you must select language model and embedding model providers.
+ During [application onboarding](#application-onboarding), you must select language model and embedding model providers.
If your chosen provider offers both types, you can use the same provider for both selections.
If your provider offers only one type, such as Anthropic, you must select two providers.
@@ -208,7 +208,7 @@ If OpenRAG detects OAuth credentials, it recommends **Advanced Setup**.
6. To start the Docling service, under **Native Services**, click **Start**.
7. To open the OpenRAG application, navigate to the TUI main menu, and then click **Open App**.
Alternatively, in your browser, navigate to `localhost:3000`.
- 8. Continue with [Application Onboarding](#application-onboarding).
+ 8. Continue with [application onboarding](#application-onboarding).
@@ -257,7 +257,7 @@ If OpenRAG detects OAuth credentials, it recommends **Advanced Setup**.
- OneDrive: `/connectors/onedrive/webhook`
- SharePoint: `/connectors/sharepoint/webhook`
- 12. Continue with [Application Onboarding](#application-onboarding).
+ 12. Continue with [application onboarding](#application-onboarding).
@@ -436,11 +436,11 @@ To reinstall OpenRAG with a completely fresh setup:
This removes all containers, volumes, and data.
2. Optional: Delete your project's `.env` file.
- The Reset operation does not remove your project's `.env` file, so your passwords, API keys, and OAuth settings can be preserved.
+ The Reset operation doesn't remove your project's `.env` file, so your passwords, API keys, and OAuth settings can be preserved.
If you delete the `.env` file, run the [Set up OpenRAG with the TUI](#setup) process again to create a new configuration.
3. In the TUI Setup menu, follow these steps from [Basic Setup](#setup):
1. Click **Start All Services** to pull container images and start them.
2. Under **Native Services**, click **Start** to start the Docling service.
3. Click **Open App** to open the OpenRAG application.
- 4. Continue with [Application Onboarding](#application-onboarding).
\ No newline at end of file
+ 4. Continue with [application onboarding](#application-onboarding).
\ No newline at end of file
diff --git a/docs/docs/reference/configuration.mdx b/docs/docs/reference/configuration.mdx
index 1a88fa3a..c573958e 100644
--- a/docs/docs/reference/configuration.mdx
+++ b/docs/docs/reference/configuration.mdx
@@ -23,32 +23,47 @@ The Docker Compose files are populated with values from your `.env`, so you don'
Environment variables always take precedence over other variables.
-### Set environment variables
+### Set environment variables {#set-environment-variables}
-To set environment variables, do the following.
+After you start OpenRAG, you must [stop and restart OpenRAG containers](/install#tui-container-management) to apply any changes you make to the `.env` file.
+
+To set mutable environment variables, do the following:
+
+1. Stop OpenRAG with the TUI or Docker Compose.
-1. Stop OpenRAG.
2. Set the values in the `.env` file:
+
```bash
LOG_LEVEL=DEBUG
LOG_FORMAT=json
SERVICE_NAME=openrag-dev
```
-3. Start OpenRAG.
-Updating provider API keys or provider endpoints in the `.env` file will not take effect after [Application onboarding](/install#application-onboarding). To change these values, you must:
+3. Start OpenRAG with the TUI or Docker Compose.
+
+Certain environment variables that you set during [application onboarding](/install#application-onboarding), such as provider API keys and provider endpoints, require resetting the containers after modifying the `.env` file.
+
+To change immutable variables with TUI-managed containers, you must [reinstall OpenRAG](/install#reinstall) and either delete or modify the `.env` file before you repeat the setup and onboarding process in the TUI.
+
+To change immutable variables with self-managed containers, do the following:
+
+1. Stop OpenRAG with Docker Compose.
-1. Stop OpenRAG.
2. Remove the containers:
- ```
+
+ ```bash
docker-compose down
```
+
3. Update the values in your `.env` file.
-4. Start OpenRAG containers.
- ```
+
+4. Start OpenRAG with Docker Compose:
+
+ ```bash
docker-compose up -d
```
-5. Complete [Application onboarding](/install#application-onboarding) again.
+
+5. Repeat [application onboarding](/install#application-onboarding). The values in your `.env` file are automatically populated.
## Supported environment variables
@@ -56,18 +71,19 @@ All OpenRAG configuration can be controlled through environment variables.
### AI provider settings
-Configure which AI models and providers OpenRAG uses for language processing and embeddings.
-For more information, see [Application onboarding](/install#application-onboarding).
+Configure which models and providers OpenRAG uses to generate text and embeddings.
+These are initially set during [application onboarding](/install#application-onboarding).
+Some values are immutable and can only be changed by recreating the OpenRAG containers, as explained in [Set environment variables](#set-environment-variables).
| Variable | Default | Description |
|----------|---------|-------------|
-| `EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model for vector search. |
-| `LLM_MODEL` | `gpt-4o-mini` | Language model for the chat agent. |
+| `EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model for generating vector embeddings for documents in the knowledge base and similarity search queries. Can be changed after application onboarding. Accepts one or more models. |
+| `LLM_MODEL` | `gpt-4o-mini` | Language model for language processing and text generation in the **Chat** feature. |
| `MODEL_PROVIDER` | `openai` | Model provider, such as OpenAI or IBM watsonx.ai. |
-| `OPENAI_API_KEY` | - | Your OpenAI API key. Optional. Can be provided during application onboarding when installing OpenRAG. |
-| `PROVIDER_API_KEY` | - | API key for the model provider. |
-| `PROVIDER_ENDPOINT` | - | Custom provider endpoint. Only used for IBM or Ollama providers. |
-| `PROVIDER_PROJECT_ID` | - | Project ID for providers. Only required for the IBM watsonx.ai provider. |
+| `OPENAI_API_KEY` | Not set | Optional OpenAI API key for the default model. For other providers, use `PROVIDER_API_KEY`. |
+| `PROVIDER_API_KEY` | Not set | API key for the model provider. |
+| `PROVIDER_ENDPOINT` | Not set | Custom provider endpoint for the IBM and Ollama model providers. Leave unset for other model providers. |
+| `PROVIDER_PROJECT_ID` | Not set | Project ID for the IBM watsonx.ai model provider only. Leave unset for other model providers. |
### Document processing
@@ -78,7 +94,7 @@ Control how OpenRAG [processes and ingests documents](/ingestion) into your know
| `CHUNK_OVERLAP` | `200` | Overlap between chunks. |
| `CHUNK_SIZE` | `1000` | Text chunk size for document processing. |
| `DISABLE_INGEST_WITH_LANGFLOW` | `false` | Disable Langflow ingestion pipeline. |
-| `DOCLING_OCR_ENGINE` | - | OCR engine for document processing. |
+| `DOCLING_OCR_ENGINE` | Set by OS | OCR engine for document processing. For macOS, `ocrmac`. For any other OS, `easyocr`. |
| `OCR_ENABLED` | `false` | Enable OCR for image processing. |
| `OPENRAG_DOCUMENTS_PATHS` | `./openrag-documents` | Document paths for ingestion. |
| `PICTURE_DESCRIPTIONS_ENABLED` | `false` | Enable picture descriptions. |
@@ -90,18 +106,18 @@ Configure Langflow authentication.
| Variable | Default | Description |
|----------|---------|-------------|
| `LANGFLOW_AUTO_LOGIN` | `False` | Enable auto-login for Langflow. |
-| `LANGFLOW_CHAT_FLOW_ID` | pre-filled | This value is pre-filled. The default value is found in [.env.example](https://github.com/langflow-ai/openrag/blob/main/.env.example). |
-| `LANGFLOW_ENABLE_SUPERUSER_CLI` | `False` | Enable superuser CLI. |
-| `LANGFLOW_INGEST_FLOW_ID` | pre-filled | This value is pre-filled. The default value is found in [.env.example](https://github.com/langflow-ai/openrag/blob/main/.env.example). |
-| `LANGFLOW_KEY` | auto-generated | Explicit Langflow API key. |
-| `LANGFLOW_NEW_USER_IS_ACTIVE` | `False` | New users are active by default. |
-| `LANGFLOW_PUBLIC_URL` | `http://localhost:7860` | Public URL for Langflow. |
-| `LANGFLOW_SECRET_KEY` | - | Secret key for Langflow internal operations. |
-| `LANGFLOW_SUPERUSER` | - | Langflow admin username. Required. |
-| `LANGFLOW_SUPERUSER_PASSWORD` | - | Langflow admin password. Required. |
-| `LANGFLOW_URL` | `http://localhost:7860` | Langflow URL. |
-| `NUDGES_FLOW_ID` | pre-filled | This value is pre-filled. The default value is found in [.env.example](https://github.com/langflow-ai/openrag/blob/main/.env.example). |
-| `SYSTEM_PROMPT` | "You are a helpful AI assistant with access to a knowledge base. Answer questions based on the provided context." | System prompt for the Langflow agent. |
+| `LANGFLOW_CHAT_FLOW_ID` | Built-in flow ID | This value is automatically set to the ID of the chat [flow](/agents). The default value is found in [`.env.example`](https://github.com/langflow-ai/openrag/blob/main/.env.example). Only change this value if you explicitly don't want to use this built-in flow. |
+| `LANGFLOW_ENABLE_SUPERUSER_CLI` | `False` | Enable superuser privileges for Langflow CLI commands. |
+| `LANGFLOW_INGEST_FLOW_ID` | Built-in flow ID | This value is automatically set to the ID of the ingestion [flow](/agents). The default value is found in [`.env.example`](https://github.com/langflow-ai/openrag/blob/main/.env.example). Only change this value if you explicitly don't want to use this built-in flow. |
+| `LANGFLOW_KEY` | Automatically generated | Explicit Langflow API key. |
+| `LANGFLOW_NEW_USER_IS_ACTIVE` | `False` | Whether new Langflow users are active by default. |
+| `LANGFLOW_PUBLIC_URL` | `http://localhost:7860` | Public URL for the Langflow instance. |
+| `LANGFLOW_SECRET_KEY` | Not set | Secret key for Langflow internal operations. |
+| `LANGFLOW_SUPERUSER` | None, must be explicitly set | Langflow admin username. Required. |
+| `LANGFLOW_SUPERUSER_PASSWORD` | None, must be explicitly set | Langflow admin password. Required. |
+| `LANGFLOW_URL` | `http://localhost:7860` | URL for the Langflow instance. |
+| `NUDGES_FLOW_ID` | Built-in flow ID | This value is automatically set to the ID of the nudges [flow](/agents). The default value is found in [`.env.example`](https://github.com/langflow-ai/openrag/blob/main/.env.example). Only change this value if you explicitly don't want to use this built-in flow. |
+| `SYSTEM_PROMPT` | `You are a helpful AI assistant with access to a knowledge base. Answer questions based on the provided context.` | System prompt instructions for the agent driving the **Chat** flow. |
### OAuth provider settings
@@ -134,30 +150,28 @@ Configure general system components, session management, and logging.
| `LANGFLOW_KEY_RETRIES` | `15` | Number of retries for Langflow key generation. |
| `LANGFLOW_KEY_RETRY_DELAY` | `2.0` | Delay between retries in seconds. |
| `LANGFLOW_VERSION` | `latest` | Langflow Docker image version. |
-| `LOG_FORMAT` | - | Log format (set to "json" for JSON output). |
+| `LOG_FORMAT` | Disabled | Set to `json` to enabled JSON-formatted log output. |
| `LOG_LEVEL` | `INFO` | Logging level (DEBUG, INFO, WARNING, ERROR). |
-| `MAX_WORKERS` | - | Maximum number of workers for document processing. |
+| `MAX_WORKERS` | `1` | Maximum number of workers for document processing. |
| `OPENRAG_VERSION` | `latest` | OpenRAG Docker image version. |
| `SERVICE_NAME` | `openrag` | Service name for logging. |
-| `SESSION_SECRET` | auto-generated | Session management. |
+| `SESSION_SECRET` | Automatically generated | Session management. |
## Langflow runtime overrides
-Langflow runtime overrides allow you to modify component settings at runtime without changing the base configuration.
+You can modify [flow](/agents) settings at runtime without permanently changing the flow's configuration.
-Runtime overrides are implemented through **tweaks** - parameter modifications that are passed to specific Langflow components during flow execution.
+Runtime overrides are implemented through _tweaks_, which are one-time parameter modifications that are passed to specific Langflow components during flow execution.
-For more information on tweaks, see [Input schema (tweaks)](https://docs.langflow.org/concepts-publish#input-schema).
+For more information on tweaks, see the Langflow documentation on [Input schema (tweaks)](https://docs.langflow.org/concepts-publish#input-schema).
## Default values and fallbacks
-When no environment variables or configuration file values are provided, OpenRAG uses default values.
-These values can be found in the code base at the following locations.
+If a variable isn't set by environment variables or a configuration file, OpenRAG can use a default value if one is defined in the codebase.
+Default values can be found in the OpenRAG repository:
-### OpenRAG configuration defaults
+* OpenRAG configuration: [`config_manager.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/config_manager.py)
-These values are defined in [`config_manager.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/config/config_manager.py).
+* System configuration: [`settings.py`](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py)
-### System configuration defaults
-
-These fallback values are defined in [`settings.py` in the OpenRAG repository](https://github.com/langflow-ai/openrag/blob/main/src/config/settings.py).
\ No newline at end of file
+* Logging configuration: [`logging_config.py`](https://github.com/langflow-ai/openrag/blob/main/src/utils/logging_config.py)
\ No newline at end of file
diff --git a/docs/docs/support/troubleshoot.mdx b/docs/docs/support/troubleshoot.mdx
index bad30891..4f04a281 100644
--- a/docs/docs/support/troubleshoot.mdx
+++ b/docs/docs/support/troubleshoot.mdx
@@ -77,7 +77,7 @@ On macOS, this cache directory is typically a user cache directory such as `/Use
uvx openrag
```
-If you do not need OCR, you can disable OCR-based processing in your ingestion settings to avoid requiring `easyocr`.
+If you don't need OCR, you can disable OCR-based processing in your ingestion settings to avoid requiring `easyocr`.
## Upgrade fails due to Langflow container already exists {#langflow-container-already-exists-during-upgrade}