diff --git a/.env.example b/.env.example
index fe908795..ee2a838c 100644
--- a/.env.example
+++ b/.env.example
@@ -9,7 +9,7 @@ LANGFLOW_SECRET_KEY=
LANGFLOW_CHAT_FLOW_ID=1098eea1-6649-4e1d-aed1-b77249fb8dd0
LANGFLOW_INGEST_FLOW_ID=5488df7c-b93f-4f87-a446-b67028bc0813
# Ingest flow using docling
-LANGFLOW_INGEST_FLOW_ID=1402618b-e6d1-4ff2-9a11-d6ce71186915
+# LANGFLOW_INGEST_FLOW_ID=1402618b-e6d1-4ff2-9a11-d6ce71186915
NUDGES_FLOW_ID=ebc01d31-1976-46ce-a385-b0240327226c
# Set a strong admin password for OpenSearch; a bcrypt hash is generated at
diff --git a/docs/docs/get-started/quickstart.mdx b/docs/docs/get-started/quickstart.mdx
new file mode 100644
index 00000000..993d9739
--- /dev/null
+++ b/docs/docs/get-started/quickstart.mdx
@@ -0,0 +1,390 @@
+---
+title: Quickstart
+slug: /quickstart
+---
+
+import Icon from "@site/src/components/icon/icon";
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+Get started with OpenRAG by loading your knowledge, swapping out your language model, and then chatting with the OpenRAG API.
+
+## Prerequisites
+
+- Install and start OpenRAG
+
+## Find your way around
+
+1. In OpenRAG, click **Chat**.
+2. Ask `What documents are available to you?`
+ The agent responds with a message summarizing the documents that OpenRAG loads by default, which are PDFs about evaluating data quality when using LLMs in health care.
+3. To confirm the agent is correct, click **Knowledge**.
+ The **Knowledge** page lists the documents OpenRAG has ingested into the OpenSearch vector database. Click on a document to display the chunks derived from splitting the default documents into the vector database.
+
+## Add your own knowledge
+
+1. To add documents to your knowledge base, click **Add Knowledge**.
+ * Select **Add File** to add a single file from your local machine (mapped with the Docker volume mount).
+ * Select **Process Folder** to process an entire folder of documents from your local machine (mapped with the Docker volume mount).
+2. Return to the Chat window and ask a question about your loaded data.
+ For example, with a manual about a PC tablet loaded, ask `How do I connect this device to WiFI?`
+ The agent responds with a message indicating it now has your knowledge as context for answering questions.
+3. Click the **Function Call: search_documents (tool_call)** that is printed in the Playground.
+These events log the agent's request to the tool and the tool's response, so you have direct visibility into your agent's functionality.
+If you aren't getting the results you need, you can further tune the knowledge ingestion and agent behavior in the next section.
+
+## Swap out the language model to modify agent behavior
+
+To modify the knowledge ingestion or Agent behavior, click **Settings**.
+
+In this example, you'll try a different LLM to demonstrate how the Agent's response changes.
+
+1. To edit the Agent's behavior, click **Edit in Langflow**.
+2. OpenRAG warns you that you're entering Langflow. Click **Proceed**.
+3. The OpenRAG Open Search Agent flow appears.
+
+
+
+4. In the **Language Model** component, under **Model Provider**, select **Anthropic**.
+ :::note
+ This guide uses an Anthropic model for demonstration purposes. If you want to use a different provider, change the **Model Provider** and **Model Name** fields, and then provide credentials for your selected provider.
+ :::
+5. Save your flow with Command+S.
+6. In OpenRAG, start a new conversation by clicking the in the **Conversations** tab.
+7. Ask the same question as before to demonstrate how a different language model changes the results.
+
+## Integrate OpenRAG into your application
+
+:::tip
+Ensure the `openrag-backend` container has port 8000 exposed in your `docker-compose.yml`:
+
+```yaml
+openrag-backend:
+ ports:
+ - "8000:8000"
+```
+:::
+
+OpenRAG provides a REST API that you can call from Python, TypeScript, or any HTTP client to chat with your documents.
+
+These example requests are run assuming OpenRAG is in "no-auth" mode.
+For complete API documentation, including authentication, request and response parameters, and example requests, see the API documentation.
+
+### Chat with your documents
+
+Prompt OpenRAG at the `/chat` API endpoint.
+
+
+
+
+```python
+import requests
+
+url = "http://localhost:8000/chat"
+payload = {
+ "prompt": "What documents are available to you?",
+ "previous_response_id": None
+}
+
+response = requests.post(url, json=payload)
+print("OpenRAG Response:", response.json())
+```
+
+
+
+
+```typescript
+import fetch from 'node-fetch';
+
+const response = await fetch("http://localhost:8000/chat", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ prompt: "What documents are available to you?",
+ previous_response_id: null
+ })
+});
+
+const data = await response.json();
+console.log("OpenRAG Response:", data);
+```
+
+
+
+
+```bash
+curl -X POST "http://localhost:8000/chat" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "prompt": "What documents are available to you?",
+ "previous_response_id": null
+ }'
+```
+
+
+
+
+
+Response
+
+```
+{
+ "response": "I have access to a wide range of documents depending on the context and the tools enabled in this environment. Specifically, I can search for and retrieve documents related to various topics such as technical papers, articles, manuals, guides, knowledge base entries, and other text-based resources. If you specify a particular subject or type of document you're interested in, I can try to locate relevant materials for you. Let me know what you need!",
+ "response_id": "resp_68d3fdbac93081958b8781b97919fe7007f98bd83932fa1a"
+}
+```
+
+
+
+### Search your documents
+
+Search your document knowledge base at the `/search` endpoint.
+
+
+
+
+```python
+import requests
+
+url = "http://localhost:8000/search"
+payload = {"query": "healthcare data quality", "limit": 5}
+
+response = requests.post(url, json=payload)
+results = response.json()
+
+print("Search Results:")
+for result in results.get("results", []):
+ print(f"- {result.get('filename')}: {result.get('text', '')[:100]}...")
+```
+
+
+
+
+```typescript
+const response = await fetch("http://localhost:8000/search", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ query: "healthcare data quality",
+ limit: 5
+ })
+});
+
+const results = await response.json();
+console.log("Search Results:");
+results.results?.forEach((result, index) => {
+ const filename = result.filename || 'Unknown';
+ const text = result.text?.substring(0, 100) || '';
+ console.log(`${index + 1}. ${filename}: ${text}...`);
+});
+```
+
+
+
+
+```bash
+curl -X POST "http://localhost:8000/search" \
+ -H "Content-Type: application/json" \
+ -d '{"query": "healthcare data quality", "limit": 5}'
+```
+
+
+
+
+
+
+Example response
+
+```
+Found 5 results
+1. 2506.08231v1.pdf: variables with high performance metrics. These variables might also require fewer replication analys...
+2. 2506.08231v1.pdf: on EHR data and may lack the clinical domain knowledge needed to perform well on the tasks where EHR...
+3. 2506.08231v1.pdf: Abstract Large language models (LLMs) are increasingly used to extract clinical data from electronic...
+4. 2506.08231v1.pdf: these multidimensional assessments, the framework not only quantifies accuracy, but can also be appl...
+5. 2506.08231v1.pdf: observed in only the model metrics, but not the abstractor metrics, it indicates that model errors m...
+```
+
+
+
+### Use chat and search together
+
+Create a complete chat application that combines an interactive terminal chat with session continuity and search functionality.
+
+
+
+
+```python
+import requests
+
+# Configuration
+OPENRAG_BASE_URL = "http://localhost:8000"
+CHAT_URL = f"{OPENRAG_BASE_URL}/chat"
+SEARCH_URL = f"{OPENRAG_BASE_URL}/search"
+DEFAULT_SEARCH_LIMIT = 5
+
+def chat_with_openrag(message, previous_response_id=None):
+ try:
+ response = requests.post(CHAT_URL, json={
+ "prompt": message,
+ "previous_response_id": previous_response_id
+ })
+ response.raise_for_status()
+ data = response.json()
+ return data.get("response"), data.get("response_id")
+ except Exception as e:
+ return f"Error: {str(e)}", None
+
+def search_documents(query, limit=DEFAULT_SEARCH_LIMIT):
+ try:
+ response = requests.post(SEARCH_URL, json={
+ "query": query,
+ "limit": limit
+ })
+ response.raise_for_status()
+ data = response.json()
+ return data.get("results", [])
+ except Exception as e:
+ return []
+
+# Interactive chat with session continuity and search
+previous_response_id = None
+while True:
+ question = input("Your question (or 'search ' to search): ").strip()
+ if question.lower() in ['quit', 'exit', 'q']:
+ break
+ if not question:
+ continue
+
+ if question.lower().startswith('search '):
+ query = question[7:].strip()
+ print("Searching documents...")
+ results = search_documents(query)
+ print(f"\nFound {len(results)} results:")
+ for i, result in enumerate(results, 1):
+ filename = result.get('filename', 'Unknown')
+ text = result.get('text', '')[:100]
+ print(f"{i}. {filename}: {text}...")
+ print()
+ else:
+ print("OpenRAG is thinking...")
+ result, response_id = chat_with_openrag(question, previous_response_id)
+ print(f"OpenRAG: {result}\n")
+ previous_response_id = response_id
+```
+
+
+
+
+```ts
+import fetch from 'node-fetch';
+
+// Configuration
+const OPENRAG_BASE_URL = "http://localhost:8000";
+const CHAT_URL = `${OPENRAG_BASE_URL}/chat`;
+const SEARCH_URL = `${OPENRAG_BASE_URL}/search`;
+const DEFAULT_SEARCH_LIMIT = 5;
+
+async function chatWithOpenRAG(message: string, previousResponseId?: string | null) {
+ try {
+ const response = await fetch(CHAT_URL, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ prompt: message,
+ previous_response_id: previousResponseId
+ })
+ });
+ const data = await response.json();
+ return [data.response || "No response received", data.response_id || null];
+ } catch (error) {
+ return [`Error: ${error}`, null];
+ }
+}
+
+async function searchDocuments(query: string, limit: number = DEFAULT_SEARCH_LIMIT) {
+ try {
+ const response = await fetch(SEARCH_URL, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ query, limit })
+ });
+ const data = await response.json();
+ return data.results || [];
+ } catch (error) {
+ return [];
+ }
+}
+
+// Interactive chat with session continuity and search
+let previousResponseId = null;
+const readline = require('readline');
+const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+
+const askQuestion = () => {
+ rl.question("Your question (or 'search ' to search): ", async (question) => {
+ if (question.toLowerCase() === 'quit' || question.toLowerCase() === 'exit' || question.toLowerCase() === 'q') {
+ console.log("Goodbye!");
+ rl.close();
+ return;
+ }
+ if (!question.trim()) {
+ askQuestion();
+ return;
+ }
+
+ if (question.toLowerCase().startsWith('search ')) {
+ const query = question.substring(7).trim();
+ console.log("Searching documents...");
+ const results = await searchDocuments(query);
+ console.log(`\nFound ${results.length} results:`);
+ results.forEach((result, i) => {
+ const filename = result.filename || 'Unknown';
+ const text = result.text?.substring(0, 100) || '';
+ console.log(`${i + 1}. ${filename}: ${text}...`);
+ });
+ console.log();
+ } else {
+ console.log("OpenRAG is thinking...");
+ const [result, responseId] = await chatWithOpenRAG(question, previousResponseId);
+ console.log(`\nOpenRAG: ${result}\n`);
+ previousResponseId = responseId;
+ }
+ askQuestion();
+ });
+};
+
+console.log("OpenRAG Chat Interface");
+console.log("Ask questions about your documents. Type 'quit' to exit.");
+console.log("Use 'search ' to search documents directly.\n");
+askQuestion();
+```
+
+
+
+
+
+Example response
+
+```
+Your question (or 'search ' to search): search healthcare
+Searching documents...
+
+Found 5 results:
+1. 2506.08231v1.pdf: variables with high performance metrics. These variables might also require fewer replication analys...
+2. 2506.08231v1.pdf: on EHR data and may lack the clinical domain knowledge needed to perform well on the tasks where EHR...
+3. 2506.08231v1.pdf: Abstract Large language models (LLMs) are increasingly used to extract clinical data from electronic...
+4. 2506.08231v1.pdf: Acknowledgements Darren Johnson for support in publication planning and management. The authors used...
+5. 2506.08231v1.pdf: Ensuring Reliability of Curated EHR-Derived Data: The Validation of Accuracy for LLM/ML-Extracted In...
+
+Your question (or 'search ' to search): what's the weather today?
+OpenRAG is thinking...
+OpenRAG: I don't have access to real-time weather data. Could you please provide me with your location? Then I can help you find the weather information.
+
+Your question (or 'search ' to search): newark nj
+OpenRAG is thinking...
+```
+
+
+## Next steps
+
+TBD
\ No newline at end of file
diff --git a/docs/docs/get-started/what-is-openrag.mdx b/docs/docs/get-started/what-is-openrag.mdx
index 7b411617..d52fe12f 100644
--- a/docs/docs/get-started/what-is-openrag.mdx
+++ b/docs/docs/get-started/what-is-openrag.mdx
@@ -1,6 +1,6 @@
---
title: What is OpenRAG?
-slug: /what-is-openrag
+slug: /
---
OpenRAG is an open-source package for building agentic RAG systems.
diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js
index 4248c3e2..c4175c09 100644
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -71,7 +71,7 @@ const config = {
logo: {
alt: 'OpenRAG Logo',
src: 'img/logo.svg',
- href: 'what-is-openrag',
+ href: '/',
},
items: [
{
@@ -89,7 +89,7 @@ const config = {
items: [
{
label: 'Getting Started',
- to: 'what-is-openrag',
+ to: '/',
},
],
},
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 568989e5..6d6db6b3 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -25,6 +25,12 @@ const sidebars = {
id: "get-started/what-is-openrag",
label: "Introduction"
},
+ {
+ type: "doc",
+ id: "get-started/quickstart",
+ label: "Quickstart"
+ },
+
{
type: "doc",
id: "get-started/docker",
diff --git a/docs/static/img/opensearch-agent-flow.png b/docs/static/img/opensearch-agent-flow.png
new file mode 100644
index 00000000..d201aef9
Binary files /dev/null and b/docs/static/img/opensearch-agent-flow.png differ
diff --git a/flows/openrag_ingest_docling.json b/flows/openrag_ingest_docling.json
index cd6d7d39..889f8425 100644
--- a/flows/openrag_ingest_docling.json
+++ b/flows/openrag_ingest_docling.json
@@ -30,34 +30,6 @@
"target": "OpenSearchHybrid-XtKoA",
"targetHandle": "{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchHybrid-XtKoAœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}"
},
- {
- "animated": false,
- "className": "",
- "data": {
- "sourceHandle": {
- "dataType": "OpenAIEmbeddings",
- "id": "OpenAIEmbeddings-mP45L",
- "name": "embeddings",
- "output_types": [
- "Embeddings"
- ]
- },
- "targetHandle": {
- "fieldName": "embedding",
- "id": "OpenSearchHybrid-XtKoA",
- "inputTypes": [
- "Embeddings"
- ],
- "type": "other"
- }
- },
- "id": "reactflow__edge-OpenAIEmbeddings-mP45L{œdataTypeœ:œOpenAIEmbeddingsœ,œidœ:œOpenAIEmbeddings-mP45Lœ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}-OpenSearchHybrid-XtKoA{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-XtKoAœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}",
- "selected": false,
- "source": "OpenAIEmbeddings-mP45L",
- "sourceHandle": "{œdataTypeœ:œOpenAIEmbeddingsœ,œidœ:œOpenAIEmbeddings-mP45Lœ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}",
- "target": "OpenSearchHybrid-XtKoA",
- "targetHandle": "{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-XtKoAœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}"
- },
{
"animated": false,
"className": "",
@@ -116,6 +88,34 @@
"sourceHandle": "{œdataTypeœ:œExportDoclingDocumentœ,œidœ:œExportDoclingDocument-xFoCIœ,œnameœ:œdataœ,œoutput_typesœ:[œDataœ]}",
"target": "SplitText-3ZI5B",
"targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-3ZI5Bœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}"
+ },
+ {
+ "animated": false,
+ "className": "",
+ "data": {
+ "sourceHandle": {
+ "dataType": "EmbeddingModel",
+ "id": "EmbeddingModel-cxG9r",
+ "name": "embeddings",
+ "output_types": [
+ "Embeddings"
+ ]
+ },
+ "targetHandle": {
+ "fieldName": "embedding",
+ "id": "OpenSearchHybrid-XtKoA",
+ "inputTypes": [
+ "Embeddings"
+ ],
+ "type": "other"
+ }
+ },
+ "id": "xy-edge__EmbeddingModel-cxG9r{œdataTypeœ:œEmbeddingModelœ,œidœ:œEmbeddingModel-cxG9rœ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}-OpenSearchHybrid-XtKoA{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-XtKoAœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}",
+ "selected": false,
+ "source": "EmbeddingModel-cxG9r",
+ "sourceHandle": "{œdataTypeœ:œEmbeddingModelœ,œidœ:œEmbeddingModel-cxG9rœ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}",
+ "target": "OpenSearchHybrid-XtKoA",
+ "targetHandle": "{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-XtKoAœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}"
}
],
"nodes": [
@@ -361,585 +361,6 @@
"type": "genericNode",
"width": 320
},
- {
- "data": {
- "id": "OpenAIEmbeddings-mP45L",
- "node": {
- "base_classes": [
- "Embeddings"
- ],
- "beta": false,
- "conditional_paths": [],
- "custom_fields": {},
- "description": "Generate embeddings using OpenAI models.",
- "display_name": "OpenAI Embeddings",
- "documentation": "",
- "edited": false,
- "field_order": [
- "default_headers",
- "default_query",
- "chunk_size",
- "client",
- "deployment",
- "embedding_ctx_length",
- "max_retries",
- "model",
- "model_kwargs",
- "openai_api_key",
- "openai_api_base",
- "openai_api_type",
- "openai_api_version",
- "openai_organization",
- "openai_proxy",
- "request_timeout",
- "show_progress_bar",
- "skip_empty",
- "tiktoken_model_name",
- "tiktoken_enable",
- "dimensions"
- ],
- "frozen": false,
- "icon": "OpenAI",
- "legacy": false,
- "metadata": {
- "code_hash": "8a658ed6d4c9",
- "dependencies": {
- "dependencies": [
- {
- "name": "langchain_openai",
- "version": "0.3.23"
- },
- {
- "name": "lfx",
- "version": null
- }
- ],
- "total_dependencies": 2
- },
- "module": "custom_components.openai_embeddings"
- },
- "minimized": false,
- "output_types": [],
- "outputs": [
- {
- "allows_loop": false,
- "cache": true,
- "display_name": "Embedding Model",
- "group_outputs": false,
- "method": "build_embeddings",
- "name": "embeddings",
- "options": null,
- "required_inputs": null,
- "selected": "Embeddings",
- "tool_mode": true,
- "types": [
- "Embeddings"
- ],
- "value": "__UNDEFINED__"
- }
- ],
- "pinned": false,
- "template": {
- "_type": "Component",
- "chunk_size": {
- "_input_type": "IntInput",
- "advanced": true,
- "display_name": "Chunk Size",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "chunk_size",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "int",
- "value": 1000
- },
- "client": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "Client",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "client",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "code": {
- "advanced": true,
- "dynamic": true,
- "fileTypes": [],
- "file_path": "",
- "info": "",
- "list": false,
- "load_from_db": false,
- "multiline": true,
- "name": "code",
- "password": false,
- "placeholder": "",
- "required": true,
- "show": true,
- "title_case": false,
- "type": "code",
- "value": "from langchain_openai import OpenAIEmbeddings\n\nfrom lfx.base.embeddings.model import LCEmbeddingsModel\nfrom lfx.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES\nfrom lfx.field_typing import Embeddings\nfrom lfx.io import BoolInput, DictInput, DropdownInput, FloatInput, IntInput, MessageTextInput, SecretStrInput\n\n\nclass OpenAIEmbeddingsComponent(LCEmbeddingsModel):\n display_name = \"OpenAI Embeddings\"\n description = \"Generate embeddings using OpenAI models.\"\n icon = \"OpenAI\"\n name = \"OpenAIEmbeddings\"\n\n inputs = [\n DictInput(\n name=\"default_headers\",\n display_name=\"Default Headers\",\n advanced=True,\n info=\"Default headers to use for the API request.\",\n ),\n DictInput(\n name=\"default_query\",\n display_name=\"Default Query\",\n advanced=True,\n info=\"Default query parameters to use for the API request.\",\n ),\n IntInput(name=\"chunk_size\", display_name=\"Chunk Size\", advanced=True, value=1000),\n MessageTextInput(name=\"client\", display_name=\"Client\", advanced=True),\n MessageTextInput(name=\"deployment\", display_name=\"Deployment\", advanced=True),\n IntInput(name=\"embedding_ctx_length\", display_name=\"Embedding Context Length\", advanced=True, value=1536),\n IntInput(name=\"max_retries\", display_name=\"Max Retries\", value=3, advanced=True),\n DropdownInput(\n name=\"model\",\n display_name=\"Model\",\n advanced=False,\n options=OPENAI_EMBEDDING_MODEL_NAMES,\n value=\"text-embedding-3-small\",\n ),\n DictInput(name=\"model_kwargs\", display_name=\"Model Kwargs\", advanced=True),\n SecretStrInput(name=\"openai_api_key\", display_name=\"OpenAI API Key\", value=\"OPENAI_API_KEY\", required=True),\n MessageTextInput(name=\"openai_api_base\", display_name=\"OpenAI API Base\", advanced=True),\n MessageTextInput(name=\"openai_api_type\", display_name=\"OpenAI API Type\", advanced=True),\n MessageTextInput(name=\"openai_api_version\", display_name=\"OpenAI API Version\", advanced=True),\n MessageTextInput(\n name=\"openai_organization\",\n display_name=\"OpenAI Organization\",\n advanced=True,\n ),\n MessageTextInput(name=\"openai_proxy\", display_name=\"OpenAI Proxy\", advanced=True),\n FloatInput(name=\"request_timeout\", display_name=\"Request Timeout\", advanced=True),\n BoolInput(name=\"show_progress_bar\", display_name=\"Show Progress Bar\", advanced=True),\n BoolInput(name=\"skip_empty\", display_name=\"Skip Empty\", advanced=True),\n MessageTextInput(\n name=\"tiktoken_model_name\",\n display_name=\"TikToken Model Name\",\n advanced=True,\n ),\n BoolInput(\n name=\"tiktoken_enable\",\n display_name=\"TikToken Enable\",\n advanced=True,\n value=True,\n info=\"If False, you must have transformers installed.\",\n ),\n IntInput(\n name=\"dimensions\",\n display_name=\"Dimensions\",\n info=\"The number of dimensions the resulting output embeddings should have. \"\n \"Only supported by certain models.\",\n advanced=True,\n ),\n ]\n\n def build_embeddings(self) -> Embeddings:\n return OpenAIEmbeddings(\n client=self.client or None,\n model=self.model,\n dimensions=self.dimensions or None,\n deployment=self.deployment or None,\n api_version=self.openai_api_version or None,\n base_url=self.openai_api_base or None,\n openai_api_type=self.openai_api_type or None,\n openai_proxy=self.openai_proxy or None,\n embedding_ctx_length=self.embedding_ctx_length,\n api_key=self.openai_api_key or None,\n organization=self.openai_organization or None,\n allowed_special=\"all\",\n disallowed_special=\"all\",\n chunk_size=self.chunk_size,\n max_retries=self.max_retries,\n timeout=self.request_timeout or None,\n tiktoken_enabled=self.tiktoken_enable,\n tiktoken_model_name=self.tiktoken_model_name or None,\n show_progress_bar=self.show_progress_bar,\n model_kwargs=self.model_kwargs,\n skip_empty=self.skip_empty,\n default_headers=self.default_headers or None,\n default_query=self.default_query or None,\n )\n"
- },
- "default_headers": {
- "_input_type": "DictInput",
- "advanced": true,
- "display_name": "Default Headers",
- "dynamic": false,
- "info": "Default headers to use for the API request.",
- "list": false,
- "list_add_label": "Add More",
- "name": "default_headers",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "type": "dict",
- "value": {}
- },
- "default_query": {
- "_input_type": "DictInput",
- "advanced": true,
- "display_name": "Default Query",
- "dynamic": false,
- "info": "Default query parameters to use for the API request.",
- "list": false,
- "list_add_label": "Add More",
- "name": "default_query",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "type": "dict",
- "value": {}
- },
- "deployment": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "Deployment",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "deployment",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "dimensions": {
- "_input_type": "IntInput",
- "advanced": true,
- "display_name": "Dimensions",
- "dynamic": false,
- "info": "The number of dimensions the resulting output embeddings should have. Only supported by certain models.",
- "list": false,
- "list_add_label": "Add More",
- "name": "dimensions",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "int",
- "value": ""
- },
- "embedding_ctx_length": {
- "_input_type": "IntInput",
- "advanced": true,
- "display_name": "Embedding Context Length",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "embedding_ctx_length",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "int",
- "value": 1536
- },
- "max_retries": {
- "_input_type": "IntInput",
- "advanced": true,
- "display_name": "Max Retries",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "max_retries",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "int",
- "value": 3
- },
- "model": {
- "_input_type": "DropdownInput",
- "advanced": false,
- "combobox": false,
- "dialog_inputs": {},
- "display_name": "Model",
- "dynamic": false,
- "info": "",
- "name": "model",
- "options": [
- "text-embedding-3-small",
- "text-embedding-3-large",
- "text-embedding-ada-002"
- ],
- "options_metadata": [],
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "toggle": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "str",
- "value": "text-embedding-3-small"
- },
- "model_kwargs": {
- "_input_type": "DictInput",
- "advanced": true,
- "display_name": "Model Kwargs",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "model_kwargs",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "type": "dict",
- "value": {}
- },
- "openai_api_base": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "OpenAI API Base",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "openai_api_base",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "openai_api_key": {
- "_input_type": "SecretStrInput",
- "advanced": false,
- "display_name": "OpenAI API Key",
- "dynamic": false,
- "info": "",
- "input_types": [],
- "load_from_db": false,
- "name": "openai_api_key",
- "password": true,
- "placeholder": "",
- "required": true,
- "show": true,
- "title_case": false,
- "type": "str",
- "value": ""
- },
- "openai_api_type": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "OpenAI API Type",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "openai_api_type",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "openai_api_version": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "OpenAI API Version",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "openai_api_version",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "openai_organization": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "OpenAI Organization",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "openai_organization",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "openai_proxy": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "OpenAI Proxy",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "openai_proxy",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- },
- "request_timeout": {
- "_input_type": "FloatInput",
- "advanced": true,
- "display_name": "Request Timeout",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "request_timeout",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "float",
- "value": ""
- },
- "show_progress_bar": {
- "_input_type": "BoolInput",
- "advanced": true,
- "display_name": "Show Progress Bar",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "show_progress_bar",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "bool",
- "value": false
- },
- "skip_empty": {
- "_input_type": "BoolInput",
- "advanced": true,
- "display_name": "Skip Empty",
- "dynamic": false,
- "info": "",
- "list": false,
- "list_add_label": "Add More",
- "name": "skip_empty",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "bool",
- "value": false
- },
- "tiktoken_enable": {
- "_input_type": "BoolInput",
- "advanced": true,
- "display_name": "TikToken Enable",
- "dynamic": false,
- "info": "If False, you must have transformers installed.",
- "list": false,
- "list_add_label": "Add More",
- "name": "tiktoken_enable",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_metadata": true,
- "type": "bool",
- "value": true
- },
- "tiktoken_model_name": {
- "_input_type": "MessageTextInput",
- "advanced": true,
- "display_name": "TikToken Model Name",
- "dynamic": false,
- "info": "",
- "input_types": [
- "Message"
- ],
- "list": false,
- "list_add_label": "Add More",
- "load_from_db": false,
- "name": "tiktoken_model_name",
- "placeholder": "",
- "required": false,
- "show": true,
- "title_case": false,
- "tool_mode": false,
- "trace_as_input": true,
- "trace_as_metadata": true,
- "type": "str",
- "value": ""
- }
- },
- "tool_mode": false
- },
- "selected_output": "embeddings",
- "type": "OpenAIEmbeddings"
- },
- "dragging": false,
- "height": 320,
- "id": "OpenAIEmbeddings-mP45L",
- "measured": {
- "height": 320,
- "width": 320
- },
- "position": {
- "x": 1704.8491676318172,
- "y": 1879.144249471858
- },
- "positionAbsolute": {
- "x": 1690.9220896443658,
- "y": 1866.483269483266
- },
- "selected": false,
- "type": "genericNode",
- "width": 320
- },
- {
- "data": {
- "id": "note-59mzY",
- "node": {
- "description": "### 💡 Add your OpenAI API key here 👇",
- "display_name": "",
- "documentation": "",
- "template": {
- "backgroundColor": "transparent"
- }
- },
- "type": "note"
- },
- "dragging": false,
- "height": 324,
- "id": "note-59mzY",
- "measured": {
- "height": 324,
- "width": 324
- },
- "position": {
- "x": 1692.2322233423606,
- "y": 1821.9077961087607
- },
- "positionAbsolute": {
- "x": 1692.2322233423606,
- "y": 1821.9077961087607
- },
- "selected": false,
- "type": "noteNode",
- "width": 324
- },
{
"data": {
"id": "OpenSearchHybrid-XtKoA",
@@ -1327,7 +748,7 @@
"dynamic": false,
"info": "Paste a valid JWT (sent as a header).",
"input_types": [],
- "load_from_db": false,
+ "load_from_db": true,
"name": "jwt_token",
"password": true,
"placeholder": "",
@@ -1562,7 +983,7 @@
"dragging": false,
"id": "OpenSearchHybrid-XtKoA",
"measured": {
- "height": 765,
+ "height": 760,
"width": 320
},
"position": {
@@ -1574,6 +995,8 @@
},
{
"data": {
+ "description": "Uses Docling to process input documents connecting to your instance of Docling Serve.",
+ "display_name": "Docling Serve",
"id": "DoclingRemote-78KoX",
"node": {
"base_classes": [
@@ -1603,9 +1026,8 @@
"frozen": false,
"icon": "Docling",
"legacy": false,
- "lf_version": "1.6.0",
"metadata": {
- "code_hash": "930312ffe40c",
+ "code_hash": "880538860431",
"dependencies": {
"dependencies": [
{
@@ -1621,13 +1043,13 @@
"version": "2.10.6"
},
{
- "name": "lfx",
+ "name": "langflow",
"version": null
}
],
"total_dependencies": 4
},
- "module": "lfx.components.docling.docling_remote.DoclingRemoteComponent"
+ "module": "custom_components.docling_serve"
},
"minimized": false,
"output_types": [],
@@ -1639,6 +1061,8 @@
"group_outputs": false,
"method": "load_files",
"name": "dataframe",
+ "options": null,
+ "required_inputs": null,
"selected": "DataFrame",
"tool_mode": true,
"types": [
@@ -1704,7 +1128,7 @@
"show": true,
"title_case": false,
"type": "code",
- "value": "import base64\nimport time\nfrom concurrent.futures import Future, ThreadPoolExecutor\nfrom pathlib import Path\nfrom typing import Any\n\nimport httpx\nfrom docling_core.types.doc import DoclingDocument\nfrom pydantic import ValidationError\n\nfrom lfx.base.data import BaseFileComponent\nfrom lfx.inputs import IntInput, NestedDictInput, StrInput\nfrom lfx.inputs.inputs import FloatInput\nfrom lfx.schema import Data\n\n\nclass DoclingRemoteComponent(BaseFileComponent):\n display_name = \"Docling Serve\"\n description = \"Uses Docling to process input documents connecting to your instance of Docling Serve.\"\n documentation = \"https://docling-project.github.io/docling/\"\n trace_type = \"tool\"\n icon = \"Docling\"\n name = \"DoclingRemote\"\n\n MAX_500_RETRIES = 5\n\n # https://docling-project.github.io/docling/usage/supported_formats/\n VALID_EXTENSIONS = [\n \"adoc\",\n \"asciidoc\",\n \"asc\",\n \"bmp\",\n \"csv\",\n \"dotx\",\n \"dotm\",\n \"docm\",\n \"docx\",\n \"htm\",\n \"html\",\n \"jpeg\",\n \"json\",\n \"md\",\n \"pdf\",\n \"png\",\n \"potx\",\n \"ppsx\",\n \"pptm\",\n \"potm\",\n \"ppsm\",\n \"pptx\",\n \"tiff\",\n \"txt\",\n \"xls\",\n \"xlsx\",\n \"xhtml\",\n \"xml\",\n \"webp\",\n ]\n\n inputs = [\n *BaseFileComponent.get_base_inputs(),\n StrInput(\n name=\"api_url\",\n display_name=\"Server address\",\n info=\"URL of the Docling Serve instance.\",\n required=True,\n ),\n IntInput(\n name=\"max_concurrency\",\n display_name=\"Concurrency\",\n info=\"Maximum number of concurrent requests for the server.\",\n advanced=True,\n value=2,\n ),\n FloatInput(\n name=\"max_poll_timeout\",\n display_name=\"Maximum poll time\",\n info=\"Maximum waiting time for the document conversion to complete.\",\n advanced=True,\n value=3600,\n ),\n NestedDictInput(\n name=\"api_headers\",\n display_name=\"HTTP headers\",\n advanced=True,\n required=False,\n info=(\"Optional dictionary of additional headers required for connecting to Docling Serve.\"),\n ),\n NestedDictInput(\n name=\"docling_serve_opts\",\n display_name=\"Docling options\",\n advanced=True,\n required=False,\n info=(\n \"Optional dictionary of additional options. \"\n \"See https://github.com/docling-project/docling-serve/blob/main/docs/usage.md for more information.\"\n ),\n ),\n ]\n\n outputs = [\n *BaseFileComponent.get_base_outputs(),\n ]\n\n def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n base_url = f\"{self.api_url}/v1\"\n\n def _convert_document(client: httpx.Client, file_path: Path, options: dict[str, Any]) -> Data | None:\n encoded_doc = base64.b64encode(file_path.read_bytes()).decode()\n payload = {\n \"options\": options,\n \"sources\": [{\"kind\": \"file\", \"base64_string\": encoded_doc, \"filename\": file_path.name}],\n }\n\n response = client.post(f\"{base_url}/convert/source/async\", json=payload)\n response.raise_for_status()\n task = response.json()\n\n http_failures = 0\n retry_status_start = 500\n retry_status_end = 600\n start_wait_time = time.monotonic()\n while task[\"task_status\"] not in (\"success\", \"failure\"):\n # Check if processing exceeds the maximum poll timeout\n processing_time = time.monotonic() - start_wait_time\n if processing_time >= self.max_poll_timeout:\n msg = (\n f\"Processing time {processing_time=} exceeds the maximum poll timeout {self.max_poll_timeout=}.\"\n \"Please increase the max_poll_timeout parameter or review why the processing \"\n \"takes long on the server.\"\n )\n self.log(msg)\n raise RuntimeError(msg)\n\n # Call for a new status update\n time.sleep(2)\n response = client.get(f\"{base_url}/status/poll/{task['task_id']}\")\n\n # Check if the status call gets into 5xx errors and retry\n if retry_status_start <= response.status_code < retry_status_end:\n http_failures += 1\n if http_failures > self.MAX_500_RETRIES:\n self.log(f\"The status requests got a http response {response.status_code} too many times.\")\n return None\n continue\n\n # Update task status\n task = response.json()\n\n result_resp = client.get(f\"{base_url}/result/{task['task_id']}\")\n result_resp.raise_for_status()\n result = result_resp.json()\n\n if \"json_content\" not in result[\"document\"] or result[\"document\"][\"json_content\"] is None:\n self.log(\"No JSON DoclingDocument found in the result.\")\n return None\n\n try:\n doc = DoclingDocument.model_validate(result[\"document\"][\"json_content\"])\n return Data(data={\"doc\": doc, \"file_path\": str(file_path)})\n except ValidationError as e:\n self.log(f\"Error validating the document. {e}\")\n return None\n\n docling_options = {\n \"to_formats\": [\"json\"],\n \"image_export_mode\": \"placeholder\",\n **(self.docling_serve_opts or {}),\n }\n\n processed_data: list[Data | None] = []\n with (\n httpx.Client(headers=self.api_headers) as client,\n ThreadPoolExecutor(max_workers=self.max_concurrency) as executor,\n ):\n futures: list[tuple[int, Future]] = []\n for i, file in enumerate(file_list):\n if file.path is None:\n processed_data.append(None)\n continue\n\n futures.append((i, executor.submit(_convert_document, client, file.path, docling_options)))\n\n for _index, future in futures:\n try:\n result_data = future.result()\n processed_data.append(result_data)\n except (httpx.HTTPStatusError, httpx.RequestError, KeyError, ValueError) as exc:\n self.log(f\"Docling remote processing failed: {exc}\")\n raise\n\n return self.rollup_data(file_list, processed_data)\n"
+ "value": "import base64\nimport time\nfrom concurrent.futures import Future, ThreadPoolExecutor\nfrom pathlib import Path\nfrom typing import Any\n\nimport httpx\nfrom docling_core.types.doc import DoclingDocument\nfrom pydantic import ValidationError\n\nfrom langflow.base.data import BaseFileComponent\nfrom langflow.inputs import IntInput, NestedDictInput, StrInput\nfrom langflow.inputs.inputs import FloatInput\nfrom langflow.schema import Data\n\n\nclass DoclingRemoteComponent(BaseFileComponent):\n display_name = \"Docling Serve\"\n description = \"Uses Docling to process input documents connecting to your instance of Docling Serve.\"\n documentation = \"https://docling-project.github.io/docling/\"\n trace_type = \"tool\"\n icon = \"Docling\"\n name = \"DoclingRemote\"\n\n MAX_500_RETRIES = 5\n\n # https://docling-project.github.io/docling/usage/supported_formats/\n VALID_EXTENSIONS = [\n \"adoc\",\n \"asciidoc\",\n \"asc\",\n \"bmp\",\n \"csv\",\n \"dotx\",\n \"dotm\",\n \"docm\",\n \"docx\",\n \"htm\",\n \"html\",\n \"jpeg\",\n \"json\",\n \"md\",\n \"pdf\",\n \"png\",\n \"potx\",\n \"ppsx\",\n \"pptm\",\n \"potm\",\n \"ppsm\",\n \"pptx\",\n \"tiff\",\n \"txt\",\n \"xls\",\n \"xlsx\",\n \"xhtml\",\n \"xml\",\n \"webp\",\n ]\n\n inputs = [\n *BaseFileComponent._base_inputs,\n StrInput(\n name=\"api_url\",\n display_name=\"Server address\",\n info=\"URL of the Docling Serve instance.\",\n required=True,\n ),\n IntInput(\n name=\"max_concurrency\",\n display_name=\"Concurrency\",\n info=\"Maximum number of concurrent requests for the server.\",\n advanced=True,\n value=2,\n ),\n FloatInput(\n name=\"max_poll_timeout\",\n display_name=\"Maximum poll time\",\n info=\"Maximum waiting time for the document conversion to complete.\",\n advanced=True,\n value=3600,\n ),\n NestedDictInput(\n name=\"api_headers\",\n display_name=\"HTTP headers\",\n advanced=True,\n required=False,\n info=(\"Optional dictionary of additional headers required for connecting to Docling Serve.\"),\n ),\n NestedDictInput(\n name=\"docling_serve_opts\",\n display_name=\"Docling options\",\n advanced=True,\n required=False,\n info=(\n \"Optional dictionary of additional options. \"\n \"See https://github.com/docling-project/docling-serve/blob/main/docs/usage.md for more information.\"\n ),\n ),\n ]\n\n outputs = [\n *BaseFileComponent._base_outputs,\n ]\n\n def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n base_url = f\"{self.api_url}/v1alpha\"\n\n def _convert_document(client: httpx.Client, file_path: Path, options: dict[str, Any]) -> Data | None:\n encoded_doc = base64.b64encode(file_path.read_bytes()).decode()\n payload = {\n \"options\": options,\n \"file_sources\": [{\"base64_string\": encoded_doc, \"filename\": file_path.name}],\n }\n\n response = client.post(f\"{base_url}/convert/source/async\", json=payload)\n response.raise_for_status()\n task = response.json()\n\n http_failures = 0\n retry_status_start = 500\n retry_status_end = 600\n start_wait_time = time.monotonic()\n while task[\"task_status\"] not in (\"success\", \"failure\"):\n # Check if processing exceeds the maximum poll timeout\n processing_time = time.monotonic() - start_wait_time\n if processing_time >= self.max_poll_timeout:\n msg = (\n f\"Processing time {processing_time=} exceeds the maximum poll timeout {self.max_poll_timeout=}.\"\n \"Please increase the max_poll_timeout parameter or review why the processing \"\n \"takes long on the server.\"\n )\n self.log(msg)\n raise RuntimeError(msg)\n\n # Call for a new status update\n time.sleep(2)\n response = client.get(f\"{base_url}/status/poll/{task['task_id']}\")\n\n # Check if the status call gets into 5xx errors and retry\n if retry_status_start <= response.status_code < retry_status_end:\n http_failures += 1\n if http_failures > self.MAX_500_RETRIES:\n self.log(f\"The status requests got a http response {response.status_code} too many times.\")\n return None\n continue\n\n # Update task status\n task = response.json()\n\n result_resp = client.get(f\"{base_url}/result/{task['task_id']}\")\n result_resp.raise_for_status()\n result = result_resp.json()\n\n if \"json_content\" not in result[\"document\"] or result[\"document\"][\"json_content\"] is None:\n self.log(\"No JSON DoclingDocument found in the result.\")\n return None\n\n try:\n doc = DoclingDocument.model_validate(result[\"document\"][\"json_content\"])\n return Data(data={\"doc\": doc, \"file_path\": str(file_path)})\n except ValidationError as e:\n self.log(f\"Error validating the document. {e}\")\n return None\n\n docling_options = {\n \"to_formats\": [\"json\"],\n \"image_export_mode\": \"placeholder\",\n \"return_as_file\": False,\n **(self.docling_serve_opts or {}),\n }\n\n processed_data: list[Data | None] = []\n with (\n httpx.Client(headers=self.api_headers) as client,\n ThreadPoolExecutor(max_workers=self.max_concurrency) as executor,\n ):\n futures: list[tuple[int, Future]] = []\n for i, file in enumerate(file_list):\n if file.path is None:\n processed_data.append(None)\n continue\n\n futures.append((i, executor.submit(_convert_document, client, file.path, docling_options)))\n\n for _index, future in futures:\n try:\n result_data = future.result()\n processed_data.append(result_data)\n except (httpx.HTTPStatusError, httpx.RequestError, KeyError, ValueError) as exc:\n self.log(f\"Docling remote processing failed: {exc}\")\n raise\n\n return self.rollup_data(file_list, processed_data)\n"
},
"delete_server_file_after_processing": {
"_input_type": "BoolInput",
@@ -1732,6 +1156,7 @@
"info": "Optional dictionary of additional options. See https://github.com/docling-project/docling-serve/blob/main/docs/usage.md for more information.",
"list": false,
"list_add_label": "Add More",
+ "load_from_db": false,
"name": "docling_serve_opts",
"placeholder": "",
"required": false,
@@ -1939,18 +1364,20 @@
"dragging": false,
"id": "DoclingRemote-78KoX",
"measured": {
- "height": 475,
+ "height": 472,
"width": 320
},
"position": {
"x": 974.2998232996713,
"y": 1337.9345348080217
},
- "selected": true,
+ "selected": false,
"type": "genericNode"
},
{
"data": {
+ "description": "Export DoclingDocument to markdown, html or other formats.",
+ "display_name": "Export DoclingDocument",
"id": "ExportDoclingDocument-xFoCI",
"node": {
"base_classes": [
@@ -1975,9 +1402,8 @@
"frozen": false,
"icon": "Docling",
"legacy": false,
- "lf_version": "1.6.0",
"metadata": {
- "code_hash": "4de16ddd37ac",
+ "code_hash": "451c9673bd4c",
"dependencies": {
"dependencies": [
{
@@ -1985,13 +1411,13 @@
"version": "2.45.0"
},
{
- "name": "lfx",
+ "name": "langflow",
"version": null
}
],
"total_dependencies": 2
},
- "module": "lfx.components.docling.export_docling_document.ExportDoclingDocumentComponent"
+ "module": "custom_components.export_doclingdocument"
},
"minimized": false,
"output_types": [],
@@ -2003,6 +1429,8 @@
"group_outputs": false,
"method": "export_document",
"name": "data",
+ "options": null,
+ "required_inputs": null,
"selected": "Data",
"tool_mode": true,
"types": [
@@ -2017,6 +1445,9 @@
"group_outputs": false,
"method": "as_dataframe",
"name": "dataframe",
+ "options": null,
+ "required_inputs": null,
+ "selected": "DataFrame",
"tool_mode": true,
"types": [
"DataFrame"
@@ -2043,7 +1474,7 @@
"show": true,
"title_case": false,
"type": "code",
- "value": "from typing import Any\n\nfrom docling_core.types.doc import ImageRefMode\n\nfrom lfx.base.data.docling_utils import extract_docling_documents\nfrom lfx.custom import Component\nfrom lfx.io import DropdownInput, HandleInput, MessageTextInput, Output, StrInput\nfrom lfx.schema import Data, DataFrame\n\n\nclass ExportDoclingDocumentComponent(Component):\n display_name: str = \"Export DoclingDocument\"\n description: str = \"Export DoclingDocument to markdown, html or other formats.\"\n documentation = \"https://docling-project.github.io/docling/\"\n icon = \"Docling\"\n name = \"ExportDoclingDocument\"\n\n inputs = [\n HandleInput(\n name=\"data_inputs\",\n display_name=\"Data or DataFrame\",\n info=\"The data with documents to export.\",\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n DropdownInput(\n name=\"export_format\",\n display_name=\"Export format\",\n options=[\"Markdown\", \"HTML\", \"Plaintext\", \"DocTags\"],\n info=\"Select the export format to convert the input.\",\n value=\"Markdown\",\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"image_mode\",\n display_name=\"Image export mode\",\n options=[\"placeholder\", \"embedded\"],\n info=(\n \"Specify how images are exported in the output. Placeholder will replace the images with a string, \"\n \"whereas Embedded will include them as base64 encoded images.\"\n ),\n value=\"placeholder\",\n ),\n StrInput(\n name=\"md_image_placeholder\",\n display_name=\"Image placeholder\",\n info=\"Specify the image placeholder for markdown exports.\",\n value=\"\",\n advanced=True,\n ),\n StrInput(\n name=\"md_page_break_placeholder\",\n display_name=\"Page break placeholder\",\n info=\"Add this placeholder betweek pages in the markdown output.\",\n value=\"\",\n advanced=True,\n ),\n MessageTextInput(\n name=\"doc_key\",\n display_name=\"Doc Key\",\n info=\"The key to use for the DoclingDocument column.\",\n value=\"doc\",\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Exported data\", name=\"data\", method=\"export_document\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:\n if field_name == \"export_format\" and field_value == \"Markdown\":\n build_config[\"md_image_placeholder\"][\"show\"] = True\n build_config[\"md_page_break_placeholder\"][\"show\"] = True\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value == \"HTML\":\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value in {\"Plaintext\", \"DocTags\"}:\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = False\n\n return build_config\n\n def export_document(self) -> list[Data]:\n documents = extract_docling_documents(self.data_inputs, self.doc_key)\n\n results: list[Data] = []\n try:\n image_mode = ImageRefMode(self.image_mode)\n for doc in documents:\n content = \"\"\n if self.export_format == \"Markdown\":\n content = doc.export_to_markdown(\n image_mode=image_mode,\n image_placeholder=self.md_image_placeholder,\n page_break_placeholder=self.md_page_break_placeholder,\n )\n elif self.export_format == \"HTML\":\n content = doc.export_to_html(image_mode=image_mode)\n elif self.export_format == \"Plaintext\":\n content = doc.export_to_text()\n elif self.export_format == \"DocTags\":\n content = doc.export_to_doctags()\n\n results.append(Data(text=content))\n except Exception as e:\n msg = f\"Error splitting text: {e}\"\n raise TypeError(msg) from e\n\n return results\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.export_document())\n"
+ "value": "from typing import Any\n\nfrom docling_core.types.doc import ImageRefMode\n\nfrom langflow.base.data.docling_utils import extract_docling_documents\nfrom langflow.custom import Component\nfrom langflow.io import DropdownInput, HandleInput, MessageTextInput, Output, StrInput\nfrom langflow.schema import Data, DataFrame\n\n\nclass ExportDoclingDocumentComponent(Component):\n display_name: str = \"Export DoclingDocument\"\n description: str = \"Export DoclingDocument to markdown, html or other formats.\"\n documentation = \"https://docling-project.github.io/docling/\"\n icon = \"Docling\"\n name = \"ExportDoclingDocument\"\n\n inputs = [\n HandleInput(\n name=\"data_inputs\",\n display_name=\"Data or DataFrame\",\n info=\"The data with documents to export.\",\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n DropdownInput(\n name=\"export_format\",\n display_name=\"Export format\",\n options=[\"Markdown\", \"HTML\", \"Plaintext\", \"DocTags\"],\n info=\"Select the export format to convert the input.\",\n value=\"Markdown\",\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"image_mode\",\n display_name=\"Image export mode\",\n options=[\"placeholder\", \"embedded\"],\n info=(\n \"Specify how images are exported in the output. Placeholder will replace the images with a string, \"\n \"whereas Embedded will include them as base64 encoded images.\"\n ),\n value=\"placeholder\",\n ),\n StrInput(\n name=\"md_image_placeholder\",\n display_name=\"Image placeholder\",\n info=\"Specify the image placeholder for markdown exports.\",\n value=\"\",\n advanced=True,\n ),\n StrInput(\n name=\"md_page_break_placeholder\",\n display_name=\"Page break placeholder\",\n info=\"Add this placeholder betweek pages in the markdown output.\",\n value=\"\",\n advanced=True,\n ),\n MessageTextInput(\n name=\"doc_key\",\n display_name=\"Doc Key\",\n info=\"The key to use for the DoclingDocument column.\",\n value=\"doc\",\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Exported data\", name=\"data\", method=\"export_document\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:\n if field_name == \"export_format\" and field_value == \"Markdown\":\n build_config[\"md_image_placeholder\"][\"show\"] = True\n build_config[\"md_page_break_placeholder\"][\"show\"] = True\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value == \"HTML\":\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value in {\"Plaintext\", \"DocTags\"}:\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = False\n\n return build_config\n\n def export_document(self) -> list[Data]:\n documents = extract_docling_documents(self.data_inputs, self.doc_key)\n\n results: list[Data] = []\n try:\n image_mode = ImageRefMode(self.image_mode)\n for doc in documents:\n content = \"\"\n if self.export_format == \"Markdown\":\n content = doc.export_to_markdown(\n image_mode=image_mode,\n image_placeholder=self.md_image_placeholder,\n page_break_placeholder=self.md_page_break_placeholder,\n )\n elif self.export_format == \"HTML\":\n content = doc.export_to_html(image_mode=image_mode)\n elif self.export_format == \"Plaintext\":\n content = doc.export_to_text()\n elif self.export_format == \"DocTags\":\n content = doc.export_to_doctags()\n\n results.append(Data(text=content))\n except Exception as e:\n msg = f\"Error splitting text: {e}\"\n raise TypeError(msg) from e\n\n return results\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.export_document())\n"
},
"data_inputs": {
"_input_type": "HandleInput",
@@ -2188,7 +1619,7 @@
"dragging": false,
"id": "ExportDoclingDocument-xFoCI",
"measured": {
- "height": 347,
+ "height": 344,
"width": 320
},
"position": {
@@ -2197,19 +1628,328 @@
},
"selected": false,
"type": "genericNode"
+ },
+ {
+ "data": {
+ "id": "EmbeddingModel-cxG9r",
+ "node": {
+ "base_classes": [
+ "Embeddings"
+ ],
+ "beta": false,
+ "conditional_paths": [],
+ "custom_fields": {},
+ "description": "Generate embeddings using a specified provider.",
+ "display_name": "Embedding Model",
+ "documentation": "https://docs.langflow.org/components-embedding-models",
+ "edited": false,
+ "field_order": [
+ "provider",
+ "model",
+ "api_key",
+ "api_base",
+ "dimensions",
+ "chunk_size",
+ "request_timeout",
+ "max_retries",
+ "show_progress_bar",
+ "model_kwargs"
+ ],
+ "frozen": false,
+ "icon": "binary",
+ "last_updated": "2025-09-24T16:02:07.998Z",
+ "legacy": false,
+ "metadata": {
+ "code_hash": "93faf11517da",
+ "dependencies": {
+ "dependencies": [
+ {
+ "name": "langchain_openai",
+ "version": "0.3.23"
+ },
+ {
+ "name": "langflow",
+ "version": null
+ }
+ ],
+ "total_dependencies": 2
+ },
+ "module": "langflow.components.models.embedding_model.EmbeddingModelComponent"
+ },
+ "minimized": false,
+ "output_types": [],
+ "outputs": [
+ {
+ "allows_loop": false,
+ "cache": true,
+ "display_name": "Embedding Model",
+ "group_outputs": false,
+ "method": "build_embeddings",
+ "name": "embeddings",
+ "options": null,
+ "required_inputs": null,
+ "selected": "Embeddings",
+ "tool_mode": true,
+ "types": [
+ "Embeddings"
+ ],
+ "value": "__UNDEFINED__"
+ }
+ ],
+ "pinned": false,
+ "template": {
+ "_type": "Component",
+ "api_base": {
+ "_input_type": "MessageTextInput",
+ "advanced": true,
+ "display_name": "API Base URL",
+ "dynamic": false,
+ "info": "Base URL for the API. Leave empty for default.",
+ "input_types": [
+ "Message"
+ ],
+ "list": false,
+ "list_add_label": "Add More",
+ "load_from_db": false,
+ "name": "api_base",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_input": true,
+ "trace_as_metadata": true,
+ "type": "str",
+ "value": ""
+ },
+ "api_key": {
+ "_input_type": "SecretStrInput",
+ "advanced": false,
+ "display_name": "OpenAI API Key",
+ "dynamic": false,
+ "info": "Model Provider API key",
+ "input_types": [],
+ "load_from_db": true,
+ "name": "api_key",
+ "password": true,
+ "placeholder": "",
+ "real_time_refresh": true,
+ "required": true,
+ "show": true,
+ "title_case": false,
+ "type": "str",
+ "value": ""
+ },
+ "chunk_size": {
+ "_input_type": "IntInput",
+ "advanced": true,
+ "display_name": "Chunk Size",
+ "dynamic": false,
+ "info": "",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "chunk_size",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "int",
+ "value": 1000
+ },
+ "code": {
+ "advanced": true,
+ "dynamic": true,
+ "fileTypes": [],
+ "file_path": "",
+ "info": "",
+ "list": false,
+ "load_from_db": false,
+ "multiline": true,
+ "name": "code",
+ "password": false,
+ "placeholder": "",
+ "required": true,
+ "show": true,
+ "title_case": false,
+ "type": "code",
+ "value": "from typing import Any\n\nfrom langchain_openai import OpenAIEmbeddings\n\nfrom langflow.base.embeddings.model import LCEmbeddingsModel\nfrom langflow.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES\nfrom langflow.field_typing import Embeddings\nfrom langflow.io import (\n BoolInput,\n DictInput,\n DropdownInput,\n FloatInput,\n IntInput,\n MessageTextInput,\n SecretStrInput,\n)\nfrom langflow.schema.dotdict import dotdict\n\n\nclass EmbeddingModelComponent(LCEmbeddingsModel):\n display_name = \"Embedding Model\"\n description = \"Generate embeddings using a specified provider.\"\n documentation: str = \"https://docs.langflow.org/components-embedding-models\"\n icon = \"binary\"\n name = \"EmbeddingModel\"\n category = \"models\"\n\n inputs = [\n DropdownInput(\n name=\"provider\",\n display_name=\"Model Provider\",\n options=[\"OpenAI\"],\n value=\"OpenAI\",\n info=\"Select the embedding model provider\",\n real_time_refresh=True,\n options_metadata=[{\"icon\": \"OpenAI\"}],\n ),\n DropdownInput(\n name=\"model\",\n display_name=\"Model Name\",\n options=OPENAI_EMBEDDING_MODEL_NAMES,\n value=OPENAI_EMBEDDING_MODEL_NAMES[0],\n info=\"Select the embedding model to use\",\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"OpenAI API Key\",\n info=\"Model Provider API key\",\n required=True,\n show=True,\n real_time_refresh=True,\n ),\n MessageTextInput(\n name=\"api_base\",\n display_name=\"API Base URL\",\n info=\"Base URL for the API. Leave empty for default.\",\n advanced=True,\n ),\n IntInput(\n name=\"dimensions\",\n display_name=\"Dimensions\",\n info=\"The number of dimensions the resulting output embeddings should have. \"\n \"Only supported by certain models.\",\n advanced=True,\n ),\n IntInput(name=\"chunk_size\", display_name=\"Chunk Size\", advanced=True, value=1000),\n FloatInput(name=\"request_timeout\", display_name=\"Request Timeout\", advanced=True),\n IntInput(name=\"max_retries\", display_name=\"Max Retries\", advanced=True, value=3),\n BoolInput(name=\"show_progress_bar\", display_name=\"Show Progress Bar\", advanced=True),\n DictInput(\n name=\"model_kwargs\",\n display_name=\"Model Kwargs\",\n advanced=True,\n info=\"Additional keyword arguments to pass to the model.\",\n ),\n ]\n\n def build_embeddings(self) -> Embeddings:\n provider = self.provider\n model = self.model\n api_key = self.api_key\n api_base = self.api_base\n dimensions = self.dimensions\n chunk_size = self.chunk_size\n request_timeout = self.request_timeout\n max_retries = self.max_retries\n show_progress_bar = self.show_progress_bar\n model_kwargs = self.model_kwargs or {}\n\n if provider == \"OpenAI\":\n if not api_key:\n msg = \"OpenAI API key is required when using OpenAI provider\"\n raise ValueError(msg)\n return OpenAIEmbeddings(\n model=model,\n dimensions=dimensions or None,\n base_url=api_base or None,\n api_key=api_key,\n chunk_size=chunk_size,\n max_retries=max_retries,\n timeout=request_timeout or None,\n show_progress_bar=show_progress_bar,\n model_kwargs=model_kwargs,\n )\n msg = f\"Unknown provider: {provider}\"\n raise ValueError(msg)\n\n def update_build_config(self, build_config: dotdict, field_value: Any, field_name: str | None = None) -> dotdict:\n if field_name == \"provider\" and field_value == \"OpenAI\":\n build_config[\"model\"][\"options\"] = OPENAI_EMBEDDING_MODEL_NAMES\n build_config[\"model\"][\"value\"] = OPENAI_EMBEDDING_MODEL_NAMES[0]\n build_config[\"api_key\"][\"display_name\"] = \"OpenAI API Key\"\n build_config[\"api_base\"][\"display_name\"] = \"OpenAI API Base URL\"\n return build_config\n"
+ },
+ "dimensions": {
+ "_input_type": "IntInput",
+ "advanced": true,
+ "display_name": "Dimensions",
+ "dynamic": false,
+ "info": "The number of dimensions the resulting output embeddings should have. Only supported by certain models.",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "dimensions",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "int",
+ "value": ""
+ },
+ "max_retries": {
+ "_input_type": "IntInput",
+ "advanced": true,
+ "display_name": "Max Retries",
+ "dynamic": false,
+ "info": "",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "max_retries",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "int",
+ "value": 3
+ },
+ "model": {
+ "_input_type": "DropdownInput",
+ "advanced": false,
+ "combobox": false,
+ "dialog_inputs": {},
+ "display_name": "Model Name",
+ "dynamic": false,
+ "info": "Select the embedding model to use",
+ "name": "model",
+ "options": [
+ "text-embedding-3-small",
+ "text-embedding-3-large",
+ "text-embedding-ada-002"
+ ],
+ "options_metadata": [],
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "toggle": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "str",
+ "value": "text-embedding-3-small"
+ },
+ "model_kwargs": {
+ "_input_type": "DictInput",
+ "advanced": true,
+ "display_name": "Model Kwargs",
+ "dynamic": false,
+ "info": "Additional keyword arguments to pass to the model.",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "model_kwargs",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_input": true,
+ "type": "dict",
+ "value": {}
+ },
+ "provider": {
+ "_input_type": "DropdownInput",
+ "advanced": false,
+ "combobox": false,
+ "dialog_inputs": {},
+ "display_name": "Model Provider",
+ "dynamic": false,
+ "info": "Select the embedding model provider",
+ "name": "provider",
+ "options": [
+ "OpenAI"
+ ],
+ "options_metadata": [
+ {
+ "icon": "OpenAI"
+ }
+ ],
+ "placeholder": "",
+ "real_time_refresh": true,
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "toggle": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "str",
+ "value": "OpenAI"
+ },
+ "request_timeout": {
+ "_input_type": "FloatInput",
+ "advanced": true,
+ "display_name": "Request Timeout",
+ "dynamic": false,
+ "info": "",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "request_timeout",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "float",
+ "value": ""
+ },
+ "show_progress_bar": {
+ "_input_type": "BoolInput",
+ "advanced": true,
+ "display_name": "Show Progress Bar",
+ "dynamic": false,
+ "info": "",
+ "list": false,
+ "list_add_label": "Add More",
+ "name": "show_progress_bar",
+ "placeholder": "",
+ "required": false,
+ "show": true,
+ "title_case": false,
+ "tool_mode": false,
+ "trace_as_metadata": true,
+ "type": "bool",
+ "value": false
+ }
+ },
+ "tool_mode": false
+ },
+ "showNode": true,
+ "type": "EmbeddingModel"
+ },
+ "dragging": false,
+ "id": "EmbeddingModel-cxG9r",
+ "measured": {
+ "height": 366,
+ "width": 320
+ },
+ "position": {
+ "x": 1743.8608432729177,
+ "y": 1808.780792406514
+ },
+ "selected": false,
+ "type": "genericNode"
}
],
"viewport": {
- "x": -708.9707113557265,
- "y": -965.7967428241175,
- "zoom": 0.7967811989815704
+ "x": -767.6929603556041,
+ "y": -1196.6455082358875,
+ "zoom": 0.9277466102702023
}
},
"description": "Load your data for chat context with Retrieval Augmented Generation.",
"endpoint_name": null,
"id": "1402618b-e6d1-4ff2-9a11-d6ce71186915",
"is_component": false,
- "last_tested_version": "1.6.0",
+ "last_tested_version": "1.5.0.post2",
"name": "OpenSearch Ingestion Flow Docling Serve",
"tags": [
"openai",
diff --git a/frontend/components/ui/select.tsx b/frontend/components/ui/select.tsx
index 5c139245..6a0d8bbb 100644
--- a/frontend/components/ui/select.tsx
+++ b/frontend/components/ui/select.tsx
@@ -1,5 +1,6 @@
"use client";
+<<<<<<< HEAD
import * as React from "react";
import * as SelectPrimitive from "@radix-ui/react-select";
import {
@@ -9,6 +10,11 @@ import {
ChevronUp,
LockIcon,
} from "lucide-react";
+=======
+import * as React from "react"
+import * as SelectPrimitive from "@radix-ui/react-select"
+import { Check, ChevronDown, ChevronUp, Lock } from "lucide-react"
+>>>>>>> main
import { cn } from "@/lib/utils";
diff --git a/src/api/settings.py b/src/api/settings.py
index 560eb400..37072c63 100644
--- a/src/api/settings.py
+++ b/src/api/settings.py
@@ -182,6 +182,7 @@ async def update_settings(request, session_manager):
"chunk_size",
"chunk_overlap",
"doclingPresets",
+ "embedding_model",
}
# Check for invalid fields
@@ -202,11 +203,53 @@ async def update_settings(request, session_manager):
current_config.agent.llm_model = body["llm_model"]
config_updated = True
+ # Also update the chat flow with the new model
+ try:
+ flows_service = _get_flows_service()
+ await flows_service.update_chat_flow_model(body["llm_model"])
+ logger.info(f"Successfully updated chat flow model to '{body['llm_model']}'")
+ except Exception as e:
+ logger.error(f"Failed to update chat flow model: {str(e)}")
+ # Don't fail the entire settings update if flow update fails
+ # The config will still be saved
+
if "system_prompt" in body:
current_config.agent.system_prompt = body["system_prompt"]
config_updated = True
+ # Also update the chat flow with the new system prompt
+ try:
+ flows_service = _get_flows_service()
+ await flows_service.update_chat_flow_system_prompt(body["system_prompt"])
+ logger.info(f"Successfully updated chat flow system prompt")
+ except Exception as e:
+ logger.error(f"Failed to update chat flow system prompt: {str(e)}")
+ # Don't fail the entire settings update if flow update fails
+ # The config will still be saved
+
# Update knowledge settings
+ if "embedding_model" in body:
+ if (
+ not isinstance(body["embedding_model"], str)
+ or not body["embedding_model"].strip()
+ ):
+ return JSONResponse(
+ {"error": "embedding_model must be a non-empty string"},
+ status_code=400,
+ )
+ current_config.knowledge.embedding_model = body["embedding_model"].strip()
+ config_updated = True
+
+ # Also update the ingest flow with the new embedding model
+ try:
+ flows_service = _get_flows_service()
+ await flows_service.update_ingest_flow_embedding_model(body["embedding_model"].strip())
+ logger.info(f"Successfully updated ingest flow embedding model to '{body['embedding_model'].strip()}'")
+ except Exception as e:
+ logger.error(f"Failed to update ingest flow embedding model: {str(e)}")
+ # Don't fail the entire settings update if flow update fails
+ # The config will still be saved
+
if "doclingPresets" in body:
preset_configs = get_docling_preset_configs()
valid_presets = list(preset_configs.keys())
@@ -222,7 +265,8 @@ async def update_settings(request, session_manager):
# Also update the flow with the new docling preset
try:
- await _update_flow_docling_preset(body["doclingPresets"], preset_configs[body["doclingPresets"]])
+ flows_service = _get_flows_service()
+ await flows_service.update_flow_docling_preset(body["doclingPresets"], preset_configs[body["doclingPresets"]])
logger.info(f"Successfully updated docling preset in flow to '{body['doclingPresets']}'")
except Exception as e:
logger.error(f"Failed to update docling preset in flow: {str(e)}")
@@ -237,6 +281,16 @@ async def update_settings(request, session_manager):
current_config.knowledge.chunk_size = body["chunk_size"]
config_updated = True
+ # Also update the ingest flow with the new chunk size
+ try:
+ flows_service = _get_flows_service()
+ await flows_service.update_ingest_flow_chunk_size(body["chunk_size"])
+ logger.info(f"Successfully updated ingest flow chunk size to {body['chunk_size']}")
+ except Exception as e:
+ logger.error(f"Failed to update ingest flow chunk size: {str(e)}")
+ # Don't fail the entire settings update if flow update fails
+ # The config will still be saved
+
if "chunk_overlap" in body:
if not isinstance(body["chunk_overlap"], int) or body["chunk_overlap"] < 0:
return JSONResponse(
@@ -246,6 +300,16 @@ async def update_settings(request, session_manager):
current_config.knowledge.chunk_overlap = body["chunk_overlap"]
config_updated = True
+ # Also update the ingest flow with the new chunk overlap
+ try:
+ flows_service = _get_flows_service()
+ await flows_service.update_ingest_flow_chunk_overlap(body["chunk_overlap"])
+ logger.info(f"Successfully updated ingest flow chunk overlap to {body['chunk_overlap']}")
+ except Exception as e:
+ logger.error(f"Failed to update ingest flow chunk overlap: {str(e)}")
+ # Don't fail the entire settings update if flow update fails
+ # The config will still be saved
+
if not config_updated:
return JSONResponse(
{"error": "No valid fields provided for update"}, status_code=400
@@ -524,48 +588,12 @@ async def onboarding(request, flows_service):
)
-async def _update_flow_docling_preset(preset: str, preset_config: dict):
- """Helper function to update docling preset in the ingest flow"""
- if not LANGFLOW_INGEST_FLOW_ID:
- raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
- # Get the current flow data from Langflow
- response = await clients.langflow_request(
- "GET", f"/api/v1/flows/{LANGFLOW_INGEST_FLOW_ID}"
- )
- if response.status_code != 200:
- raise Exception(f"Failed to get ingest flow: HTTP {response.status_code} - {response.text}")
-
- flow_data = response.json()
-
- # Find the target node in the flow using environment variable
- nodes = flow_data.get("data", {}).get("nodes", [])
- target_node = None
- target_node_index = None
-
- for i, node in enumerate(nodes):
- if node.get("id") == DOCLING_COMPONENT_ID:
- target_node = node
- target_node_index = i
- break
-
- if target_node is None:
- raise Exception(f"Docling component '{DOCLING_COMPONENT_ID}' not found in ingest flow")
-
- # Update the docling_serve_opts value directly in the existing node
- if (target_node.get("data", {}).get("node", {}).get("template", {}).get("docling_serve_opts")):
- flow_data["data"]["nodes"][target_node_index]["data"]["node"]["template"]["docling_serve_opts"]["value"] = preset_config
- else:
- raise Exception(f"docling_serve_opts field not found in node '{DOCLING_COMPONENT_ID}'")
-
- # Update the flow via PATCH request
- patch_response = await clients.langflow_request(
- "PATCH", f"/api/v1/flows/{LANGFLOW_INGEST_FLOW_ID}", json=flow_data
- )
-
- if patch_response.status_code != 200:
- raise Exception(f"Failed to update ingest flow: HTTP {patch_response.status_code} - {patch_response.text}")
+def _get_flows_service():
+ """Helper function to get flows service instance"""
+ from services.flows_service import FlowsService
+ return FlowsService()
async def update_docling_preset(request, session_manager):
@@ -595,7 +623,8 @@ async def update_docling_preset(request, session_manager):
preset_config = preset_configs[preset]
# Use the helper function to update the flow
- await _update_flow_docling_preset(preset, preset_config)
+ flows_service = _get_flows_service()
+ await flows_service.update_flow_docling_preset(preset, preset_config)
logger.info(f"Successfully updated docling preset to '{preset}' in ingest flow")
diff --git a/src/services/flows_service.py b/src/services/flows_service.py
index 4c3872ca..8993025a 100644
--- a/src/services/flows_service.py
+++ b/src/services/flows_service.py
@@ -400,6 +400,128 @@ class FlowsService:
return node
return None
+ def _find_node_in_flow(self, flow_data, node_id=None, display_name=None):
+ """
+ Helper function to find a node in flow data by ID or display name.
+ Returns tuple of (node, node_index) or (None, None) if not found.
+ """
+ nodes = flow_data.get("data", {}).get("nodes", [])
+
+ for i, node in enumerate(nodes):
+ node_data = node.get("data", {})
+ node_template = node_data.get("node", {})
+
+ # Check by ID if provided
+ if node_id and node_data.get("id") == node_id:
+ return node, i
+
+ # Check by display_name if provided
+ if display_name and node_template.get("display_name") == display_name:
+ return node, i
+
+ return None, None
+
+ async def _update_flow_field(self, flow_id: str, field_name: str, field_value: str, node_display_name: str = None, node_id: str = None):
+ """
+ Generic helper function to update any field in any Langflow component.
+
+ Args:
+ flow_id: The ID of the flow to update
+ field_name: The name of the field to update (e.g., 'model_name', 'system_message', 'docling_serve_opts')
+ field_value: The new value to set
+ node_display_name: The display name to search for (optional)
+ node_id: The node ID to search for (optional, used as fallback or primary)
+ """
+ if not flow_id:
+ raise ValueError("flow_id is required")
+
+ # Get the current flow data from Langflow
+ response = await clients.langflow_request(
+ "GET", f"/api/v1/flows/{flow_id}"
+ )
+
+ if response.status_code != 200:
+ raise Exception(f"Failed to get flow: HTTP {response.status_code} - {response.text}")
+
+ flow_data = response.json()
+
+ # Find the target component by display name first, then by ID as fallback
+ target_node, target_node_index = None, None
+ if node_display_name:
+ target_node, target_node_index = self._find_node_in_flow(flow_data, display_name=node_display_name)
+
+ if target_node is None and node_id:
+ target_node, target_node_index = self._find_node_in_flow(flow_data, node_id=node_id)
+
+ if target_node is None:
+ identifier = node_display_name or node_id
+ raise Exception(f"Component '{identifier}' not found in flow {flow_id}")
+
+ # Update the field value directly in the existing node
+ template = target_node.get("data", {}).get("node", {}).get("template", {})
+ if template.get(field_name):
+ flow_data["data"]["nodes"][target_node_index]["data"]["node"]["template"][field_name]["value"] = field_value
+ else:
+ identifier = node_display_name or node_id
+ raise Exception(f"{field_name} field not found in {identifier} component")
+
+ # Update the flow via PATCH request
+ patch_response = await clients.langflow_request(
+ "PATCH", f"/api/v1/flows/{flow_id}", json=flow_data
+ )
+
+ if patch_response.status_code != 200:
+ raise Exception(f"Failed to update flow: HTTP {patch_response.status_code} - {patch_response.text}")
+
+ async def update_chat_flow_model(self, model_name: str):
+ """Helper function to update the model in the chat flow"""
+ if not LANGFLOW_CHAT_FLOW_ID:
+ raise ValueError("LANGFLOW_CHAT_FLOW_ID is not configured")
+ await self._update_flow_field(LANGFLOW_CHAT_FLOW_ID, "model_name", model_name,
+ node_display_name="Language Model",
+ node_id="LanguageModelComponent-0YME7")
+
+ async def update_chat_flow_system_prompt(self, system_prompt: str):
+ """Helper function to update the system prompt in the chat flow"""
+ if not LANGFLOW_CHAT_FLOW_ID:
+ raise ValueError("LANGFLOW_CHAT_FLOW_ID is not configured")
+ await self._update_flow_field(LANGFLOW_CHAT_FLOW_ID, "system_message", system_prompt,
+ node_display_name="Language Model",
+ node_id="LanguageModelComponent-0YME7")
+
+ async def update_flow_docling_preset(self, preset: str, preset_config: dict):
+ """Helper function to update docling preset in the ingest flow"""
+ if not LANGFLOW_INGEST_FLOW_ID:
+ raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
+
+ from config.settings import DOCLING_COMPONENT_ID
+ await self._update_flow_field(LANGFLOW_INGEST_FLOW_ID, "docling_serve_opts", preset_config,
+ node_id=DOCLING_COMPONENT_ID)
+
+ async def update_ingest_flow_chunk_size(self, chunk_size: int):
+ """Helper function to update chunk size in the ingest flow"""
+ if not LANGFLOW_INGEST_FLOW_ID:
+ raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
+ await self._update_flow_field(LANGFLOW_INGEST_FLOW_ID, "chunk_size", chunk_size,
+ node_display_name="Split Text",
+ node_id="SplitText-3ZI5B")
+
+ async def update_ingest_flow_chunk_overlap(self, chunk_overlap: int):
+ """Helper function to update chunk overlap in the ingest flow"""
+ if not LANGFLOW_INGEST_FLOW_ID:
+ raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
+ await self._update_flow_field(LANGFLOW_INGEST_FLOW_ID, "chunk_overlap", chunk_overlap,
+ node_display_name="Split Text",
+ node_id="SplitText-3ZI5B")
+
+ async def update_ingest_flow_embedding_model(self, embedding_model: str):
+ """Helper function to update embedding model in the ingest flow"""
+ if not LANGFLOW_INGEST_FLOW_ID:
+ raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
+ await self._update_flow_field(LANGFLOW_INGEST_FLOW_ID, "model", embedding_model,
+ node_display_name="Embedding Model",
+ node_id="EmbeddingModel-eZ6bT")
+
def _replace_node_in_flow(self, flow_data, old_id, new_node):
"""Replace a node in the flow data"""
nodes = flow_data.get("data", {}).get("nodes", [])