From f0744b153d8db820988c4d2d76fc9b841921345a Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Tue, 2 Sep 2025 23:10:20 -0300
Subject: [PATCH 01/67] Add LangflowFileService for file upload and ingestion
 flow

This commit introduces the LangflowFileService class, which provides methods for uploading user files, deleting user files, and triggering an ingestion flow using the Langflow Files API. The service is designed to handle asynchronous operations and includes error handling for API requests. Documentation for each method is included to ensure clarity on usage.
---
 src/services/langflow_file_service.py | 75 +++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 src/services/langflow_file_service.py

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
new file mode 100644
index 00000000..5945d5b7
--- /dev/null
+++ b/src/services/langflow_file_service.py
@@ -0,0 +1,75 @@
+from typing import Any, Dict, List, Optional
+
+import httpx
+
+from config.settings import FLOW_ID_INGEST, LANGFLOW_BASE_URL, LANGFLOW_KEY
+
+
+class LangflowFileService:
+    def __init__(self):
+        self.base_url = LANGFLOW_BASE_URL.rstrip("/")
+        self.api_key = LANGFLOW_KEY
+        self.flow_id_ingest = FLOW_ID_INGEST
+
+    def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
+        headers = {"x-api-key": self.api_key} if self.api_key else {}
+        if extra:
+            headers.update(extra)
+        return headers
+
+    async def upload_user_file(self, file_tuple) -> Dict[str, Any]:
+        """Upload a file for the current user using Langflow Files API."""
+        url = f"{self.base_url}/files/user/upload"
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            files = {"file": file_tuple}
+            resp = await client.post(url, headers=self._headers(), files=files)
+            resp.raise_for_status()
+            return resp.json()
+
+    async def delete_user_file(self, file_id: str) -> None:
+        url = f"{self.base_url}/files/user/{file_id}"
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            resp = await client.delete(url, headers=self._headers())
+            resp.raise_for_status()
+
+    async def run_ingestion_flow(
+        self,
+        file_paths: List[str],
+        session_id: Optional[str] = None,
+        tweaks: Optional[Dict[str, Any]] = None,
+        jwt_token: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Trigger the ingestion flow with provided file paths.
+        The flow must expose a File component path in input schema or accept files parameter.
+        """
+        if not self.flow_id_ingest:
+            raise ValueError("FLOW_ID_INGEST is not configured")
+
+        url = f"{self.base_url}/run/{self.flow_id_ingest}"
+
+        payload: Dict[str, Any] = {
+            "input_value": "Ingest files",
+            "input_type": "chat",
+            "output_type": "json",
+        }
+
+        # Prefer passing files via 'files' if flow supports it, otherwise via tweaks
+        if file_paths:
+            payload["files"] = file_paths
+        if tweaks:
+            payload["tweaks"] = tweaks
+        if session_id:
+            payload["session_id"] = session_id
+
+        extra_headers = {}
+        if jwt_token:
+            # Provide user context if flow needs it
+            extra_headers["X-LANGFLOW-GLOBAL-VAR-JWT"] = jwt_token
+
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(
+                url, headers=self._headers(extra_headers), json=payload
+            )
+            resp.raise_for_status()
+            return resp.json()

From 50f1663374f97191451b01e5082c851b745161a5 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Tue, 2 Sep 2025 23:10:28 -0300
Subject: [PATCH 02/67] Add API endpoints for file upload, ingestion, and
 deletion

This commit introduces three asynchronous API endpoints in langflow_files.py: upload_user_file, run_ingestion, and delete_user_files. Each endpoint handles file operations with appropriate error handling and returns JSON responses. The implementation ensures robust interaction with the LangflowFileService for managing user files.
---
 src/api/langflow_files.py | 88 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/api/langflow_files.py

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
new file mode 100644
index 00000000..2d2cfd42
--- /dev/null
+++ b/src/api/langflow_files.py
@@ -0,0 +1,88 @@
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+
+from services.langflow_file_service import LangflowFileService
+
+
+async def upload_user_file(
+    request: Request, langflow_file_service: LangflowFileService, session_manager
+):
+    try:
+        form = await request.form()
+        upload_file = form.get("file")
+        if upload_file is None:
+            return JSONResponse({"error": "Missing file"}, status_code=400)
+
+        # starlette UploadFile provides file-like; httpx needs (filename, file, content_type)
+        file_tuple = (
+            upload_file.filename,
+            await upload_file.read(),
+            upload_file.content_type or "application/octet-stream",
+        )
+
+        result = await langflow_file_service.upload_user_file(file_tuple)
+        return JSONResponse(result, status_code=201)
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+
+
+async def run_ingestion(
+    request: Request, langflow_file_service: LangflowFileService, session_manager
+):
+    try:
+        payload = await request.json()
+        file_ids = payload.get("file_ids")
+        file_paths = payload.get("file_paths") or []
+        session_id = payload.get("session_id")
+        tweaks = payload.get("tweaks")
+
+        # We assume file_paths is provided. If only file_ids are provided, client would need to resolve to paths via Files API (not implemented here).
+        if not file_paths and not file_ids:
+            return JSONResponse(
+                {"error": "Provide file_paths or file_ids"}, status_code=400
+            )
+
+        # Include user JWT if available
+        jwt_token = getattr(request.state, "jwt_token", None)
+
+        result = await langflow_file_service.run_ingestion_flow(
+            file_paths=file_paths or [],
+            session_id=session_id,
+            tweaks=tweaks,
+            jwt_token=jwt_token,
+        )
+        return JSONResponse(result)
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+
+
+async def delete_user_files(
+    request: Request, langflow_file_service: LangflowFileService, session_manager
+):
+    try:
+        payload = await request.json()
+        file_ids = payload.get("file_ids")
+        if not file_ids or not isinstance(file_ids, list):
+            return JSONResponse(
+                {"error": "file_ids must be a non-empty list"}, status_code=400
+            )
+
+        errors = []
+        for fid in file_ids:
+            try:
+                await langflow_file_service.delete_user_file(fid)
+            except Exception as e:
+                errors.append({"file_id": fid, "error": str(e)})
+
+        status = 207 if errors else 200
+        return JSONResponse(
+            {
+                "deleted": [
+                    fid for fid in file_ids if fid not in [e["file_id"] for e in errors]
+                ],
+                "errors": errors,
+            },
+            status_code=status,
+        )
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)

From 003cb1a443641cc5ef25a41491f7753645a22d2b Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Tue, 2 Sep 2025 23:10:37 -0300
Subject: [PATCH 03/67] Refactor imports and enhance service organization in
 main.py

This commit reorganizes the import statements in main.py for better clarity and structure. It consolidates API endpoint imports and service imports, ensuring a cleaner and more maintainable codebase. Additionally, it updates the print statement for connection loading to improve logging consistency.
---
 src/main.py | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/main.py b/src/main.py
index 5f0aeedc..7ea5a197 100644
--- a/src/main.py
+++ b/src/main.py
@@ -4,6 +4,7 @@ import multiprocessing
 import os
 import subprocess
 from functools import partial
+
 from starlette.applications import Starlette
 from starlette.routing import Route
 
@@ -11,30 +12,38 @@ from starlette.routing import Route
 multiprocessing.set_start_method('spawn', force=True)
 
 # Create process pool FIRST, before any torch/CUDA imports
-from utils.process_pool import process_pool
-
 import torch
 
-# Configuration and setup
-from config.settings import clients, INDEX_NAME, INDEX_BODY, SESSION_SECRET
-from utils.gpu_detection import detect_gpu_devices
+# API endpoints
+from api import (
+    auth,
+    chat,
+    connectors,
+    knowledge_filter,
+    oidc,
+    search,
+    settings,
+    tasks,
+    upload,
+)
+from auth_middleware import optional_auth, require_auth
 
-# Services
-from services.document_service import DocumentService
-from services.search_service import SearchService
-from services.task_service import TaskService
-from services.auth_service import AuthService
-from services.chat_service import ChatService
-from services.knowledge_filter_service import KnowledgeFilterService
-from services.monitor_service import MonitorService
+# Configuration and setup
+from config.settings import INDEX_BODY, INDEX_NAME, SESSION_SECRET, clients
 
 # Existing services
 from connectors.service import ConnectorService
-from session_manager import SessionManager
-from auth_middleware import require_auth, optional_auth
+from services.auth_service import AuthService
+from services.chat_service import ChatService
 
-# API endpoints
-from api import upload, search, chat, auth, connectors, tasks, oidc, knowledge_filter, settings
+# Services
+from services.document_service import DocumentService
+from services.knowledge_filter_service import KnowledgeFilterService
+from services.monitor_service import MonitorService
+from services.search_service import SearchService
+from services.task_service import TaskService
+from session_manager import SessionManager
+from utils.process_pool import process_pool
 
 print("CUDA available:", torch.cuda.is_available())
 print("CUDA version PyTorch was built with:", torch.version.cuda)
@@ -202,7 +211,7 @@ async def initialize_services():
         except Exception as e:
             print(f"[WARNING] Failed to load persisted connections on startup: {e}")
     else:
-        print(f"[CONNECTORS] Skipping connection loading in no-auth mode")
+        print("[CONNECTORS] Skipping connection loading in no-auth mode")
     
     return {
         'document_service': document_service,

From 31cab2e6d2d0b8692faca2a75da2932f3cf7a921 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Tue, 2 Sep 2025 23:19:15 -0300
Subject: [PATCH 04/67] Reorganize import statements in settings.py for
 improved clarity

This commit refactors the import statements in settings.py, enhancing the organization and readability of the code. The changes include consolidating and reordering imports, which contributes to a cleaner and more maintainable codebase.
---
 src/config/settings.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/config/settings.py b/src/config/settings.py
index 3a42fa1c..814513c7 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -1,13 +1,13 @@
 import os
-import requests
-import asyncio
 import time
+
+import requests
+from agentd.patch import patch_openai_with_mcp
+from docling.document_converter import DocumentConverter
 from dotenv import load_dotenv
+from openai import AsyncOpenAI
 from opensearchpy import AsyncOpenSearch
 from opensearchpy._async.http_aiohttp import AIOHttpConnection
-from docling.document_converter import DocumentConverter
-from agentd.patch import patch_openai_with_mcp
-from openai import AsyncOpenAI
 
 load_dotenv()
 load_dotenv("../")

From a10b35f6315d26d40c729632d43ae416c219754c Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Wed, 3 Sep 2025 10:36:46 -0300
Subject: [PATCH 05/67] Add LangflowFileService integration and new API
 endpoints in main.py

This commit integrates the LangflowFileService into main.py, enabling the management of user files through new asynchronous API endpoints for file upload, ingestion, and deletion. The changes enhance the application's functionality and maintainability by providing structured access to Langflow file operations, while ensuring proper authentication and session management for each endpoint.
---
 src/main.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/main.py b/src/main.py
index 7ea5a197..dc7366d5 100644
--- a/src/main.py
+++ b/src/main.py
@@ -20,6 +20,7 @@ from api import (
     chat,
     connectors,
     knowledge_filter,
+    langflow_files,
     oidc,
     search,
     settings,
@@ -30,13 +31,13 @@ from auth_middleware import optional_auth, require_auth
 
 # Configuration and setup
 from config.settings import INDEX_BODY, INDEX_NAME, SESSION_SECRET, clients
-
 # Existing services
 from connectors.service import ConnectorService
 from services.auth_service import AuthService
 from services.chat_service import ChatService
 
 # Services
+from services.langflow_file_service import LangflowFileService
 from services.document_service import DocumentService
 from services.knowledge_filter_service import KnowledgeFilterService
 from services.monitor_service import MonitorService
@@ -213,11 +214,16 @@ async def initialize_services():
     else:
         print("[CONNECTORS] Skipping connection loading in no-auth mode")
     
+    # New: Langflow file service
+
+    langflow_file_service = LangflowFileService()
+
     return {
         'document_service': document_service,
         'search_service': search_service,
         'task_service': task_service,
         'chat_service': chat_service,
+        'langflow_file_service': langflow_file_service,
         'auth_service': auth_service,
         'connector_service': connector_service,
         'knowledge_filter_service': knowledge_filter_service,
@@ -238,6 +244,28 @@ async def create_app():
                          document_service=services['document_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
+
+        # Langflow Files endpoints
+        Route("/langflow/files/upload",
+              optional_auth(services['session_manager'])(
+                  partial(langflow_files.upload_user_file,
+                         langflow_file_service=services['langflow_file_service'],
+                         session_manager=services['session_manager'])
+              ), methods=["POST"]),
+
+        Route("/langflow/ingest",
+              require_auth(services['session_manager'])(
+                  partial(langflow_files.run_ingestion,
+                         langflow_file_service=services['langflow_file_service'],
+                         session_manager=services['session_manager'])
+              ), methods=["POST"]),
+
+        Route("/langflow/files",
+              require_auth(services['session_manager'])(
+                  partial(langflow_files.delete_user_files,
+                         langflow_file_service=services['langflow_file_service'],
+                         session_manager=services['session_manager'])
+              ), methods=["DELETE"]),
         
         Route("/upload_context", 
               require_auth(services['session_manager'])(
@@ -530,6 +558,7 @@ async def cleanup_subscriptions_proper(services):
     except Exception as e:
         print(f"[ERROR] Failed to cleanup subscriptions: {e}")
 
+
 if __name__ == "__main__":
     import uvicorn
     

From 4be48270b774f970ee4c679d0cc0921d69e96e37 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Wed, 3 Sep 2025 10:36:54 -0300
Subject: [PATCH 06/67] Refactor main.py for improved organization and clarity

This commit reorganizes the import statements in main.py, enhancing the structure and readability of the code. It also includes minor formatting adjustments for consistency. The changes contribute to a cleaner codebase, facilitating easier maintenance and future development.
---
 src/main.py | 213 ++++++++++++++++++++++++++--------------------------
 1 file changed, 107 insertions(+), 106 deletions(-)

diff --git a/src/main.py b/src/main.py
index dc7366d5..ecf40a78 100644
--- a/src/main.py
+++ b/src/main.py
@@ -31,15 +31,16 @@ from auth_middleware import optional_auth, require_auth
 
 # Configuration and setup
 from config.settings import INDEX_BODY, INDEX_NAME, SESSION_SECRET, clients
+
 # Existing services
 from connectors.service import ConnectorService
 from services.auth_service import AuthService
 from services.chat_service import ChatService
+from services.document_service import DocumentService
+from services.knowledge_filter_service import KnowledgeFilterService
 
 # Services
 from services.langflow_file_service import LangflowFileService
-from services.document_service import DocumentService
-from services.knowledge_filter_service import KnowledgeFilterService
 from services.monitor_service import MonitorService
 from services.search_service import SearchService
 from services.task_service import TaskService
@@ -53,7 +54,7 @@ async def wait_for_opensearch():
     """Wait for OpenSearch to be ready with retries"""
     max_retries = 30
     retry_delay = 2
-    
+
     for attempt in range(max_retries):
         try:
             await clients.opensearch.info()
@@ -74,11 +75,11 @@ async def configure_alerting_security():
         alerting_settings = {
             "persistent": {
                 "plugins.alerting.filter_by_backend_roles": "false",
-                "opendistro.alerting.filter_by_backend_roles": "false", 
+                "opendistro.alerting.filter_by_backend_roles": "false",
                 "opensearch.notifications.general.filter_by_backend_roles": "false"
             }
         }
-        
+
         # Use admin client (clients.opensearch uses admin credentials)
         response = await clients.opensearch.cluster.put_settings(body=alerting_settings)
         print("Alerting security settings configured successfully")
@@ -90,14 +91,14 @@ async def configure_alerting_security():
 async def init_index():
     """Initialize OpenSearch index and security roles"""
     await wait_for_opensearch()
-    
+
     # Create documents index
     if not await clients.opensearch.indices.exists(index=INDEX_NAME):
         await clients.opensearch.indices.create(index=INDEX_NAME, body=INDEX_BODY)
         print(f"Created index '{INDEX_NAME}'")
     else:
         print(f"Index '{INDEX_NAME}' already exists, skipping creation.")
-    
+
     # Create knowledge filters index
     knowledge_filter_index_name = "knowledge_filters"
     knowledge_filter_index_body = {
@@ -116,13 +117,13 @@ async def init_index():
             }
         }
     }
-    
+
     if not await clients.opensearch.indices.exists(index=knowledge_filter_index_name):
         await clients.opensearch.indices.create(index=knowledge_filter_index_name, body=knowledge_filter_index_body)
         print(f"Created index '{knowledge_filter_index_name}'")
     else:
         print(f"Index '{knowledge_filter_index_name}' already exists, skipping creation.")
-    
+
     # Configure alerting plugin security settings
     await configure_alerting_security()
 
@@ -131,10 +132,10 @@ def generate_jwt_keys():
     keys_dir = "keys"
     private_key_path = os.path.join(keys_dir, "private_key.pem")
     public_key_path = os.path.join(keys_dir, "public_key.pem")
-    
+
     # Create keys directory if it doesn't exist
     os.makedirs(keys_dir, exist_ok=True)
-    
+
     # Generate keys if they don't exist
     if not os.path.exists(private_key_path):
         try:
@@ -142,12 +143,12 @@ def generate_jwt_keys():
             subprocess.run([
                 "openssl", "genrsa", "-out", private_key_path, "2048"
             ], check=True, capture_output=True)
-            
+
             # Generate public key
             subprocess.run([
                 "openssl", "rsa", "-in", private_key_path, "-pubout", "-out", public_key_path
             ], check=True, capture_output=True)
-            
+
             print("Generated RSA keys for JWT signing")
         except subprocess.CalledProcessError as e:
             print(f"Failed to generate RSA keys: {e}")
@@ -163,19 +164,19 @@ async def init_index_when_ready():
     except Exception as e:
         print(f"OpenSearch index initialization failed: {e}")
         print("OIDC endpoints will still work, but document operations may fail until OpenSearch is ready")
-    
+
 
 async def initialize_services():
     """Initialize all services and their dependencies"""
     # Generate JWT keys if they don't exist
     generate_jwt_keys()
-    
+
     # Initialize clients (now async to generate Langflow API key)
     await clients.initialize()
-    
+
     # Initialize session manager
     session_manager = SessionManager(SESSION_SECRET)
-    
+
     # Initialize services
     document_service = DocumentService(session_manager=session_manager)
     search_service = SearchService(session_manager)
@@ -183,10 +184,10 @@ async def initialize_services():
     chat_service = ChatService()
     knowledge_filter_service = KnowledgeFilterService(session_manager)
     monitor_service = MonitorService(session_manager)
-    
+
     # Set process pool for document service
     document_service.process_pool = process_pool
-    
+
     # Initialize connector service
     connector_service = ConnectorService(
         patched_async_client=clients.patched_async_client,
@@ -196,10 +197,10 @@ async def initialize_services():
         task_service=task_service,
         session_manager=session_manager
     )
-    
+
     # Initialize auth service
     auth_service = AuthService(session_manager, connector_service)
-    
+
     # Load persisted connector connections at startup so webhooks and syncs
     # can resolve existing subscriptions immediately after server boot
     # Skip in no-auth mode since connectors require OAuth
@@ -213,7 +214,7 @@ async def initialize_services():
             print(f"[WARNING] Failed to load persisted connections on startup: {e}")
     else:
         print("[CONNECTORS] Skipping connection loading in no-auth mode")
-    
+
     # New: Langflow file service
 
     langflow_file_service = LangflowFileService()
@@ -234,13 +235,13 @@ async def initialize_services():
 async def create_app():
     """Create and configure the Starlette application"""
     services = await initialize_services()
-    
+
     # Create route handlers with service dependencies injected
     routes = [
         # Upload endpoints
-        Route("/upload", 
+        Route("/upload",
               require_auth(services['session_manager'])(
-                  partial(upload.upload, 
+                  partial(upload.upload,
                          document_service=services['document_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
@@ -266,15 +267,15 @@ async def create_app():
                          langflow_file_service=services['langflow_file_service'],
                          session_manager=services['session_manager'])
               ), methods=["DELETE"]),
-        
-        Route("/upload_context", 
+
+        Route("/upload_context",
               require_auth(services['session_manager'])(
                   partial(upload.upload_context,
                          document_service=services['document_service'],
                          chat_service=services['chat_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
+
         Route("/upload_path",
               require_auth(services['session_manager'])(
                   partial(upload.upload_path,
@@ -294,227 +295,227 @@ async def create_app():
                          task_service=services['task_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/tasks/{task_id}", 
+
+        Route("/tasks/{task_id}",
               require_auth(services['session_manager'])(
                   partial(tasks.task_status,
                          task_service=services['task_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/tasks", 
+
+        Route("/tasks",
               require_auth(services['session_manager'])(
                   partial(tasks.all_tasks,
                          task_service=services['task_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/tasks/{task_id}/cancel", 
+
+        Route("/tasks/{task_id}/cancel",
               require_auth(services['session_manager'])(
                   partial(tasks.cancel_task,
                          task_service=services['task_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
+
         # Search endpoint
-        Route("/search", 
+        Route("/search",
               require_auth(services['session_manager'])(
                   partial(search.search,
                          search_service=services['search_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
+
         # Knowledge Filter endpoints
-        Route("/knowledge-filter", 
+        Route("/knowledge-filter",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.create_knowledge_filter,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/knowledge-filter/search", 
+
+        Route("/knowledge-filter/search",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.search_knowledge_filters,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/knowledge-filter/{filter_id}", 
+
+        Route("/knowledge-filter/{filter_id}",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.get_knowledge_filter,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/knowledge-filter/{filter_id}", 
+
+        Route("/knowledge-filter/{filter_id}",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.update_knowledge_filter,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["PUT"]),
-        
-        Route("/knowledge-filter/{filter_id}", 
+
+        Route("/knowledge-filter/{filter_id}",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.delete_knowledge_filter,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["DELETE"]),
-        
+
         # Knowledge Filter Subscription endpoints
-        Route("/knowledge-filter/{filter_id}/subscribe", 
+        Route("/knowledge-filter/{filter_id}/subscribe",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.subscribe_to_knowledge_filter,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          monitor_service=services['monitor_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/knowledge-filter/{filter_id}/subscriptions", 
+
+        Route("/knowledge-filter/{filter_id}/subscriptions",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.list_knowledge_filter_subscriptions,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/knowledge-filter/{filter_id}/subscribe/{subscription_id}", 
+
+        Route("/knowledge-filter/{filter_id}/subscribe/{subscription_id}",
               require_auth(services['session_manager'])(
                   partial(knowledge_filter.cancel_knowledge_filter_subscription,
                          knowledge_filter_service=services['knowledge_filter_service'],
                          monitor_service=services['monitor_service'],
                          session_manager=services['session_manager'])
               ), methods=["DELETE"]),
-        
+
         # Knowledge Filter Webhook endpoint (no auth required - called by OpenSearch)
-        Route("/knowledge-filter/{filter_id}/webhook/{subscription_id}", 
+        Route("/knowledge-filter/{filter_id}/webhook/{subscription_id}",
               partial(knowledge_filter.knowledge_filter_webhook,
                      knowledge_filter_service=services['knowledge_filter_service'],
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["POST"]),
-        
+
         # Chat endpoints
-        Route("/chat", 
+        Route("/chat",
               require_auth(services['session_manager'])(
                   partial(chat.chat_endpoint,
                          chat_service=services['chat_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/langflow", 
+
+        Route("/langflow",
               require_auth(services['session_manager'])(
                   partial(chat.langflow_endpoint,
                          chat_service=services['chat_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
+
         # Chat history endpoints
-        Route("/chat/history", 
+        Route("/chat/history",
               require_auth(services['session_manager'])(
                   partial(chat.chat_history_endpoint,
                          chat_service=services['chat_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/langflow/history", 
+
+        Route("/langflow/history",
               require_auth(services['session_manager'])(
                   partial(chat.langflow_history_endpoint,
                          chat_service=services['chat_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
+
         # Authentication endpoints
-        Route("/auth/init", 
+        Route("/auth/init",
               optional_auth(services['session_manager'])(
                   partial(auth.auth_init,
                          auth_service=services['auth_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/auth/callback", 
+
+        Route("/auth/callback",
               partial(auth.auth_callback,
                      auth_service=services['auth_service'],
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["POST"]),
-        
-        Route("/auth/me", 
+
+        Route("/auth/me",
               optional_auth(services['session_manager'])(
                   partial(auth.auth_me,
                          auth_service=services['auth_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/auth/logout", 
+
+        Route("/auth/logout",
               require_auth(services['session_manager'])(
                   partial(auth.auth_logout,
                          auth_service=services['auth_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
+
         # Connector endpoints
-        Route("/connectors", 
+        Route("/connectors",
               require_auth(services['session_manager'])(
                   partial(connectors.list_connectors,
                          connector_service=services['connector_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/connectors/{connector_type}/sync", 
+
+        Route("/connectors/{connector_type}/sync",
               require_auth(services['session_manager'])(
                   partial(connectors.connector_sync,
                          connector_service=services['connector_service'],
                          session_manager=services['session_manager'])
               ), methods=["POST"]),
-        
-        Route("/connectors/{connector_type}/status", 
+
+        Route("/connectors/{connector_type}/status",
               require_auth(services['session_manager'])(
                   partial(connectors.connector_status,
                          connector_service=services['connector_service'],
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
-        
-        Route("/connectors/{connector_type}/webhook", 
+
+        Route("/connectors/{connector_type}/webhook",
               partial(connectors.connector_webhook,
                      connector_service=services['connector_service'],
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["POST", "GET"]),
-        
+
         # OIDC endpoints
-        Route("/.well-known/openid-configuration", 
+        Route("/.well-known/openid-configuration",
               partial(oidc.oidc_discovery,
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["GET"]),
-        
-        Route("/auth/jwks", 
+
+        Route("/auth/jwks",
               partial(oidc.jwks_endpoint,
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["GET"]),
-        
-        Route("/auth/introspect", 
+
+        Route("/auth/introspect",
               partial(oidc.token_introspection,
-                     session_manager=services['session_manager']), 
+                     session_manager=services['session_manager']),
               methods=["POST"]),
-        
+
         # Settings endpoint
-        Route("/settings", 
+        Route("/settings",
               require_auth(services['session_manager'])(
                   partial(settings.get_settings,
                          session_manager=services['session_manager'])
               ), methods=["GET"]),
     ]
-    
+
     app = Starlette(debug=True, routes=routes)
     app.state.services = services  # Store services for cleanup
-    
+
     # Add startup event handler
-    @app.on_event("startup")  
+    @app.on_event("startup")
     async def startup_event():
         # Start index initialization in background to avoid blocking OIDC endpoints
         asyncio.create_task(init_index_when_ready())
-    
+
     # Add shutdown event handler
     @app.on_event("shutdown")
     async def shutdown_event():
         await cleanup_subscriptions_proper(services)
-    
+
     return app
 
 async def startup():
@@ -533,15 +534,15 @@ def cleanup():
 async def cleanup_subscriptions_proper(services):
     """Cancel all active webhook subscriptions"""
     print("[CLEANUP] Cancelling active webhook subscriptions...")
-    
+
     try:
         connector_service = services['connector_service']
         await connector_service.connection_manager.load_connections()
-        
-        # Get all active connections with webhook subscriptions  
+
+        # Get all active connections with webhook subscriptions
         all_connections = await connector_service.connection_manager.list_connections()
         active_connections = [c for c in all_connections if c.is_active and c.config.get('webhook_channel_id')]
-        
+
         for connection in active_connections:
             try:
                 print(f"[CLEANUP] Cancelling subscription for connection {connection.connection_id}")
@@ -552,22 +553,22 @@ async def cleanup_subscriptions_proper(services):
                     print(f"[CLEANUP] Cancelled subscription {subscription_id}")
             except Exception as e:
                 print(f"[ERROR] Failed to cancel subscription for {connection.connection_id}: {e}")
-        
+
         print(f"[CLEANUP] Finished cancelling {len(active_connections)} subscriptions")
-        
+
     except Exception as e:
         print(f"[ERROR] Failed to cleanup subscriptions: {e}")
 
 
 if __name__ == "__main__":
     import uvicorn
-    
+
     # Register cleanup function
     atexit.register(cleanup)
-    
+
     # Create app asynchronously
     app = asyncio.run(create_app())
-    
+
     # Run the server (startup tasks now handled by Starlette startup event)
     uvicorn.run(
         app,

From 531ca7cd497f6b0bc1f2ac18388043bc26fce268 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 08:44:00 -0300
Subject: [PATCH 07/67] Refactor TaskService for improved readability and
 maintainability

This commit enhances the TaskService class by reorganizing import statements, updating type hints to use `dict` instead of `Dict`, and improving the formatting of method definitions for better clarity. Additionally, minor adjustments were made to comments and error handling, contributing to a more robust and well-documented codebase.
---
 src/services/task_service.py | 153 ++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 74 deletions(-)

diff --git a/src/services/task_service.py b/src/services/task_service.py
index 8fa1ed2b..6f088100 100644
--- a/src/services/task_service.py
+++ b/src/services/task_service.py
@@ -1,60 +1,69 @@
 import asyncio
-import uuid
-import time
 import random
-from typing import Dict
+import time
+import uuid
 
-from models.tasks import TaskStatus, UploadTask, FileTask
-
-from src.utils.gpu_detection import get_worker_count
+from models.tasks import FileTask, TaskStatus, UploadTask
+from utils.gpu_detection import get_worker_count
 
 
 class TaskService:
     def __init__(self, document_service=None, process_pool=None):
         self.document_service = document_service
         self.process_pool = process_pool
-        self.task_store: Dict[str, Dict[str, UploadTask]] = {}  # user_id -> {task_id -> UploadTask}
+        self.task_store: dict[str, dict[str, UploadTask]] = {}  # user_id -> {task_id -> UploadTask}
         self.background_tasks = set()
-        
+
         if self.process_pool is None:
             raise ValueError("TaskService requires a process_pool parameter")
 
-    async def exponential_backoff_delay(self, retry_count: int, base_delay: float = 1.0, max_delay: float = 60.0) -> None:
+    async def exponential_backoff_delay(
+        self, retry_count: int, base_delay: float = 1.0, max_delay: float = 60.0
+    ) -> None:
         """Apply exponential backoff with jitter"""
-        delay = min(base_delay * (2 ** retry_count) + random.uniform(0, 1), max_delay)
+        delay = min(base_delay * (2**retry_count) + random.uniform(0, 1), max_delay)
         await asyncio.sleep(delay)
 
-    async def create_upload_task(self, user_id: str, file_paths: list, jwt_token: str = None, owner_name: str = None, owner_email: str = None) -> str:
+    async def create_upload_task(
+        self, user_id: str, file_paths: list, jwt_token: str = None, owner_name: str = None, owner_email: str = None
+    ) -> str:
         """Create a new upload task for bulk file processing"""
         # Use default DocumentFileProcessor with user context
         from models.processors import DocumentFileProcessor
-        processor = DocumentFileProcessor(self.document_service, owner_user_id=user_id, jwt_token=jwt_token, owner_name=owner_name, owner_email=owner_email)
+
+        processor = DocumentFileProcessor(
+            self.document_service,
+            owner_user_id=user_id,
+            jwt_token=jwt_token,
+            owner_name=owner_name,
+            owner_email=owner_email,
+        )
         return await self.create_custom_task(user_id, file_paths, processor)
-    
+
     async def create_custom_task(self, user_id: str, items: list, processor) -> str:
         """Create a new task with custom processor for any type of items"""
         task_id = str(uuid.uuid4())
         upload_task = UploadTask(
             task_id=task_id,
             total_files=len(items),
-            file_tasks={str(item): FileTask(file_path=str(item)) for item in items}
+            file_tasks={str(item): FileTask(file_path=str(item)) for item in items},
         )
-        
+
         # Attach the custom processor to the task
         upload_task.processor = processor
-        
+
         if user_id not in self.task_store:
             self.task_store[user_id] = {}
         self.task_store[user_id][task_id] = upload_task
-        
+
         # Start background processing
         background_task = asyncio.create_task(self.background_custom_processor(user_id, task_id, items))
         self.background_tasks.add(background_task)
         background_task.add_done_callback(self.background_tasks.discard)
-        
+
         # Store reference to background task for cancellation
         upload_task.background_task = background_task
-        
+
         return task_id
 
     async def background_upload_processor(self, user_id: str, task_id: str) -> None:
@@ -63,25 +72,23 @@ class TaskService:
             upload_task = self.task_store[user_id][task_id]
             upload_task.status = TaskStatus.RUNNING
             upload_task.updated_at = time.time()
-            
+
             # Process files with limited concurrency to avoid overwhelming the system
             max_workers = get_worker_count()
             semaphore = asyncio.Semaphore(max_workers * 2)  # Allow 2x process pool size for async I/O
-            
+
             async def process_with_semaphore(file_path: str):
                 async with semaphore:
                     await self.document_service.process_single_file_task(upload_task, file_path)
-            
-            tasks = [
-                process_with_semaphore(file_path)
-                for file_path in upload_task.file_tasks.keys()
-            ]
-            
+
+            tasks = [process_with_semaphore(file_path) for file_path in upload_task.file_tasks.keys()]
+
             await asyncio.gather(*tasks, return_exceptions=True)
-            
+
         except Exception as e:
             print(f"[ERROR] Background upload processor failed for task {task_id}: {e}")
             import traceback
+
             traceback.print_exc()
             if user_id in self.task_store and task_id in self.task_store[user_id]:
                 self.task_store[user_id][task_id].status = TaskStatus.FAILED
@@ -93,24 +100,25 @@ class TaskService:
             upload_task = self.task_store[user_id][task_id]
             upload_task.status = TaskStatus.RUNNING
             upload_task.updated_at = time.time()
-            
+
             processor = upload_task.processor
-            
+
             # Process items with limited concurrency
             max_workers = get_worker_count()
             semaphore = asyncio.Semaphore(max_workers * 2)
-            
+
             async def process_with_semaphore(item, item_key: str):
                 async with semaphore:
                     file_task = upload_task.file_tasks[item_key]
                     file_task.status = TaskStatus.RUNNING
                     file_task.updated_at = time.time()
-                    
+
                     try:
                         await processor.process_item(upload_task, item, file_task)
                     except Exception as e:
                         print(f"[ERROR] Failed to process item {item}: {e}")
                         import traceback
+
                         traceback.print_exc()
                         file_task.status = TaskStatus.FAILED
                         file_task.error = str(e)
@@ -119,18 +127,15 @@ class TaskService:
                         file_task.updated_at = time.time()
                         upload_task.processed_files += 1
                         upload_task.updated_at = time.time()
-            
-            tasks = [
-                process_with_semaphore(item, str(item))
-                for item in items
-            ]
-            
+
+            tasks = [process_with_semaphore(item, str(item)) for item in items]
+
             await asyncio.gather(*tasks, return_exceptions=True)
-            
+
             # Mark task as completed
             upload_task.status = TaskStatus.COMPLETED
             upload_task.updated_at = time.time()
-            
+
         except asyncio.CancelledError:
             print(f"[INFO] Background processor for task {task_id} was cancelled")
             if user_id in self.task_store and task_id in self.task_store[user_id]:
@@ -140,6 +145,7 @@ class TaskService:
         except Exception as e:
             print(f"[ERROR] Background custom processor failed for task {task_id}: {e}")
             import traceback
+
             traceback.print_exc()
             if user_id in self.task_store and task_id in self.task_store[user_id]:
                 self.task_store[user_id][task_id].status = TaskStatus.FAILED
@@ -147,13 +153,11 @@ class TaskService:
 
     def get_task_status(self, user_id: str, task_id: str) -> dict:
         """Get the status of a specific upload task"""
-        if (not task_id or 
-            user_id not in self.task_store or 
-            task_id not in self.task_store[user_id]):
+        if not task_id or user_id not in self.task_store or task_id not in self.task_store[user_id]:
             return None
-        
+
         upload_task = self.task_store[user_id][task_id]
-        
+
         file_statuses = {}
         for file_path, file_task in upload_task.file_tasks.items():
             file_statuses[file_path] = {
@@ -162,9 +166,9 @@ class TaskService:
                 "error": file_task.error,
                 "retry_count": file_task.retry_count,
                 "created_at": file_task.created_at,
-                "updated_at": file_task.updated_at
+                "updated_at": file_task.updated_at,
             }
-        
+
         return {
             "task_id": upload_task.task_id,
             "status": upload_task.status.value,
@@ -174,51 +178,52 @@ class TaskService:
             "failed_files": upload_task.failed_files,
             "created_at": upload_task.created_at,
             "updated_at": upload_task.updated_at,
-            "files": file_statuses
+            "files": file_statuses,
         }
-    
+
     def get_all_tasks(self, user_id: str) -> list:
         """Get all tasks for a user"""
         if user_id not in self.task_store:
             return []
-        
+
         tasks = []
         for task_id, upload_task in self.task_store[user_id].items():
-            tasks.append({
-                "task_id": upload_task.task_id,
-                "status": upload_task.status.value,
-                "total_files": upload_task.total_files,
-                "processed_files": upload_task.processed_files,
-                "successful_files": upload_task.successful_files,
-                "failed_files": upload_task.failed_files,
-                "created_at": upload_task.created_at,
-                "updated_at": upload_task.updated_at
-            })
-        
+            tasks.append(
+                {
+                    "task_id": upload_task.task_id,
+                    "status": upload_task.status.value,
+                    "total_files": upload_task.total_files,
+                    "processed_files": upload_task.processed_files,
+                    "successful_files": upload_task.successful_files,
+                    "failed_files": upload_task.failed_files,
+                    "created_at": upload_task.created_at,
+                    "updated_at": upload_task.updated_at,
+                }
+            )
+
         # Sort by creation time, most recent first
         tasks.sort(key=lambda x: x["created_at"], reverse=True)
         return tasks
-    
+
     def cancel_task(self, user_id: str, task_id: str) -> bool:
         """Cancel a task if it exists and is not already completed"""
-        if (user_id not in self.task_store or 
-            task_id not in self.task_store[user_id]):
+        if user_id not in self.task_store or task_id not in self.task_store[user_id]:
             return False
-        
+
         upload_task = self.task_store[user_id][task_id]
-        
+
         # Can only cancel pending or running tasks
         if upload_task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
             return False
-        
+
         # Cancel the background task to stop scheduling new work
-        if hasattr(upload_task, 'background_task') and not upload_task.background_task.done():
+        if hasattr(upload_task, "background_task") and not upload_task.background_task.done():
             upload_task.background_task.cancel()
-        
+
         # Mark task as failed (cancelled)
         upload_task.status = TaskStatus.FAILED
         upload_task.updated_at = time.time()
-        
+
         # Mark all pending file tasks as failed
         for file_task in upload_task.file_tasks.values():
             if file_task.status == TaskStatus.PENDING:
@@ -226,10 +231,10 @@ class TaskService:
                 file_task.error = "Task cancelled by user"
                 file_task.updated_at = time.time()
                 upload_task.failed_files += 1
-        
+
         return True
-    
+
     def shutdown(self):
         """Cleanup process pool"""
-        if hasattr(self, 'process_pool'):
-            self.process_pool.shutdown(wait=True)
\ No newline at end of file
+        if hasattr(self, "process_pool"):
+            self.process_pool.shutdown(wait=True)

From 8354e24591c8093c54f2eea148bf2e3f93ced1e5 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 08:59:34 -0300
Subject: [PATCH 08/67] Add development Makefile with comprehensive commands

---
 Makefile | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..74df8d40
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,210 @@
+# OpenRAG Development Makefile
+# Provides easy commands for development workflow
+
+.PHONY: help dev dev-cpu dev-local infra stop clean build logs shell-backend shell-frontend install test backend frontend install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os shell-be shell-lf shell-os restart status health db-reset flow-upload quick setup
+
+# Default target
+help:
+	@echo "OpenRAG Development Commands"
+	@echo ""
+	@echo "Development:"
+	@echo "  dev          - Start full stack with GPU support (docker compose)"
+	@echo "  dev-cpu      - Start full stack with CPU only (docker compose)"
+	@echo "  dev-local    - Start infrastructure only, run backend/frontend locally"
+	@echo "  infra        - Start infrastructure services only (alias for dev-local)"
+	@echo "  stop         - Stop all containers"
+	@echo "  restart      - Restart all containers"
+	@echo ""
+	@echo "Local Development:"
+	@echo "  backend      - Run backend locally (requires infrastructure)"
+	@echo "  frontend     - Run frontend locally"
+	@echo "  install      - Install all dependencies"
+	@echo "  install-be   - Install backend dependencies (uv)"
+	@echo "  install-fe   - Install frontend dependencies (npm)"
+	@echo ""
+	@echo "Utilities:"
+	@echo "  build        - Build all Docker images"
+	@echo "  clean        - Stop containers and remove volumes"
+	@echo "  logs         - Show logs from all containers"
+	@echo "  logs-be      - Show backend container logs"
+	@echo "  logs-lf      - Show langflow container logs"
+	@echo "  shell-be     - Shell into backend container"
+	@echo "  shell-lf     - Shell into langflow container"
+	@echo ""
+	@echo "Testing:"
+	@echo "  test         - Run backend tests"
+	@echo "  lint         - Run linting checks"
+	@echo ""
+
+# Development environments
+dev:
+	@echo "🚀 Starting OpenRAG with GPU support..."
+	docker-compose up -d
+	@echo "✅ Services started!"
+	@echo "   Backend: http://localhost:8000"
+	@echo "   Frontend: http://localhost:3000"  
+	@echo "   Langflow: http://localhost:7860"
+	@echo "   OpenSearch: http://localhost:9200"
+	@echo "   Dashboards: http://localhost:5601"
+
+dev-cpu:
+	@echo "🚀 Starting OpenRAG with CPU only..."
+	docker-compose -f docker-compose-cpu.yml up -d
+	@echo "✅ Services started!"
+	@echo "   Backend: http://localhost:8000"
+	@echo "   Frontend: http://localhost:3000"
+	@echo "   Langflow: http://localhost:7860"
+	@echo "   OpenSearch: http://localhost:9200"
+	@echo "   Dashboards: http://localhost:5601"
+
+dev-local:
+	@echo "🔧 Starting infrastructure only (for local development)..."
+	docker-compose up -d opensearch dashboards langflow
+	@echo "✅ Infrastructure started!"
+	@echo "   Langflow: http://localhost:7860"
+	@echo "   OpenSearch: http://localhost:9200"
+	@echo "   Dashboards: http://localhost:5601"
+	@echo ""
+	@echo "Now run 'make backend' and 'make frontend' in separate terminals"
+
+infra:
+	@echo "🔧 Starting infrastructure services only..."
+	docker-compose up -d opensearch dashboards langflow
+	@echo "✅ Infrastructure services started!"
+	@echo "   Langflow: http://localhost:7860"
+	@echo "   OpenSearch: http://localhost:9200"
+	@echo "   Dashboards: http://localhost:5601"
+
+# Container management
+stop:
+	@echo "🛑 Stopping all containers..."
+	docker-compose down
+	docker-compose -f docker-compose-cpu.yml down 2>/dev/null || true
+
+restart: stop dev
+
+clean: stop
+	@echo "🧹 Cleaning up containers and volumes..."
+	docker-compose down -v --remove-orphans
+	docker-compose -f docker-compose-cpu.yml down -v --remove-orphans 2>/dev/null || true
+	docker system prune -f
+
+# Local development
+backend:
+	@echo "🐍 Starting backend locally..."
+	@if [ ! -f .env ]; then echo "⚠️  .env file not found. Copy .env.example to .env first"; exit 1; fi
+	cd src && uv run python main.py
+
+frontend:
+	@echo "⚛️  Starting frontend locally..."
+	@if [ ! -d "frontend/node_modules" ]; then echo "📦 Installing frontend dependencies first..."; cd frontend && npm install; fi
+	cd frontend && npx next dev
+
+# Installation
+install: install-be install-fe
+	@echo "✅ All dependencies installed!"
+
+install-be:
+	@echo "📦 Installing backend dependencies..."
+	uv sync
+
+install-fe:
+	@echo "📦 Installing frontend dependencies..."
+	cd frontend && npm install
+
+# Building
+build:
+	@echo "🔨 Building Docker images..."
+	docker-compose build
+
+build-be:
+	@echo "🔨 Building backend image..."
+	docker build -t openrag-backend -f Dockerfile.backend .
+
+build-fe:
+	@echo "🔨 Building frontend image..."
+	docker build -t openrag-frontend -f Dockerfile.frontend .
+
+# Logging and debugging
+logs:
+	@echo "📋 Showing all container logs..."
+	docker-compose logs -f
+
+logs-be:
+	@echo "📋 Showing backend logs..."
+	docker-compose logs -f openrag-backend
+
+logs-fe:
+	@echo "📋 Showing frontend logs..."
+	docker-compose logs -f openrag-frontend
+
+logs-lf:
+	@echo "📋 Showing langflow logs..."
+	docker-compose logs -f langflow
+
+logs-os:
+	@echo "📋 Showing opensearch logs..."
+	docker-compose logs -f opensearch
+
+# Shell access
+shell-be:
+	@echo "🐚 Opening shell in backend container..."
+	docker-compose exec openrag-backend /bin/bash
+
+shell-lf:
+	@echo "🐚 Opening shell in langflow container..."
+	docker-compose exec langflow /bin/bash
+
+shell-os:
+	@echo "🐚 Opening shell in opensearch container..."
+	docker-compose exec opensearch /bin/bash
+
+# Testing and quality
+test:
+	@echo "🧪 Running backend tests..."
+	uv run pytest
+
+lint:
+	@echo "🔍 Running linting checks..."
+	cd frontend && npm run lint
+	@echo "Frontend linting complete"
+
+# Service status
+status:
+	@echo "📊 Container status:"
+	@docker-compose ps 2>/dev/null || echo "No containers running"
+
+health:
+	@echo "🏥 Health check:"
+	@echo "Backend: $$(curl -s http://localhost:8000/health 2>/dev/null || echo 'Not responding')"
+	@echo "Langflow: $$(curl -s http://localhost:7860/health 2>/dev/null || echo 'Not responding')"
+	@echo "OpenSearch: $$(curl -s -k -u admin:$(shell grep OPENSEARCH_PASSWORD .env | cut -d= -f2) https://localhost:9200 2>/dev/null | jq -r .tagline 2>/dev/null || echo 'Not responding')"
+
+# Database operations
+db-reset:
+	@echo "🗄️ Resetting OpenSearch indices..."
+	curl -X DELETE "http://localhost:9200/documents" -u admin:$$(grep OPENSEARCH_PASSWORD .env | cut -d= -f2) || true
+	curl -X DELETE "http://localhost:9200/knowledge_filters" -u admin:$$(grep OPENSEARCH_PASSWORD .env | cut -d= -f2) || true
+	@echo "Indices reset. Restart backend to recreate."
+
+# Flow management  
+flow-upload:
+	@echo "📁 Uploading flow to Langflow..."
+	@if [ -z "$(FLOW_FILE)" ]; then echo "Usage: make flow-upload FLOW_FILE=path/to/flow.json"; exit 1; fi
+	curl -X POST "http://localhost:7860/api/v1/flows" \
+		-H "Content-Type: application/json" \
+		-d @$(FLOW_FILE)
+
+# Quick development shortcuts
+quick: dev-local
+	@echo "🚀 Quick start: infrastructure running!"
+	@echo "Run these in separate terminals:"
+	@echo "  make backend"
+	@echo "  make frontend"
+
+# Environment setup
+setup:
+	@echo "⚙️ Setting up development environment..."
+	@if [ ! -f .env ]; then cp .env.example .env && echo "📝 Created .env from template"; fi
+	@$(MAKE) install
+	@echo "✅ Setup complete! Run 'make dev' to start."
\ No newline at end of file

From 8f69eab5c9a6f2391a386cc537482052408d4301 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 08:59:54 -0300
Subject: [PATCH 09/67] Update Docker Compose files to include
 OPENSEARCH_PASSWORD in environment variables

This commit modifies both docker-compose.yml and docker-compose-cpu.yml to add OPENSEARCH_PASSWORD to the LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT. Additionally, unnecessary whitespace has been removed for improved readability.
---
 docker-compose-cpu.yml | 7 ++++---
 docker-compose.yml     | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml
index 6f27042a..e9831aa6 100644
--- a/docker-compose-cpu.yml
+++ b/docker-compose-cpu.yml
@@ -14,10 +14,10 @@ services:
       bash -c "
         # Start OpenSearch in background
         /usr/share/opensearch/opensearch-docker-entrypoint.sh opensearch &
-        
+
         # Wait a bit for OpenSearch to start, then apply security config
         sleep 10 && /usr/share/opensearch/setup-security.sh &
-        
+
         # Wait for background processes
         wait
       "
@@ -96,7 +96,8 @@ services:
       - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY}
       - JWT="dummy"
       - OPENRAG-QUERY-FILTER="{}"
-      - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER
+      - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
+      - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD
       - LANGFLOW_LOG_LEVEL=DEBUG
       - LANGFLOW_AUTO_LOGIN=${LANGFLOW_AUTO_LOGIN}
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}
diff --git a/docker-compose.yml b/docker-compose.yml
index 8e2fdee2..997cc3b8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,10 +15,10 @@ services:
       bash -c "
         # Start OpenSearch in background
         /usr/share/opensearch/opensearch-docker-entrypoint.sh opensearch &
-        
+
         # Wait a bit for OpenSearch to start, then apply security config
         sleep 10 && /usr/share/opensearch/setup-security.sh &
-        
+
         # Wait for background processes
         wait
       "
@@ -97,7 +97,8 @@ services:
       - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY}
       - JWT="dummy"
       - OPENRAG-QUERY-FILTER="{}"
-      - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER
+      - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
+      - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD
       - LANGFLOW_LOG_LEVEL=DEBUG
       - LANGFLOW_AUTO_LOGIN=${LANGFLOW_AUTO_LOGIN}
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}

From 45e9c60af1ca23837158212e696c402e57647a25 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:21:35 -0300
Subject: [PATCH 10/67] Enhance LangflowFileService with API key management and
 v2 endpoint support

This commit refactors the LangflowFileService to include asynchronous API key retrieval and updates the file upload and deletion methods to use the new v2 endpoints. The flow ID constant has been renamed for clarity, and additional logging has been added for better debugging and error handling. The payload structure for the ingestion flow has also been modified to improve functionality and maintainability.
---
 src/services/langflow_file_service.py | 116 +++++++++++++++++++++-----
 1 file changed, 97 insertions(+), 19 deletions(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 5945d5b7..213228a0 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -1,35 +1,89 @@
+import logging
 from typing import Any, Dict, List, Optional
 
 import httpx
 
-from config.settings import FLOW_ID_INGEST, LANGFLOW_BASE_URL, LANGFLOW_KEY
+from config.settings import (
+    LANGFLOW_BASE_URL,
+    LANGFLOW_INGEST_FLOW_ID,
+    LANGFLOW_URL,
+)
 
 
 class LangflowFileService:
     def __init__(self):
         self.base_url = LANGFLOW_BASE_URL.rstrip("/")
-        self.api_key = LANGFLOW_KEY
-        self.flow_id_ingest = FLOW_ID_INGEST
+        self.flow_id_ingest = LANGFLOW_INGEST_FLOW_ID
+        self.logger = logging.getLogger(__name__)
 
-    def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
-        headers = {"x-api-key": self.api_key} if self.api_key else {}
+    async def _get_api_key(self) -> Optional[str]:
+        """Get Langflow API key, ensuring it's generated if needed"""
+        from config.settings import generate_langflow_api_key
+
+        api_key = await generate_langflow_api_key()
+        print(f"[LF] _get_api_key returning: {'present' if api_key else 'None'}")
+        if api_key:
+            print(f"[LF] API key prefix: {api_key[:8]}...")
+        return api_key
+
+    async def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
+        api_key = await self._get_api_key()
+        headers = {"x-api-key": api_key} if api_key else {}
         if extra:
             headers.update(extra)
         return headers
 
-    async def upload_user_file(self, file_tuple) -> Dict[str, Any]:
-        """Upload a file for the current user using Langflow Files API."""
-        url = f"{self.base_url}/files/user/upload"
+    async def upload_user_file(
+        self, file_tuple, jwt_token: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Upload a file using Langflow Files API v2: POST /api/v2/files.
+        Returns JSON with keys: id, name, path, size, provider.
+        """
+        # NOTE: base_url points to /api/v1; v2 endpoints must not be prefixed with /api/v1
+        url = f"{LANGFLOW_URL}/api/v2/files"
+        api_key = await self._get_api_key()
+        self.logger.debug("[LF] Upload (v2) -> %s (key_present=%s)", url, bool(api_key))
+        if api_key:
+            self.logger.debug(f"[LF] Using API key: {api_key[:12]}...")
+        else:
+            self.logger.error("[LF] No API key available for upload!")
         async with httpx.AsyncClient(timeout=60.0) as client:
             files = {"file": file_tuple}
-            resp = await client.post(url, headers=self._headers(), files=files)
+            headers = await self._headers()
+            print(f"[LF] Upload headers: {headers}")
+            # Note: jwt_token is for OpenSearch, not for Langflow API - only use x-api-key
+            resp = await client.post(url, headers=headers, files=files)
+            self.logger.debug(
+                "[LF] Upload response: %s %s", resp.status_code, resp.reason_phrase
+            )
+            if resp.status_code >= 400:
+                self.logger.error(
+                    "[LF] Upload failed: %s %s | body=%s",
+                    resp.status_code,
+                    resp.reason_phrase,
+                    resp.text[:500],
+                )
             resp.raise_for_status()
             return resp.json()
 
     async def delete_user_file(self, file_id: str) -> None:
-        url = f"{self.base_url}/files/user/{file_id}"
+        """Delete a file by id using v2: DELETE /api/v2/files/{id}."""
+        # NOTE: use v2 root, not /api/v1
+        url = f"{LANGFLOW_URL}/api/v2/files/{file_id}"
+        self.logger.debug("[LF] Delete (v2) -> %s (id=%s)", url, file_id)
         async with httpx.AsyncClient(timeout=30.0) as client:
-            resp = await client.delete(url, headers=self._headers())
+            headers = await self._headers()
+            resp = await client.delete(url, headers=headers)
+            self.logger.debug(
+                "[LF] Delete response: %s %s", resp.status_code, resp.reason_phrase
+            )
+            if resp.status_code >= 400:
+                self.logger.error(
+                    "[LF] Delete failed: %s %s | body=%s",
+                    resp.status_code,
+                    resp.reason_phrase,
+                    resp.text[:500],
+                )
             resp.raise_for_status()
 
     async def run_ingestion_flow(
@@ -44,32 +98,56 @@ class LangflowFileService:
         The flow must expose a File component path in input schema or accept files parameter.
         """
         if not self.flow_id_ingest:
-            raise ValueError("FLOW_ID_INGEST is not configured")
+            raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
 
         url = f"{self.base_url}/run/{self.flow_id_ingest}"
 
         payload: Dict[str, Any] = {
             "input_value": "Ingest files",
             "input_type": "chat",
-            "output_type": "json",
+            "output_type": "text",  # Changed from "json" to "text"
         }
 
-        # Prefer passing files via 'files' if flow supports it, otherwise via tweaks
+        # Pass files via tweaks to File component (File-PSU37 from the flow)
         if file_paths:
-            payload["files"] = file_paths
+            if not tweaks:
+                tweaks = {}
+            tweaks["File-PSU37"] = {"path": file_paths}
+
         if tweaks:
             payload["tweaks"] = tweaks
         if session_id:
             payload["session_id"] = session_id
 
+        self.logger.debug(
+            "[LF] Run ingestion -> %s | files=%s session_id=%s tweaks_keys=%s jwt_present=%s",
+            url,
+            len(file_paths) if file_paths else 0,
+            session_id,
+            list(tweaks.keys()) if isinstance(tweaks, dict) else None,
+            bool(jwt_token),
+        )
+
+        # Log the full payload for debugging
+        self.logger.debug("[LF] Request payload: %s", payload)
+
         extra_headers = {}
-        if jwt_token:
-            # Provide user context if flow needs it
-            extra_headers["X-LANGFLOW-GLOBAL-VAR-JWT"] = jwt_token
+        # Note: Ingestion flow doesn't need JWT authentication context
+        # Removed X-LANGFLOW-GLOBAL-VAR-JWT header
 
         async with httpx.AsyncClient(timeout=120.0) as client:
             resp = await client.post(
-                url, headers=self._headers(extra_headers), json=payload
+                url, headers=await self._headers(extra_headers), json=payload
             )
+            self.logger.debug(
+                "[LF] Run response: %s %s", resp.status_code, resp.reason_phrase
+            )
+            if resp.status_code >= 400:
+                self.logger.error(
+                    "[LF] Run failed: %s %s | body=%s",
+                    resp.status_code,
+                    resp.reason_phrase,
+                    resp.text[:1000],
+                )
             resp.raise_for_status()
             return resp.json()

From e1d58c742106adfad1cf0a6df0e30e1901e157bb Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:23:10 -0300
Subject: [PATCH 11/67] Implement backwards compatibility for flow ID handling
 and enhance API key generation process

This commit introduces backwards compatibility for the flow ID by allowing the use of the deprecated FLOW_ID environment variable while issuing deprecation warnings. Additionally, the API key generation process has been improved with validation checks for cached keys and enhanced error handling. The Langflow client initialization has been updated to ensure proper handling of environment variables, contributing to a more robust and well-documented codebase.
---
 src/config/settings.py | 209 +++++++++++++++++++++++++++++------------
 1 file changed, 148 insertions(+), 61 deletions(-)

diff --git a/src/config/settings.py b/src/config/settings.py
index 814513c7..2603c80b 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -20,7 +20,17 @@ OPENSEARCH_PASSWORD = os.getenv("OPENSEARCH_PASSWORD")
 LANGFLOW_URL = os.getenv("LANGFLOW_URL", "http://localhost:7860")
 # Optional: public URL for browser links (e.g., http://localhost:7860)
 LANGFLOW_PUBLIC_URL = os.getenv("LANGFLOW_PUBLIC_URL")
-FLOW_ID = os.getenv("FLOW_ID")
+# Backwards compatible flow ID handling with deprecation warnings
+_legacy_flow_id = os.getenv("FLOW_ID")
+
+LANGFLOW_CHAT_FLOW_ID = os.getenv("LANGFLOW_CHAT_FLOW_ID") or _legacy_flow_id
+LANGFLOW_INGEST_FLOW_ID = os.getenv("LANGFLOW_INGEST_FLOW_ID") or _legacy_flow_id_ingest
+
+if _legacy_flow_id and not os.getenv("LANGFLOW_CHAT_FLOW_ID"):
+    print("[WARNING] FLOW_ID is deprecated. Please use LANGFLOW_CHAT_FLOW_ID instead")
+    LANGFLOW_CHAT_FLOW_ID = _legacy_flow_id
+
+
 # Langflow superuser credentials for API key generation
 LANGFLOW_SUPERUSER = os.getenv("LANGFLOW_SUPERUSER")
 LANGFLOW_SUPERUSER_PASSWORD = os.getenv("LANGFLOW_SUPERUSER_PASSWORD")
@@ -30,14 +40,20 @@ SESSION_SECRET = os.getenv("SESSION_SECRET", "your-secret-key-change-in-producti
 GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
 GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
 
+
 def is_no_auth_mode():
     """Check if we're running in no-auth mode (OAuth credentials missing)"""
     result = not (GOOGLE_OAUTH_CLIENT_ID and GOOGLE_OAUTH_CLIENT_SECRET)
-    print(f"[DEBUG] is_no_auth_mode() = {result}, CLIENT_ID={GOOGLE_OAUTH_CLIENT_ID is not None}, CLIENT_SECRET={GOOGLE_OAUTH_CLIENT_SECRET is not None}")
+    print(
+        f"[DEBUG] is_no_auth_mode() = {result}, CLIENT_ID={GOOGLE_OAUTH_CLIENT_ID is not None}, CLIENT_SECRET={GOOGLE_OAUTH_CLIENT_SECRET is not None}"
+    )
     return result
 
+
 # Webhook configuration - must be set to enable webhooks
-WEBHOOK_BASE_URL = os.getenv("WEBHOOK_BASE_URL")  # No default - must be explicitly configured
+WEBHOOK_BASE_URL = os.getenv(
+    "WEBHOOK_BASE_URL"
+)  # No default - must be explicitly configured
 
 # OpenSearch configuration
 INDEX_NAME = "documents"
@@ -48,62 +64,85 @@ INDEX_BODY = {
     "settings": {
         "index": {"knn": True},
         "number_of_shards": 1,
-        "number_of_replicas": 1
+        "number_of_replicas": 1,
     },
     "mappings": {
         "properties": {
-            "document_id": { "type": "keyword" },
-            "filename":    { "type": "keyword" },
-            "mimetype":    { "type": "keyword" },
-            "page":        { "type": "integer" },
-            "text":        { "type": "text" },
+            "document_id": {"type": "keyword"},
+            "filename": {"type": "keyword"},
+            "mimetype": {"type": "keyword"},
+            "page": {"type": "integer"},
+            "text": {"type": "text"},
             "chunk_embedding": {
                 "type": "knn_vector",
                 "dimension": VECTOR_DIM,
                 "method": {
-                    "name":       "disk_ann",
-                    "engine":     "jvector",
+                    "name": "hnsw",
+                    "engine": "nmslib",
                     "space_type": "l2",
-                    "parameters": {
-                        "ef_construction": 100,
-                        "m":               16
-                    }
-                }
+                    "parameters": {"ef_construction": 100, "m": 16},
+                },
             },
-            "source_url": { "type": "keyword" },
-            "connector_type": { "type": "keyword" },
-            "owner": { "type": "keyword" },
-            "allowed_users": { "type": "keyword" },
-            "allowed_groups": { "type": "keyword" },
-            "user_permissions": { "type": "object" },
-            "group_permissions": { "type": "object" },
-            "created_time": { "type": "date" },
-            "modified_time": { "type": "date" },
-            "indexed_time": { "type": "date" },
-            "metadata": { "type": "object" }
+            "source_url": {"type": "keyword"},
+            "connector_type": {"type": "keyword"},
+            "owner": {"type": "keyword"},
+            "allowed_users": {"type": "keyword"},
+            "allowed_groups": {"type": "keyword"},
+            "user_permissions": {"type": "object"},
+            "group_permissions": {"type": "object"},
+            "created_time": {"type": "date"},
+            "modified_time": {"type": "date"},
+            "indexed_time": {"type": "date"},
+            "metadata": {"type": "object"},
         }
-    }
+    },
 }
 
+# Convenience base URL for Langflow REST API
+LANGFLOW_BASE_URL = f"{LANGFLOW_URL}/api/v1"
+
+
 async def generate_langflow_api_key():
     """Generate Langflow API key using superuser credentials at startup"""
     global LANGFLOW_KEY
-    
+
+    print(f"[DEBUG] generate_langflow_api_key called - current LANGFLOW_KEY: {'present' if LANGFLOW_KEY else 'None'}")
+
     # If key already provided via env, do not attempt generation
     if LANGFLOW_KEY:
-        print("[INFO] Using LANGFLOW_KEY from environment; skipping generation")
-        return LANGFLOW_KEY
-    
+        if os.getenv("LANGFLOW_KEY"):
+            print("[INFO] Using LANGFLOW_KEY from environment; skipping generation")
+            return LANGFLOW_KEY
+        else:
+            # We have a cached key, but let's validate it first
+            print(f"[DEBUG] Validating cached LANGFLOW_KEY: {LANGFLOW_KEY[:8]}...")
+            try:
+                validation_response = requests.get(
+                    f"{LANGFLOW_URL}/api/v1/users/whoami",
+                    headers={"x-api-key": LANGFLOW_KEY},
+                    timeout=5
+                )
+                if validation_response.status_code == 200:
+                    print(f"[DEBUG] Cached API key is valid, returning: {LANGFLOW_KEY[:8]}...")
+                    return LANGFLOW_KEY
+                else:
+                    print(f"[WARNING] Cached API key is invalid ({validation_response.status_code}), generating fresh key")
+                    LANGFLOW_KEY = None  # Clear invalid key
+            except Exception as e:
+                print(f"[WARNING] Cached API key validation failed ({e}), generating fresh key")
+                LANGFLOW_KEY = None  # Clear invalid key
+
     if not LANGFLOW_SUPERUSER or not LANGFLOW_SUPERUSER_PASSWORD:
-        print("[WARNING] LANGFLOW_SUPERUSER and LANGFLOW_SUPERUSER_PASSWORD not set, skipping API key generation")
+        print(
+            "[WARNING] LANGFLOW_SUPERUSER and LANGFLOW_SUPERUSER_PASSWORD not set, skipping API key generation"
+        )
         return None
-    
+
     try:
         print("[INFO] Generating Langflow API key using superuser credentials...")
         max_attempts = int(os.getenv("LANGFLOW_KEY_RETRIES", "15"))
         delay_seconds = float(os.getenv("LANGFLOW_KEY_RETRY_DELAY", "2.0"))
 
-        last_error = None
         for attempt in range(1, max_attempts + 1):
             try:
                 # Login to get access token
@@ -112,9 +151,9 @@ async def generate_langflow_api_key():
                     headers={"Content-Type": "application/x-www-form-urlencoded"},
                     data={
                         "username": LANGFLOW_SUPERUSER,
-                        "password": LANGFLOW_SUPERUSER_PASSWORD
+                        "password": LANGFLOW_SUPERUSER_PASSWORD,
                     },
-                    timeout=10
+                    timeout=10,
                 )
                 login_response.raise_for_status()
                 access_token = login_response.json().get("access_token")
@@ -126,27 +165,38 @@ async def generate_langflow_api_key():
                     f"{LANGFLOW_URL}/api/v1/api_key/",
                     headers={
                         "Content-Type": "application/json",
-                        "Authorization": f"Bearer {access_token}"
+                        "Authorization": f"Bearer {access_token}",
                     },
                     json={"name": "openrag-auto-generated"},
-                    timeout=10
+                    timeout=10,
                 )
                 api_key_response.raise_for_status()
                 api_key = api_key_response.json().get("api_key")
                 if not api_key:
                     raise KeyError("api_key")
 
-                LANGFLOW_KEY = api_key
-                print(f"[INFO] Successfully generated Langflow API key: {api_key[:8]}...")
-                return api_key
+                # Validate the API key works
+                validation_response = requests.get(
+                    f"{LANGFLOW_URL}/api/v1/users/whoami",
+                    headers={"x-api-key": api_key},
+                    timeout=10
+                )
+                if validation_response.status_code == 200:
+                    LANGFLOW_KEY = api_key
+                    print(f"[INFO] Successfully generated and validated Langflow API key: {api_key[:8]}...")
+                    return api_key
+                else:
+                    print(f"[ERROR] Generated API key validation failed: {validation_response.status_code}")
+                    raise ValueError(f"API key validation failed: {validation_response.status_code}")
             except (requests.exceptions.RequestException, KeyError) as e:
-                last_error = e
-                print(f"[WARN] Attempt {attempt}/{max_attempts} to generate Langflow API key failed: {e}")
+                print(
+                    f"[WARN] Attempt {attempt}/{max_attempts} to generate Langflow API key failed: {e}"
+                )
                 if attempt < max_attempts:
                     time.sleep(delay_seconds)
                 else:
                     raise
-    
+
     except requests.exceptions.RequestException as e:
         print(f"[ERROR] Failed to generate Langflow API key: {e}")
         return None
@@ -157,17 +207,18 @@ async def generate_langflow_api_key():
         print(f"[ERROR] Unexpected error generating Langflow API key: {e}")
         return None
 
+
 class AppClients:
     def __init__(self):
         self.opensearch = None
         self.langflow_client = None
         self.patched_async_client = None
         self.converter = None
-        
+
     async def initialize(self):
         # Generate Langflow API key first
         await generate_langflow_api_key()
-        
+
         # Initialize OpenSearch client
         self.opensearch = AsyncOpenSearch(
             hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
@@ -179,26 +230,33 @@ class AppClients:
             http_auth=(OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD),
             http_compress=True,
         )
-        
+
         # Initialize Langflow client with generated/provided API key
         if LANGFLOW_KEY and self.langflow_client is None:
             try:
-                self.langflow_client = AsyncOpenAI(
-                    base_url=f"{LANGFLOW_URL}/api/v1",
-                    api_key=LANGFLOW_KEY
-                )
+                if not OPENSEARCH_PASSWORD:
+                    raise ValueError("OPENSEARCH_PASSWORD is not set")
+                else:
+                    await self.ensure_langflow_client()
+                    # Note: OPENSEARCH_PASSWORD global variable should be created automatically
+                    # via LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT in docker-compose
+                    print(
+                        "[INFO] Langflow client initialized - OPENSEARCH_PASSWORD should be available via environment variables"
+                    )
             except Exception as e:
                 print(f"[WARNING] Failed to initialize Langflow client: {e}")
                 self.langflow_client = None
         if self.langflow_client is None:
-            print("[WARNING] No Langflow client initialized yet; will attempt later on first use")
-        
+            print(
+                "[WARNING] No Langflow client initialized yet; will attempt later on first use"
+            )
+
         # Initialize patched OpenAI client
         self.patched_async_client = patch_openai_with_mcp(AsyncOpenAI())
-        
+
         # Initialize document converter
         self.converter = DocumentConverter()
-        
+
         return self
 
     async def ensure_langflow_client(self):
@@ -210,19 +268,47 @@ class AppClients:
         if LANGFLOW_KEY and self.langflow_client is None:
             try:
                 self.langflow_client = AsyncOpenAI(
-                    base_url=f"{LANGFLOW_URL}/api/v1",
-                    api_key=LANGFLOW_KEY
+                    base_url=f"{LANGFLOW_URL}/api/v1", api_key=LANGFLOW_KEY
                 )
                 print("[INFO] Langflow client initialized on-demand")
             except Exception as e:
                 print(f"[ERROR] Failed to initialize Langflow client on-demand: {e}")
                 self.langflow_client = None
         return self.langflow_client
-    
+
+    async def _create_langflow_global_variable(self, name: str, value: str):
+        """Create a global variable in Langflow via API"""
+        api_key = await generate_langflow_api_key()
+        if not api_key:
+            print(f"[WARNING] Cannot create Langflow global variable {name}: No API key")
+            return
+
+        url = f"{LANGFLOW_URL}/api/v1/variables/"
+        payload = {
+            "name": name,
+            "value": value,
+            "default_fields": [],
+            "type": "Credential",
+        }
+        headers = {"x-api-key": api_key, "Content-Type": "application/json"}
+
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.post(url, headers=headers, json=payload)
+                
+                if response.status_code in [200, 201]:
+                    print(f"[INFO] Successfully created Langflow global variable: {name}")
+                elif response.status_code == 400 and "already exists" in response.text:
+                    print(f"[INFO] Langflow global variable {name} already exists")
+                else:
+                    print(f"[WARNING] Failed to create Langflow global variable {name}: {response.status_code}")
+        except Exception as e:
+            print(f"[ERROR] Exception creating Langflow global variable {name}: {e}")
+
     def create_user_opensearch_client(self, jwt_token: str):
         """Create OpenSearch client with user's JWT token for OIDC auth"""
-        headers = {'Authorization': f'Bearer {jwt_token}'}
-        
+        headers = {"Authorization": f"Bearer {jwt_token}"}
+
         return AsyncOpenSearch(
             hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
             connection_class=AIOHttpConnection,
@@ -234,5 +320,6 @@ class AppClients:
             http_compress=True,
         )
 
+
 # Global clients instance
 clients = AppClients()

From d77ebb5f37f83a7f4e4b282eb525125fd9f166b0 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:23:34 -0300
Subject: [PATCH 12/67] update FLOW_ID to LANGFLOW_CHAT_FLOW_ID

---
 src/services/chat_service.py | 270 +++++++++++++++++++++++++----------
 1 file changed, 193 insertions(+), 77 deletions(-)

diff --git a/src/services/chat_service.py b/src/services/chat_service.py
index 689a626c..f778d25c 100644
--- a/src/services/chat_service.py
+++ b/src/services/chat_service.py
@@ -1,43 +1,75 @@
-from config.settings import clients, LANGFLOW_URL, FLOW_ID
-from agent import async_chat, async_langflow, async_chat_stream, async_langflow_stream
-from auth_context import set_auth_context
 import json
 
+from agent import async_chat, async_chat_stream, async_langflow
+from auth_context import set_auth_context
+from config.settings import LANGFLOW_CHAT_FLOW_ID, LANGFLOW_URL, clients
+
+
 class ChatService:
-    
-    async def chat(self, prompt: str, user_id: str = None, jwt_token: str = None, previous_response_id: str = None, stream: bool = False):
+    async def chat(
+        self,
+        prompt: str,
+        user_id: str = None,
+        jwt_token: str = None,
+        previous_response_id: str = None,
+        stream: bool = False,
+    ):
         """Handle chat requests using the patched OpenAI client"""
         if not prompt:
             raise ValueError("Prompt is required")
-        
+
         # Set authentication context for this request so tools can access it
         if user_id and jwt_token:
             set_auth_context(user_id, jwt_token)
-        
+
         if stream:
-            return async_chat_stream(clients.patched_async_client, prompt, user_id, previous_response_id=previous_response_id)
+            return async_chat_stream(
+                clients.patched_async_client,
+                prompt,
+                user_id,
+                previous_response_id=previous_response_id,
+            )
         else:
-            response_text, response_id = await async_chat(clients.patched_async_client, prompt, user_id, previous_response_id=previous_response_id)
+            response_text, response_id = await async_chat(
+                clients.patched_async_client,
+                prompt,
+                user_id,
+                previous_response_id=previous_response_id,
+            )
             response_data = {"response": response_text}
             if response_id:
                 response_data["response_id"] = response_id
             return response_data
 
-    async def langflow_chat(self, prompt: str, user_id: str = None, jwt_token: str = None, previous_response_id: str = None, stream: bool = False):
+    async def langflow_chat(
+        self,
+        prompt: str,
+        user_id: str = None,
+        jwt_token: str = None,
+        previous_response_id: str = None,
+        stream: bool = False,
+    ):
         """Handle Langflow chat requests"""
         if not prompt:
             raise ValueError("Prompt is required")
 
-        if not LANGFLOW_URL or not FLOW_ID:
-            raise ValueError("LANGFLOW_URL and FLOW_ID environment variables are required")
+        if not LANGFLOW_URL or not LANGFLOW_CHAT_FLOW_ID:
+            raise ValueError(
+                "LANGFLOW_URL and LANGFLOW_CHAT_FLOW_ID environment variables are required"
+            )
 
         # Prepare extra headers for JWT authentication
         extra_headers = {}
         if jwt_token:
-            extra_headers['X-LANGFLOW-GLOBAL-VAR-JWT'] = jwt_token
+            extra_headers["X-LANGFLOW-GLOBAL-VAR-JWT"] = jwt_token
 
         # Get context variables for filters, limit, and threshold
-        from auth_context import get_search_filters, get_search_limit, get_score_threshold
+        from auth_context import (
+            get_score_threshold,
+            get_search_filters,
+            get_search_limit,
+        )
+
         filters = get_search_filters()
         limit = get_search_limit()
         score_threshold = get_score_threshold()
@@ -49,86 +81,130 @@ class ChatService:
             # Map frontend filter names to backend field names
             field_mapping = {
                 "data_sources": "filename",
-                "document_types": "mimetype", 
-                "owners": "owner"
+                "document_types": "mimetype",
+                "owners": "owner",
             }
-            
+
             for filter_key, values in filters.items():
                 if values is not None and isinstance(values, list) and len(values) > 0:
                     # Map frontend key to backend field name
                     field_name = field_mapping.get(filter_key, filter_key)
-                    
+
                     if len(values) == 1:
                         # Single value filter
                         filter_clauses.append({"term": {field_name: values[0]}})
                     else:
                         # Multiple values filter
                         filter_clauses.append({"terms": {field_name: values}})
-            
+
             if filter_clauses:
                 filter_expression["filter"] = filter_clauses
-        
+
         # Add limit and score threshold to the filter expression (only if different from defaults)
         if limit and limit != 10:  # 10 is the default limit
             filter_expression["limit"] = limit
-            
+
         if score_threshold and score_threshold != 0:  # 0 is the default threshold
             filter_expression["score_threshold"] = score_threshold
 
         # Pass the complete filter expression as a single header to Langflow (only if we have something to send)
         if filter_expression:
-            print(f"Sending OpenRAG query filter to Langflow: {json.dumps(filter_expression, indent=2)}")
-            extra_headers['X-LANGFLOW-GLOBAL-VAR-OPENRAG-QUERY-FILTER'] = json.dumps(filter_expression)
+            print(
+                f"Sending OpenRAG query filter to Langflow: {json.dumps(filter_expression, indent=2)}"
+            )
+            extra_headers["X-LANGFLOW-GLOBAL-VAR-OPENRAG-QUERY-FILTER"] = json.dumps(
+                filter_expression
+            )
 
         # Ensure the Langflow client exists; try lazy init if needed
         langflow_client = await clients.ensure_langflow_client()
         if not langflow_client:
-            raise ValueError("Langflow client not initialized. Ensure LANGFLOW is reachable or set LANGFLOW_KEY.")
+            raise ValueError(
+                "Langflow client not initialized. Ensure LANGFLOW is reachable or set LANGFLOW_KEY."
+            )
 
         if stream:
             from agent import async_langflow_chat_stream
-            return async_langflow_chat_stream(langflow_client, FLOW_ID, prompt, user_id, extra_headers=extra_headers, previous_response_id=previous_response_id)
+
+            return async_langflow_chat_stream(
+                langflow_client,
+                LANGFLOW_CHAT_FLOW_ID,
+                prompt,
+                user_id,
+                extra_headers=extra_headers,
+                previous_response_id=previous_response_id,
+            )
         else:
             from agent import async_langflow_chat
-            response_text, response_id = await async_langflow_chat(langflow_client, FLOW_ID, prompt, user_id, extra_headers=extra_headers, previous_response_id=previous_response_id)
+
+            response_text, response_id = await async_langflow_chat(
+                langflow_client,
+                LANGFLOW_CHAT_FLOW_ID,
+                prompt,
+                user_id,
+                extra_headers=extra_headers,
+                previous_response_id=previous_response_id,
+            )
             response_data = {"response": response_text}
             if response_id:
                 response_data["response_id"] = response_id
             return response_data
 
-    async def upload_context_chat(self, document_content: str, filename: str, 
-                                 user_id: str = None, jwt_token: str = None, previous_response_id: str = None, endpoint: str = "langflow"):
+    async def upload_context_chat(
+        self,
+        document_content: str,
+        filename: str,
+        user_id: str = None,
+        jwt_token: str = None,
+        previous_response_id: str = None,
+        endpoint: str = "langflow",
+    ):
         """Send document content as user message to get proper response_id"""
         document_prompt = f"I'm uploading a document called '{filename}'. Here is its content:\n\n{document_content}\n\nPlease confirm you've received this document and are ready to answer questions about it."
-        
+
         if endpoint == "langflow":
             # Prepare extra headers for JWT authentication
             extra_headers = {}
             if jwt_token:
-                extra_headers['X-LANGFLOW-GLOBAL-VAR-JWT'] = jwt_token
+                extra_headers["X-LANGFLOW-GLOBAL-VAR-JWT"] = jwt_token
             # Ensure the Langflow client exists; try lazy init if needed
             langflow_client = await clients.ensure_langflow_client()
             if not langflow_client:
-                raise ValueError("Langflow client not initialized. Ensure LANGFLOW is reachable or set LANGFLOW_KEY.")
-            response_text, response_id = await async_langflow(langflow_client, FLOW_ID, document_prompt, extra_headers=extra_headers, previous_response_id=previous_response_id)
+                raise ValueError(
+                    "Langflow client not initialized. Ensure LANGFLOW is reachable or set LANGFLOW_KEY."
+                )
+            response_text, response_id = await async_langflow(
+                langflow_client,
+                LANGFLOW_CHAT_FLOW_ID,
+                document_prompt,
+                extra_headers=extra_headers,
+                previous_response_id=previous_response_id,
+            )
         else:  # chat
             # Set auth context for chat tools and provide user_id
             if user_id and jwt_token:
                 set_auth_context(user_id, jwt_token)
-            response_text, response_id = await async_chat(clients.patched_async_client, document_prompt, user_id, previous_response_id=previous_response_id)
-        
+            response_text, response_id = await async_chat(
+                clients.patched_async_client,
+                document_prompt,
+                user_id,
+                previous_response_id=previous_response_id,
+            )
+
         return response_text, response_id
 
     async def get_chat_history(self, user_id: str):
         """Get chat conversation history for a user"""
         from agent import get_user_conversations
-        
+
         if not user_id:
             return {"error": "User ID is required", "conversations": []}
-        
+
         conversations_dict = get_user_conversations(user_id)
-        print(f"[DEBUG] get_chat_history for user {user_id}: found {len(conversations_dict)} conversations")
-        
+        print(
+            f"[DEBUG] get_chat_history for user {user_id}: found {len(conversations_dict)} conversations"
+        )
+
         # Convert conversations dict to list format with metadata
         conversations = []
         for response_id, conversation_state in conversations_dict.items():
@@ -139,47 +215,67 @@ class ChatService:
                     message_data = {
                         "role": msg["role"],
                         "content": msg["content"],
-                        "timestamp": msg.get("timestamp").isoformat() if msg.get("timestamp") else None
+                        "timestamp": msg.get("timestamp").isoformat()
+                        if msg.get("timestamp")
+                        else None,
                     }
                     if msg.get("response_id"):
                         message_data["response_id"] = msg["response_id"]
                     messages.append(message_data)
-            
+
             if messages:  # Only include conversations with actual messages
                 # Generate title from first user message
-                first_user_msg = next((msg for msg in messages if msg["role"] == "user"), None)
-                title = first_user_msg["content"][:50] + "..." if first_user_msg and len(first_user_msg["content"]) > 50 else first_user_msg["content"] if first_user_msg else "New chat"
-                
-                conversations.append({
-                    "response_id": response_id,
-                    "title": title,
-                    "endpoint": "chat",
-                    "messages": messages,
-                    "created_at": conversation_state.get("created_at").isoformat() if conversation_state.get("created_at") else None,
-                    "last_activity": conversation_state.get("last_activity").isoformat() if conversation_state.get("last_activity") else None,
-                    "previous_response_id": conversation_state.get("previous_response_id"),
-                    "total_messages": len(messages)
-                })
-        
+                first_user_msg = next(
+                    (msg for msg in messages if msg["role"] == "user"), None
+                )
+                title = (
+                    first_user_msg["content"][:50] + "..."
+                    if first_user_msg and len(first_user_msg["content"]) > 50
+                    else first_user_msg["content"]
+                    if first_user_msg
+                    else "New chat"
+                )
+
+                conversations.append(
+                    {
+                        "response_id": response_id,
+                        "title": title,
+                        "endpoint": "chat",
+                        "messages": messages,
+                        "created_at": conversation_state.get("created_at").isoformat()
+                        if conversation_state.get("created_at")
+                        else None,
+                        "last_activity": conversation_state.get(
+                            "last_activity"
+                        ).isoformat()
+                        if conversation_state.get("last_activity")
+                        else None,
+                        "previous_response_id": conversation_state.get(
+                            "previous_response_id"
+                        ),
+                        "total_messages": len(messages),
+                    }
+                )
+
         # Sort by last activity (most recent first)
         conversations.sort(key=lambda c: c["last_activity"], reverse=True)
-        
+
         return {
             "user_id": user_id,
             "endpoint": "chat",
             "conversations": conversations,
-            "total_conversations": len(conversations)
+            "total_conversations": len(conversations),
         }
 
     async def get_langflow_history(self, user_id: str):
         """Get langflow conversation history for a user"""
         from agent import get_user_conversations
-        
+
         if not user_id:
             return {"error": "User ID is required", "conversations": []}
-        
+
         conversations_dict = get_user_conversations(user_id)
-        
+
         # Convert conversations dict to list format with metadata
         conversations = []
         for response_id, conversation_state in conversations_dict.items():
@@ -190,34 +286,54 @@ class ChatService:
                     message_data = {
                         "role": msg["role"],
                         "content": msg["content"],
-                        "timestamp": msg.get("timestamp").isoformat() if msg.get("timestamp") else None
+                        "timestamp": msg.get("timestamp").isoformat()
+                        if msg.get("timestamp")
+                        else None,
                     }
                     if msg.get("response_id"):
                         message_data["response_id"] = msg["response_id"]
                     messages.append(message_data)
-            
+
             if messages:  # Only include conversations with actual messages
                 # Generate title from first user message
-                first_user_msg = next((msg for msg in messages if msg["role"] == "user"), None)
-                title = first_user_msg["content"][:50] + "..." if first_user_msg and len(first_user_msg["content"]) > 50 else first_user_msg["content"] if first_user_msg else "New chat"
-                
-                conversations.append({
-                    "response_id": response_id,
-                    "title": title,
-                    "endpoint": "langflow",
-                    "messages": messages,
-                    "created_at": conversation_state.get("created_at").isoformat() if conversation_state.get("created_at") else None,
-                    "last_activity": conversation_state.get("last_activity").isoformat() if conversation_state.get("last_activity") else None,
-                    "previous_response_id": conversation_state.get("previous_response_id"),
-                    "total_messages": len(messages)
-                })
-        
+                first_user_msg = next(
+                    (msg for msg in messages if msg["role"] == "user"), None
+                )
+                title = (
+                    first_user_msg["content"][:50] + "..."
+                    if first_user_msg and len(first_user_msg["content"]) > 50
+                    else first_user_msg["content"]
+                    if first_user_msg
+                    else "New chat"
+                )
+
+                conversations.append(
+                    {
+                        "response_id": response_id,
+                        "title": title,
+                        "endpoint": "langflow",
+                        "messages": messages,
+                        "created_at": conversation_state.get("created_at").isoformat()
+                        if conversation_state.get("created_at")
+                        else None,
+                        "last_activity": conversation_state.get(
+                            "last_activity"
+                        ).isoformat()
+                        if conversation_state.get("last_activity")
+                        else None,
+                        "previous_response_id": conversation_state.get(
+                            "previous_response_id"
+                        ),
+                        "total_messages": len(messages),
+                    }
+                )
+
         # Sort by last activity (most recent first)
         conversations.sort(key=lambda c: c["last_activity"], reverse=True)
-        
+
         return {
             "user_id": user_id,
             "endpoint": "langflow",
             "conversations": conversations,
-            "total_conversations": len(conversations)
+            "total_conversations": len(conversations),
         }

From c82c74d76c3c0dcc3102e230ac49645345ed91fa Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:23:45 -0300
Subject: [PATCH 13/67] Enhance upload_user_file endpoint with improved logging
 and error handling

This commit adds detailed debug and error logging to the upload_user_file endpoint, facilitating better tracking of file upload processes and issues. It includes checks for the presence of a file and JWT token, and captures exceptions with traceback information for enhanced debugging. These changes contribute to a more robust and well-documented codebase.
---
 src/api/langflow_files.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
index 2d2cfd42..be0a0293 100644
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@@ -8,21 +8,39 @@ async def upload_user_file(
     request: Request, langflow_file_service: LangflowFileService, session_manager
 ):
     try:
+        print("[DEBUG] upload_user_file endpoint called")
         form = await request.form()
         upload_file = form.get("file")
         if upload_file is None:
+            print("[ERROR] No file provided in upload request")
             return JSONResponse({"error": "Missing file"}, status_code=400)
 
+        print(
+            f"[DEBUG] Processing file: {upload_file.filename}, size: {upload_file.size}"
+        )
+
         # starlette UploadFile provides file-like; httpx needs (filename, file, content_type)
+        content = await upload_file.read()
         file_tuple = (
             upload_file.filename,
-            await upload_file.read(),
+            content,
             upload_file.content_type or "application/octet-stream",
         )
 
-        result = await langflow_file_service.upload_user_file(file_tuple)
+        jwt_token = getattr(request.state, "jwt_token", None)
+        print(f"[DEBUG] JWT token present: {jwt_token is not None}")
+
+        print("[DEBUG] Calling langflow_file_service.upload_user_file...")
+        result = await langflow_file_service.upload_user_file(
+            file_tuple, jwt_token=jwt_token
+        )
+        print(f"[DEBUG] Upload successful: {result}")
         return JSONResponse(result, status_code=201)
     except Exception as e:
+        print(f"[ERROR] upload_user_file endpoint failed: {type(e).__name__}: {e}")
+        import traceback
+
+        print(f"[ERROR] Traceback: {traceback.format_exc()}")
         return JSONResponse({"error": str(e)}, status_code=500)
 
 

From 83438a7c93852cbb82ecef62419f94bfd0dccc87 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:23:54 -0300
Subject: [PATCH 14/67] Refactor settings.py to enhance settings retrieval and
 improve flow ID handling

This commit updates the settings retrieval function to include new flow IDs for chat and ingestion, replacing the deprecated FLOW_ID. It also improves the logic for exposing edit URLs based on the availability of public URLs, contributing to a more robust and well-documented codebase.
---
 src/api/settings.py | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/api/settings.py b/src/api/settings.py
index e718e927..43813e99 100644
--- a/src/api/settings.py
+++ b/src/api/settings.py
@@ -1,6 +1,11 @@
-import os
 from starlette.responses import JSONResponse
-from config.settings import LANGFLOW_URL, FLOW_ID, LANGFLOW_PUBLIC_URL
+from config.settings import (
+    LANGFLOW_URL,
+    LANGFLOW_CHAT_FLOW_ID,
+    LANGFLOW_INGEST_FLOW_ID,
+    LANGFLOW_PUBLIC_URL,
+)
+
 
 async def get_settings(request, session_manager):
     """Get application settings"""
@@ -8,18 +13,25 @@ async def get_settings(request, session_manager):
         # Return public settings that are safe to expose to frontend
         settings = {
             "langflow_url": LANGFLOW_URL,
-            "flow_id": FLOW_ID,
+            "flow_id": LANGFLOW_CHAT_FLOW_ID,
+            "ingest_flow_id": LANGFLOW_INGEST_FLOW_ID,
             "langflow_public_url": LANGFLOW_PUBLIC_URL,
         }
-        
-        # Only expose edit URL when a public URL is configured
-        if LANGFLOW_PUBLIC_URL and FLOW_ID:
-            settings["langflow_edit_url"] = f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{FLOW_ID}"
-        
+
+        # Only expose edit URLs when a public URL is configured
+        if LANGFLOW_PUBLIC_URL and LANGFLOW_CHAT_FLOW_ID:
+            settings["langflow_edit_url"] = (
+                f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{LANGFLOW_CHAT_FLOW_ID}"
+            )
+
+        if LANGFLOW_PUBLIC_URL and LANGFLOW_INGEST_FLOW_ID:
+            settings["langflow_ingest_edit_url"] = (
+                f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{LANGFLOW_INGEST_FLOW_ID}"
+            )
+
         return JSONResponse(settings)
-        
+
     except Exception as e:
         return JSONResponse(
-            {"error": f"Failed to retrieve settings: {str(e)}"},
-            status_code=500
+            {"error": f"Failed to retrieve settings: {str(e)}"}, status_code=500
         )

From 3118e54b69c52fb18270c0295cf0e672a2a73936 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:24:11 -0300
Subject: [PATCH 15/67] Add ingest flow handling and UI updates in
 KnowledgeSourcesPage

This commit introduces state management for ingest flow IDs and corresponding edit URLs in the KnowledgeSourcesPage component. It enhances the user interface by adding a new section for file ingestion, allowing users to customize their file processing pipeline. The changes improve the overall functionality and maintainability of the settings page, contributing to a more robust and well-documented codebase.
---
 frontend/src/app/admin/page.tsx    | 12 +++++-----
 frontend/src/app/settings/page.tsx | 37 ++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/frontend/src/app/admin/page.tsx b/frontend/src/app/admin/page.tsx
index 121a460b..c3262156 100644
--- a/frontend/src/app/admin/page.tsx
+++ b/frontend/src/app/admin/page.tsx
@@ -57,7 +57,7 @@ function AdminPage() {
       })
 
       const result = await response.json()
-      
+
       if (response.ok) {
         setUploadStatus(`File uploaded successfully! ID: ${result.id}`)
         setSelectedFile(null)
@@ -132,23 +132,23 @@ function AdminPage() {
       })
 
       const result = await response.json()
-      
+
       if (response.status === 201) {
         // New flow: Got task ID, use centralized tracking
         const taskId = result.task_id || result.id
         const totalFiles = result.total_files || 0
-        
+
         if (!taskId) {
           throw new Error("No task ID received from server")
         }
-        
+
         // Add task to centralized tracking
         addTask(taskId)
-        
+
         setUploadStatus(`🔄 Processing started for ${totalFiles} files. Check the task notification panel for real-time progress. (Task ID: ${taskId})`)
         setFolderPath("")
         setPathUploadLoading(false)
-        
+
       } else if (response.ok) {
         // Original flow: Direct response with results
         const successful = result.results?.filter((r: {status: string}) => r.status === "indexed").length || 0
diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx
index c42cbeb8..e1352434 100644
--- a/frontend/src/app/settings/page.tsx
+++ b/frontend/src/app/settings/page.tsx
@@ -57,7 +57,9 @@ function KnowledgeSourcesPage() {
   // Settings state
   // Note: backend internal Langflow URL is not needed on the frontend
   const [flowId, setFlowId] = useState<string>('1098eea1-6649-4e1d-aed1-b77249fb8dd0')
+  const [ingestFlowId, setIngestFlowId] = useState<string>('')
   const [langflowEditUrl, setLangflowEditUrl] = useState<string>('')
+  const [langflowIngestEditUrl, setLangflowIngestEditUrl] = useState<string>('')
   const [publicLangflowUrl, setPublicLangflowUrl] = useState<string>('')
 
   // Fetch settings from backend
@@ -69,9 +71,18 @@ function KnowledgeSourcesPage() {
         if (settings.flow_id) {
           setFlowId(settings.flow_id)
         }
+        if (settings.ingest_flow_id) {
+          console.log('Setting ingestFlowId to:', settings.ingest_flow_id)
+          setIngestFlowId(settings.ingest_flow_id)
+        } else {
+          console.log('No ingest_flow_id in settings:', settings)
+        }
         if (settings.langflow_edit_url) {
           setLangflowEditUrl(settings.langflow_edit_url)
         }
+        if (settings.langflow_ingest_edit_url) {
+          setLangflowIngestEditUrl(settings.langflow_ingest_edit_url)
+        }
         if (settings.langflow_public_url) {
           setPublicLangflowUrl(settings.langflow_public_url)
         }
@@ -344,6 +355,32 @@ function KnowledgeSourcesPage() {
         </Button>
       </div>
 
+      {/* Ingest Flow Section */}
+      <div className="flex items-center justify-between py-4">
+        <div>
+          <h3 className="text-lg font-medium">File ingestion</h3>
+          <p className="text-sm text-muted-foreground">Customize your file processing and indexing pipeline</p>
+        </div>
+        <Button
+          onClick={() => {
+            const derivedFromWindow = typeof window !== 'undefined' 
+              ? `${window.location.protocol}//${window.location.hostname}:7860` 
+              : ''
+            const base = (publicLangflowUrl || derivedFromWindow || 'http://localhost:7860').replace(/\/$/, '')
+            const computed = ingestFlowId ? `${base}/flow/${ingestFlowId}` : base
+            const url = langflowIngestEditUrl || computed
+            window.open(url, '_blank')
+          }}
+        >
+          <svg xmlns="http://www.w3.org/2000/svg" width="24" height="22" viewBox="0 0 24 22" className="h-4 w-4 mr-2">
+            <path fill="currentColor" d="M13.0486 0.462158H9.75399C9.44371 0.462158 9.14614 0.586082 8.92674 0.806667L4.03751 5.72232C3.81811 5.9429 3.52054 6.06682 3.21026 6.06682H1.16992C0.511975 6.06682 -0.0165756 6.61212 0.000397655 7.2734L0.0515933 9.26798C0.0679586 9.90556 0.586745 10.4139 1.22111 10.4139H3.59097C3.90124 10.4139 4.19881 10.2899 4.41821 10.0694L9.34823 5.11269C9.56763 4.89211 9.8652 4.76818 10.1755 4.76818H13.0486C13.6947 4.76818 14.2185 4.24157 14.2185 3.59195V1.63839C14.2185 0.988773 13.6947 0.462158 13.0486 0.462158Z"></path>
+            <path fill="currentColor" d="M19.5355 11.5862H22.8301C23.4762 11.5862 24 12.1128 24 12.7624V14.716C24 15.3656 23.4762 15.8922 22.8301 15.8922H19.957C19.6467 15.8922 19.3491 16.0161 19.1297 16.2367L14.1997 21.1934C13.9803 21.414 13.6827 21.5379 13.3725 21.5379H11.0026C10.3682 21.5379 9.84945 21.0296 9.83309 20.392L9.78189 18.3974C9.76492 17.7361 10.2935 17.1908 10.9514 17.1908H12.9918C13.302 17.1908 13.5996 17.0669 13.819 16.8463L18.7082 11.9307C18.9276 11.7101 19.2252 11.5862 19.5355 11.5862Z"></path>
+            <path fill="currentColor" d="M19.5355 2.9796L22.8301 2.9796C23.4762 2.9796 24 3.50622 24 4.15583V6.1094C24 6.75901 23.4762 7.28563 22.8301 7.28563H19.957C19.6467 7.28563 19.3491 7.40955 19.1297 7.63014L14.1997 12.5868C13.9803 12.8074 13.6827 12.9313 13.3725 12.9313H10.493C10.1913 12.9313 9.90126 13.0485 9.68346 13.2583L4.14867 18.5917C3.93087 18.8016 3.64085 18.9187 3.33917 18.9187H1.32174C0.675616 18.9187 0.151832 18.3921 0.151832 17.7425V15.7343C0.151832 15.0846 0.675616 14.558 1.32174 14.558H3.32468C3.63496 14.558 3.93253 14.4341 4.15193 14.2135L9.40827 8.92878C9.62767 8.70819 9.92524 8.58427 10.2355 8.58427H12.9918C13.302 8.58427 13.5996 8.46034 13.819 8.23976L18.7082 3.32411C18.9276 3.10353 19.2252 2.9796 19.5355 2.9796Z"></path>
+          </svg>
+          Edit in Langflow
+        </Button>
+      </div>
+
 
       {/* Connectors Section */}
       <div className="space-y-6">

From 2bb74d89bb6406e802be8fee377122b198207288 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:26:27 -0300
Subject: [PATCH 16/67] Implement file upload, ingestion, and deletion flow in
 KnowledgeDropdown component

This commit enhances the KnowledgeDropdown component by integrating a complete file handling process that includes uploading files to Langflow, running an ingestion flow, and deleting the uploaded files. It introduces error handling for each step and dispatches appropriate events to notify the UI of the upload and ingestion results. These changes improve the robustness and maintainability of the component, contributing to a well-documented codebase.
---
 frontend/components/knowledge-dropdown.tsx | 54 ++++++++++++++++------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/frontend/components/knowledge-dropdown.tsx b/frontend/components/knowledge-dropdown.tsx
index e73db5e9..c30d5420 100644
--- a/frontend/components/knowledge-dropdown.tsx
+++ b/frontend/components/knowledge-dropdown.tsx
@@ -80,24 +80,50 @@ export function KnowledgeDropdown({ active, variant = 'navigation' }: KnowledgeD
         const formData = new FormData()
         formData.append('file', files[0])
         
-        const response = await fetch('/api/upload', {
+        // 1) Upload to Langflow
+        const upRes = await fetch('/api/langflow/files/upload', {
           method: 'POST',
           body: formData,
         })
-
-        const result = await response.json()
-        
-        if (response.ok) {
-          window.dispatchEvent(new CustomEvent('fileUploaded', { 
-            detail: { file: files[0], result } 
-          }))
-          // Trigger search refresh after successful upload
-          window.dispatchEvent(new CustomEvent('knowledgeUpdated'))
-        } else {
-          window.dispatchEvent(new CustomEvent('fileUploadError', { 
-            detail: { filename: files[0].name, error: result.error || 'Upload failed' } 
-          }))
+        const upJson = await upRes.json()
+        if (!upRes.ok) {
+          throw new Error(upJson?.error || 'Upload to Langflow failed')
         }
+
+        const fileId = upJson?.id
+        const filePath = upJson?.path
+        if (!fileId || !filePath) {
+          throw new Error('Langflow did not return file id/path')
+        }
+
+        // 2) Run ingestion flow
+        const runRes = await fetch('/api/langflow/ingest', {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ file_paths: [filePath] }),
+        })
+        const runJson = await runRes.json()
+        if (!runRes.ok) {
+          throw new Error(runJson?.error || 'Langflow ingestion failed')
+        }
+
+        // 3) Delete file from Langflow
+        const delRes = await fetch('/api/langflow/files', {
+          method: 'DELETE',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ file_ids: [fileId] }),
+        })
+        const delJson = await delRes.json().catch(() => ({}))
+        if (!delRes.ok) {
+          throw new Error(delJson?.error || 'Langflow file delete failed')
+        }
+
+        // Notify UI
+        window.dispatchEvent(new CustomEvent('fileUploaded', { 
+          detail: { file: files[0], result: { file_id: fileId, file_path: filePath, run: runJson } } 
+        }))
+        // Trigger search refresh after successful ingestion
+        window.dispatchEvent(new CustomEvent('knowledgeUpdated'))
       } catch (error) {
         window.dispatchEvent(new CustomEvent('fileUploadError', { 
           detail: { filename: files[0].name, error: error instanceof Error ? error.message : 'Upload failed' } 

From e4603706fe355b751b0332e583c41bab0b8a2251 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 10:27:08 -0300
Subject: [PATCH 17/67] Update Docker Compose files to replace FLOW_ID with
 LANGFLOW_CHAT_FLOW_ID

This commit modifies both docker-compose.yml and docker-compose-cpu.yml to update the environment variable from FLOW_ID to LANGFLOW_CHAT_FLOW_ID, ensuring consistency across configurations. These changes contribute to a more robust and well-documented codebase.
---
 .env.example           | 7 ++++---
 docker-compose-cpu.yml | 2 +-
 docker-compose.yml     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.env.example b/.env.example
index a1fd6326..0537dc51 100644
--- a/.env.example
+++ b/.env.example
@@ -1,10 +1,11 @@
 # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
 LANGFLOW_SECRET_KEY=
-# flow id from the the openrag flow json
-FLOW_ID=1098eea1-6649-4e1d-aed1-b77249fb8dd0
+# flow ids for chat and ingestion flows
+LANGFLOW_CHAT_FLOW_ID=1098eea1-6649-4e1d-aed1-b77249fb8dd0
+LANGFLOW_INGEST_FLOW_ID=5488df7c-b93f-4f87-a446-b67028bc0813
 # must match the hashed password in secureconfig, must change for secure deployment!!!
 OPENSEARCH_PASSWORD=OSisgendb1!
-# make here https://console.cloud.google.com/apis/credentials 
+# make here https://console.cloud.google.com/apis/credentials
 GOOGLE_OAUTH_CLIENT_ID=
 GOOGLE_OAUTH_CLIENT_SECRET=
 # Azure app registration credentials for SharePoint/OneDrive
diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml
index e9831aa6..d22c2491 100644
--- a/docker-compose-cpu.yml
+++ b/docker-compose-cpu.yml
@@ -52,7 +52,7 @@ services:
       - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY}
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}
       - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD}
-      - FLOW_ID=${FLOW_ID}
+      - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID}
       - OPENSEARCH_PORT=9200
       - OPENSEARCH_USERNAME=admin
       - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}
diff --git a/docker-compose.yml b/docker-compose.yml
index 997cc3b8..78059a46 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -52,7 +52,7 @@ services:
       - LANGFLOW_PUBLIC_URL=${LANGFLOW_PUBLIC_URL}
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}
       - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD}
-      - FLOW_ID=${FLOW_ID}
+      - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID}
       - OPENSEARCH_PORT=9200
       - OPENSEARCH_USERNAME=admin
       - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD}

From ec5092a54a2f29fe1d092b18b1aea724c70c7c53 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 18:03:16 -0300
Subject: [PATCH 18/67] Add ingestion flow  for OpenSearch integration

This commit introduces a new JSON configuration file for the OpenSearch ingestion flow, detailing the data processing pipeline. The flow includes components for splitting text, generating embeddings, and ingesting data into OpenSearch, enhancing the capabilities for Retrieval Augmented Generation (RAG) tasks. The configuration is designed to support various input types and provides detailed metadata for each component, ensuring robust and well-documented integration.
---
 flows/ingestion_flow.json | 1588 +++++++++++++++++++++++++++++++++++++
 1 file changed, 1588 insertions(+)
 create mode 100644 flows/ingestion_flow.json

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
new file mode 100644
index 00000000..eff0552d
--- /dev/null
+++ b/flows/ingestion_flow.json
@@ -0,0 +1,1588 @@
+{
+  "data": {
+    "edges": [
+      {
+        "animated": false,
+        "className": "",
+        "data": {
+          "sourceHandle": {
+            "dataType": "File",
+            "id": "File-PSU37",
+            "name": "message",
+            "output_types": [
+              "Message"
+            ]
+          },
+          "targetHandle": {
+            "fieldName": "data_inputs",
+            "id": "SplitText-QIKhg",
+            "inputTypes": [
+              "Data",
+              "DataFrame",
+              "Message"
+            ],
+            "type": "other"
+          }
+        },
+        "id": "reactflow__edge-File-PSU37{\u0153dataType\u0153:\u0153File\u0153,\u0153id\u0153:\u0153File-PSU37\u0153,\u0153name\u0153:\u0153message\u0153,\u0153output_types\u0153:[\u0153Message\u0153]}-SplitText-QIKhg{\u0153fieldName\u0153:\u0153data_inputs\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153,\u0153Message\u0153],\u0153type\u0153:\u0153other\u0153}",
+        "selected": false,
+        "source": "File-PSU37",
+        "sourceHandle": "{\u0153dataType\u0153:\u0153File\u0153,\u0153id\u0153:\u0153File-PSU37\u0153,\u0153name\u0153:\u0153message\u0153,\u0153output_types\u0153:[\u0153Message\u0153]}",
+        "target": "SplitText-QIKhg",
+        "targetHandle": "{\u0153fieldName\u0153:\u0153data_inputs\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153,\u0153Message\u0153],\u0153type\u0153:\u0153other\u0153}"
+      },
+      {
+        "animated": false,
+        "className": "",
+        "data": {
+          "sourceHandle": {
+            "dataType": "OpenAIEmbeddings",
+            "id": "OpenAIEmbeddings-joRJ6",
+            "name": "embeddings",
+            "output_types": [
+              "Embeddings"
+            ]
+          },
+          "targetHandle": {
+            "fieldName": "embedding",
+            "id": "OpenSearch-Mkw1W",
+            "inputTypes": [
+              "Embeddings"
+            ],
+            "type": "other"
+          }
+        },
+        "id": "xy-edge__OpenAIEmbeddings-joRJ6{\u0153dataType\u0153:\u0153OpenAIEmbeddings\u0153,\u0153id\u0153:\u0153OpenAIEmbeddings-joRJ6\u0153,\u0153name\u0153:\u0153embeddings\u0153,\u0153output_types\u0153:[\u0153Embeddings\u0153]}-OpenSearch-Mkw1W{\u0153fieldName\u0153:\u0153embedding\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Embeddings\u0153],\u0153type\u0153:\u0153other\u0153}",
+        "selected": false,
+        "source": "OpenAIEmbeddings-joRJ6",
+        "sourceHandle": "{\u0153dataType\u0153:\u0153OpenAIEmbeddings\u0153,\u0153id\u0153:\u0153OpenAIEmbeddings-joRJ6\u0153,\u0153name\u0153:\u0153embeddings\u0153,\u0153output_types\u0153:[\u0153Embeddings\u0153]}",
+        "target": "OpenSearch-Mkw1W",
+        "targetHandle": "{\u0153fieldName\u0153:\u0153embedding\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Embeddings\u0153],\u0153type\u0153:\u0153other\u0153}"
+      },
+      {
+        "animated": false,
+        "className": "",
+        "data": {
+          "sourceHandle": {
+            "dataType": "SplitText",
+            "id": "SplitText-QIKhg",
+            "name": "dataframe",
+            "output_types": [
+              "DataFrame"
+            ]
+          },
+          "targetHandle": {
+            "fieldName": "ingest_data",
+            "id": "OpenSearch-Mkw1W",
+            "inputTypes": [
+              "Data",
+              "DataFrame"
+            ],
+            "type": "other"
+          }
+        },
+        "id": "xy-edge__SplitText-QIKhg{\u0153dataType\u0153:\u0153SplitText\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153name\u0153:\u0153dataframe\u0153,\u0153output_types\u0153:[\u0153DataFrame\u0153]}-OpenSearch-Mkw1W{\u0153fieldName\u0153:\u0153ingest_data\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153],\u0153type\u0153:\u0153other\u0153}",
+        "selected": false,
+        "source": "SplitText-QIKhg",
+        "sourceHandle": "{\u0153dataType\u0153:\u0153SplitText\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153name\u0153:\u0153dataframe\u0153,\u0153output_types\u0153:[\u0153DataFrame\u0153]}",
+        "target": "OpenSearch-Mkw1W",
+        "targetHandle": "{\u0153fieldName\u0153:\u0153ingest_data\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153],\u0153type\u0153:\u0153other\u0153}"
+      }
+    ],
+    "nodes": [
+      {
+        "data": {
+          "description": "Split text into chunks based on specified criteria.",
+          "display_name": "Split Text",
+          "id": "SplitText-QIKhg",
+          "node": {
+            "base_classes": [
+              "Data"
+            ],
+            "beta": false,
+            "conditional_paths": [],
+            "custom_fields": {},
+            "description": "Split text into chunks based on specified criteria.",
+            "display_name": "Split Text",
+            "documentation": "",
+            "edited": false,
+            "field_order": [
+              "data_inputs",
+              "chunk_overlap",
+              "chunk_size",
+              "separator"
+            ],
+            "frozen": false,
+            "icon": "scissors-line-dashed",
+            "legacy": false,
+            "lf_version": "1.1.1",
+            "metadata": {
+              "code_hash": "dbf2e9d2319d",
+              "dependencies": {
+                "dependencies": [
+                  {
+                    "name": "langchain_text_splitters",
+                    "version": "0.3.9"
+                  },
+                  {
+                    "name": "langflow",
+                    "version": "1.5.0.post2"
+                  }
+                ],
+                "total_dependencies": 2
+              },
+              "module": "langflow.components.processing.split_text.SplitTextComponent"
+            },
+            "output_types": [],
+            "outputs": [
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "Chunks",
+                "group_outputs": false,
+                "method": "split_text",
+                "name": "dataframe",
+                "selected": "DataFrame",
+                "tool_mode": true,
+                "types": [
+                  "DataFrame"
+                ],
+                "value": "__UNDEFINED__"
+              }
+            ],
+            "pinned": false,
+            "template": {
+              "_type": "Component",
+              "chunk_overlap": {
+                "advanced": false,
+                "display_name": "Chunk Overlap",
+                "dynamic": false,
+                "info": "Number of characters to overlap between chunks.",
+                "list": false,
+                "name": "chunk_overlap",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 200
+              },
+              "chunk_size": {
+                "advanced": false,
+                "display_name": "Chunk Size",
+                "dynamic": false,
+                "info": "The maximum length of each chunk. Text is first split by separator, then chunks are merged up to this size. Individual splits larger than this won't be further divided.",
+                "list": false,
+                "name": "chunk_size",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 1000
+              },
+              "code": {
+                "advanced": true,
+                "dynamic": true,
+                "fileTypes": [],
+                "file_path": "",
+                "info": "",
+                "list": false,
+                "load_from_db": false,
+                "multiline": true,
+                "name": "code",
+                "password": false,
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "type": "code",
+                "value": "from langchain_text_splitters import CharacterTextSplitter\n\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\nfrom langflow.utils.util import unescape_string\n\n\nclass SplitTextComponent(Component):\n    display_name: str = \"Split Text\"\n    description: str = \"Split text into chunks based on specified criteria.\"\n    documentation: str = \"https://docs.langflow.org/components-processing#split-text\"\n    icon = \"scissors-line-dashed\"\n    name = \"SplitText\"\n\n    inputs = [\n        HandleInput(\n            name=\"data_inputs\",\n            display_name=\"Input\",\n            info=\"The data with texts to split in chunks.\",\n            input_types=[\"Data\", \"DataFrame\", \"Message\"],\n            required=True,\n        ),\n        IntInput(\n            name=\"chunk_overlap\",\n            display_name=\"Chunk Overlap\",\n            info=\"Number of characters to overlap between chunks.\",\n            value=200,\n        ),\n        IntInput(\n            name=\"chunk_size\",\n            display_name=\"Chunk Size\",\n            info=(\n                \"The maximum length of each chunk. Text is first split by separator, \"\n                \"then chunks are merged up to this size. \"\n                \"Individual splits larger than this won't be further divided.\"\n            ),\n            value=1000,\n        ),\n        MessageTextInput(\n            name=\"separator\",\n            display_name=\"Separator\",\n            info=(\n                \"The character to split on. Use \\\\n for newline. \"\n                \"Examples: \\\\n\\\\n for paragraphs, \\\\n for lines, . for sentences\"\n            ),\n            value=\"\\n\",\n        ),\n        MessageTextInput(\n            name=\"text_key\",\n            display_name=\"Text Key\",\n            info=\"The key to use for the text column.\",\n            value=\"text\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"keep_separator\",\n            display_name=\"Keep Separator\",\n            info=\"Whether to keep the separator in the output chunks and where to place it.\",\n            options=[\"False\", \"True\", \"Start\", \"End\"],\n            value=\"False\",\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Chunks\", name=\"dataframe\", method=\"split_text\"),\n    ]\n\n    def _docs_to_data(self, docs) -> list[Data]:\n        return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]\n\n    def _fix_separator(self, separator: str) -> str:\n        \"\"\"Fix common separator issues and convert to proper format.\"\"\"\n        if separator == \"/n\":\n            return \"\\n\"\n        if separator == \"/t\":\n            return \"\\t\"\n        return separator\n\n    def split_text_base(self):\n        separator = self._fix_separator(self.separator)\n        separator = unescape_string(separator)\n\n        if isinstance(self.data_inputs, DataFrame):\n            if not len(self.data_inputs):\n                msg = \"DataFrame is empty\"\n                raise TypeError(msg)\n\n            self.data_inputs.text_key = self.text_key\n            try:\n                documents = self.data_inputs.to_lc_documents()\n            except Exception as e:\n                msg = f\"Error converting DataFrame to documents: {e}\"\n                raise TypeError(msg) from e\n        elif isinstance(self.data_inputs, Message):\n            self.data_inputs = [self.data_inputs.to_data()]\n            return self.split_text_base()\n        else:\n            if not self.data_inputs:\n                msg = \"No data inputs provided\"\n                raise TypeError(msg)\n\n            documents = []\n            if isinstance(self.data_inputs, Data):\n                self.data_inputs.text_key = self.text_key\n                documents = [self.data_inputs.to_lc_document()]\n            else:\n                try:\n                    documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]\n                    if not documents:\n                        msg = f\"No valid Data inputs found in {type(self.data_inputs)}\"\n                        raise TypeError(msg)\n                except AttributeError as e:\n                    msg = f\"Invalid input type in collection: {e}\"\n                    raise TypeError(msg) from e\n        try:\n            # Convert string 'False'/'True' to boolean\n            keep_sep = self.keep_separator\n            if isinstance(keep_sep, str):\n                if keep_sep.lower() == \"false\":\n                    keep_sep = False\n                elif keep_sep.lower() == \"true\":\n                    keep_sep = True\n                # 'start' and 'end' are kept as strings\n\n            splitter = CharacterTextSplitter(\n                chunk_overlap=self.chunk_overlap,\n                chunk_size=self.chunk_size,\n                separator=separator,\n                keep_separator=keep_sep,\n            )\n            return splitter.split_documents(documents)\n        except Exception as e:\n            msg = f\"Error splitting text: {e}\"\n            raise TypeError(msg) from e\n\n    def split_text(self) -> DataFrame:\n        return DataFrame(self._docs_to_data(self.split_text_base()))\n"
+              },
+              "data_inputs": {
+                "advanced": false,
+                "display_name": "Input",
+                "dynamic": false,
+                "info": "The data with texts to split in chunks.",
+                "input_types": [
+                  "Data",
+                  "DataFrame",
+                  "Message"
+                ],
+                "list": false,
+                "name": "data_inputs",
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "other",
+                "value": ""
+              },
+              "keep_separator": {
+                "_input_type": "DropdownInput",
+                "advanced": true,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Keep Separator",
+                "dynamic": false,
+                "info": "Whether to keep the separator in the output chunks and where to place it.",
+                "name": "keep_separator",
+                "options": [
+                  "False",
+                  "True",
+                  "Start",
+                  "End"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "False"
+              },
+              "separator": {
+                "advanced": false,
+                "display_name": "Separator",
+                "dynamic": false,
+                "info": "The character to split on. Use \\n for newline. Examples: \\n\\n for paragraphs, \\n for lines, . for sentences",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "separator",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "\n"
+              },
+              "text_key": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "Text Key",
+                "dynamic": false,
+                "info": "The key to use for the text column.",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "text_key",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "text"
+              }
+            }
+          },
+          "selected_output": "chunks",
+          "type": "SplitText"
+        },
+        "dragging": false,
+        "height": 475,
+        "id": "SplitText-QIKhg",
+        "measured": {
+          "height": 475,
+          "width": 320
+        },
+        "position": {
+          "x": 1692.461995335383,
+          "y": 1328.2681481569232
+        },
+        "positionAbsolute": {
+          "x": 1683.4543896546102,
+          "y": 1350.7871623588553
+        },
+        "selected": false,
+        "type": "genericNode",
+        "width": 320
+      },
+      {
+        "data": {
+          "id": "OpenAIEmbeddings-joRJ6",
+          "node": {
+            "base_classes": [
+              "Embeddings"
+            ],
+            "beta": false,
+            "conditional_paths": [],
+            "custom_fields": {},
+            "description": "Generate embeddings using OpenAI models.",
+            "display_name": "OpenAI Embeddings",
+            "documentation": "",
+            "edited": false,
+            "field_order": [
+              "default_headers",
+              "default_query",
+              "chunk_size",
+              "client",
+              "deployment",
+              "embedding_ctx_length",
+              "max_retries",
+              "model",
+              "model_kwargs",
+              "openai_api_key",
+              "openai_api_base",
+              "openai_api_type",
+              "openai_api_version",
+              "openai_organization",
+              "openai_proxy",
+              "request_timeout",
+              "show_progress_bar",
+              "skip_empty",
+              "tiktoken_model_name",
+              "tiktoken_enable",
+              "dimensions"
+            ],
+            "frozen": false,
+            "icon": "OpenAI",
+            "legacy": false,
+            "lf_version": "1.1.1",
+            "metadata": {
+              "code_hash": "2691dee277c9",
+              "dependencies": {
+                "dependencies": [
+                  {
+                    "name": "langchain_openai",
+                    "version": "0.3.23"
+                  },
+                  {
+                    "name": "langflow",
+                    "version": "1.5.0.post2"
+                  }
+                ],
+                "total_dependencies": 2
+              },
+              "module": "langflow.components.openai.openai.OpenAIEmbeddingsComponent"
+            },
+            "output_types": [],
+            "outputs": [
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "Embedding Model",
+                "group_outputs": false,
+                "method": "build_embeddings",
+                "name": "embeddings",
+                "selected": "Embeddings",
+                "tool_mode": true,
+                "types": [
+                  "Embeddings"
+                ],
+                "value": "__UNDEFINED__"
+              }
+            ],
+            "pinned": false,
+            "template": {
+              "_type": "Component",
+              "chunk_size": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Chunk Size",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "chunk_size",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 1000
+              },
+              "client": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "Client",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "client",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "code": {
+                "advanced": true,
+                "dynamic": true,
+                "fileTypes": [],
+                "file_path": "",
+                "info": "",
+                "list": false,
+                "load_from_db": false,
+                "multiline": true,
+                "name": "code",
+                "password": false,
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "type": "code",
+                "value": "from langchain_openai import OpenAIEmbeddings\n\nfrom langflow.base.embeddings.model import LCEmbeddingsModel\nfrom langflow.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES\nfrom langflow.field_typing import Embeddings\nfrom langflow.io import BoolInput, DictInput, DropdownInput, FloatInput, IntInput, MessageTextInput, SecretStrInput\n\n\nclass OpenAIEmbeddingsComponent(LCEmbeddingsModel):\n    display_name = \"OpenAI Embeddings\"\n    description = \"Generate embeddings using OpenAI models.\"\n    icon = \"OpenAI\"\n    name = \"OpenAIEmbeddings\"\n\n    inputs = [\n        DictInput(\n            name=\"default_headers\",\n            display_name=\"Default Headers\",\n            advanced=True,\n            info=\"Default headers to use for the API request.\",\n        ),\n        DictInput(\n            name=\"default_query\",\n            display_name=\"Default Query\",\n            advanced=True,\n            info=\"Default query parameters to use for the API request.\",\n        ),\n        IntInput(name=\"chunk_size\", display_name=\"Chunk Size\", advanced=True, value=1000),\n        MessageTextInput(name=\"client\", display_name=\"Client\", advanced=True),\n        MessageTextInput(name=\"deployment\", display_name=\"Deployment\", advanced=True),\n        IntInput(name=\"embedding_ctx_length\", display_name=\"Embedding Context Length\", advanced=True, value=1536),\n        IntInput(name=\"max_retries\", display_name=\"Max Retries\", value=3, advanced=True),\n        DropdownInput(\n            name=\"model\",\n            display_name=\"Model\",\n            advanced=False,\n            options=OPENAI_EMBEDDING_MODEL_NAMES,\n            value=\"text-embedding-3-small\",\n        ),\n        DictInput(name=\"model_kwargs\", display_name=\"Model Kwargs\", advanced=True),\n        SecretStrInput(name=\"openai_api_key\", display_name=\"OpenAI API Key\", value=\"OPENAI_API_KEY\", required=True),\n        MessageTextInput(name=\"openai_api_base\", display_name=\"OpenAI API Base\", advanced=True),\n        MessageTextInput(name=\"openai_api_type\", display_name=\"OpenAI API Type\", advanced=True),\n        MessageTextInput(name=\"openai_api_version\", display_name=\"OpenAI API Version\", advanced=True),\n        MessageTextInput(\n            name=\"openai_organization\",\n            display_name=\"OpenAI Organization\",\n            advanced=True,\n        ),\n        MessageTextInput(name=\"openai_proxy\", display_name=\"OpenAI Proxy\", advanced=True),\n        FloatInput(name=\"request_timeout\", display_name=\"Request Timeout\", advanced=True),\n        BoolInput(name=\"show_progress_bar\", display_name=\"Show Progress Bar\", advanced=True),\n        BoolInput(name=\"skip_empty\", display_name=\"Skip Empty\", advanced=True),\n        MessageTextInput(\n            name=\"tiktoken_model_name\",\n            display_name=\"TikToken Model Name\",\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"tiktoken_enable\",\n            display_name=\"TikToken Enable\",\n            advanced=True,\n            value=True,\n            info=\"If False, you must have transformers installed.\",\n        ),\n        IntInput(\n            name=\"dimensions\",\n            display_name=\"Dimensions\",\n            info=\"The number of dimensions the resulting output embeddings should have. \"\n            \"Only supported by certain models.\",\n            advanced=True,\n        ),\n    ]\n\n    def build_embeddings(self) -> Embeddings:\n        return OpenAIEmbeddings(\n            client=self.client or None,\n            model=self.model,\n            dimensions=self.dimensions or None,\n            deployment=self.deployment or None,\n            api_version=self.openai_api_version or None,\n            base_url=self.openai_api_base or None,\n            openai_api_type=self.openai_api_type or None,\n            openai_proxy=self.openai_proxy or None,\n            embedding_ctx_length=self.embedding_ctx_length,\n            api_key=self.openai_api_key or None,\n            organization=self.openai_organization or None,\n            allowed_special=\"all\",\n            disallowed_special=\"all\",\n            chunk_size=self.chunk_size,\n            max_retries=self.max_retries,\n            timeout=self.request_timeout or None,\n            tiktoken_enabled=self.tiktoken_enable,\n            tiktoken_model_name=self.tiktoken_model_name or None,\n            show_progress_bar=self.show_progress_bar,\n            model_kwargs=self.model_kwargs,\n            skip_empty=self.skip_empty,\n            default_headers=self.default_headers or None,\n            default_query=self.default_query or None,\n        )\n"
+              },
+              "default_headers": {
+                "_input_type": "DictInput",
+                "advanced": true,
+                "display_name": "Default Headers",
+                "dynamic": false,
+                "info": "Default headers to use for the API request.",
+                "list": false,
+                "name": "default_headers",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_input": true,
+                "type": "dict",
+                "value": {}
+              },
+              "default_query": {
+                "_input_type": "DictInput",
+                "advanced": true,
+                "display_name": "Default Query",
+                "dynamic": false,
+                "info": "Default query parameters to use for the API request.",
+                "list": false,
+                "name": "default_query",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_input": true,
+                "type": "dict",
+                "value": {}
+              },
+              "deployment": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "Deployment",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "deployment",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "dimensions": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Dimensions",
+                "dynamic": false,
+                "info": "The number of dimensions the resulting output embeddings should have. Only supported by certain models.",
+                "list": false,
+                "name": "dimensions",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": ""
+              },
+              "embedding_ctx_length": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Embedding Context Length",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "embedding_ctx_length",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 1536
+              },
+              "max_retries": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Max Retries",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "max_retries",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 3
+              },
+              "model": {
+                "_input_type": "DropdownInput",
+                "advanced": false,
+                "combobox": false,
+                "display_name": "Model",
+                "dynamic": false,
+                "info": "",
+                "name": "model",
+                "options": [
+                  "text-embedding-3-small",
+                  "text-embedding-3-large",
+                  "text-embedding-ada-002"
+                ],
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "text-embedding-3-small"
+              },
+              "model_kwargs": {
+                "_input_type": "DictInput",
+                "advanced": true,
+                "display_name": "Model Kwargs",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "model_kwargs",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_input": true,
+                "type": "dict",
+                "value": {}
+              },
+              "openai_api_base": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "OpenAI API Base",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "openai_api_base",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "openai_api_key": {
+                "_input_type": "SecretStrInput",
+                "advanced": false,
+                "display_name": "OpenAI API Key",
+                "dynamic": false,
+                "info": "",
+                "input_types": [],
+                "load_from_db": true,
+                "name": "openai_api_key",
+                "password": true,
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "type": "str",
+                "value": "OPENAI_API_KEY"
+              },
+              "openai_api_type": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "OpenAI API Type",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "openai_api_type",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "openai_api_version": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "OpenAI API Version",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "openai_api_version",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "openai_organization": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "OpenAI Organization",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "openai_organization",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "openai_proxy": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "OpenAI Proxy",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "openai_proxy",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "request_timeout": {
+                "_input_type": "FloatInput",
+                "advanced": true,
+                "display_name": "Request Timeout",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "request_timeout",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "float",
+                "value": ""
+              },
+              "show_progress_bar": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Show Progress Bar",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "show_progress_bar",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
+              "skip_empty": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Skip Empty",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "name": "skip_empty",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
+              "tiktoken_enable": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "TikToken Enable",
+                "dynamic": false,
+                "info": "If False, you must have transformers installed.",
+                "list": false,
+                "name": "tiktoken_enable",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
+              "tiktoken_model_name": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "TikToken Model Name",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "load_from_db": false,
+                "name": "tiktoken_model_name",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              }
+            },
+            "tool_mode": false
+          },
+          "selected_output": "embeddings",
+          "type": "OpenAIEmbeddings"
+        },
+        "dragging": false,
+        "height": 320,
+        "id": "OpenAIEmbeddings-joRJ6",
+        "measured": {
+          "height": 320,
+          "width": 320
+        },
+        "position": {
+          "x": 1690.9220896443658,
+          "y": 1866.483269483266
+        },
+        "positionAbsolute": {
+          "x": 1690.9220896443658,
+          "y": 1866.483269483266
+        },
+        "selected": false,
+        "type": "genericNode",
+        "width": 320
+      },
+      {
+        "data": {
+          "id": "note-Bm5Xw",
+          "node": {
+            "description": "### \ud83d\udca1 Add your OpenAI API key here \ud83d\udc47",
+            "display_name": "",
+            "documentation": "",
+            "template": {
+              "backgroundColor": "transparent"
+            }
+          },
+          "type": "note"
+        },
+        "dragging": false,
+        "height": 324,
+        "id": "note-Bm5Xw",
+        "measured": {
+          "height": 324,
+          "width": 324
+        },
+        "position": {
+          "x": 1692.2322233423606,
+          "y": 1821.9077961087607
+        },
+        "positionAbsolute": {
+          "x": 1692.2322233423606,
+          "y": 1821.9077961087607
+        },
+        "selected": false,
+        "type": "noteNode",
+        "width": 324
+      },
+      {
+        "data": {
+          "id": "File-PSU37",
+          "node": {
+            "base_classes": [
+              "Message"
+            ],
+            "beta": false,
+            "conditional_paths": [],
+            "custom_fields": {},
+            "description": "Loads content from one or more files as a DataFrame.",
+            "display_name": "File",
+            "documentation": "",
+            "edited": false,
+            "field_order": [
+              "path",
+              "file_path",
+              "separator",
+              "silent_errors",
+              "delete_server_file_after_processing",
+              "ignore_unsupported_extensions",
+              "ignore_unspecified_files",
+              "use_multithreading",
+              "concurrency_multithreading"
+            ],
+            "frozen": false,
+            "icon": "file-text",
+            "last_updated": "2025-09-03T06:37:14.082Z",
+            "legacy": false,
+            "metadata": {},
+            "minimized": false,
+            "output_types": [],
+            "outputs": [
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "Raw Content",
+                "group_outputs": false,
+                "method": "load_files_message",
+                "name": "message",
+                "options": null,
+                "required_inputs": null,
+                "selected": "Message",
+                "tool_mode": true,
+                "types": [
+                  "Message"
+                ],
+                "value": "__UNDEFINED__"
+              }
+            ],
+            "pinned": false,
+            "template": {
+              "_type": "Component",
+              "code": {
+                "advanced": true,
+                "dynamic": true,
+                "fileTypes": [],
+                "file_path": "",
+                "info": "",
+                "list": false,
+                "load_from_db": false,
+                "multiline": true,
+                "name": "code",
+                "password": false,
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "type": "code",
+                "value": "\"\"\"Enhanced file component with clearer structure and Docling isolation.\n\nNotes:\n-----\n- Functionality is preserved with minimal behavioral changes.\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n  growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n  utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import TYPE_CHECKING, Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FileInput,\n    IntInput,\n    MessageTextInput,\n    Output,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.message import Message\n\nif TYPE_CHECKING:\n    from langflow.schema import DataFrame\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n    display_name = \"File\"\n    description = \"Loads content from files with optional advanced document processing and export using Docling.\"\n    documentation: str = \"https://docs.langflow.org/components-data#file\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n    VALID_EXTENSIONS = [\n        \"adoc\",\n        \"asciidoc\",\n        \"asc\",\n        \"bmp\",\n        \"csv\",\n        \"dotx\",\n        \"dotm\",\n        \"docm\",\n        \"docx\",\n        \"htm\",\n        \"html\",\n        \"jpeg\",\n        \"json\",\n        \"md\",\n        \"pdf\",\n        \"png\",\n        \"potx\",\n        \"ppsx\",\n        \"pptm\",\n        \"potm\",\n        \"ppsm\",\n        \"pptx\",\n        \"tiff\",\n        \"txt\",\n        \"xls\",\n        \"xlsx\",\n        \"xhtml\",\n        \"xml\",\n        \"webp\",\n        *TEXT_FILE_TYPES,\n    ]\n\n    # Fixed export settings used when markdown export is requested.\n    EXPORT_FORMAT = \"Markdown\"\n    IMAGE_MODE = \"placeholder\"\n\n    # ---- Inputs / Outputs (kept as close to original as possible) -------------------\n    _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n    for input_item in _base_inputs:\n        if isinstance(input_item, FileInput) and input_item.name == \"path\":\n            input_item.real_time_refresh = True\n            break\n\n    inputs = [\n        *_base_inputs,\n        BoolInput(\n            name=\"advanced_mode\",\n            display_name=\"Advanced Parser\",\n            value=False,\n            real_time_refresh=True,\n            info=(\n                \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n                \"Available only for single file processing.\"\n            ),\n            show=False,\n        ),\n        DropdownInput(\n            name=\"pipeline\",\n            display_name=\"Pipeline\",\n            info=\"Docling pipeline to use\",\n            options=[\"standard\", \"vlm\"],\n            value=\"standard\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"ocr_engine\",\n            display_name=\"OCR Engine\",\n            info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n            options=[\"\", \"easyocr\"],\n            value=\"\",\n            show=False,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"md_image_placeholder\",\n            display_name=\"Image placeholder\",\n            info=\"Specify the image placeholder for markdown exports.\",\n            value=\"<!-- image -->\",\n            advanced=True,\n            show=False,\n        ),\n        StrInput(\n            name=\"md_page_break_placeholder\",\n            display_name=\"Page break placeholder\",\n            info=\"Add this placeholder between pages in the markdown output.\",\n            value=\"\",\n            advanced=True,\n            show=False,\n        ),\n        MessageTextInput(\n            name=\"doc_key\",\n            display_name=\"Doc Key\",\n            info=\"The key to use for the DoclingDocument column.\",\n            value=\"doc\",\n            advanced=True,\n            show=False,\n        ),\n        # Deprecated input retained for backward-compatibility.\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=True,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n        BoolInput(\n            name=\"markdown\",\n            display_name=\"Markdown Export\",\n            info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n            value=False,\n            show=False,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n    ]\n\n    # ------------------------------ UI helpers --------------------------------------\n\n    def _path_value(self, template: dict) -> list[str]:\n        \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n        return template.get(\"path\", {}).get(\"file_path\", [])\n\n    def update_build_config(\n        self,\n        build_config: dict[str, Any],\n        field_value: Any,\n        field_name: str | None = None,\n    ) -> dict[str, Any]:\n        \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n        if field_name == \"path\":\n            paths = self._path_value(build_config)\n            file_path = paths[0] if paths else \"\"\n            file_count = len(field_value) if field_value else 0\n\n            # Advanced mode only for single (non-tabular) file\n            allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n            build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n            if not allow_advanced:\n                build_config[\"advanced_mode\"][\"value\"] = False\n                for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                    if f in build_config:\n                        build_config[f][\"show\"] = False\n\n        elif field_name == \"advanced_mode\":\n            for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                if f in build_config:\n                    build_config[f][\"show\"] = bool(field_value)\n\n        return build_config\n\n    def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]:  # noqa: ARG002\n        \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n        if field_name not in [\"path\", \"advanced_mode\"]:\n            return frontend_node\n\n        template = frontend_node.get(\"template\", {})\n        paths = self._path_value(template)\n        if not paths:\n            return frontend_node\n\n        frontend_node[\"outputs\"] = []\n        if len(paths) == 1:\n            file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n            if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n                )\n            elif file_path.endswith(\".json\"):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n                )\n\n            advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n            if advanced_mode:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Output\", name=\"advanced\", method=\"load_files_advanced\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Markdown\", name=\"markdown\", method=\"load_files_markdown\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n            else:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n        else:\n            # Multiple files => DataFrame output; advanced parser disabled\n            frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n        return frontend_node\n\n    # ------------------------------ Core processing ----------------------------------\n\n    def _is_docling_compatible(self, file_path: str) -> bool:\n        \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n        docling_exts = (\n            \".adoc\",\n            \".asciidoc\",\n            \".asc\",\n            \".bmp\",\n            \".csv\",\n            \".dotx\",\n            \".dotm\",\n            \".docm\",\n            \".docx\",\n            \".htm\",\n            \".html\",\n            \".jpeg\",\n            \".json\",\n            \".md\",\n            \".pdf\",\n            \".png\",\n            \".potx\",\n            \".ppsx\",\n            \".pptm\",\n            \".potm\",\n            \".ppsm\",\n            \".pptx\",\n            \".tiff\",\n            \".txt\",\n            \".xls\",\n            \".xlsx\",\n            \".xhtml\",\n            \".xml\",\n            \".webp\",\n        )\n        return file_path.lower().endswith(docling_exts)\n\n    def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n        \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n        We avoid multiprocessing pickling by launching `python -c \"<script>\"` and\n        passing JSON config via stdin. The child prints a JSON result to stdout.\n        \"\"\"\n        if not file_path:\n            return None\n\n        args: dict[str, Any] = {\n            \"file_path\": file_path,\n            \"markdown\": bool(self.markdown),\n            \"image_mode\": str(self.IMAGE_MODE),\n            \"md_image_placeholder\": str(self.md_image_placeholder),\n            \"md_page_break_placeholder\": str(self.md_page_break_placeholder),\n            \"pipeline\": str(self.pipeline),\n            \"ocr_engine\": str(self.ocr_engine) if getattr(self, \"ocr_engine\", \"\") else None,\n        }\n\n        # The child is a tiny, self-contained script to keep memory/state isolated.\n        child_script = textwrap.dedent(\n            r\"\"\"\n            import json, sys\n\n            def try_imports():\n                # Strategy 1: latest layout\n                try:\n                    from docling.datamodel.base_models import ConversionStatus, InputFormat  # type: ignore\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    from docling_core.types.doc import ImageRefMode  # type: ignore\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"latest\"\n                except Exception:\n                    pass\n                # Strategy 2: alternative layout\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    try:\n                        from docling_core.types import ConversionStatus, InputFormat  # type: ignore\n                    except Exception:\n                        try:\n                            from docling.datamodel import ConversionStatus, InputFormat  # type: ignore\n                        except Exception:\n                            class ConversionStatus: SUCCESS = \"success\"\n                            class InputFormat:\n                                PDF=\"pdf\"; IMAGE=\"image\"\n                    try:\n                        from docling_core.types.doc import ImageRefMode  # type: ignore\n                    except Exception:\n                        class ImageRefMode:\n                            PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"alternative\"\n                except Exception:\n                    pass\n                # Strategy 3: basic converter only\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    class ConversionStatus: SUCCESS = \"success\"\n                    class InputFormat:\n                        PDF=\"pdf\"; IMAGE=\"image\"\n                    class ImageRefMode:\n                        PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"basic\"\n                except Exception as e:\n                    raise ImportError(f\"Docling imports failed: {e}\") from e\n\n            def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):\n                if strategy == \"latest\" and pipeline == \"standard\":\n                    try:\n                        from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore\n                        from docling.document_converter import PdfFormatOption  # type: ignore\n                        pipe = PdfPipelineOptions()\n                        if ocr_engine:\n                            try:\n                                from docling.models.factories import get_ocr_factory  # type: ignore\n                                pipe.do_ocr = True\n                                fac = get_ocr_factory(allow_external_plugins=False)\n                                pipe.ocr_options = fac.create_options(kind=ocr_engine)\n                            except Exception:\n                                pipe.do_ocr = False\n                        fmt = {}\n                        if hasattr(input_format, \"PDF\"):\n                            fmt[getattr(input_format, \"PDF\")] = PdfFormatOption(pipeline_options=pipe)\n                        if hasattr(input_format, \"IMAGE\"):\n                            fmt[getattr(input_format, \"IMAGE\")] = PdfFormatOption(pipeline_options=pipe)\n                        return DocumentConverter(format_options=fmt)\n                    except Exception:\n                        return DocumentConverter()\n                return DocumentConverter()\n\n            def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):\n                try:\n                    mode = getattr(ImageRefMode, image_mode.upper(), image_mode)\n                    return document.export_to_markdown(\n                        image_mode=mode,\n                        image_placeholder=img_ph,\n                        page_break_placeholder=pg_ph,\n                    )\n                except Exception:\n                    try:\n                        return document.export_to_text()\n                    except Exception:\n                        return str(document)\n\n            def to_rows(doc_dict):\n                rows = []\n                for t in doc_dict.get(\"texts\", []):\n                    prov = t.get(\"prov\") or []\n                    page_no = None\n                    if prov and isinstance(prov, list) and isinstance(prov[0], dict):\n                        page_no = prov[0].get(\"page_no\")\n                    rows.append({\n                        \"page_no\": page_no,\n                        \"label\": t.get(\"label\"),\n                        \"text\": t.get(\"text\"),\n                        \"level\": t.get(\"level\"),\n                    })\n                return rows\n\n            def main():\n                cfg = json.loads(sys.stdin.read())\n                file_path = cfg[\"file_path\"]\n                markdown = cfg[\"markdown\"]\n                image_mode = cfg[\"image_mode\"]\n                img_ph = cfg[\"md_image_placeholder\"]\n                pg_ph = cfg[\"md_page_break_placeholder\"]\n                pipeline = cfg[\"pipeline\"]\n                ocr_engine = cfg.get(\"ocr_engine\")\n                meta = {\"file_path\": file_path}\n\n                try:\n                    ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()\n                    converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)\n                    try:\n                        res = converter.convert(file_path)\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling conversion error: {e}\", \"meta\": meta}))\n                        return\n\n                    ok = False\n                    if hasattr(res, \"status\"):\n                        try:\n                            ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == \"success\")\n                        except Exception:\n                            ok = (str(res.status).lower() == \"success\")\n                    if not ok and hasattr(res, \"document\"):\n                        ok = getattr(res, \"document\", None) is not None\n                    if not ok:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling conversion failed\", \"meta\": meta}))\n                        return\n\n                    doc = getattr(res, \"document\", None)\n                    if doc is None:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling produced no document\", \"meta\": meta}))\n                        return\n\n                    if markdown:\n                        text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)\n                        print(json.dumps({\"ok\": True, \"mode\": \"markdown\", \"text\": text, \"meta\": meta}))\n                        return\n\n                    # structured\n                    try:\n                        doc_dict = doc.export_to_dict()\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling export_to_dict failed: {e}\", \"meta\": meta}))\n                        return\n\n                    rows = to_rows(doc_dict)\n                    print(json.dumps({\"ok\": True, \"mode\": \"structured\", \"doc\": rows, \"meta\": meta}))\n                except Exception as e:\n                    print(\n                        json.dumps({\n                            \"ok\": False,\n                            \"error\": f\"Docling processing error: {e}\",\n                            \"meta\": {\"file_path\": file_path},\n                        })\n                    )\n\n            if __name__ == \"__main__\":\n                main()\n            \"\"\"\n        )\n\n        # Validate file_path to avoid command injection or unsafe input\n        if not isinstance(args[\"file_path\"], str) or any(c in args[\"file_path\"] for c in [\";\", \"|\", \"&\", \"$\", \"`\"]):\n            return Data(data={\"error\": \"Unsafe file path detected.\", \"file_path\": args[\"file_path\"]})\n\n        proc = subprocess.run(  # noqa: S603\n            [sys.executable, \"-u\", \"-c\", child_script],\n            input=json.dumps(args).encode(\"utf-8\"),\n            capture_output=True,\n            check=False,\n        )\n\n        if not proc.stdout:\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\") or \"no output from child process\"\n            return Data(data={\"error\": f\"Docling subprocess error: {err_msg}\", \"file_path\": file_path})\n\n        try:\n            result = json.loads(proc.stdout.decode(\"utf-8\"))\n        except Exception as e:  # noqa: BLE001\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\")\n            return Data(\n                data={\"error\": f\"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}\", \"file_path\": file_path},\n            )\n\n        if not result.get(\"ok\"):\n            return Data(data={\"error\": result.get(\"error\", \"Unknown Docling error\"), **result.get(\"meta\", {})})\n\n        meta = result.get(\"meta\", {})\n        if result.get(\"mode\") == \"markdown\":\n            exported_content = str(result.get(\"text\", \"\"))\n            return Data(\n                text=exported_content,\n                data={\"exported_content\": exported_content, \"export_format\": self.EXPORT_FORMAT, **meta},\n            )\n\n        rows = list(result.get(\"doc\", []))\n        return Data(data={\"doc\": rows, \"export_format\": self.EXPORT_FORMAT, **meta})\n\n    def process_files(\n        self,\n        file_list: list[BaseFileComponent.BaseFile],\n    ) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Process input files.\n\n        - Single file + advanced_mode => Docling in a separate process.\n        - Otherwise => standard parsing in current process (optionally threaded).\n        \"\"\"\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                self.log(f\"File not found: {file_path}. Error: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                self.log(f\"Unexpected error processing {file_path}: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n\n        # Advanced path: only for a single Docling-compatible file\n        if len(file_list) == 1:\n            file_path = str(file_list[0].path)\n            if self.advanced_mode and self._is_docling_compatible(file_path):\n                advanced_data: Data | None = self._process_docling_in_subprocess(file_path)\n\n                # --- UNNEST: expand each element in `doc` to its own Data row\n                payload = getattr(advanced_data, \"data\", {}) or {}\n                doc_rows = payload.get(\"doc\")\n                if isinstance(doc_rows, list):\n                    rows: list[Data | None] = [\n                        Data(\n                            data={\n                                \"file_path\": file_path,\n                                **(item if isinstance(item, dict) else {\"value\": item}),\n                            },\n                        )\n                        for item in doc_rows\n                    ]\n                    return self.rollup_data(file_list, rows)\n\n                # If not structured, keep as-is (e.g., markdown export or error dict)\n                return self.rollup_data(file_list, [advanced_data])\n\n        # Standard multi-file (or single non-advanced) path\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_paths = [str(f.path) for f in file_list]\n        self.log(f\"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.\")\n        my_data = parallel_load_data(\n            file_paths,\n            silent_errors=self.silent_errors,\n            load_function=process_file_standard,\n            max_concurrency=concurrency,\n        )\n        return self.rollup_data(file_list, my_data)\n\n    # ------------------------------ Output helpers -----------------------------------\n\n    def load_files_advanced(self) -> DataFrame:\n        \"\"\"Load files using advanced Docling processing and export to an advanced format.\"\"\"\n        self.markdown = False\n        return self.load_files()\n\n    def load_files_markdown(self) -> Message:\n        \"\"\"Load files using advanced Docling processing and export to Markdown format.\"\"\"\n        self.markdown = True\n        result = self.load_files()\n        return Message(text=str(result.text[0]))\n"
+              },
+              "concurrency_multithreading": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Processing Concurrency",
+                "dynamic": false,
+                "info": "When multiple files are being processed, the number of files to process concurrently.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "concurrency_multithreading",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 1
+              },
+              "delete_server_file_after_processing": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Delete Server File After Processing",
+                "dynamic": false,
+                "info": "If true, the Server File Path will be deleted after processing.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "delete_server_file_after_processing",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
+              "file_path": {
+                "_input_type": "HandleInput",
+                "advanced": true,
+                "display_name": "Server File Path",
+                "dynamic": false,
+                "info": "Data object with a 'file_path' property pointing to server file or a Message object with a path to the file. Supercedes 'Path' but supports same file types.",
+                "input_types": [
+                  "Data",
+                  "Message"
+                ],
+                "list": true,
+                "list_add_label": "Add More",
+                "name": "file_path",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "other",
+                "value": ""
+              },
+              "ignore_unspecified_files": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Ignore Unspecified Files",
+                "dynamic": false,
+                "info": "If true, Data with no 'file_path' property will be ignored.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "ignore_unspecified_files",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
+              "ignore_unsupported_extensions": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Ignore Unsupported Extensions",
+                "dynamic": false,
+                "info": "If true, files with unsupported extensions will not be processed.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "ignore_unsupported_extensions",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
+              "path": {
+                "_input_type": "FileInput",
+                "advanced": false,
+                "display_name": "Files",
+                "dynamic": false,
+                "fileTypes": [
+                  "txt",
+                  "md",
+                  "mdx",
+                  "csv",
+                  "json",
+                  "yaml",
+                  "yml",
+                  "xml",
+                  "html",
+                  "htm",
+                  "pdf",
+                  "docx",
+                  "py",
+                  "sh",
+                  "sql",
+                  "js",
+                  "ts",
+                  "tsx",
+                  "zip",
+                  "tar",
+                  "tgz",
+                  "bz2",
+                  "gz"
+                ],
+                "file_path": [],
+                "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
+                "list": true,
+                "list_add_label": "Add More",
+                "name": "path",
+                "placeholder": "",
+                "real_time_refresh": true,
+                "required": false,
+                "show": true,
+                "temp_file": false,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "file",
+                "value": ""
+              },
+              "separator": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "Separator",
+                "dynamic": false,
+                "info": "Specify the separator to use between multiple outputs in Message format.",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "separator",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "\n\n"
+              },
+              "silent_errors": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Silent Errors",
+                "dynamic": false,
+                "info": "If true, errors will not raise an exception.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "silent_errors",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
+              "use_multithreading": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "[Deprecated] Use Multithreading",
+                "dynamic": false,
+                "info": "Set 'Processing Concurrency' greater than 1 to enable multithreading.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "use_multithreading",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              }
+            },
+            "tool_mode": false
+          },
+          "showNode": true,
+          "type": "File"
+        },
+        "dragging": false,
+        "id": "File-PSU37",
+        "measured": {
+          "height": 230,
+          "width": 320
+        },
+        "position": {
+          "x": 1330.7650978046952,
+          "y": 1431.5905495627503
+        },
+        "selected": false,
+        "type": "genericNode"
+      },
+      {
+        "data": {
+          "description": "OpenSearch Vector Store with advanced, customizable search capabilities.",
+          "display_name": "OpenSearch",
+          "id": "OpenSearch-Mkw1W",
+          "node": {
+            "base_classes": [
+              "Data",
+              "DataFrame",
+              "VectorStore"
+            ],
+            "beta": false,
+            "conditional_paths": [],
+            "custom_fields": {},
+            "description": "OpenSearch Vector Store with advanced, customizable search capabilities.",
+            "display_name": "OpenSearch",
+            "documentation": "",
+            "edited": false,
+            "field_order": [
+              "opensearch_url",
+              "index_name",
+              "ingest_data",
+              "search_query",
+              "should_cache_vector_store",
+              "embedding",
+              "search_type",
+              "number_of_results",
+              "search_score_threshold",
+              "username",
+              "password",
+              "use_ssl",
+              "verify_certs",
+              "hybrid_search_query"
+            ],
+            "frozen": false,
+            "icon": "OpenSearch",
+            "legacy": false,
+            "metadata": {
+              "code_hash": "972b714acf6b",
+              "dependencies": {
+                "dependencies": [
+                  {
+                    "name": "langchain_community",
+                    "version": "0.3.21"
+                  },
+                  {
+                    "name": "langflow",
+                    "version": "1.5.0.post2"
+                  }
+                ],
+                "total_dependencies": 2
+              },
+              "module": "custom_components.opensearch"
+            },
+            "minimized": false,
+            "output_types": [],
+            "outputs": [
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "Search Results",
+                "group_outputs": false,
+                "method": "search_documents",
+                "name": "search_results",
+                "options": null,
+                "required_inputs": null,
+                "selected": "Data",
+                "tool_mode": true,
+                "types": [
+                  "Data"
+                ],
+                "value": "__UNDEFINED__"
+              },
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "DataFrame",
+                "group_outputs": false,
+                "method": "as_dataframe",
+                "name": "dataframe",
+                "options": null,
+                "required_inputs": null,
+                "selected": "DataFrame",
+                "tool_mode": true,
+                "types": [
+                  "DataFrame"
+                ],
+                "value": "__UNDEFINED__"
+              },
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "Vector Store Connection",
+                "group_outputs": false,
+                "hidden": true,
+                "method": "as_vector_store",
+                "name": "vectorstoreconnection",
+                "options": null,
+                "required_inputs": null,
+                "selected": "VectorStore",
+                "tool_mode": true,
+                "types": [
+                  "VectorStore"
+                ],
+                "value": "__UNDEFINED__"
+              }
+            ],
+            "pinned": false,
+            "template": {
+              "_type": "Component",
+              "code": {
+                "advanced": true,
+                "dynamic": true,
+                "fileTypes": [],
+                "file_path": "",
+                "info": "",
+                "list": false,
+                "load_from_db": false,
+                "multiline": true,
+                "name": "code",
+                "password": false,
+                "placeholder": "",
+                "required": true,
+                "show": true,
+                "title_case": false,
+                "type": "code",
+                "value": "import json\nfrom typing import Any\n\nfrom langchain_community.vectorstores import OpenSearchVectorSearch\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FloatInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom fastapi.encoders import jsonable_encoder\nfrom langchain_core.documents import Document\nimport json\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch Vector Store with advanced, customizable search capabilities.\"\"\"\n\n    display_name: str = \"OpenSearch\"\n    description: str = \"OpenSearch Vector Store with advanced, customizable search capabilities.\"\n    name = \"OpenSearch\"\n    icon = \"OpenSearch\"\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for OpenSearch cluster (e.g. https://192.168.1.1:9200).\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"documents\",\n            info=\"The index name where the vectors will be stored in OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            info=\"Document field embeddings are stored in.\",\n            advanced=True,\n        ),\n        StrInput(\n            name=\"text_field\",\n            display_name=\"Text Field\", \n            value=\"text\",\n            info=\"Document field the text of the document is stored in.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=100,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,\n        HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n        DropdownInput(\n            name=\"search_type\",\n            display_name=\"Search Type\",\n            options=[\"similarity\", \"similarity_score_threshold\", \"mmr\"],\n            value=\"similarity\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Number of Results\",\n            info=\"Number of results to return.\",\n            advanced=True,\n            value=4,\n        ),\n        FloatInput(\n            name=\"search_score_threshold\",\n            display_name=\"Search Score Threshold\",\n            info=\"Minimum similarity score threshold for search results.\",\n            value=0.0,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            advanced=True,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"OPENSEARCH_PASSWORD\",\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"use_ssl\",\n            display_name=\"Use SSL\",\n            value=True,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n        MultilineInput(\n            name=\"hybrid_search_query\",\n            display_name=\"Hybrid Search Query\",\n            value=\"\",\n            advanced=True,\n            info=(\n                \"Provide a custom hybrid search query in JSON format. This allows you to combine \"\n                \"vector similarity and keyword matching.\"\n            ),\n        ),\n    ]\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearchVectorSearch:\n        \"\"\"Builds the OpenSearch Vector Store object.\"\"\"\n        try:\n            from langchain_community.vectorstores import OpenSearchVectorSearch\n        except ImportError as e:\n            error_message = f\"Failed to import required modules: {e}\"\n            self.log(error_message)\n            raise ImportError(error_message) from e\n\n        try:\n            opensearch = OpenSearchVectorSearch(\n                index_name=self.index_name,\n                embedding_function=self.embedding,\n                opensearch_url=self.opensearch_url,\n                http_auth=(self.username, self.password),\n                use_ssl=self.use_ssl,\n                verify_certs=self.verify_certs,\n                ssl_assert_hostname=False,\n                ssl_show_warn=False,\n                engine=self.engine,\n                vector_field=self.vector_field,\n                text_field=self.text_field,\n                space_type=self.space_type,\n                ef_construction=self.ef_construction,\n                m=self.m,\n            )\n        except Exception as e:\n            error_message = f\"Failed to create OpenSearchVectorSearch instance: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        if self.ingest_data:\n            self._add_documents_to_vector_store(opensearch)\n\n        return opensearch\n\n    def _add_documents_to_vector_store(self, vector_store: \"OpenSearchVectorSearch\") -> None:\n        \"\"\"Adds documents to the Vector Store.\"\"\"\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        documents = []\n        for _input in self.ingest_data or []:\n            if isinstance(_input, Data):\n                doc = Document(\n                    page_content=_input.get_text(), \n                    metadata=jsonable_encoder(_input.data) if _input.data else {}\n                )\n                documents.append(doc)\n            else:\n                error_message = f\"Expected Data object, got {type(_input)}\"\n                self.log(error_message)\n                raise TypeError(error_message)\n\n        if documents and self.embedding is not None:\n            self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n            try:\n                vector_store.add_documents(\n                    documents, \n                    vector_field=self.vector_field,\n                    text_field=self.text_field\n                )\n            except Exception as e:\n                error_message = f\"Error adding documents to Vector Store: {e}\"\n                self.log(error_message)\n                raise RuntimeError(error_message) from e\n        else:\n            self.log(\"No documents to add to the Vector Store.\")\n\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        \"\"\"Search for similar documents in the vector store or retrieve all documents if no query is provided.\"\"\"\n        \n        vector_store = self.build_vector_store()\n        try:\n            query = query or \"\"\n\n            if self.hybrid_search_query.strip():\n                try:\n                    hybrid_query = json.loads(self.hybrid_search_query)\n                except json.JSONDecodeError as e:\n                    error_message = f\"Invalid hybrid search query JSON: {e}\"\n                    self.log(error_message)\n                    raise ValueError(error_message) from e\n\n                results = vector_store.client.search(index=self.index_name, body=hybrid_query)\n\n                processed_results = []\n                for hit in results.get(\"hits\", {}).get(\"hits\", []):\n                    source = hit.get(\"_source\", {})\n                    text = source.get(self.text_field, \"\")\n                    metadata = source.get(\"metadata\", {})\n\n                    if isinstance(text, dict):\n                        text = text.get(\"text\", \"\")\n\n                    processed_results.append(\n                        {\n                            \"page_content\": text,\n                            \"metadata\": metadata,\n                        }\n                    )\n                return processed_results\n\n            search_kwargs = {\n                \"k\": self.number_of_results,\n                \"vector_field\": self.vector_field,\n                \"text_field\": self.text_field\n            }\n            search_type = self.search_type.lower()\n\n            if search_type == \"similarity\":\n                results = vector_store.similarity_search(query, **search_kwargs)\n                return [{\"page_content\": doc.page_content, \"metadata\": doc.metadata} for doc in results]\n            if search_type == \"similarity_score_threshold\":\n                search_kwargs[\"score_threshold\"] = self.search_score_threshold\n                results = vector_store.similarity_search_with_relevance_scores(query, **search_kwargs)\n                return [\n                    {\n                        \"page_content\": doc.page_content,\n                        \"metadata\": doc.metadata,\n                        \"score\": score,\n                    }\n                    for doc, score in results\n                ]\n            if search_type == \"mmr\":\n                results = vector_store.max_marginal_relevance_search(query, **search_kwargs)\n                return [{\"page_content\": doc.page_content, \"metadata\": doc.metadata} for doc in results]\n\n        except Exception as e:\n            error_message = f\"Error during search: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        error_message = f\"Error during search. Invalid search type: {self.search_type}\"\n        self.log(error_message)\n        raise ValueError(error_message)\n\n    def search_documents(self) -> list[Data]:\n        \"\"\"Search for documents in the vector store based on the search input.\n\n        If no search input is provided, retrieve all documents.\n        \"\"\"\n        try:\n            query = self.search_query.strip() if self.search_query else None\n            results = self.search(query)\n            retrieved_data = [\n                Data(\n                    file_path=result[\"metadata\"].get(\"file_path\", \"\"),\n                    text=result[\"page_content\"],\n                )\n                for result in results\n            ]\n        except Exception as e:\n            error_message = f\"Error during document search: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        self.status = retrieved_data\n        return retrieved_data\n"
+              },
+              "embedding": {
+                "_input_type": "HandleInput",
+                "advanced": false,
+                "display_name": "Embedding",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Embeddings"
+                ],
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "embedding",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "other",
+                "value": ""
+              },
+              "hybrid_search_query": {
+                "_input_type": "MultilineInput",
+                "advanced": true,
+                "copy_field": false,
+                "display_name": "Hybrid Search Query",
+                "dynamic": false,
+                "info": "Provide a custom hybrid search query in JSON format. This allows you to combine vector similarity and keyword matching.",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "multiline": true,
+                "name": "hybrid_search_query",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "index_name": {
+                "_input_type": "StrInput",
+                "advanced": false,
+                "display_name": "Index Name",
+                "dynamic": false,
+                "info": "The index name where the vectors will be stored in OpenSearch cluster.",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "index_name",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "documents"
+              },
+              "ingest_data": {
+                "_input_type": "HandleInput",
+                "advanced": false,
+                "display_name": "Ingest Data",
+                "dynamic": false,
+                "info": "",
+                "input_types": [
+                  "Data",
+                  "DataFrame"
+                ],
+                "list": true,
+                "list_add_label": "Add More",
+                "name": "ingest_data",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "trace_as_metadata": true,
+                "type": "other",
+                "value": ""
+              },
+              "number_of_results": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "Number of Results",
+                "dynamic": false,
+                "info": "Number of results to return.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "number_of_results",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 4
+              },
+              "opensearch_url": {
+                "_input_type": "StrInput",
+                "advanced": false,
+                "display_name": "OpenSearch URL",
+                "dynamic": false,
+                "info": "URL for OpenSearch cluster (e.g. https://192.168.1.1:9200).",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "opensearch_url",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "http://opensearch:9200"
+              },
+              "password": {
+                "_input_type": "SecretStrInput",
+                "advanced": true,
+                "display_name": "Password",
+                "dynamic": false,
+                "info": "",
+                "input_types": [],
+                "load_from_db": true,
+                "name": "password",
+                "password": true,
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "type": "str",
+                "value": "OPENSEARCH_PASSWORD"
+              },
+              "search_query": {
+                "_input_type": "QueryInput",
+                "advanced": false,
+                "display_name": "Search Query",
+                "dynamic": false,
+                "info": "Enter a query to run a similarity search.",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "search_query",
+                "placeholder": "Enter a query...",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": true,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "query",
+                "value": ""
+              },
+              "search_score_threshold": {
+                "_input_type": "FloatInput",
+                "advanced": true,
+                "display_name": "Search Score Threshold",
+                "dynamic": false,
+                "info": "Minimum similarity score threshold for search results.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "search_score_threshold",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "float",
+                "value": 0
+              },
+              "search_type": {
+                "_input_type": "DropdownInput",
+                "advanced": true,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Search Type",
+                "dynamic": false,
+                "info": "",
+                "name": "search_type",
+                "options": [
+                  "similarity",
+                  "similarity_score_threshold",
+                  "mmr"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "similarity"
+              },
+              "should_cache_vector_store": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Cache Vector Store",
+                "dynamic": false,
+                "info": "If True, the vector store will be cached for the current build of the component. This is useful for components that have multiple output methods and want to share the same vector store.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "should_cache_vector_store",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
+              "use_ssl": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Use SSL",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "use_ssl",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
+              "username": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "Username",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "username",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "admin"
+              },
+              "verify_certs": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Verify Certificates",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "verify_certs",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              }
+            },
+            "tool_mode": false
+          },
+          "selected_output": "search_results",
+          "showNode": true,
+          "type": "OpenSearch"
+        },
+        "dragging": false,
+        "id": "OpenSearch-Mkw1W",
+        "measured": {
+          "height": 518,
+          "width": 320
+        },
+        "position": {
+          "x": 2136.4456339674302,
+          "y": 1460.3160066924486
+        },
+        "selected": false,
+        "type": "genericNode"
+      }
+    ],
+    "viewport": {
+      "x": -1173.5436043881646,
+      "y": -1289.0306227762003,
+      "zoom": 1.0020797567291742
+    }
+  },
+  "description": "Load your data for chat context with Retrieval Augmented Generation.",
+  "endpoint_name": null,
+  "id": "5488df7c-b93f-4f87-a446-b67028bc0813",
+  "is_component": false,
+  "last_tested_version": "1.5.0.post2",
+  "name": "OpenSearch Ingestion Flow",
+  "tags": [
+    "openai",
+    "astradb",
+    "rag",
+    "q-a"
+  ]
+}
\ No newline at end of file

From 2fbb8d84303447f0a08a691f4cebb2a3d5bb8d4a Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 18:31:38 -0300
Subject: [PATCH 19/67] Implement async fetching of ingestion flow
 configuration

This commit adds functionality to retrieve and set ingestion-specific defaults from the Langflow API based on the flow configuration. It includes error handling for the API call and updates the settings with values for chunk size, chunk overlap, separator, and embedding model based on the flow data. This enhancement improves the flexibility and robustness of the ingestion process.
---
 src/api/settings.py | 70 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/api/settings.py b/src/api/settings.py
index 43813e99..b6148464 100644
--- a/src/api/settings.py
+++ b/src/api/settings.py
@@ -29,6 +29,76 @@ async def get_settings(request, session_manager):
                 f"{LANGFLOW_PUBLIC_URL.rstrip('/')}/flow/{LANGFLOW_INGEST_FLOW_ID}"
             )
 
+        # Fetch ingestion flow configuration to get actual component defaults
+        if LANGFLOW_INGEST_FLOW_ID:
+            try:
+                from config.settings import generate_langflow_api_key
+                import httpx
+
+                api_key = await generate_langflow_api_key()
+                if api_key:
+                    async with httpx.AsyncClient(timeout=10.0) as client:
+                        response = await client.get(
+                            f"{LANGFLOW_URL}/api/v1/flows/{LANGFLOW_INGEST_FLOW_ID}",
+                            headers={"x-api-key": api_key},
+                        )
+                        if response.status_code == 200:
+                            flow_data = response.json()
+
+                            # Extract component defaults (ingestion-specific settings only)
+                            ingestion_defaults = {
+                                "chunkSize": 1000,
+                                "chunkOverlap": 200,
+                                "separator": "\\n",
+                                "embeddingModel": "text-embedding-3-small",
+                            }
+
+                            if flow_data.get("data", {}).get("nodes"):
+                                for node in flow_data["data"]["nodes"]:
+                                    node_template = (
+                                        node.get("data", {})
+                                        .get("node", {})
+                                        .get("template", {})
+                                    )
+
+                                    # Split Text component (SplitText-QIKhg)
+                                    if node.get("id") == "SplitText-QIKhg":
+                                        if node_template.get("chunk_size", {}).get(
+                                            "value"
+                                        ):
+                                            ingestion_defaults["chunkSize"] = (
+                                                node_template["chunk_size"]["value"]
+                                            )
+                                        if node_template.get("chunk_overlap", {}).get(
+                                            "value"
+                                        ):
+                                            ingestion_defaults["chunkOverlap"] = (
+                                                node_template["chunk_overlap"]["value"]
+                                            )
+                                        if node_template.get("separator", {}).get(
+                                            "value"
+                                        ):
+                                            ingestion_defaults["separator"] = (
+                                                node_template["separator"]["value"]
+                                            )
+
+                                    # OpenAI Embeddings component (OpenAIEmbeddings-joRJ6)
+                                    elif node.get("id") == "OpenAIEmbeddings-joRJ6":
+                                        if node_template.get("model", {}).get("value"):
+                                            ingestion_defaults["embeddingModel"] = (
+                                                node_template["model"]["value"]
+                                            )
+
+                                    # Note: OpenSearch component settings are not exposed for ingestion
+                                    # (search-related parameters like number_of_results, score_threshold
+                                    # are for retrieval, not ingestion)
+
+                            settings["ingestion_defaults"] = ingestion_defaults
+
+            except Exception as e:
+                print(f"[WARNING] Failed to fetch ingestion flow defaults: {e}")
+                # Continue without ingestion defaults
+
         return JSONResponse(settings)
 
     except Exception as e:

From 9494e998083220b9f575345ef402d8cab9cfe37d Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 18:32:29 -0300
Subject: [PATCH 20/67] Set tweaks to settings passed from the UI

---
 src/api/langflow_files.py | 58 +++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
index be0a0293..f51d2c17 100644
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@@ -2,21 +2,24 @@ from starlette.requests import Request
 from starlette.responses import JSONResponse
 
 from services.langflow_file_service import LangflowFileService
+from utils.logging_config import get_logger
+
+logger = get_logger(__name__)
 
 
 async def upload_user_file(
     request: Request, langflow_file_service: LangflowFileService, session_manager
 ):
     try:
-        print("[DEBUG] upload_user_file endpoint called")
+        logger.debug("upload_user_file endpoint called")
         form = await request.form()
         upload_file = form.get("file")
         if upload_file is None:
-            print("[ERROR] No file provided in upload request")
+            logger.error("No file provided in upload request")
             return JSONResponse({"error": "Missing file"}, status_code=400)
 
-        print(
-            f"[DEBUG] Processing file: {upload_file.filename}, size: {upload_file.size}"
+        logger.debug(
+            "Processing file", filename=upload_file.filename, size=upload_file.size
         )
 
         # starlette UploadFile provides file-like; httpx needs (filename, file, content_type)
@@ -28,16 +31,20 @@ async def upload_user_file(
         )
 
         jwt_token = getattr(request.state, "jwt_token", None)
-        print(f"[DEBUG] JWT token present: {jwt_token is not None}")
+        logger.debug("JWT token status", jwt_present=jwt_token is not None)
 
-        print("[DEBUG] Calling langflow_file_service.upload_user_file...")
+        logger.debug("Calling langflow_file_service.upload_user_file")
         result = await langflow_file_service.upload_user_file(
             file_tuple, jwt_token=jwt_token
         )
-        print(f"[DEBUG] Upload successful: {result}")
+        logger.debug("Upload successful", result=result)
         return JSONResponse(result, status_code=201)
     except Exception as e:
-        print(f"[ERROR] upload_user_file endpoint failed: {type(e).__name__}: {e}")
+        logger.error(
+            "upload_user_file endpoint failed",
+            error_type=type(e).__name__,
+            error=str(e),
+        )
         import traceback
 
         print(f"[ERROR] Traceback: {traceback.format_exc()}")
@@ -52,7 +59,8 @@ async def run_ingestion(
         file_ids = payload.get("file_ids")
         file_paths = payload.get("file_paths") or []
         session_id = payload.get("session_id")
-        tweaks = payload.get("tweaks")
+        tweaks = payload.get("tweaks") or {}
+        settings = payload.get("settings", {})
 
         # We assume file_paths is provided. If only file_ids are provided, client would need to resolve to paths via Files API (not implemented here).
         if not file_paths and not file_ids:
@@ -60,6 +68,38 @@ async def run_ingestion(
                 {"error": "Provide file_paths or file_ids"}, status_code=400
             )
 
+        # Convert UI settings to component tweaks using exact component IDs
+        if settings:
+            logger.debug("Applying ingestion settings", settings=settings)
+
+            # Split Text component tweaks (SplitText-QIKhg)
+            if (
+                settings.get("chunkSize")
+                or settings.get("chunkOverlap")
+                or settings.get("separator")
+            ):
+                if "SplitText-QIKhg" not in tweaks:
+                    tweaks["SplitText-QIKhg"] = {}
+                if settings.get("chunkSize"):
+                    tweaks["SplitText-QIKhg"]["chunk_size"] = settings["chunkSize"]
+                if settings.get("chunkOverlap"):
+                    tweaks["SplitText-QIKhg"]["chunk_overlap"] = settings[
+                        "chunkOverlap"
+                    ]
+                if settings.get("separator"):
+                    tweaks["SplitText-QIKhg"]["separator"] = settings["separator"]
+
+            # OpenAI Embeddings component tweaks (OpenAIEmbeddings-joRJ6)
+            if settings.get("embeddingModel"):
+                if "OpenAIEmbeddings-joRJ6" not in tweaks:
+                    tweaks["OpenAIEmbeddings-joRJ6"] = {}
+                tweaks["OpenAIEmbeddings-joRJ6"]["model"] = settings["embeddingModel"]
+
+            # Note: OpenSearch component tweaks not needed for ingestion
+            # (search parameters are for retrieval, not document processing)
+
+            logger.debug("Final tweaks with settings applied", tweaks=tweaks)
+
         # Include user JWT if available
         jwt_token = getattr(request.state, "jwt_token", None)
 

From 69834ac4acd09ea571fc2ad17353664799cb2ee9 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 18:32:38 -0300
Subject: [PATCH 21/67] Refactor error logging in upload_user_file function

This commit replaces the print statement for error traceback with a structured logging approach using the logger. This change enhances error visibility and aligns with best practices for robust and well-documented async code.
---
 src/api/langflow_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
index f51d2c17..1fa1f9c7 100644
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@@ -47,7 +47,7 @@ async def upload_user_file(
         )
         import traceback
 
-        print(f"[ERROR] Traceback: {traceback.format_exc()}")
+        logger.error("Full traceback", traceback=traceback.format_exc())
         return JSONResponse({"error": str(e)}, status_code=500)
 
 

From b493dab318cba30b06e496cdfc0b52b3aac5ac44 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Thu, 4 Sep 2025 18:33:16 -0300
Subject: [PATCH 22/67] Enhance KnowledgeSourcesPage with ingestion settings
 and connector management

This commit refactors the KnowledgeSourcesPage component to include a new ingestion settings section, allowing users to configure document processing parameters such as chunk size and overlap. It also improves the connector management interface by integrating async fetching of connector statuses and enhancing error handling. The changes aim to provide a more robust and user-friendly experience while maintaining well-documented code practices.
---
 frontend/src/app/settings/page.tsx | 1225 ++++++++++++++++------------
 1 file changed, 722 insertions(+), 503 deletions(-)

diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx
index e1352434..1e88b12c 100644
--- a/frontend/src/app/settings/page.tsx
+++ b/frontend/src/app/settings/page.tsx
@@ -1,539 +1,758 @@
-"use client"
-
-import { useState, useEffect, useCallback, Suspense } from "react"
-import { useSearchParams } from "next/navigation"
-import { Button } from "@/components/ui/button"
-import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
-import { Badge } from "@/components/ui/badge"
-import { Input } from "@/components/ui/input"
-import { Label } from "@/components/ui/label"
-import { Checkbox } from "@/components/ui/checkbox"
-import { Loader2, PlugZap, RefreshCw } from "lucide-react"
-import { ProtectedRoute } from "@/components/protected-route"
-import { useTask } from "@/contexts/task-context"
-import { useAuth } from "@/contexts/auth-context"
+"use client";
 
+import { useState, useEffect, useCallback, Suspense } from "react";
+import { useSearchParams } from "next/navigation";
+import { Button } from "@/components/ui/button";
+import {
+	Card,
+	CardContent,
+	CardDescription,
+	CardHeader,
+	CardTitle,
+} from "@/components/ui/card";
+import { Badge } from "@/components/ui/badge";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import { Checkbox } from "@/components/ui/checkbox";
+import { Loader2, PlugZap, RefreshCw } from "lucide-react";
+import { ProtectedRoute } from "@/components/protected-route";
+import { useTask } from "@/contexts/task-context";
+import { useAuth } from "@/contexts/auth-context";
 
 interface Connector {
-  id: string
-  name: string
-  description: string
-  icon: React.ReactNode
-  status: "not_connected" | "connecting" | "connected" | "error"
-  type: string
-  connectionId?: string
-  access_token?: string
+	id: string;
+	name: string;
+	description: string;
+	icon: React.ReactNode;
+	status: "not_connected" | "connecting" | "connected" | "error";
+	type: string;
+	connectionId?: string;
+	access_token?: string;
 }
 
 interface SyncResult {
-  processed?: number;
-  added?: number;
-  errors?: number;
-  skipped?: number;
-  total?: number;
+	processed?: number;
+	added?: number;
+	errors?: number;
+	skipped?: number;
+	total?: number;
 }
 
 interface Connection {
-  connection_id: string
-  is_active: boolean
-  created_at: string
-  last_sync?: string
+	connection_id: string;
+	is_active: boolean;
+	created_at: string;
+	last_sync?: string;
 }
 
 function KnowledgeSourcesPage() {
-  const { isAuthenticated, isNoAuthMode } = useAuth()
-  const { addTask, tasks } = useTask()
-  const searchParams = useSearchParams()
-  
-  
-  // Connectors state
-  const [connectors, setConnectors] = useState<Connector[]>([])
-  const [isConnecting, setIsConnecting] = useState<string | null>(null)
-  const [isSyncing, setIsSyncing] = useState<string | null>(null)
-  const [syncResults, setSyncResults] = useState<{[key: string]: SyncResult | null}>({})
-  const [maxFiles, setMaxFiles] = useState<number>(10)
-  const [syncAllFiles, setSyncAllFiles] = useState<boolean>(false)
-  
-  // Settings state
-  // Note: backend internal Langflow URL is not needed on the frontend
-  const [flowId, setFlowId] = useState<string>('1098eea1-6649-4e1d-aed1-b77249fb8dd0')
-  const [ingestFlowId, setIngestFlowId] = useState<string>('')
-  const [langflowEditUrl, setLangflowEditUrl] = useState<string>('')
-  const [langflowIngestEditUrl, setLangflowIngestEditUrl] = useState<string>('')
-  const [publicLangflowUrl, setPublicLangflowUrl] = useState<string>('')
+	const { isAuthenticated, isNoAuthMode } = useAuth();
+	const { addTask, tasks } = useTask();
+	const searchParams = useSearchParams();
 
-  // Fetch settings from backend
-  const fetchSettings = useCallback(async () => {
-    try {
-      const response = await fetch('/api/settings')
-      if (response.ok) {
-        const settings = await response.json()
-        if (settings.flow_id) {
-          setFlowId(settings.flow_id)
-        }
-        if (settings.ingest_flow_id) {
-          console.log('Setting ingestFlowId to:', settings.ingest_flow_id)
-          setIngestFlowId(settings.ingest_flow_id)
-        } else {
-          console.log('No ingest_flow_id in settings:', settings)
-        }
-        if (settings.langflow_edit_url) {
-          setLangflowEditUrl(settings.langflow_edit_url)
-        }
-        if (settings.langflow_ingest_edit_url) {
-          setLangflowIngestEditUrl(settings.langflow_ingest_edit_url)
-        }
-        if (settings.langflow_public_url) {
-          setPublicLangflowUrl(settings.langflow_public_url)
-        }
-      }
-    } catch (error) {
-      console.error('Failed to fetch settings:', error)
-    }
-  }, [])
+	// Connectors state
+	const [connectors, setConnectors] = useState<Connector[]>([]);
+	const [isConnecting, setIsConnecting] = useState<string | null>(null);
+	const [isSyncing, setIsSyncing] = useState<string | null>(null);
+	const [syncResults, setSyncResults] = useState<{
+		[key: string]: SyncResult | null;
+	}>({});
+	const [maxFiles, setMaxFiles] = useState<number>(10);
+	const [syncAllFiles, setSyncAllFiles] = useState<boolean>(false);
 
-  // Helper function to get connector icon
-  const getConnectorIcon = (iconName: string) => {
-    const iconMap: { [key: string]: React.ReactElement } = {
-      'google-drive': (
-        <div className="w-8 h-8 bg-blue-600 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
-          G
-        </div>
-      ),
-      'sharepoint': (
-        <div className="w-8 h-8 bg-blue-700 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
-          SP
-        </div>
-      ),
-      'onedrive': (
-        <div className="w-8 h-8 bg-blue-400 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
-          OD
-        </div>
-      ),
-    }
-    return iconMap[iconName] || (
-      <div className="w-8 h-8 bg-gray-500 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
-        ?
-      </div>
-    )
-  }
+	// Settings state
+	// Note: backend internal Langflow URL is not needed on the frontend
+	const [flowId, setFlowId] = useState<string>(
+		"1098eea1-6649-4e1d-aed1-b77249fb8dd0",
+	);
+	const [ingestFlowId, setIngestFlowId] = useState<string>("");
+	const [langflowEditUrl, setLangflowEditUrl] = useState<string>("");
+	const [langflowIngestEditUrl, setLangflowIngestEditUrl] =
+		useState<string>("");
+	const [publicLangflowUrl, setPublicLangflowUrl] = useState<string>("");
 
-  // Connector functions
-  const checkConnectorStatuses = useCallback(async () => {
-    try {
-      // Fetch available connectors from backend
-      const connectorsResponse = await fetch('/api/connectors')
-      if (!connectorsResponse.ok) {
-        throw new Error('Failed to load connectors')
-      }
-      
-      const connectorsResult = await connectorsResponse.json()
-      const connectorTypes = Object.keys(connectorsResult.connectors)
-      
-      // Initialize connectors list with metadata from backend
-      const initialConnectors = connectorTypes
-        .filter(type => connectorsResult.connectors[type].available) // Only show available connectors
-        .map(type => ({
-          id: type,
-          name: connectorsResult.connectors[type].name,
-          description: connectorsResult.connectors[type].description,
-          icon: getConnectorIcon(connectorsResult.connectors[type].icon),
-          status: "not_connected" as const,
-          type: type
-        }))
-      
-      setConnectors(initialConnectors)
+	// Ingestion settings state - will be populated from Langflow flow defaults
+	const [ingestionSettings, setIngestionSettings] = useState({
+		chunkSize: 1000,
+		chunkOverlap: 200,
+		separator: "\\n",
+		embeddingModel: "text-embedding-3-small",
+	});
+	const [settingsLoaded, setSettingsLoaded] = useState(false);
 
-      // Check status for each connector type
-      
-      for (const connectorType of connectorTypes) {
-        const response = await fetch(`/api/connectors/${connectorType}/status`)
-        if (response.ok) {
-          const data = await response.json()
-          const connections = data.connections || []
-          const activeConnection = connections.find((conn: Connection) => conn.is_active)
-          const isConnected = activeConnection !== undefined
-          
-          setConnectors(prev => prev.map(c => 
-            c.type === connectorType 
-              ? { 
-                  ...c, 
-                  status: isConnected ? "connected" : "not_connected",
-                  connectionId: activeConnection?.connection_id
-                } 
-              : c
-          ))
-        }
-      }
-    } catch (error) {
-      console.error('Failed to check connector statuses:', error)
-    }
-  }, [])
+	// Fetch settings from backend
+	const fetchSettings = useCallback(async () => {
+		try {
+			const response = await fetch("/api/settings");
+			if (response.ok) {
+				const settings = await response.json();
+				if (settings.flow_id) {
+					setFlowId(settings.flow_id);
+				}
+				if (settings.ingest_flow_id) {
+					console.log("Setting ingestFlowId to:", settings.ingest_flow_id);
+					setIngestFlowId(settings.ingest_flow_id);
+				} else {
+					console.log("No ingest_flow_id in settings:", settings);
+				}
+				if (settings.langflow_edit_url) {
+					setLangflowEditUrl(settings.langflow_edit_url);
+				}
+				if (settings.langflow_ingest_edit_url) {
+					setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
+				}
+				if (settings.langflow_public_url) {
+					setPublicLangflowUrl(settings.langflow_public_url);
+				}
+				if (settings.ingestion_defaults) {
+					console.log(
+						"Loading ingestion defaults from backend:",
+						settings.ingestion_defaults,
+					);
+					setIngestionSettings(settings.ingestion_defaults);
+					setSettingsLoaded(true);
+				}
+			}
+		} catch (error) {
+			console.error("Failed to fetch settings:", error);
+		}
+	}, []);
 
-  const handleConnect = async (connector: Connector) => {
-    setIsConnecting(connector.id)
-    setSyncResults(prev => ({ ...prev, [connector.id]: null }))
-    
-    try {
-      // Use the shared auth callback URL, same as connectors page
-      const redirectUri = `${window.location.origin}/auth/callback`
-      
-      const response = await fetch('/api/auth/init', {
-        method: 'POST',
-        headers: {
-          'Content-Type': 'application/json',
-        },
-        body: JSON.stringify({
-          connector_type: connector.type,
-          purpose: "data_source",
-          name: `${connector.name} Connection`,
-          redirect_uri: redirectUri
-        }),
-      })
-      
-      if (response.ok) {
-        const result = await response.json()
-        
-        if (result.oauth_config) {
-          localStorage.setItem('connecting_connector_id', result.connection_id)
-          localStorage.setItem('connecting_connector_type', connector.type)
-          
-          const authUrl = `${result.oauth_config.authorization_endpoint}?` +
-            `client_id=${result.oauth_config.client_id}&` +
-            `response_type=code&` +
-            `scope=${result.oauth_config.scopes.join(' ')}&` +
-            `redirect_uri=${encodeURIComponent(result.oauth_config.redirect_uri)}&` +
-            `access_type=offline&` +
-            `prompt=consent&` +
-            `state=${result.connection_id}`
-          
-          window.location.href = authUrl
-        }
-      } else {
-        console.error('Failed to initiate connection')
-        setIsConnecting(null)
-      }
-    } catch (error) {
-      console.error('Connection error:', error)
-      setIsConnecting(null)
-    }
-  }
+	// Helper function to get connector icon
+	const getConnectorIcon = (iconName: string) => {
+		const iconMap: { [key: string]: React.ReactElement } = {
+			"google-drive": (
+				<div className="w-8 h-8 bg-blue-600 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
+					G
+				</div>
+			),
+			sharepoint: (
+				<div className="w-8 h-8 bg-blue-700 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
+					SP
+				</div>
+			),
+			onedrive: (
+				<div className="w-8 h-8 bg-blue-400 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
+					OD
+				</div>
+			),
+		};
+		return (
+			iconMap[iconName] || (
+				<div className="w-8 h-8 bg-gray-500 rounded flex items-center justify-center text-white font-bold leading-none shrink-0">
+					?
+				</div>
+			)
+		);
+	};
 
-  const handleSync = async (connector: Connector) => {
-    if (!connector.connectionId) return
-    
-    setIsSyncing(connector.id)
-    setSyncResults(prev => ({ ...prev, [connector.id]: null }))
-    
-    try {
-      const response = await fetch(`/api/connectors/${connector.type}/sync`, {
-        method: 'POST',
-        headers: {
-          'Content-Type': 'application/json',
-        },
-        body: JSON.stringify({
-          connection_id: connector.connectionId,
-          max_files: syncAllFiles ? 0 : (maxFiles || undefined)
-        }),
-      })
-      
-      const result = await response.json()
-      
-      if (response.status === 201) {
-        const taskId = result.task_id
-        if (taskId) {
-          addTask(taskId)
-          setSyncResults(prev => ({ 
-            ...prev, 
-            [connector.id]: { 
-              processed: 0, 
-              total: result.total_files || 0 
-            }
-          }))
-        }
-      } else if (response.ok) {
-        setSyncResults(prev => ({ ...prev, [connector.id]: result }))
-        // Note: Stats will auto-refresh via task completion watcher for async syncs
-      } else {
-        console.error('Sync failed:', result.error)
-      }
-    } catch (error) {
-      console.error('Sync error:', error)
-    } finally {
-      setIsSyncing(null)
-    }
-  }
+	// Connector functions
+	const checkConnectorStatuses = useCallback(async () => {
+		try {
+			// Fetch available connectors from backend
+			const connectorsResponse = await fetch("/api/connectors");
+			if (!connectorsResponse.ok) {
+				throw new Error("Failed to load connectors");
+			}
 
-  const getStatusBadge = (status: Connector["status"]) => {
-    switch (status) {
-      case "connected":
-        return <Badge variant="default" className="bg-green-500/20 text-green-400 border-green-500/30">Connected</Badge>
-      case "connecting":
-        return <Badge variant="secondary" className="bg-yellow-500/20 text-yellow-400 border-yellow-500/30">Connecting...</Badge>
-      case "error":
-        return <Badge variant="destructive">Error</Badge>
-      default:
-        return <Badge variant="outline" className="bg-muted/20 text-muted-foreground border-muted whitespace-nowrap">Not Connected</Badge>
-    }
-  }
+			const connectorsResult = await connectorsResponse.json();
+			const connectorTypes = Object.keys(connectorsResult.connectors);
 
-  // Fetch settings on mount when authenticated
-  useEffect(() => {
-    if (isAuthenticated) {
-      fetchSettings()
-    }
-  }, [isAuthenticated, fetchSettings])
+			// Initialize connectors list with metadata from backend
+			const initialConnectors = connectorTypes
+				.filter((type) => connectorsResult.connectors[type].available) // Only show available connectors
+				.map((type) => ({
+					id: type,
+					name: connectorsResult.connectors[type].name,
+					description: connectorsResult.connectors[type].description,
+					icon: getConnectorIcon(connectorsResult.connectors[type].icon),
+					status: "not_connected" as const,
+					type: type,
+				}));
 
-  // Check connector status on mount and when returning from OAuth
-  useEffect(() => {
-    if (isAuthenticated) {
-      checkConnectorStatuses()
-    }
-    
-    if (searchParams.get('oauth_success') === 'true') {
-      const url = new URL(window.location.href)
-      url.searchParams.delete('oauth_success')
-      window.history.replaceState({}, '', url.toString())
-    }
-  }, [searchParams, isAuthenticated, checkConnectorStatuses])
+			setConnectors(initialConnectors);
 
+			// Check status for each connector type
 
+			for (const connectorType of connectorTypes) {
+				const response = await fetch(`/api/connectors/${connectorType}/status`);
+				if (response.ok) {
+					const data = await response.json();
+					const connections = data.connections || [];
+					const activeConnection = connections.find(
+						(conn: Connection) => conn.is_active,
+					);
+					const isConnected = activeConnection !== undefined;
 
+					setConnectors((prev) =>
+						prev.map((c) =>
+							c.type === connectorType
+								? {
+										...c,
+										status: isConnected ? "connected" : "not_connected",
+										connectionId: activeConnection?.connection_id,
+									}
+								: c,
+						),
+					);
+				}
+			}
+		} catch (error) {
+			console.error("Failed to check connector statuses:", error);
+		}
+	}, []);
 
-  // Track previous tasks to detect new completions
-  const [prevTasks, setPrevTasks] = useState<typeof tasks>([])
-  
-  // Watch for task completions and refresh stats
-  useEffect(() => {
-    // Find newly completed tasks by comparing with previous state
-    const newlyCompletedTasks = tasks.filter(task => {
-      const wasCompleted = prevTasks.find(prev => prev.task_id === task.task_id)?.status === 'completed'
-      return task.status === 'completed' && !wasCompleted
-    })
-    
-    if (newlyCompletedTasks.length > 0) {
-      // Task completed - could refresh data here if needed
-      const timeoutId = setTimeout(() => {
-        // Stats refresh removed
-      }, 1000)
-      
-      // Update previous tasks state
-      setPrevTasks(tasks)
-      
-      return () => clearTimeout(timeoutId)
-    } else {
-      // Always update previous tasks state
-      setPrevTasks(tasks)
-    }
-  }, [tasks, prevTasks])
+	const handleConnect = async (connector: Connector) => {
+		setIsConnecting(connector.id);
+		setSyncResults((prev) => ({ ...prev, [connector.id]: null }));
 
-  return (
-    <div className="space-y-8">
-      {/* Agent Behavior Section */}
-      <div className="flex items-center justify-between py-4">
-        <div>
-          <h3 className="text-lg font-medium">Agent behavior</h3>
-          <p className="text-sm text-muted-foreground">Adjust your retrieval agent flow</p>
-        </div>
-        <Button
-          onClick={() => {
-            const derivedFromWindow = typeof window !== 'undefined' 
-              ? `${window.location.protocol}//${window.location.hostname}:7860` 
-              : ''
-            const base = (publicLangflowUrl || derivedFromWindow || 'http://localhost:7860').replace(/\/$/, '')
-            const computed = flowId ? `${base}/flow/${flowId}` : base
-            const url = langflowEditUrl || computed
-            window.open(url, '_blank')
-          }}
-        >
-          <svg xmlns="http://www.w3.org/2000/svg" width="24" height="22" viewBox="0 0 24 22" className="h-4 w-4 mr-2">
-            <path fill="currentColor" d="M13.0486 0.462158H9.75399C9.44371 0.462158 9.14614 0.586082 8.92674 0.806667L4.03751 5.72232C3.81811 5.9429 3.52054 6.06682 3.21026 6.06682H1.16992C0.511975 6.06682 -0.0165756 6.61212 0.000397655 7.2734L0.0515933 9.26798C0.0679586 9.90556 0.586745 10.4139 1.22111 10.4139H3.59097C3.90124 10.4139 4.19881 10.2899 4.41821 10.0694L9.34823 5.11269C9.56763 4.89211 9.8652 4.76818 10.1755 4.76818H13.0486C13.6947 4.76818 14.2185 4.24157 14.2185 3.59195V1.63839C14.2185 0.988773 13.6947 0.462158 13.0486 0.462158Z"></path>
-            <path fill="currentColor" d="M19.5355 11.5862H22.8301C23.4762 11.5862 24 12.1128 24 12.7624V14.716C24 15.3656 23.4762 15.8922 22.8301 15.8922H19.957C19.6467 15.8922 19.3491 16.0161 19.1297 16.2367L14.1997 21.1934C13.9803 21.414 13.6827 21.5379 13.3725 21.5379H11.0026C10.3682 21.5379 9.84945 21.0296 9.83309 20.392L9.78189 18.3974C9.76492 17.7361 10.2935 17.1908 10.9514 17.1908H12.9918C13.302 17.1908 13.5996 17.0669 13.819 16.8463L18.7082 11.9307C18.9276 11.7101 19.2252 11.5862 19.5355 11.5862Z"></path>
-            <path fill="currentColor" d="M19.5355 2.9796L22.8301 2.9796C23.4762 2.9796 24 3.50622 24 4.15583V6.1094C24 6.75901 23.4762 7.28563 22.8301 7.28563H19.957C19.6467 7.28563 19.3491 7.40955 19.1297 7.63014L14.1997 12.5868C13.9803 12.8074 13.6827 12.9313 13.3725 12.9313H10.493C10.1913 12.9313 9.90126 13.0485 9.68346 13.2583L4.14867 18.5917C3.93087 18.8016 3.64085 18.9187 3.33917 18.9187H1.32174C0.675616 18.9187 0.151832 18.3921 0.151832 17.7425V15.7343C0.151832 15.0846 0.675616 14.558 1.32174 14.558H3.32468C3.63496 14.558 3.93253 14.4341 4.15193 14.2135L9.40827 8.92878C9.62767 8.70819 9.92524 8.58427 10.2355 8.58427H12.9918C13.302 8.58427 13.5996 8.46034 13.819 8.23976L18.7082 3.32411C18.9276 3.10353 19.2252 2.9796 19.5355 2.9796Z"></path>
-          </svg>
-          Edit in Langflow
-        </Button>
-      </div>
+		try {
+			// Use the shared auth callback URL, same as connectors page
+			const redirectUri = `${window.location.origin}/auth/callback`;
 
-      {/* Ingest Flow Section */}
-      <div className="flex items-center justify-between py-4">
-        <div>
-          <h3 className="text-lg font-medium">File ingestion</h3>
-          <p className="text-sm text-muted-foreground">Customize your file processing and indexing pipeline</p>
-        </div>
-        <Button
-          onClick={() => {
-            const derivedFromWindow = typeof window !== 'undefined' 
-              ? `${window.location.protocol}//${window.location.hostname}:7860` 
-              : ''
-            const base = (publicLangflowUrl || derivedFromWindow || 'http://localhost:7860').replace(/\/$/, '')
-            const computed = ingestFlowId ? `${base}/flow/${ingestFlowId}` : base
-            const url = langflowIngestEditUrl || computed
-            window.open(url, '_blank')
-          }}
-        >
-          <svg xmlns="http://www.w3.org/2000/svg" width="24" height="22" viewBox="0 0 24 22" className="h-4 w-4 mr-2">
-            <path fill="currentColor" d="M13.0486 0.462158H9.75399C9.44371 0.462158 9.14614 0.586082 8.92674 0.806667L4.03751 5.72232C3.81811 5.9429 3.52054 6.06682 3.21026 6.06682H1.16992C0.511975 6.06682 -0.0165756 6.61212 0.000397655 7.2734L0.0515933 9.26798C0.0679586 9.90556 0.586745 10.4139 1.22111 10.4139H3.59097C3.90124 10.4139 4.19881 10.2899 4.41821 10.0694L9.34823 5.11269C9.56763 4.89211 9.8652 4.76818 10.1755 4.76818H13.0486C13.6947 4.76818 14.2185 4.24157 14.2185 3.59195V1.63839C14.2185 0.988773 13.6947 0.462158 13.0486 0.462158Z"></path>
-            <path fill="currentColor" d="M19.5355 11.5862H22.8301C23.4762 11.5862 24 12.1128 24 12.7624V14.716C24 15.3656 23.4762 15.8922 22.8301 15.8922H19.957C19.6467 15.8922 19.3491 16.0161 19.1297 16.2367L14.1997 21.1934C13.9803 21.414 13.6827 21.5379 13.3725 21.5379H11.0026C10.3682 21.5379 9.84945 21.0296 9.83309 20.392L9.78189 18.3974C9.76492 17.7361 10.2935 17.1908 10.9514 17.1908H12.9918C13.302 17.1908 13.5996 17.0669 13.819 16.8463L18.7082 11.9307C18.9276 11.7101 19.2252 11.5862 19.5355 11.5862Z"></path>
-            <path fill="currentColor" d="M19.5355 2.9796L22.8301 2.9796C23.4762 2.9796 24 3.50622 24 4.15583V6.1094C24 6.75901 23.4762 7.28563 22.8301 7.28563H19.957C19.6467 7.28563 19.3491 7.40955 19.1297 7.63014L14.1997 12.5868C13.9803 12.8074 13.6827 12.9313 13.3725 12.9313H10.493C10.1913 12.9313 9.90126 13.0485 9.68346 13.2583L4.14867 18.5917C3.93087 18.8016 3.64085 18.9187 3.33917 18.9187H1.32174C0.675616 18.9187 0.151832 18.3921 0.151832 17.7425V15.7343C0.151832 15.0846 0.675616 14.558 1.32174 14.558H3.32468C3.63496 14.558 3.93253 14.4341 4.15193 14.2135L9.40827 8.92878C9.62767 8.70819 9.92524 8.58427 10.2355 8.58427H12.9918C13.302 8.58427 13.5996 8.46034 13.819 8.23976L18.7082 3.32411C18.9276 3.10353 19.2252 2.9796 19.5355 2.9796Z"></path>
-          </svg>
-          Edit in Langflow
-        </Button>
-      </div>
+			const response = await fetch("/api/auth/init", {
+				method: "POST",
+				headers: {
+					"Content-Type": "application/json",
+				},
+				body: JSON.stringify({
+					connector_type: connector.type,
+					purpose: "data_source",
+					name: `${connector.name} Connection`,
+					redirect_uri: redirectUri,
+				}),
+			});
 
+			if (response.ok) {
+				const result = await response.json();
 
-      {/* Connectors Section */}
-      <div className="space-y-6">
-        <div>
-          <h2 className="text-2xl font-semibold tracking-tight mb-2">Cloud Connectors</h2>
-        </div>
+				if (result.oauth_config) {
+					localStorage.setItem("connecting_connector_id", result.connection_id);
+					localStorage.setItem("connecting_connector_type", connector.type);
 
-        {/* Conditional Sync Settings or No-Auth Message */}
-        {isNoAuthMode ? (
-          <Card className="border-yellow-500/50 bg-yellow-500/5">
-            <CardHeader>
-              <CardTitle className="text-lg text-yellow-600">Cloud connectors are only available with auth mode enabled</CardTitle>
-              <CardDescription className="text-sm">
-                Please provide the following environment variables and restart:
-              </CardDescription>
-            </CardHeader>
-            <CardContent>
-              <div className="bg-muted rounded-md p-4 font-mono text-sm">
-                <div className="text-muted-foreground mb-2"># make here https://console.cloud.google.com/apis/credentials</div>
-                <div>GOOGLE_OAUTH_CLIENT_ID=</div>
-                <div>GOOGLE_OAUTH_CLIENT_SECRET=</div>
-              </div>
-            </CardContent>
-          </Card>
-        ) : (
-          <div className="flex items-center justify-between py-4">
-            <div>
-              <h3 className="text-lg font-medium">Sync Settings</h3>
-              <p className="text-sm text-muted-foreground">Configure how many files to sync when manually triggering a sync</p>
-            </div>
-            <div className="flex items-center gap-4">
-              <div className="flex items-center space-x-2">
-                <Checkbox 
-                  id="syncAllFiles" 
-                  checked={syncAllFiles}
-                  onCheckedChange={(checked) => {
-                    setSyncAllFiles(!!checked)
-                    if (checked) {
-                      setMaxFiles(0)
-                    } else {
-                      setMaxFiles(10)
-                    }
-                  }}
-                />
-                <Label htmlFor="syncAllFiles" className="font-medium whitespace-nowrap">
-                  Sync all files
-                </Label>
-              </div>
-              <Label htmlFor="maxFiles" className="font-medium whitespace-nowrap">
-                Max files per sync:
-              </Label>
-              <div className="relative">
-                <Input
-                  id="maxFiles"
-                  type="number"
-                  value={syncAllFiles ? 0 : maxFiles}
-                  onChange={(e) => setMaxFiles(parseInt(e.target.value) || 10)}
-                  disabled={syncAllFiles}
-                  className="w-16 min-w-16 max-w-16 flex-shrink-0 disabled:opacity-50 disabled:cursor-not-allowed"
-                  min="1"
-                  max="100"
-                  title={syncAllFiles ? "Disabled when 'Sync all files' is checked" : "Leave blank or set to 0 for unlimited"}
-                />
-              </div>
-            </div>
-          </div>
-        )}
+					const authUrl =
+						`${result.oauth_config.authorization_endpoint}?` +
+						`client_id=${result.oauth_config.client_id}&` +
+						`response_type=code&` +
+						`scope=${result.oauth_config.scopes.join(" ")}&` +
+						`redirect_uri=${encodeURIComponent(result.oauth_config.redirect_uri)}&` +
+						`access_type=offline&` +
+						`prompt=consent&` +
+						`state=${result.connection_id}`;
 
-        {/* Connectors Grid */}
-        <div className="grid gap-6 md:grid-cols-2 lg:grid-cols-3">
-          {connectors.map((connector) => (
-            <Card key={connector.id} className="relative flex flex-col">
-              <CardHeader>
-                <div className="flex items-center justify-between">
-                  <div className="flex items-center gap-3">
-                    {connector.icon}
-                    <div>
-                      <CardTitle className="text-lg">{connector.name}</CardTitle>
-                      <CardDescription className="text-sm">
-                        {connector.description}
-                      </CardDescription>
-                    </div>
-                  </div>
-                  {getStatusBadge(connector.status)}
-                </div>
-              </CardHeader>
-              <CardContent className="flex-1 flex flex-col justify-end space-y-4">
-                {connector.status === "connected" ? (
-                  <div className="space-y-3">
-                    <Button
-                      onClick={() => handleSync(connector)}
-                      disabled={isSyncing === connector.id}
-                      className="w-full"
-                      variant="outline"
-                    >
-                      {isSyncing === connector.id ? (
-                        <>
-                          <Loader2 className="mr-2 h-4 w-4 animate-spin" />
-                          Syncing...
-                        </>
-                      ) : (
-                        <>
-                          <RefreshCw className="mr-2 h-4 w-4" />
-                          Sync Now
-                        </>
-                      )}
-                    </Button>
-                    
-                    {syncResults[connector.id] && (
-                      <div className="text-xs text-muted-foreground bg-muted/50 p-2 rounded">
-                        <div>Processed: {syncResults[connector.id]?.processed || 0}</div>
-                        <div>Added: {syncResults[connector.id]?.added || 0}</div>
-                        {syncResults[connector.id]?.errors && (
-                          <div>Errors: {syncResults[connector.id]?.errors}</div>
-                        )}
-                      </div>
-                    )}
-                  </div>
-                ) : (
-                  <Button
-                    onClick={() => handleConnect(connector)}
-                    disabled={isConnecting === connector.id}
-                    className="w-full"
-                  >
-                    {isConnecting === connector.id ? (
-                      <>
-                        <Loader2 className="mr-2 h-4 w-4 animate-spin" />
-                        Connecting...
-                      </>
-                    ) : (
-                      <>
-                        <PlugZap className="mr-2 h-4 w-4" />
-                        Connect
-                      </>
-                    )}
-                  </Button>
-                )}
-              </CardContent>
-            </Card>
-          ))}
-        </div>
+					window.location.href = authUrl;
+				}
+			} else {
+				console.error("Failed to initiate connection");
+				setIsConnecting(null);
+			}
+		} catch (error) {
+			console.error("Connection error:", error);
+			setIsConnecting(null);
+		}
+	};
 
-      </div>
-    </div>
-  )
+	const handleSync = async (connector: Connector) => {
+		if (!connector.connectionId) return;
+
+		setIsSyncing(connector.id);
+		setSyncResults((prev) => ({ ...prev, [connector.id]: null }));
+
+		try {
+			const response = await fetch(`/api/connectors/${connector.type}/sync`, {
+				method: "POST",
+				headers: {
+					"Content-Type": "application/json",
+				},
+				body: JSON.stringify({
+					connection_id: connector.connectionId,
+					max_files: syncAllFiles ? 0 : maxFiles || undefined,
+				}),
+			});
+
+			const result = await response.json();
+
+			if (response.status === 201) {
+				const taskId = result.task_id;
+				if (taskId) {
+					addTask(taskId);
+					setSyncResults((prev) => ({
+						...prev,
+						[connector.id]: {
+							processed: 0,
+							total: result.total_files || 0,
+						},
+					}));
+				}
+			} else if (response.ok) {
+				setSyncResults((prev) => ({ ...prev, [connector.id]: result }));
+				// Note: Stats will auto-refresh via task completion watcher for async syncs
+			} else {
+				console.error("Sync failed:", result.error);
+			}
+		} catch (error) {
+			console.error("Sync error:", error);
+		} finally {
+			setIsSyncing(null);
+		}
+	};
+
+	const getStatusBadge = (status: Connector["status"]) => {
+		switch (status) {
+			case "connected":
+				return (
+					<Badge
+						variant="default"
+						className="bg-green-500/20 text-green-400 border-green-500/30"
+					>
+						Connected
+					</Badge>
+				);
+			case "connecting":
+				return (
+					<Badge
+						variant="secondary"
+						className="bg-yellow-500/20 text-yellow-400 border-yellow-500/30"
+					>
+						Connecting...
+					</Badge>
+				);
+			case "error":
+				return <Badge variant="destructive">Error</Badge>;
+			default:
+				return (
+					<Badge
+						variant="outline"
+						className="bg-muted/20 text-muted-foreground border-muted whitespace-nowrap"
+					>
+						Not Connected
+					</Badge>
+				);
+		}
+	};
+
+	// Fetch settings on mount when authenticated
+	useEffect(() => {
+		if (isAuthenticated) {
+			fetchSettings();
+		}
+	}, [isAuthenticated, fetchSettings]);
+
+	// Check connector status on mount and when returning from OAuth
+	useEffect(() => {
+		if (isAuthenticated) {
+			checkConnectorStatuses();
+		}
+
+		if (searchParams.get("oauth_success") === "true") {
+			const url = new URL(window.location.href);
+			url.searchParams.delete("oauth_success");
+			window.history.replaceState({}, "", url.toString());
+		}
+	}, [searchParams, isAuthenticated, checkConnectorStatuses]);
+
+	// Track previous tasks to detect new completions
+	const [prevTasks, setPrevTasks] = useState<typeof tasks>([]);
+
+	// Watch for task completions and refresh stats
+	useEffect(() => {
+		// Find newly completed tasks by comparing with previous state
+		const newlyCompletedTasks = tasks.filter((task) => {
+			const wasCompleted =
+				prevTasks.find((prev) => prev.task_id === task.task_id)?.status ===
+				"completed";
+			return task.status === "completed" && !wasCompleted;
+		});
+
+		if (newlyCompletedTasks.length > 0) {
+			// Task completed - could refresh data here if needed
+			const timeoutId = setTimeout(() => {
+				// Stats refresh removed
+			}, 1000);
+
+			// Update previous tasks state
+			setPrevTasks(tasks);
+
+			return () => clearTimeout(timeoutId);
+		} else {
+			// Always update previous tasks state
+			setPrevTasks(tasks);
+		}
+	}, [tasks, prevTasks]);
+
+	return (
+		<div className="space-y-8">
+			{/* Agent Behavior Section */}
+			<div className="flex items-center justify-between py-4">
+				<div>
+					<h3 className="text-lg font-medium">Agent behavior</h3>
+					<p className="text-sm text-muted-foreground">
+						Adjust your retrieval agent flow
+					</p>
+				</div>
+				<Button
+					onClick={() => {
+						const derivedFromWindow =
+							typeof window !== "undefined"
+								? `${window.location.protocol}//${window.location.hostname}:7860`
+								: "";
+						const base = (
+							publicLangflowUrl ||
+							derivedFromWindow ||
+							"http://localhost:7860"
+						).replace(/\/$/, "");
+						const computed = flowId ? `${base}/flow/${flowId}` : base;
+						const url = langflowEditUrl || computed;
+						window.open(url, "_blank");
+					}}
+				>
+					<svg
+						xmlns="http://www.w3.org/2000/svg"
+						width="24"
+						height="22"
+						viewBox="0 0 24 22"
+						className="h-4 w-4 mr-2"
+					>
+						<path
+							fill="currentColor"
+							d="M13.0486 0.462158H9.75399C9.44371 0.462158 9.14614 0.586082 8.92674 0.806667L4.03751 5.72232C3.81811 5.9429 3.52054 6.06682 3.21026 6.06682H1.16992C0.511975 6.06682 -0.0165756 6.61212 0.000397655 7.2734L0.0515933 9.26798C0.0679586 9.90556 0.586745 10.4139 1.22111 10.4139H3.59097C3.90124 10.4139 4.19881 10.2899 4.41821 10.0694L9.34823 5.11269C9.56763 4.89211 9.8652 4.76818 10.1755 4.76818H13.0486C13.6947 4.76818 14.2185 4.24157 14.2185 3.59195V1.63839C14.2185 0.988773 13.6947 0.462158 13.0486 0.462158Z"
+						></path>
+						<path
+							fill="currentColor"
+							d="M19.5355 11.5862H22.8301C23.4762 11.5862 24 12.1128 24 12.7624V14.716C24 15.3656 23.4762 15.8922 22.8301 15.8922H19.957C19.6467 15.8922 19.3491 16.0161 19.1297 16.2367L14.1997 21.1934C13.9803 21.414 13.6827 21.5379 13.3725 21.5379H11.0026C10.3682 21.5379 9.84945 21.0296 9.83309 20.392L9.78189 18.3974C9.76492 17.7361 10.2935 17.1908 10.9514 17.1908H12.9918C13.302 17.1908 13.5996 17.0669 13.819 16.8463L18.7082 11.9307C18.9276 11.7101 19.2252 11.5862 19.5355 11.5862Z"
+						></path>
+						<path
+							fill="currentColor"
+							d="M19.5355 2.9796L22.8301 2.9796C23.4762 2.9796 24 3.50622 24 4.15583V6.1094C24 6.75901 23.4762 7.28563 22.8301 7.28563H19.957C19.6467 7.28563 19.3491 7.40955 19.1297 7.63014L14.1997 12.5868C13.9803 12.8074 13.6827 12.9313 13.3725 12.9313H10.493C10.1913 12.9313 9.90126 13.0485 9.68346 13.2583L4.14867 18.5917C3.93087 18.8016 3.64085 18.9187 3.33917 18.9187H1.32174C0.675616 18.9187 0.151832 18.3921 0.151832 17.7425V15.7343C0.151832 15.0846 0.675616 14.558 1.32174 14.558H3.32468C3.63496 14.558 3.93253 14.4341 4.15193 14.2135L9.40827 8.92878C9.62767 8.70819 9.92524 8.58427 10.2355 8.58427H12.9918C13.302 8.58427 13.5996 8.46034 13.819 8.23976L18.7082 3.32411C18.9276 3.10353 19.2252 2.9796 19.5355 2.9796Z"
+						></path>
+					</svg>
+					Edit in Langflow
+				</Button>
+			</div>
+
+			{/* Ingest Flow Section */}
+			<div className="flex items-center justify-between py-4">
+				<div>
+					<h3 className="text-lg font-medium">File ingestion</h3>
+					<p className="text-sm text-muted-foreground">
+						Customize your file processing and indexing flow
+					</p>
+				</div>
+				<Button
+					onClick={() => {
+						const derivedFromWindow =
+							typeof window !== "undefined"
+								? `${window.location.protocol}//${window.location.hostname}:7860`
+								: "";
+						const base = (
+							publicLangflowUrl ||
+							derivedFromWindow ||
+							"http://localhost:7860"
+						).replace(/\/$/, "");
+						const computed = ingestFlowId
+							? `${base}/flow/${ingestFlowId}`
+							: base;
+						const url = langflowIngestEditUrl || computed;
+						window.open(url, "_blank");
+					}}
+				>
+					<svg
+						xmlns="http://www.w3.org/2000/svg"
+						width="24"
+						height="22"
+						viewBox="0 0 24 22"
+						className="h-4 w-4 mr-2"
+					>
+						<path
+							fill="currentColor"
+							d="M13.0486 0.462158H9.75399C9.44371 0.462158 9.14614 0.586082 8.92674 0.806667L4.03751 5.72232C3.81811 5.9429 3.52054 6.06682 3.21026 6.06682H1.16992C0.511975 6.06682 -0.0165756 6.61212 0.000397655 7.2734L0.0515933 9.26798C0.0679586 9.90556 0.586745 10.4139 1.22111 10.4139H3.59097C3.90124 10.4139 4.19881 10.2899 4.41821 10.0694L9.34823 5.11269C9.56763 4.89211 9.8652 4.76818 10.1755 4.76818H13.0486C13.6947 4.76818 14.2185 4.24157 14.2185 3.59195V1.63839C14.2185 0.988773 13.6947 0.462158 13.0486 0.462158Z"
+						></path>
+						<path
+							fill="currentColor"
+							d="M19.5355 11.5862H22.8301C23.4762 11.5862 24 12.1128 24 12.7624V14.716C24 15.3656 23.4762 15.8922 22.8301 15.8922H19.957C19.6467 15.8922 19.3491 16.0161 19.1297 16.2367L14.1997 21.1934C13.9803 21.414 13.6827 21.5379 13.3725 21.5379H11.0026C10.3682 21.5379 9.84945 21.0296 9.83309 20.392L9.78189 18.3974C9.76492 17.7361 10.2935 17.1908 10.9514 17.1908H12.9918C13.302 17.1908 13.5996 17.0669 13.819 16.8463L18.7082 11.9307C18.9276 11.7101 19.2252 11.5862 19.5355 11.5862Z"
+						></path>
+						<path
+							fill="currentColor"
+							d="M19.5355 2.9796L22.8301 2.9796C23.4762 2.9796 24 3.50622 24 4.15583V6.1094C24 6.75901 23.4762 7.28563 22.8301 7.28563H19.957C19.6467 7.28563 19.3491 7.40955 19.1297 7.63014L14.1997 12.5868C13.9803 12.8074 13.6827 12.9313 13.3725 12.9313H10.493C10.1913 12.9313 9.90126 13.0485 9.68346 13.2583L4.14867 18.5917C3.93087 18.8016 3.64085 18.9187 3.33917 18.9187H1.32174C0.675616 18.9187 0.151832 18.3921 0.151832 17.7425V15.7343C0.151832 15.0846 0.675616 14.558 1.32174 14.558H3.32468C3.63496 14.558 3.93253 14.4341 4.15193 14.2135L9.40827 8.92878C9.62767 8.70819 9.92524 8.58427 10.2355 8.58427H12.9918C13.302 8.58427 13.5996 8.46034 13.819 8.23976L18.7082 3.32411C18.9276 3.10353 19.2252 2.9796 19.5355 2.9796Z"
+						></path>
+					</svg>
+					Edit in Langflow
+				</Button>
+			</div>
+
+			{/* Ingestion Settings Section */}
+			<div className="space-y-4">
+				<div>
+					<h3 className="text-lg font-medium">Ingestion settings</h3>
+					<p className="text-sm text-muted-foreground">
+						Configure how your documents are processed and indexed
+					</p>
+				</div>
+
+				<div className="grid gap-6 md:grid-cols-2">
+					<Card>
+						<CardHeader>
+							<CardTitle className="text-base">Document Processing</CardTitle>
+							<CardDescription>
+								Control how text is split and processed
+							</CardDescription>
+						</CardHeader>
+						<CardContent className="space-y-4">
+							<div className="space-y-2">
+								<Label htmlFor="chunkSize">Chunk Size</Label>
+								<Input
+									id="chunkSize"
+									type="number"
+									value={ingestionSettings.chunkSize}
+									onChange={(e) =>
+										setIngestionSettings((prev) => ({
+											...prev,
+											chunkSize: parseInt(e.target.value) || 1000,
+										}))
+									}
+									min="100"
+									max="4000"
+								/>
+								<p className="text-xs text-muted-foreground">
+									Maximum characters per text chunk (100-4000)
+								</p>
+							</div>
+
+							<div className="space-y-2">
+								<Label htmlFor="chunkOverlap">Chunk Overlap</Label>
+								<Input
+									id="chunkOverlap"
+									type="number"
+									value={ingestionSettings.chunkOverlap}
+									onChange={(e) =>
+										setIngestionSettings((prev) => ({
+											...prev,
+											chunkOverlap: parseInt(e.target.value) || 200,
+										}))
+									}
+									min="0"
+									max="500"
+								/>
+								<p className="text-xs text-muted-foreground">
+									Character overlap between chunks (0-500)
+								</p>
+							</div>
+						</CardContent>
+					</Card>
+
+					<Card>
+						<CardHeader>
+							<CardTitle className="text-base">Embeddings</CardTitle>
+							<CardDescription>
+								Configure embedding model and search behavior
+							</CardDescription>
+						</CardHeader>
+						<CardContent className="space-y-4">
+							<div className="space-y-2">
+								<Label htmlFor="embeddingModel">Embedding Model</Label>
+								<select
+									id="embeddingModel"
+									value={ingestionSettings.embeddingModel}
+									onChange={(e) =>
+										setIngestionSettings((prev) => ({
+											...prev,
+											embeddingModel: e.target.value,
+										}))
+									}
+									className="w-full px-3 py-2 border border-input bg-background text-foreground rounded-md text-sm"
+								>
+									<option value="text-embedding-3-small">
+										text-embedding-3-small (fast, cheaper)
+									</option>
+									<option value="text-embedding-3-large">
+										text-embedding-3-large (better quality)
+									</option>
+									<option value="text-embedding-ada-002">
+										text-embedding-ada-002 (legacy)
+									</option>
+								</select>
+							</div>
+
+						</CardContent>
+					</Card>
+				</div>
+			</div>
+
+			{/* Connectors Section */}
+			<div className="space-y-6">
+				<div>
+					<h2 className="text-2xl font-semibold tracking-tight mb-2">
+						Cloud Connectors
+					</h2>
+				</div>
+
+				{/* Conditional Sync Settings or No-Auth Message */}
+				{isNoAuthMode ? (
+					<Card className="border-yellow-500/50 bg-yellow-500/5">
+						<CardHeader>
+							<CardTitle className="text-lg text-yellow-600">
+								Cloud connectors are only available with auth mode enabled
+							</CardTitle>
+							<CardDescription className="text-sm">
+								Please provide the following environment variables and restart:
+							</CardDescription>
+						</CardHeader>
+						<CardContent>
+							<div className="bg-muted rounded-md p-4 font-mono text-sm">
+								<div className="text-muted-foreground mb-2">
+									# make here https://console.cloud.google.com/apis/credentials
+								</div>
+								<div>GOOGLE_OAUTH_CLIENT_ID=</div>
+								<div>GOOGLE_OAUTH_CLIENT_SECRET=</div>
+							</div>
+						</CardContent>
+					</Card>
+				) : (
+					<div className="flex items-center justify-between py-4">
+						<div>
+							<h3 className="text-lg font-medium">Sync Settings</h3>
+							<p className="text-sm text-muted-foreground">
+								Configure how many files to sync when manually triggering a sync
+							</p>
+						</div>
+						<div className="flex items-center gap-4">
+							<div className="flex items-center space-x-2">
+								<Checkbox
+									id="syncAllFiles"
+									checked={syncAllFiles}
+									onCheckedChange={(checked) => {
+										setSyncAllFiles(!!checked);
+										if (checked) {
+											setMaxFiles(0);
+										} else {
+											setMaxFiles(10);
+										}
+									}}
+								/>
+								<Label
+									htmlFor="syncAllFiles"
+									className="font-medium whitespace-nowrap"
+								>
+									Sync all files
+								</Label>
+							</div>
+							<Label
+								htmlFor="maxFiles"
+								className="font-medium whitespace-nowrap"
+							>
+								Max files per sync:
+							</Label>
+							<div className="relative">
+								<Input
+									id="maxFiles"
+									type="number"
+									value={syncAllFiles ? 0 : maxFiles}
+									onChange={(e) => setMaxFiles(parseInt(e.target.value) || 10)}
+									disabled={syncAllFiles}
+									className="w-16 min-w-16 max-w-16 flex-shrink-0 disabled:opacity-50 disabled:cursor-not-allowed"
+									min="1"
+									max="100"
+									title={
+										syncAllFiles
+											? "Disabled when 'Sync all files' is checked"
+											: "Leave blank or set to 0 for unlimited"
+									}
+								/>
+							</div>
+						</div>
+					</div>
+				)}
+
+				{/* Connectors Grid */}
+				<div className="grid gap-6 md:grid-cols-2 lg:grid-cols-3">
+					{connectors.map((connector) => (
+						<Card key={connector.id} className="relative flex flex-col">
+							<CardHeader>
+								<div className="flex items-center justify-between">
+									<div className="flex items-center gap-3">
+										{connector.icon}
+										<div>
+											<CardTitle className="text-lg">
+												{connector.name}
+											</CardTitle>
+											<CardDescription className="text-sm">
+												{connector.description}
+											</CardDescription>
+										</div>
+									</div>
+									{getStatusBadge(connector.status)}
+								</div>
+							</CardHeader>
+							<CardContent className="flex-1 flex flex-col justify-end space-y-4">
+								{connector.status === "connected" ? (
+									<div className="space-y-3">
+										<Button
+											onClick={() => handleSync(connector)}
+											disabled={isSyncing === connector.id}
+											className="w-full"
+											variant="outline"
+										>
+											{isSyncing === connector.id ? (
+												<>
+													<Loader2 className="mr-2 h-4 w-4 animate-spin" />
+													Syncing...
+												</>
+											) : (
+												<>
+													<RefreshCw className="mr-2 h-4 w-4" />
+													Sync Now
+												</>
+											)}
+										</Button>
+
+										{syncResults[connector.id] && (
+											<div className="text-xs text-muted-foreground bg-muted/50 p-2 rounded">
+												<div>
+													Processed: {syncResults[connector.id]?.processed || 0}
+												</div>
+												<div>
+													Added: {syncResults[connector.id]?.added || 0}
+												</div>
+												{syncResults[connector.id]?.errors && (
+													<div>Errors: {syncResults[connector.id]?.errors}</div>
+												)}
+											</div>
+										)}
+									</div>
+								) : (
+									<Button
+										onClick={() => handleConnect(connector)}
+										disabled={isConnecting === connector.id}
+										className="w-full"
+									>
+										{isConnecting === connector.id ? (
+											<>
+												<Loader2 className="mr-2 h-4 w-4 animate-spin" />
+												Connecting...
+											</>
+										) : (
+											<>
+												<PlugZap className="mr-2 h-4 w-4" />
+												Connect
+											</>
+										)}
+									</Button>
+								)}
+							</CardContent>
+						</Card>
+					))}
+				</div>
+			</div>
+		</div>
+	);
 }
 
 export default function ProtectedKnowledgeSourcesPage() {
-  return (
-    <ProtectedRoute>
-      <Suspense fallback={<div>Loading knowledge sources...</div>}>
-        <KnowledgeSourcesPage />
-      </Suspense>
-    </ProtectedRoute>
-  )
+	return (
+		<ProtectedRoute>
+			<Suspense fallback={<div>Loading knowledge sources...</div>}>
+				<KnowledgeSourcesPage />
+			</Suspense>
+		</ProtectedRoute>
+	);
 }

From b4ca29677ef3b3dc839c2d82783e592cd8fc942b Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Fri, 5 Sep 2025 10:03:07 -0300
Subject: [PATCH 23/67] Refactor settings fetching in KnowledgeSourcesPage for
 cleaner state updates

This commit simplifies the state update logic in the KnowledgeSourcesPage component by using conditional chaining to set various settings from the backend response. It removes the unnecessary settingsLoaded state, streamlining the code for better readability and maintainability while adhering to robust coding practices.
---
 frontend/src/app/settings/page.tsx | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx
index 1e88b12c..e632c57d 100644
--- a/frontend/src/app/settings/page.tsx
+++ b/frontend/src/app/settings/page.tsx
@@ -78,7 +78,6 @@ function KnowledgeSourcesPage() {
 		separator: "\\n",
 		embeddingModel: "text-embedding-3-small",
 	});
-	const [settingsLoaded, setSettingsLoaded] = useState(false);
 
 	// Fetch settings from backend
 	const fetchSettings = useCallback(async () => {
@@ -86,31 +85,18 @@ function KnowledgeSourcesPage() {
 			const response = await fetch("/api/settings");
 			if (response.ok) {
 				const settings = await response.json();
-				if (settings.flow_id) {
-					setFlowId(settings.flow_id);
-				}
-				if (settings.ingest_flow_id) {
-					console.log("Setting ingestFlowId to:", settings.ingest_flow_id);
-					setIngestFlowId(settings.ingest_flow_id);
-				} else {
-					console.log("No ingest_flow_id in settings:", settings);
-				}
-				if (settings.langflow_edit_url) {
-					setLangflowEditUrl(settings.langflow_edit_url);
-				}
-				if (settings.langflow_ingest_edit_url) {
-					setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
-				}
-				if (settings.langflow_public_url) {
-					setPublicLangflowUrl(settings.langflow_public_url);
-				}
+				// Update all state cleanly
+				settings.flow_id && setFlowId(settings.flow_id);
+				settings.ingest_flow_id && setIngestFlowId(settings.ingest_flow_id);
+				settings.langflow_edit_url && setLangflowEditUrl(settings.langflow_edit_url);
+				settings.langflow_ingest_edit_url && setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
+				settings.langflow_public_url && setPublicLangflowUrl(settings.langflow_public_url);
 				if (settings.ingestion_defaults) {
 					console.log(
 						"Loading ingestion defaults from backend:",
 						settings.ingestion_defaults,
 					);
 					setIngestionSettings(settings.ingestion_defaults);
-					setSettingsLoaded(true);
 				}
 			}
 		} catch (error) {
@@ -572,7 +558,6 @@ function KnowledgeSourcesPage() {
 									</option>
 								</select>
 							</div>
-
 						</CardContent>
 					</Card>
 				</div>

From 63982ba711970dc8e8f0fa7640c58a3d77c767ea Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Fri, 5 Sep 2025 10:22:55 -0300
Subject: [PATCH 24/67] Format

---
 frontend/src/app/settings/page.tsx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx
index e632c57d..ccb43eac 100644
--- a/frontend/src/app/settings/page.tsx
+++ b/frontend/src/app/settings/page.tsx
@@ -88,9 +88,12 @@ function KnowledgeSourcesPage() {
 				// Update all state cleanly
 				settings.flow_id && setFlowId(settings.flow_id);
 				settings.ingest_flow_id && setIngestFlowId(settings.ingest_flow_id);
-				settings.langflow_edit_url && setLangflowEditUrl(settings.langflow_edit_url);
-				settings.langflow_ingest_edit_url && setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
-				settings.langflow_public_url && setPublicLangflowUrl(settings.langflow_public_url);
+				settings.langflow_edit_url &&
+					setLangflowEditUrl(settings.langflow_edit_url);
+				settings.langflow_ingest_edit_url &&
+					setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
+				settings.langflow_public_url &&
+					setPublicLangflowUrl(settings.langflow_public_url);
 				if (settings.ingestion_defaults) {
 					console.log(
 						"Loading ingestion defaults from backend:",

From 89246ed1ae86ef3fd3f883f7a119fb3c631d23c3 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Fri, 5 Sep 2025 11:33:37 -0300
Subject: [PATCH 25/67] Refactor Langflow API integration in AppClients

This commit updates the settings for LANGFLOW_INGEST_FLOW_ID to remove reliance on a deprecated environment variable. It introduces a new Langflow HTTP client for making API requests and adds a centralized method for handling Langflow API requests, improving code organization and maintainability. The changes enhance the robustness of the async code while ensuring proper documentation practices are followed.
---
 src/config/settings.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/config/settings.py b/src/config/settings.py
index c4544f50..f2e3be7e 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -29,7 +29,7 @@ LANGFLOW_PUBLIC_URL = os.getenv("LANGFLOW_PUBLIC_URL")
 _legacy_flow_id = os.getenv("FLOW_ID")
 
 LANGFLOW_CHAT_FLOW_ID = os.getenv("LANGFLOW_CHAT_FLOW_ID") or _legacy_flow_id
-LANGFLOW_INGEST_FLOW_ID = os.getenv("LANGFLOW_INGEST_FLOW_ID") or _legacy_flow_id_ingest
+LANGFLOW_INGEST_FLOW_ID = os.getenv("LANGFLOW_INGEST_FLOW_ID")
 
 if _legacy_flow_id and not os.getenv("LANGFLOW_CHAT_FLOW_ID"):
     logger.warning("FLOW_ID is deprecated. Please use LANGFLOW_CHAT_FLOW_ID instead")
@@ -239,6 +239,7 @@ class AppClients:
     def __init__(self):
         self.opensearch = None
         self.langflow_client = None
+        self.langflow_http_client = None
         self.patched_async_client = None
         self.converter = None
 
@@ -284,6 +285,11 @@ class AppClients:
         # Initialize document converter
         self.converter = DocumentConverter()
 
+        # Initialize Langflow HTTP client
+        self.langflow_http_client = httpx.AsyncClient(
+            base_url=LANGFLOW_URL, timeout=60.0
+        )
+
         return self
 
     async def ensure_langflow_client(self):
@@ -305,6 +311,23 @@ class AppClients:
                 self.langflow_client = None
         return self.langflow_client
 
+    async def langflow_request(self, method: str, endpoint: str, **kwargs):
+        """Central method for all Langflow API requests"""
+        api_key = await generate_langflow_api_key()
+        if not api_key:
+            raise ValueError("No Langflow API key available")
+
+        # Merge headers properly - passed headers take precedence over defaults
+        default_headers = {"x-api-key": api_key, "Content-Type": "application/json"}
+        existing_headers = kwargs.pop("headers", {})
+        headers = {**default_headers, **existing_headers}
+
+        url = f"{LANGFLOW_URL}{endpoint}"
+
+        return await self.langflow_http_client.request(
+            method=method, url=url, headers=headers, **kwargs
+        )
+
     async def _create_langflow_global_variable(self, name: str, value: str):
         """Create a global variable in Langflow via API"""
         api_key = await generate_langflow_api_key()

From db65f8789adcd23de6c17835daccc9c63cdc6641 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Fri, 5 Sep 2025 11:36:44 -0300
Subject: [PATCH 26/67] Refactor LangflowFileService to utilize centralized API
 client

This commit streamlines the LangflowFileService by removing direct HTTP client usage in favor of a centralized API client for handling requests. It enhances the upload and delete file methods to improve code organization and maintainability. Additionally, it updates logging practices for better error visibility, ensuring adherence to robust async coding standards and documentation practices.
---
 src/services/langflow_file_service.py | 132 +++++++++-----------------
 1 file changed, 44 insertions(+), 88 deletions(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 213228a0..452ecb86 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -1,90 +1,53 @@
 import logging
 from typing import Any, Dict, List, Optional
 
-import httpx
-
-from config.settings import (
-    LANGFLOW_BASE_URL,
-    LANGFLOW_INGEST_FLOW_ID,
-    LANGFLOW_URL,
-)
+from config.settings import LANGFLOW_INGEST_FLOW_ID, clients
 
 
 class LangflowFileService:
     def __init__(self):
-        self.base_url = LANGFLOW_BASE_URL.rstrip("/")
         self.flow_id_ingest = LANGFLOW_INGEST_FLOW_ID
         self.logger = logging.getLogger(__name__)
 
-    async def _get_api_key(self) -> Optional[str]:
-        """Get Langflow API key, ensuring it's generated if needed"""
-        from config.settings import generate_langflow_api_key
-
-        api_key = await generate_langflow_api_key()
-        print(f"[LF] _get_api_key returning: {'present' if api_key else 'None'}")
-        if api_key:
-            print(f"[LF] API key prefix: {api_key[:8]}...")
-        return api_key
-
-    async def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
-        api_key = await self._get_api_key()
-        headers = {"x-api-key": api_key} if api_key else {}
-        if extra:
-            headers.update(extra)
-        return headers
-
     async def upload_user_file(
         self, file_tuple, jwt_token: Optional[str] = None
     ) -> Dict[str, Any]:
         """Upload a file using Langflow Files API v2: POST /api/v2/files.
         Returns JSON with keys: id, name, path, size, provider.
         """
-        # NOTE: base_url points to /api/v1; v2 endpoints must not be prefixed with /api/v1
-        url = f"{LANGFLOW_URL}/api/v2/files"
-        api_key = await self._get_api_key()
-        self.logger.debug("[LF] Upload (v2) -> %s (key_present=%s)", url, bool(api_key))
-        if api_key:
-            self.logger.debug(f"[LF] Using API key: {api_key[:12]}...")
-        else:
-            self.logger.error("[LF] No API key available for upload!")
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            files = {"file": file_tuple}
-            headers = await self._headers()
-            print(f"[LF] Upload headers: {headers}")
-            # Note: jwt_token is for OpenSearch, not for Langflow API - only use x-api-key
-            resp = await client.post(url, headers=headers, files=files)
-            self.logger.debug(
-                "[LF] Upload response: %s %s", resp.status_code, resp.reason_phrase
+        self.logger.debug("[LF] Upload (v2) -> /api/v2/files")
+        resp = await clients.langflow_request(
+            "POST", "/api/v2/files", files={"file": file_tuple}
+        )
+        self.logger.debug(
+            "[LF] Upload response: %s %s", resp.status_code, resp.reason_phrase
+        )
+        if resp.status_code >= 400:
+            self.logger.error(
+                "[LF] Upload failed: %s %s | body=%s",
+                resp.status_code,
+                resp.reason_phrase,
+                resp.text[:500],
             )
-            if resp.status_code >= 400:
-                self.logger.error(
-                    "[LF] Upload failed: %s %s | body=%s",
-                    resp.status_code,
-                    resp.reason_phrase,
-                    resp.text[:500],
-                )
-            resp.raise_for_status()
-            return resp.json()
+        resp.raise_for_status()
+        return resp.json()
 
     async def delete_user_file(self, file_id: str) -> None:
         """Delete a file by id using v2: DELETE /api/v2/files/{id}."""
         # NOTE: use v2 root, not /api/v1
-        url = f"{LANGFLOW_URL}/api/v2/files/{file_id}"
-        self.logger.debug("[LF] Delete (v2) -> %s (id=%s)", url, file_id)
-        async with httpx.AsyncClient(timeout=30.0) as client:
-            headers = await self._headers()
-            resp = await client.delete(url, headers=headers)
-            self.logger.debug(
-                "[LF] Delete response: %s %s", resp.status_code, resp.reason_phrase
+        self.logger.debug("[LF] Delete (v2) -> /api/v2/files/%s", file_id)
+        resp = await clients.langflow_request("DELETE", f"/api/v2/files/{file_id}")
+        self.logger.debug(
+            "[LF] Delete response: %s %s", resp.status_code, resp.reason_phrase
+        )
+        if resp.status_code >= 400:
+            self.logger.error(
+                "[LF] Delete failed: %s %s | body=%s",
+                resp.status_code,
+                resp.reason_phrase,
+                resp.text[:500],
             )
-            if resp.status_code >= 400:
-                self.logger.error(
-                    "[LF] Delete failed: %s %s | body=%s",
-                    resp.status_code,
-                    resp.reason_phrase,
-                    resp.text[:500],
-                )
-            resp.raise_for_status()
+        resp.raise_for_status()
 
     async def run_ingestion_flow(
         self,
@@ -100,8 +63,6 @@ class LangflowFileService:
         if not self.flow_id_ingest:
             raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
 
-        url = f"{self.base_url}/run/{self.flow_id_ingest}"
-
         payload: Dict[str, Any] = {
             "input_value": "Ingest files",
             "input_type": "chat",
@@ -120,8 +81,8 @@ class LangflowFileService:
             payload["session_id"] = session_id
 
         self.logger.debug(
-            "[LF] Run ingestion -> %s | files=%s session_id=%s tweaks_keys=%s jwt_present=%s",
-            url,
+            "[LF] Run ingestion -> /run/%s | files=%s session_id=%s tweaks_keys=%s jwt_present=%s",
+            self.flow_id_ingest,
             len(file_paths) if file_paths else 0,
             session_id,
             list(tweaks.keys()) if isinstance(tweaks, dict) else None,
@@ -131,23 +92,18 @@ class LangflowFileService:
         # Log the full payload for debugging
         self.logger.debug("[LF] Request payload: %s", payload)
 
-        extra_headers = {}
-        # Note: Ingestion flow doesn't need JWT authentication context
-        # Removed X-LANGFLOW-GLOBAL-VAR-JWT header
-
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            resp = await client.post(
-                url, headers=await self._headers(extra_headers), json=payload
+        resp = await clients.langflow_request(
+            "POST", f"/api/v1/run/{self.flow_id_ingest}", json=payload
+        )
+        self.logger.debug(
+            "[LF] Run response: %s %s", resp.status_code, resp.reason_phrase
+        )
+        if resp.status_code >= 400:
+            self.logger.error(
+                "[LF] Run failed: %s %s | body=%s",
+                resp.status_code,
+                resp.reason_phrase,
+                resp.text[:1000],
             )
-            self.logger.debug(
-                "[LF] Run response: %s %s", resp.status_code, resp.reason_phrase
-            )
-            if resp.status_code >= 400:
-                self.logger.error(
-                    "[LF] Run failed: %s %s | body=%s",
-                    resp.status_code,
-                    resp.reason_phrase,
-                    resp.text[:1000],
-                )
-            resp.raise_for_status()
-            return resp.json()
+        resp.raise_for_status()
+        return resp.json()

From 7172e95d983885f723b3a5e8b8985bf3c56023a5 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Fri, 5 Sep 2025 11:37:44 -0300
Subject: [PATCH 27/67] Update logging in initialize_services to use logger
 instead of print statement

This commit replaces the print statement with a logger.info call in the initialize_services function, enhancing the logging practices for better error tracking and consistency across the codebase. This change aligns with the project's focus on robust async coding and well-documented code.
---
 src/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.py b/src/main.py
index 3d120d1c..a1282a9c 100644
--- a/src/main.py
+++ b/src/main.py
@@ -334,7 +334,7 @@ async def initialize_services():
                 "Failed to load persisted connections on startup", error=str(e)
             )
     else:
-        print("[CONNECTORS] Skipping connection loading in no-auth mode")
+        logger.info("[CONNECTORS] Skipping connection loading in no-auth mode")
 
     # New: Langflow file service
 

From 2543b2103be59ea10463d7fce0a75065f550509f Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 08:42:46 -0300
Subject: [PATCH 28/67] fix import statement

---
 src/services/task_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/services/task_service.py b/src/services/task_service.py
index 004385ae..ad24b188 100644
--- a/src/services/task_service.py
+++ b/src/services/task_service.py
@@ -5,7 +5,7 @@ import uuid
 from typing import Dict
 
 from models.tasks import FileTask, TaskStatus, UploadTask
-from src.utils.gpu_detection import get_worker_count
+from utils.gpu_detection import get_worker_count
 from utils.logging_config import get_logger
 
 logger = get_logger(__name__)

From 7b5589653a1ca93ad9caa2beabafec8f77b7cc3b Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 08:43:32 -0300
Subject: [PATCH 29/67] Add handling for Content-Type header in AppClients

This commit modifies the AppClients class to remove the Content-Type header if it is explicitly set to None, particularly for file uploads. This change enhances the robustness of the async code by ensuring proper header management during API requests, aligning with the project's focus on well-documented and maintainable code.
---
 src/config/settings.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/config/settings.py b/src/config/settings.py
index f2e3be7e..8b5aaa4f 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -322,6 +322,10 @@ class AppClients:
         existing_headers = kwargs.pop("headers", {})
         headers = {**default_headers, **existing_headers}
 
+        # Remove Content-Type if explicitly set to None (for file uploads)
+        if headers.get("Content-Type") is None:
+            headers.pop("Content-Type", None)
+
         url = f"{LANGFLOW_URL}{endpoint}"
 
         return await self.langflow_http_client.request(

From deb39a6e5bec258d7d7536cf99fac06a8eb6c99f Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 08:43:53 -0300
Subject: [PATCH 30/67] Refactor user file upload and ingestion flow to
 streamline JWT token handling

This commit simplifies the handling of the JWT token in the upload_user_file and run_ingestion functions by removing unnecessary lines and ensuring the token is passed correctly to downstream services. This change enhances code readability and maintains the focus on robust async coding practices and well-documented code.
---
 src/api/langflow_files.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
index 1fa1f9c7..23bd33ec 100644
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@@ -34,9 +34,7 @@ async def upload_user_file(
         logger.debug("JWT token status", jwt_present=jwt_token is not None)
 
         logger.debug("Calling langflow_file_service.upload_user_file")
-        result = await langflow_file_service.upload_user_file(
-            file_tuple, jwt_token=jwt_token
-        )
+        result = await langflow_file_service.upload_user_file(file_tuple, jwt_token)
         logger.debug("Upload successful", result=result)
         return JSONResponse(result, status_code=201)
     except Exception as e:
@@ -99,15 +97,20 @@ async def run_ingestion(
             # (search parameters are for retrieval, not document processing)
 
             logger.debug("Final tweaks with settings applied", tweaks=tweaks)
-
         # Include user JWT if available
         jwt_token = getattr(request.state, "jwt_token", None)
+        if jwt_token:
+            # Set auth context for downstream services
+            from auth_context import set_auth_context
+
+            user_id = getattr(request.state, "user_id", None)
+            set_auth_context(user_id, jwt_token)
 
         result = await langflow_file_service.run_ingestion_flow(
             file_paths=file_paths or [],
+            jwt_token=jwt_token,
             session_id=session_id,
             tweaks=tweaks,
-            jwt_token=jwt_token,
         )
         return JSONResponse(result)
     except Exception as e:

From 18e1874c881038c0159bd2df50da7f2dae7345aa Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 08:44:05 -0300
Subject: [PATCH 31/67] Enhance logging and JWT token handling in
 LangflowFileService

This commit refactors the LangflowFileService to utilize a centralized logger instead of instance-specific logging. It also improves the handling of the JWT token in the run_ingestion_flow method, ensuring it is correctly passed to downstream services and logged appropriately. These changes enhance code readability and maintainability while adhering to robust async coding practices.
---
 src/services/langflow_file_service.py | 43 ++++++++++++++++-----------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 452ecb86..6b343670 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -3,11 +3,12 @@ from typing import Any, Dict, List, Optional
 
 from config.settings import LANGFLOW_INGEST_FLOW_ID, clients
 
+logger = logging.getLogger(__name__)
+
 
 class LangflowFileService:
     def __init__(self):
         self.flow_id_ingest = LANGFLOW_INGEST_FLOW_ID
-        self.logger = logging.getLogger(__name__)
 
     async def upload_user_file(
         self, file_tuple, jwt_token: Optional[str] = None
@@ -15,15 +16,18 @@ class LangflowFileService:
         """Upload a file using Langflow Files API v2: POST /api/v2/files.
         Returns JSON with keys: id, name, path, size, provider.
         """
-        self.logger.debug("[LF] Upload (v2) -> /api/v2/files")
+        logger.debug("[LF] Upload (v2) -> /api/v2/files")
         resp = await clients.langflow_request(
-            "POST", "/api/v2/files", files={"file": file_tuple}
+            "POST",
+            "/api/v2/files",
+            files={"file": file_tuple},
+            headers={"Content-Type": None},
         )
-        self.logger.debug(
+        logger.debug(
             "[LF] Upload response: %s %s", resp.status_code, resp.reason_phrase
         )
         if resp.status_code >= 400:
-            self.logger.error(
+            logger.error(
                 "[LF] Upload failed: %s %s | body=%s",
                 resp.status_code,
                 resp.reason_phrase,
@@ -35,13 +39,13 @@ class LangflowFileService:
     async def delete_user_file(self, file_id: str) -> None:
         """Delete a file by id using v2: DELETE /api/v2/files/{id}."""
         # NOTE: use v2 root, not /api/v1
-        self.logger.debug("[LF] Delete (v2) -> /api/v2/files/%s", file_id)
+        logger.debug("[LF] Delete (v2) -> /api/v2/files/%s", file_id)
         resp = await clients.langflow_request("DELETE", f"/api/v2/files/{file_id}")
-        self.logger.debug(
+        logger.debug(
             "[LF] Delete response: %s %s", resp.status_code, resp.reason_phrase
         )
         if resp.status_code >= 400:
-            self.logger.error(
+            logger.error(
                 "[LF] Delete failed: %s %s | body=%s",
                 resp.status_code,
                 resp.reason_phrase,
@@ -52,9 +56,9 @@ class LangflowFileService:
     async def run_ingestion_flow(
         self,
         file_paths: List[str],
+        jwt_token: str,
         session_id: Optional[str] = None,
         tweaks: Optional[Dict[str, Any]] = None,
-        jwt_token: Optional[str] = None,
     ) -> Dict[str, Any]:
         """
         Trigger the ingestion flow with provided file paths.
@@ -68,19 +72,26 @@ class LangflowFileService:
             "input_type": "chat",
             "output_type": "text",  # Changed from "json" to "text"
         }
+        if not tweaks:
+            tweaks = {}
 
         # Pass files via tweaks to File component (File-PSU37 from the flow)
         if file_paths:
-            if not tweaks:
-                tweaks = {}
             tweaks["File-PSU37"] = {"path": file_paths}
 
+        # Pass JWT token via tweaks using the x-langflow-global-var- pattern
+        if jwt_token:
+            # Using the global variable pattern that Langflow expects for OpenSearch components
+            tweaks["OpenSearchHybrid-Ve6bS"] = {"jwt_token": jwt_token}
+            logger.error("[LF] Adding JWT token to tweaks for OpenSearch components")
+        else:
+            logger.error("[LF] No JWT token provided")
         if tweaks:
             payload["tweaks"] = tweaks
         if session_id:
             payload["session_id"] = session_id
 
-        self.logger.debug(
+        logger.debug(
             "[LF] Run ingestion -> /run/%s | files=%s session_id=%s tweaks_keys=%s jwt_present=%s",
             self.flow_id_ingest,
             len(file_paths) if file_paths else 0,
@@ -90,16 +101,14 @@ class LangflowFileService:
         )
 
         # Log the full payload for debugging
-        self.logger.debug("[LF] Request payload: %s", payload)
+        logger.debug("[LF] Request payload: %s", payload)
 
         resp = await clients.langflow_request(
             "POST", f"/api/v1/run/{self.flow_id_ingest}", json=payload
         )
-        self.logger.debug(
-            "[LF] Run response: %s %s", resp.status_code, resp.reason_phrase
-        )
+        logger.debug("[LF] Run response: %s %s", resp.status_code, resp.reason_phrase)
         if resp.status_code >= 400:
-            self.logger.error(
+            logger.error(
                 "[LF] Run failed: %s %s | body=%s",
                 resp.status_code,
                 resp.reason_phrase,

From 20a5648b3d7c8f11d242fd466a0f062fd9a12d05 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 09:14:03 -0300
Subject: [PATCH 32/67] Refactor ingestion flow configuration for OpenSearch
 hybrid component

This commit updates the ingestion flow JSON configuration to enhance the OpenSearch hybrid component. Key changes include renaming the component to OpenSearchHybrid, updating input fields for better clarity, and improving the handling of JWT authentication. Additionally, the commit introduces new parameters for search configuration, such as engine selection and filter expressions, while ensuring the overall structure adheres to robust async coding practices and well-documented code.
---
 flows/ingestion_flow.json | 428 ++++++++++++++++++++++++++------------
 1 file changed, 300 insertions(+), 128 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index eff0552d..5ee5d1b2 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -24,40 +24,12 @@
             "type": "other"
           }
         },
-        "id": "reactflow__edge-File-PSU37{\u0153dataType\u0153:\u0153File\u0153,\u0153id\u0153:\u0153File-PSU37\u0153,\u0153name\u0153:\u0153message\u0153,\u0153output_types\u0153:[\u0153Message\u0153]}-SplitText-QIKhg{\u0153fieldName\u0153:\u0153data_inputs\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153,\u0153Message\u0153],\u0153type\u0153:\u0153other\u0153}",
+        "id": "reactflow__edge-File-PSU37{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}",
         "selected": false,
         "source": "File-PSU37",
-        "sourceHandle": "{\u0153dataType\u0153:\u0153File\u0153,\u0153id\u0153:\u0153File-PSU37\u0153,\u0153name\u0153:\u0153message\u0153,\u0153output_types\u0153:[\u0153Message\u0153]}",
+        "sourceHandle": "{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}",
         "target": "SplitText-QIKhg",
-        "targetHandle": "{\u0153fieldName\u0153:\u0153data_inputs\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153,\u0153Message\u0153],\u0153type\u0153:\u0153other\u0153}"
-      },
-      {
-        "animated": false,
-        "className": "",
-        "data": {
-          "sourceHandle": {
-            "dataType": "OpenAIEmbeddings",
-            "id": "OpenAIEmbeddings-joRJ6",
-            "name": "embeddings",
-            "output_types": [
-              "Embeddings"
-            ]
-          },
-          "targetHandle": {
-            "fieldName": "embedding",
-            "id": "OpenSearch-Mkw1W",
-            "inputTypes": [
-              "Embeddings"
-            ],
-            "type": "other"
-          }
-        },
-        "id": "xy-edge__OpenAIEmbeddings-joRJ6{\u0153dataType\u0153:\u0153OpenAIEmbeddings\u0153,\u0153id\u0153:\u0153OpenAIEmbeddings-joRJ6\u0153,\u0153name\u0153:\u0153embeddings\u0153,\u0153output_types\u0153:[\u0153Embeddings\u0153]}-OpenSearch-Mkw1W{\u0153fieldName\u0153:\u0153embedding\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Embeddings\u0153],\u0153type\u0153:\u0153other\u0153}",
-        "selected": false,
-        "source": "OpenAIEmbeddings-joRJ6",
-        "sourceHandle": "{\u0153dataType\u0153:\u0153OpenAIEmbeddings\u0153,\u0153id\u0153:\u0153OpenAIEmbeddings-joRJ6\u0153,\u0153name\u0153:\u0153embeddings\u0153,\u0153output_types\u0153:[\u0153Embeddings\u0153]}",
-        "target": "OpenSearch-Mkw1W",
-        "targetHandle": "{\u0153fieldName\u0153:\u0153embedding\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Embeddings\u0153],\u0153type\u0153:\u0153other\u0153}"
+        "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}"
       },
       {
         "animated": false,
@@ -73,7 +45,7 @@
           },
           "targetHandle": {
             "fieldName": "ingest_data",
-            "id": "OpenSearch-Mkw1W",
+            "id": "OpenSearchHybrid-Ve6bS",
             "inputTypes": [
               "Data",
               "DataFrame"
@@ -81,12 +53,40 @@
             "type": "other"
           }
         },
-        "id": "xy-edge__SplitText-QIKhg{\u0153dataType\u0153:\u0153SplitText\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153name\u0153:\u0153dataframe\u0153,\u0153output_types\u0153:[\u0153DataFrame\u0153]}-OpenSearch-Mkw1W{\u0153fieldName\u0153:\u0153ingest_data\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153],\u0153type\u0153:\u0153other\u0153}",
+        "id": "xy-edge__SplitText-QIKhg{œdataTypeœ:œSplitTextœ,œidœ:œSplitText-QIKhgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-OpenSearchHybrid-Ve6bS{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}",
         "selected": false,
         "source": "SplitText-QIKhg",
-        "sourceHandle": "{\u0153dataType\u0153:\u0153SplitText\u0153,\u0153id\u0153:\u0153SplitText-QIKhg\u0153,\u0153name\u0153:\u0153dataframe\u0153,\u0153output_types\u0153:[\u0153DataFrame\u0153]}",
-        "target": "OpenSearch-Mkw1W",
-        "targetHandle": "{\u0153fieldName\u0153:\u0153ingest_data\u0153,\u0153id\u0153:\u0153OpenSearch-Mkw1W\u0153,\u0153inputTypes\u0153:[\u0153Data\u0153,\u0153DataFrame\u0153],\u0153type\u0153:\u0153other\u0153}"
+        "sourceHandle": "{œdataTypeœ:œSplitTextœ,œidœ:œSplitText-QIKhgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}",
+        "target": "OpenSearchHybrid-Ve6bS",
+        "targetHandle": "{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}"
+      },
+      {
+        "animated": false,
+        "className": "",
+        "data": {
+          "sourceHandle": {
+            "dataType": "OpenAIEmbeddings",
+            "id": "OpenAIEmbeddings-joRJ6",
+            "name": "embeddings",
+            "output_types": [
+              "Embeddings"
+            ]
+          },
+          "targetHandle": {
+            "fieldName": "embedding",
+            "id": "OpenSearchHybrid-Ve6bS",
+            "inputTypes": [
+              "Embeddings"
+            ],
+            "type": "other"
+          }
+        },
+        "id": "xy-edge__OpenAIEmbeddings-joRJ6{œdataTypeœ:œOpenAIEmbeddingsœ,œidœ:œOpenAIEmbeddings-joRJ6œ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}-OpenSearchHybrid-Ve6bS{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}",
+        "selected": false,
+        "source": "OpenAIEmbeddings-joRJ6",
+        "sourceHandle": "{œdataTypeœ:œOpenAIEmbeddingsœ,œidœ:œOpenAIEmbeddings-joRJ6œ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}",
+        "target": "OpenSearchHybrid-Ve6bS",
+        "targetHandle": "{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}"
       }
     ],
     "nodes": [
@@ -115,7 +115,7 @@
             "frozen": false,
             "icon": "scissors-line-dashed",
             "legacy": false,
-            "lf_version": "1.1.1",
+            "lf_version": "1.5.0.post2",
             "metadata": {
               "code_hash": "dbf2e9d2319d",
               "dependencies": {
@@ -353,7 +353,7 @@
             "frozen": false,
             "icon": "OpenAI",
             "legacy": false,
-            "lf_version": "1.1.1",
+            "lf_version": "1.5.0.post2",
             "metadata": {
               "code_hash": "2691dee277c9",
               "dependencies": {
@@ -829,7 +829,7 @@
         "data": {
           "id": "note-Bm5Xw",
           "node": {
-            "description": "### \ud83d\udca1 Add your OpenAI API key here \ud83d\udc47",
+            "description": "### 💡 Add your OpenAI API key here 👇",
             "display_name": "",
             "documentation": "",
             "template": {
@@ -884,8 +884,9 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-03T06:37:14.082Z",
+            "last_updated": "2025-09-08T11:35:39.784Z",
             "legacy": false,
+            "lf_version": "1.5.0.post2",
             "metadata": {},
             "minimized": false,
             "output_types": [],
@@ -905,6 +906,23 @@
                   "Message"
                 ],
                 "value": "__UNDEFINED__"
+              },
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "File Path",
+                "group_outputs": false,
+                "hidden": null,
+                "method": "load_files_path",
+                "name": "path",
+                "options": null,
+                "required_inputs": null,
+                "selected": "Message",
+                "tool_mode": true,
+                "types": [
+                  "Message"
+                ],
+                "value": "__UNDEFINED__"
               }
             ],
             "pinned": false,
@@ -1051,7 +1069,9 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [],
+                "file_path": [
+                  "242b5797-4104-4a01-8da1-b8036813100d/test_ingestion.txt"
+                ],
                 "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
@@ -1124,13 +1144,14 @@
             },
             "tool_mode": false
           },
+          "selected_output": "message",
           "showNode": true,
           "type": "File"
         },
         "dragging": false,
         "id": "File-PSU37",
         "measured": {
-          "height": 230,
+          "height": 234,
           "width": 320
         },
         "position": {
@@ -1142,9 +1163,7 @@
       },
       {
         "data": {
-          "description": "OpenSearch Vector Store with advanced, customizable search capabilities.",
-          "display_name": "OpenSearch",
-          "id": "OpenSearch-Mkw1W",
+          "id": "OpenSearchHybrid-Ve6bS",
           "node": {
             "base_classes": [
               "Data",
@@ -1154,45 +1173,53 @@
             "beta": false,
             "conditional_paths": [],
             "custom_fields": {},
-            "description": "OpenSearch Vector Store with advanced, customizable search capabilities.",
-            "display_name": "OpenSearch",
+            "description": "Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.",
+            "display_name": "OpenSearch (Hybrid)",
             "documentation": "",
-            "edited": false,
+            "edited": true,
             "field_order": [
               "opensearch_url",
               "index_name",
+              "engine",
+              "space_type",
+              "ef_construction",
+              "m",
               "ingest_data",
               "search_query",
               "should_cache_vector_store",
               "embedding",
-              "search_type",
+              "vector_field",
               "number_of_results",
-              "search_score_threshold",
+              "filter_expression",
+              "auth_mode",
               "username",
               "password",
+              "jwt_token",
+              "jwt_header",
+              "bearer_prefix",
               "use_ssl",
-              "verify_certs",
-              "hybrid_search_query"
+              "verify_certs"
             ],
             "frozen": false,
             "icon": "OpenSearch",
+            "last_updated": "2025-09-05T21:19:52.776Z",
             "legacy": false,
             "metadata": {
-              "code_hash": "972b714acf6b",
+              "code_hash": "37e8631c902b",
               "dependencies": {
                 "dependencies": [
-                  {
-                    "name": "langchain_community",
-                    "version": "0.3.21"
-                  },
                   {
                     "name": "langflow",
                     "version": "1.5.0.post2"
+                  },
+                  {
+                    "name": "opensearchpy",
+                    "version": "2.8.0"
                   }
                 ],
                 "total_dependencies": 2
               },
-              "module": "custom_components.opensearch"
+              "module": "custom_components.opensearch_hybrid"
             },
             "minimized": false,
             "output_types": [],
@@ -1202,6 +1229,7 @@
                 "cache": true,
                 "display_name": "Search Results",
                 "group_outputs": false,
+                "hidden": null,
                 "method": "search_documents",
                 "name": "search_results",
                 "options": null,
@@ -1218,11 +1246,11 @@
                 "cache": true,
                 "display_name": "DataFrame",
                 "group_outputs": false,
+                "hidden": null,
                 "method": "as_dataframe",
                 "name": "dataframe",
                 "options": null,
                 "required_inputs": null,
-                "selected": "DataFrame",
                 "tool_mode": true,
                 "types": [
                   "DataFrame"
@@ -1239,7 +1267,6 @@
                 "name": "vectorstoreconnection",
                 "options": null,
                 "required_inputs": null,
-                "selected": "VectorStore",
                 "tool_mode": true,
                 "types": [
                   "VectorStore"
@@ -1250,6 +1277,50 @@
             "pinned": false,
             "template": {
               "_type": "Component",
+              "auth_mode": {
+                "_input_type": "DropdownInput",
+                "advanced": false,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Auth Mode",
+                "dynamic": false,
+                "info": "Choose Basic (username/password) or JWT (Bearer token).",
+                "load_from_db": false,
+                "name": "auth_mode",
+                "options": [
+                  "basic",
+                  "jwt"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "real_time_refresh": true,
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "jwt"
+              },
+              "bearer_prefix": {
+                "_input_type": "BoolInput",
+                "advanced": true,
+                "display_name": "Prefix 'Bearer '",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "bearer_prefix",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": true
+              },
               "code": {
                 "advanced": true,
                 "dynamic": true,
@@ -1266,7 +1337,25 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "import json\nfrom typing import Any\n\nfrom langchain_community.vectorstores import OpenSearchVectorSearch\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FloatInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom fastapi.encoders import jsonable_encoder\nfrom langchain_core.documents import Document\nimport json\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch Vector Store with advanced, customizable search capabilities.\"\"\"\n\n    display_name: str = \"OpenSearch\"\n    description: str = \"OpenSearch Vector Store with advanced, customizable search capabilities.\"\n    name = \"OpenSearch\"\n    icon = \"OpenSearch\"\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for OpenSearch cluster (e.g. https://192.168.1.1:9200).\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"documents\",\n            info=\"The index name where the vectors will be stored in OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            info=\"Document field embeddings are stored in.\",\n            advanced=True,\n        ),\n        StrInput(\n            name=\"text_field\",\n            display_name=\"Text Field\", \n            value=\"text\",\n            info=\"Document field the text of the document is stored in.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=100,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,\n        HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n        DropdownInput(\n            name=\"search_type\",\n            display_name=\"Search Type\",\n            options=[\"similarity\", \"similarity_score_threshold\", \"mmr\"],\n            value=\"similarity\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Number of Results\",\n            info=\"Number of results to return.\",\n            advanced=True,\n            value=4,\n        ),\n        FloatInput(\n            name=\"search_score_threshold\",\n            display_name=\"Search Score Threshold\",\n            info=\"Minimum similarity score threshold for search results.\",\n            value=0.0,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            advanced=True,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"OPENSEARCH_PASSWORD\",\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"use_ssl\",\n            display_name=\"Use SSL\",\n            value=True,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n        MultilineInput(\n            name=\"hybrid_search_query\",\n            display_name=\"Hybrid Search Query\",\n            value=\"\",\n            advanced=True,\n            info=(\n                \"Provide a custom hybrid search query in JSON format. This allows you to combine \"\n                \"vector similarity and keyword matching.\"\n            ),\n        ),\n    ]\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearchVectorSearch:\n        \"\"\"Builds the OpenSearch Vector Store object.\"\"\"\n        try:\n            from langchain_community.vectorstores import OpenSearchVectorSearch\n        except ImportError as e:\n            error_message = f\"Failed to import required modules: {e}\"\n            self.log(error_message)\n            raise ImportError(error_message) from e\n\n        try:\n            opensearch = OpenSearchVectorSearch(\n                index_name=self.index_name,\n                embedding_function=self.embedding,\n                opensearch_url=self.opensearch_url,\n                http_auth=(self.username, self.password),\n                use_ssl=self.use_ssl,\n                verify_certs=self.verify_certs,\n                ssl_assert_hostname=False,\n                ssl_show_warn=False,\n                engine=self.engine,\n                vector_field=self.vector_field,\n                text_field=self.text_field,\n                space_type=self.space_type,\n                ef_construction=self.ef_construction,\n                m=self.m,\n            )\n        except Exception as e:\n            error_message = f\"Failed to create OpenSearchVectorSearch instance: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        if self.ingest_data:\n            self._add_documents_to_vector_store(opensearch)\n\n        return opensearch\n\n    def _add_documents_to_vector_store(self, vector_store: \"OpenSearchVectorSearch\") -> None:\n        \"\"\"Adds documents to the Vector Store.\"\"\"\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        documents = []\n        for _input in self.ingest_data or []:\n            if isinstance(_input, Data):\n                doc = Document(\n                    page_content=_input.get_text(), \n                    metadata=jsonable_encoder(_input.data) if _input.data else {}\n                )\n                documents.append(doc)\n            else:\n                error_message = f\"Expected Data object, got {type(_input)}\"\n                self.log(error_message)\n                raise TypeError(error_message)\n\n        if documents and self.embedding is not None:\n            self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n            try:\n                vector_store.add_documents(\n                    documents, \n                    vector_field=self.vector_field,\n                    text_field=self.text_field\n                )\n            except Exception as e:\n                error_message = f\"Error adding documents to Vector Store: {e}\"\n                self.log(error_message)\n                raise RuntimeError(error_message) from e\n        else:\n            self.log(\"No documents to add to the Vector Store.\")\n\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        \"\"\"Search for similar documents in the vector store or retrieve all documents if no query is provided.\"\"\"\n        \n        vector_store = self.build_vector_store()\n        try:\n            query = query or \"\"\n\n            if self.hybrid_search_query.strip():\n                try:\n                    hybrid_query = json.loads(self.hybrid_search_query)\n                except json.JSONDecodeError as e:\n                    error_message = f\"Invalid hybrid search query JSON: {e}\"\n                    self.log(error_message)\n                    raise ValueError(error_message) from e\n\n                results = vector_store.client.search(index=self.index_name, body=hybrid_query)\n\n                processed_results = []\n                for hit in results.get(\"hits\", {}).get(\"hits\", []):\n                    source = hit.get(\"_source\", {})\n                    text = source.get(self.text_field, \"\")\n                    metadata = source.get(\"metadata\", {})\n\n                    if isinstance(text, dict):\n                        text = text.get(\"text\", \"\")\n\n                    processed_results.append(\n                        {\n                            \"page_content\": text,\n                            \"metadata\": metadata,\n                        }\n                    )\n                return processed_results\n\n            search_kwargs = {\n                \"k\": self.number_of_results,\n                \"vector_field\": self.vector_field,\n                \"text_field\": self.text_field\n            }\n            search_type = self.search_type.lower()\n\n            if search_type == \"similarity\":\n                results = vector_store.similarity_search(query, **search_kwargs)\n                return [{\"page_content\": doc.page_content, \"metadata\": doc.metadata} for doc in results]\n            if search_type == \"similarity_score_threshold\":\n                search_kwargs[\"score_threshold\"] = self.search_score_threshold\n                results = vector_store.similarity_search_with_relevance_scores(query, **search_kwargs)\n                return [\n                    {\n                        \"page_content\": doc.page_content,\n                        \"metadata\": doc.metadata,\n                        \"score\": score,\n                    }\n                    for doc, score in results\n                ]\n            if search_type == \"mmr\":\n                results = vector_store.max_marginal_relevance_search(query, **search_kwargs)\n                return [{\"page_content\": doc.page_content, \"metadata\": doc.metadata} for doc in results]\n\n        except Exception as e:\n            error_message = f\"Error during search: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        error_message = f\"Error during search. Invalid search type: {self.search_type}\"\n        self.log(error_message)\n        raise ValueError(error_message)\n\n    def search_documents(self) -> list[Data]:\n        \"\"\"Search for documents in the vector store based on the search input.\n\n        If no search input is provided, retrieve all documents.\n        \"\"\"\n        try:\n            query = self.search_query.strip() if self.search_query else None\n            results = self.search(query)\n            retrieved_data = [\n                Data(\n                    file_path=result[\"metadata\"].get(\"file_path\", \"\"),\n                    text=result[\"page_content\"],\n                )\n                for result in results\n            ]\n        except Exception as e:\n            error_message = f\"Error during document search: {e}\"\n            self.log(error_message)\n            raise RuntimeError(error_message) from e\n\n        self.status = retrieved_data\n        return retrieved_data\n"
+                "value": "from __future__ import annotations\n\nimport json\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\nimport uuid\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"nmslib\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        try:\n            client.indices.get(index=index_name)\n        except NotFoundError:\n            client.indices.create(index=index_name, body=mapping)\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        if not is_aoss:\n            client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        return self.build_client()\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            lc_doc = doc_obj.to_lc_document()\n            texts.append(lc_doc.page_content)\n            metadatas.append(lc_doc.metadata)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"nmslib\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+              },
+              "ef_construction": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "EF Construction",
+                "dynamic": false,
+                "info": "Size of the dynamic list used during k-NN graph creation.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "ef_construction",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 512
               },
               "embedding": {
                 "_input_type": "HandleInput",
@@ -1288,13 +1377,38 @@
                 "type": "other",
                 "value": ""
               },
-              "hybrid_search_query": {
-                "_input_type": "MultilineInput",
+              "engine": {
+                "_input_type": "DropdownInput",
                 "advanced": true,
-                "copy_field": false,
-                "display_name": "Hybrid Search Query",
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Engine",
                 "dynamic": false,
-                "info": "Provide a custom hybrid search query in JSON format. This allows you to combine vector similarity and keyword matching.",
+                "info": "Vector search engine to use.",
+                "name": "engine",
+                "options": [
+                  "nmslib",
+                  "faiss",
+                  "lucene"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "nmslib"
+              },
+              "filter_expression": {
+                "_input_type": "MultilineInput",
+                "advanced": false,
+                "copy_field": false,
+                "display_name": "Filter Expression (JSON)",
+                "dynamic": false,
+                "info": "Optional JSON to control filters/limit/score threshold.\nAccepted shapes:\n1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\n2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\nPlaceholders with __IMPOSSIBLE_VALUE__ are ignored.",
                 "input_types": [
                   "Message"
                 ],
@@ -1302,7 +1416,7 @@
                 "list_add_label": "Add More",
                 "load_from_db": false,
                 "multiline": true,
-                "name": "hybrid_search_query",
+                "name": "filter_expression",
                 "placeholder": "",
                 "required": false,
                 "show": true,
@@ -1318,7 +1432,7 @@
                 "advanced": false,
                 "display_name": "Index Name",
                 "dynamic": false,
-                "info": "The index name where the vectors will be stored in OpenSearch cluster.",
+                "info": "The index to search.",
                 "list": false,
                 "list_add_label": "Add More",
                 "load_from_db": false,
@@ -1353,14 +1467,69 @@
                 "type": "other",
                 "value": ""
               },
+              "jwt_header": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "JWT Header Name",
+                "dynamic": false,
+                "info": "",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "jwt_header",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "Authorization"
+              },
+              "jwt_token": {
+                "_input_type": "SecretStrInput",
+                "advanced": false,
+                "display_name": "JWT Token",
+                "dynamic": false,
+                "info": "Paste a valid JWT (sent as a header).",
+                "input_types": [],
+                "load_from_db": false,
+                "name": "jwt_token",
+                "password": true,
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "type": "str",
+                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3NzExNzEyLCJpYXQiOjE3NTcxMDY5MTIsImF1dGhfdGltZSI6MTc1NzExNzcxMiwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.JneUFesg-FuNKVdd0Nbc8dtItxtrctwldJTnrj8I2U_mGcZgX0ObnqrrrF8lvn25Su3rdyZIJ84bX16WMUMhUivzRl1od7X5_PUOr21F_MHtIVMBnmQW_DO5MjN6Op4-v54FAc9HZn6v5gS_RdUr4E0Vscv5CJIfbirFTA0B3Yip9hxg1UXocgXnc0NwiwTJnu9XBhEgPOXJLIu1PJjvVWBclO7ZgzMmgSUoZPzDH6GQphPqtWxeav-bGk38HyI2GR0QaRYjGMgKMB-xwGQWh5kvCuwEQ5ylF80yXN7lVIc7DGY69vhy24II6W8FaWZvMVqJnwcByfHJWbWQ8g8UDA"
+              },
+              "m": {
+                "_input_type": "IntInput",
+                "advanced": true,
+                "display_name": "M Parameter",
+                "dynamic": false,
+                "info": "Number of bidirectional links created for each new element.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "m",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "int",
+                "value": 16
+              },
               "number_of_results": {
                 "_input_type": "IntInput",
                 "advanced": true,
-                "display_name": "Number of Results",
+                "display_name": "Default Size (limit)",
                 "dynamic": false,
-                "info": "Number of results to return.",
+                "info": "Default number of hits when no limit provided in filter_expression.",
                 "list": false,
                 "list_add_label": "Add More",
+                "load_from_db": false,
                 "name": "number_of_results",
                 "placeholder": "",
                 "required": false,
@@ -1376,7 +1545,7 @@
                 "advanced": false,
                 "display_name": "OpenSearch URL",
                 "dynamic": false,
-                "info": "URL for OpenSearch cluster (e.g. https://192.168.1.1:9200).",
+                "info": "URL for your OpenSearch cluster.",
                 "list": false,
                 "list_add_label": "Add More",
                 "load_from_db": false,
@@ -1388,24 +1557,24 @@
                 "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "str",
-                "value": "http://opensearch:9200"
+                "value": "https://opensearch:9200"
               },
               "password": {
                 "_input_type": "SecretStrInput",
-                "advanced": true,
+                "advanced": false,
                 "display_name": "Password",
                 "dynamic": false,
                 "info": "",
                 "input_types": [],
-                "load_from_db": true,
+                "load_from_db": false,
                 "name": "password",
                 "password": true,
                 "placeholder": "",
                 "required": false,
-                "show": true,
+                "show": false,
                 "title_case": false,
                 "type": "str",
-                "value": "OPENSEARCH_PASSWORD"
+                "value": ""
               },
               "search_query": {
                 "_input_type": "QueryInput",
@@ -1430,49 +1599,6 @@
                 "type": "query",
                 "value": ""
               },
-              "search_score_threshold": {
-                "_input_type": "FloatInput",
-                "advanced": true,
-                "display_name": "Search Score Threshold",
-                "dynamic": false,
-                "info": "Minimum similarity score threshold for search results.",
-                "list": false,
-                "list_add_label": "Add More",
-                "name": "search_score_threshold",
-                "placeholder": "",
-                "required": false,
-                "show": true,
-                "title_case": false,
-                "tool_mode": false,
-                "trace_as_metadata": true,
-                "type": "float",
-                "value": 0
-              },
-              "search_type": {
-                "_input_type": "DropdownInput",
-                "advanced": true,
-                "combobox": false,
-                "dialog_inputs": {},
-                "display_name": "Search Type",
-                "dynamic": false,
-                "info": "",
-                "name": "search_type",
-                "options": [
-                  "similarity",
-                  "similarity_score_threshold",
-                  "mmr"
-                ],
-                "options_metadata": [],
-                "placeholder": "",
-                "required": false,
-                "show": true,
-                "title_case": false,
-                "toggle": false,
-                "tool_mode": false,
-                "trace_as_metadata": true,
-                "type": "str",
-                "value": "similarity"
-              },
               "should_cache_vector_store": {
                 "_input_type": "BoolInput",
                 "advanced": true,
@@ -1491,6 +1617,33 @@
                 "type": "bool",
                 "value": true
               },
+              "space_type": {
+                "_input_type": "DropdownInput",
+                "advanced": true,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Space Type",
+                "dynamic": false,
+                "info": "Distance metric for vector similarity.",
+                "name": "space_type",
+                "options": [
+                  "l2",
+                  "l1",
+                  "cosinesimil",
+                  "linf",
+                  "innerproduct"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "l2"
+              },
               "use_ssl": {
                 "_input_type": "BoolInput",
                 "advanced": true,
@@ -1511,7 +1664,7 @@
               },
               "username": {
                 "_input_type": "StrInput",
-                "advanced": true,
+                "advanced": false,
                 "display_name": "Username",
                 "dynamic": false,
                 "info": "",
@@ -1521,13 +1674,32 @@
                 "name": "username",
                 "placeholder": "",
                 "required": false,
-                "show": true,
+                "show": false,
                 "title_case": false,
                 "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "str",
                 "value": "admin"
               },
+              "vector_field": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "Vector Field",
+                "dynamic": false,
+                "info": "Vector field used for KNN.",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "vector_field",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "chunk_embedding"
+              },
               "verify_certs": {
                 "_input_type": "BoolInput",
                 "advanced": true,
@@ -1551,24 +1723,24 @@
           },
           "selected_output": "search_results",
           "showNode": true,
-          "type": "OpenSearch"
+          "type": "OpenSearchHybrid"
         },
         "dragging": false,
-        "id": "OpenSearch-Mkw1W",
+        "id": "OpenSearchHybrid-Ve6bS",
         "measured": {
-          "height": 518,
+          "height": 765,
           "width": 320
         },
         "position": {
-          "x": 2136.4456339674302,
-          "y": 1460.3160066924486
+          "x": 2218.9287723423276,
+          "y": 1332.2598463956504
         },
-        "selected": false,
+        "selected": true,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -1173.5436043881646,
+      "x": -1214.8709460066525,
       "y": -1289.0306227762003,
       "zoom": 1.0020797567291742
     }

From 9f7c506cce56fbb1ecd898c49fd7756b2f363fda Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Mon, 8 Sep 2025 09:58:28 -0400
Subject: [PATCH 33/67] log error when INGEST_FLOW_ID is not configured

---
 src/services/langflow_file_service.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 6b343670..694e71e0 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -65,6 +65,7 @@ class LangflowFileService:
         The flow must expose a File component path in input schema or accept files parameter.
         """
         if not self.flow_id_ingest:
+            logger.error("[LF] LANGFLOW_INGEST_FLOW_ID is not configured")
             raise ValueError("LANGFLOW_INGEST_FLOW_ID is not configured")
 
         payload: Dict[str, Any] = {

From 8dc77298be45ee24ecabc93947fd2dfb5d20258a Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Mon, 8 Sep 2025 10:30:18 -0400
Subject: [PATCH 34/67] logging

---
 src/services/langflow_file_service.py | 62 +++++++++++++++++----------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 694e71e0..494048ed 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -1,9 +1,9 @@
-import logging
 from typing import Any, Dict, List, Optional
 
 from config.settings import LANGFLOW_INGEST_FLOW_ID, clients
+from utils.logging_config import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class LangflowFileService:
@@ -24,14 +24,16 @@ class LangflowFileService:
             headers={"Content-Type": None},
         )
         logger.debug(
-            "[LF] Upload response: %s %s", resp.status_code, resp.reason_phrase
+            "[LF] Upload response",
+            status_code=resp.status_code,
+            reason=resp.reason_phrase,
         )
         if resp.status_code >= 400:
             logger.error(
-                "[LF] Upload failed: %s %s | body=%s",
-                resp.status_code,
-                resp.reason_phrase,
-                resp.text[:500],
+                "[LF] Upload failed",
+                status_code=resp.status_code,
+                reason=resp.reason_phrase,
+                body=resp.text[:500],
             )
         resp.raise_for_status()
         return resp.json()
@@ -39,17 +41,19 @@ class LangflowFileService:
     async def delete_user_file(self, file_id: str) -> None:
         """Delete a file by id using v2: DELETE /api/v2/files/{id}."""
         # NOTE: use v2 root, not /api/v1
-        logger.debug("[LF] Delete (v2) -> /api/v2/files/%s", file_id)
+        logger.debug("[LF] Delete (v2) -> /api/v2/files/{id}", file_id=file_id)
         resp = await clients.langflow_request("DELETE", f"/api/v2/files/{file_id}")
         logger.debug(
-            "[LF] Delete response: %s %s", resp.status_code, resp.reason_phrase
+            "[LF] Delete response",
+            status_code=resp.status_code,
+            reason=resp.reason_phrase,
         )
         if resp.status_code >= 400:
             logger.error(
-                "[LF] Delete failed: %s %s | body=%s",
-                resp.status_code,
-                resp.reason_phrase,
-                resp.text[:500],
+                "[LF] Delete failed",
+                status_code=resp.status_code,
+                reason=resp.reason_phrase,
+                body=resp.text[:500],
             )
         resp.raise_for_status()
 
@@ -84,9 +88,11 @@ class LangflowFileService:
         if jwt_token:
             # Using the global variable pattern that Langflow expects for OpenSearch components
             tweaks["OpenSearchHybrid-Ve6bS"] = {"jwt_token": jwt_token}
-            logger.error("[LF] Adding JWT token to tweaks for OpenSearch components")
+            logger.debug(
+                "[LF] Added JWT token to tweaks for OpenSearch components"
+            )
         else:
-            logger.error("[LF] No JWT token provided")
+            logger.warning("[LF] No JWT token provided")
         if tweaks:
             payload["tweaks"] = tweaks
         if session_id:
@@ -101,19 +107,29 @@ class LangflowFileService:
             bool(jwt_token),
         )
 
-        # Log the full payload for debugging
-        logger.debug("[LF] Request payload: %s", payload)
+        # Avoid logging full payload to prevent leaking sensitive data (e.g., JWT)
 
         resp = await clients.langflow_request(
             "POST", f"/api/v1/run/{self.flow_id_ingest}", json=payload
         )
-        logger.debug("[LF] Run response: %s %s", resp.status_code, resp.reason_phrase)
+        logger.debug(
+            "[LF] Run response", status_code=resp.status_code, reason=resp.reason_phrase
+        )
         if resp.status_code >= 400:
             logger.error(
-                "[LF] Run failed: %s %s | body=%s",
-                resp.status_code,
-                resp.reason_phrase,
-                resp.text[:1000],
+                "[LF] Run failed",
+                status_code=resp.status_code,
+                reason=resp.reason_phrase,
+                body=resp.text[:1000],
             )
         resp.raise_for_status()
-        return resp.json()
+        try:
+            resp_json = resp.json()
+        except Exception as e:
+            logger.error(
+                "[LF] Failed to parse run response as JSON",
+                body=resp.text[:1000],
+                error=str(e),
+            )
+            raise
+        return resp_json

From cb3ceceafed5115fccd28249dbd0f10c4e1f22d9 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 12:15:09 -0300
Subject: [PATCH 35/67] Fix import statement for logging configuration in
 warm_up_docling.py

---
 warm_up_docling.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/warm_up_docling.py b/warm_up_docling.py
index 272768ce..c605bef5 100644
--- a/warm_up_docling.py
+++ b/warm_up_docling.py
@@ -1,6 +1,7 @@
-from docling.document_converter import DocumentConverter
 import logging
 
+from docling.document_converter import DocumentConverter
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 

From d5166f4314a828c4ac47368fbbfbe9e9ef569829 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 12:59:28 -0300
Subject: [PATCH 36/67] fix langflow file service change

---
 src/main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main.py b/src/main.py
index dc012c6e..9810ca24 100644
--- a/src/main.py
+++ b/src/main.py
@@ -332,6 +332,8 @@ async def initialize_services():
     else:
         logger.info("[CONNECTORS] Skipping connection loading in no-auth mode")
 
+    langflow_file_service = LangflowFileService()
+
     return {
         "document_service": document_service,
         "search_service": search_service,

From e144f8f0efb8ea73882349f5a8089798b664cf31 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 14:59:01 -0300
Subject: [PATCH 37/67] update FLOW_ID variable

---
 src/services/chat_service.py | 162 ++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 70 deletions(-)

diff --git a/src/services/chat_service.py b/src/services/chat_service.py
index 122c90fc..34b28c9a 100644
--- a/src/services/chat_service.py
+++ b/src/services/chat_service.py
@@ -1,4 +1,5 @@
 import json
+
 from utils.logging_config import get_logger
 
 logger = get_logger(__name__)
@@ -178,9 +179,9 @@ class ChatService:
                     "Langflow client not initialized. Ensure LANGFLOW is reachable or set LANGFLOW_KEY."
                 )
             response_text, response_id = await async_langflow(
-                langflow_client,
-                LANGFLOW_CHAT_FLOW_ID,
-                document_prompt,
+                langflow_client=langflow_client,
+                flow_id=LANGFLOW_CHAT_FLOW_ID,
+                prompt=document_prompt,
                 extra_headers=extra_headers,
                 previous_response_id=previous_response_id,
             )
@@ -199,17 +200,17 @@ class ChatService:
 
     async def get_chat_history(self, user_id: str):
         """Get chat conversation history for a user"""
-        from agent import get_user_conversations, active_conversations
+        from agent import active_conversations, get_user_conversations
 
         if not user_id:
             return {"error": "User ID is required", "conversations": []}
 
         # Get metadata from persistent storage
         conversations_dict = get_user_conversations(user_id)
-        
+
         # Get in-memory conversations (with function calls)
         in_memory_conversations = active_conversations.get(user_id, {})
-        
+
         logger.debug(
             "Getting chat history for user",
             user_id=user_id,
@@ -219,7 +220,7 @@ class ChatService:
 
         # Convert conversations dict to list format with metadata
         conversations = []
-        
+
         # First, process in-memory conversations (they have function calls)
         for response_id, conversation_state in in_memory_conversations.items():
             # Filter out system messages
@@ -235,13 +236,13 @@ class ChatService:
                     }
                     if msg.get("response_id"):
                         message_data["response_id"] = msg["response_id"]
-                    
+
                     # Include function call data if present
                     if msg.get("chunks"):
                         message_data["chunks"] = msg["chunks"]
                     if msg.get("response_data"):
                         message_data["response_data"] = msg["response_data"]
-                        
+
                     messages.append(message_data)
 
             if messages:  # Only include conversations with actual messages
@@ -275,25 +276,27 @@ class ChatService:
                             "previous_response_id"
                         ),
                         "total_messages": len(messages),
-                        "source": "in_memory"
+                        "source": "in_memory",
                     }
                 )
-        
+
         # Then, add any persistent metadata that doesn't have in-memory data
         for response_id, metadata in conversations_dict.items():
             if response_id not in in_memory_conversations:
                 # This is metadata-only conversation (no function calls)
-                conversations.append({
-                    "response_id": response_id,
-                    "title": metadata.get("title", "New Chat"),
-                    "endpoint": "chat",
-                    "messages": [],  # No messages in metadata-only
-                    "created_at": metadata.get("created_at"),
-                    "last_activity": metadata.get("last_activity"),
-                    "previous_response_id": metadata.get("previous_response_id"),
-                    "total_messages": metadata.get("total_messages", 0),
-                    "source": "metadata_only"
-                })
+                conversations.append(
+                    {
+                        "response_id": response_id,
+                        "title": metadata.get("title", "New Chat"),
+                        "endpoint": "chat",
+                        "messages": [],  # No messages in metadata-only
+                        "created_at": metadata.get("created_at"),
+                        "last_activity": metadata.get("last_activity"),
+                        "previous_response_id": metadata.get("previous_response_id"),
+                        "total_messages": metadata.get("total_messages", 0),
+                        "source": "metadata_only",
+                    }
+                )
 
         # Sort by last activity (most recent first)
         conversations.sort(key=lambda c: c.get("last_activity", ""), reverse=True)
@@ -309,33 +312,37 @@ class ChatService:
         """Get langflow conversation history for a user - now fetches from both OpenRAG memory and Langflow database"""
         from agent import get_user_conversations
         from services.langflow_history_service import langflow_history_service
-        
+
         if not user_id:
             return {"error": "User ID is required", "conversations": []}
-        
+
         all_conversations = []
-        
+
         try:
             # 1. Get local conversation metadata (no actual messages stored here)
             conversations_dict = get_user_conversations(user_id)
             local_metadata = {}
-            
+
             for response_id, conversation_metadata in conversations_dict.items():
                 # Store metadata for later use with Langflow data
                 local_metadata[response_id] = conversation_metadata
-            
+
             # 2. Get actual conversations from Langflow database (source of truth for messages)
             print(f"[DEBUG] Attempting to fetch Langflow history for user: {user_id}")
-            langflow_history = await langflow_history_service.get_user_conversation_history(user_id, flow_id=FLOW_ID)
-            
+            langflow_history = (
+                await langflow_history_service.get_user_conversation_history(
+                    user_id, flow_id=LANGFLOW_CHAT_FLOW_ID
+                )
+            )
+
             if langflow_history.get("conversations"):
                 for conversation in langflow_history["conversations"]:
                     session_id = conversation["session_id"]
-                    
+
                     # Only process sessions that belong to this user (exist in local metadata)
                     if session_id not in local_metadata:
                         continue
-                    
+
                     # Use Langflow messages (with function calls) as source of truth
                     messages = []
                     for msg in conversation.get("messages", []):
@@ -344,76 +351,91 @@ class ChatService:
                             "content": msg["content"],
                             "timestamp": msg.get("timestamp"),
                             "langflow_message_id": msg.get("langflow_message_id"),
-                            "source": "langflow"
+                            "source": "langflow",
                         }
-                        
+
                         # Include function call data if present
                         if msg.get("chunks"):
                             message_data["chunks"] = msg["chunks"]
                         if msg.get("response_data"):
                             message_data["response_data"] = msg["response_data"]
-                            
+
                         messages.append(message_data)
-                    
+
                     if messages:
                         # Use local metadata if available, otherwise generate from Langflow data
                         metadata = local_metadata.get(session_id, {})
-                        
+
                         if not metadata.get("title"):
-                            first_user_msg = next((msg for msg in messages if msg["role"] == "user"), None)
+                            first_user_msg = next(
+                                (msg for msg in messages if msg["role"] == "user"), None
+                            )
                             title = (
                                 first_user_msg["content"][:50] + "..."
-                                if first_user_msg and len(first_user_msg["content"]) > 50
+                                if first_user_msg
+                                and len(first_user_msg["content"]) > 50
                                 else first_user_msg["content"]
                                 if first_user_msg
                                 else "Langflow chat"
                             )
                         else:
                             title = metadata["title"]
-                        
-                        all_conversations.append({
-                            "response_id": session_id,
-                            "title": title,
-                            "endpoint": "langflow",
-                            "messages": messages,  # Function calls preserved from Langflow
-                            "created_at": metadata.get("created_at") or conversation.get("created_at"),
-                            "last_activity": metadata.get("last_activity") or conversation.get("last_activity"),
-                            "total_messages": len(messages),
-                            "source": "langflow_enhanced",
-                            "langflow_session_id": session_id,
-                            "langflow_flow_id": conversation.get("flow_id")
-                        })
-            
+
+                        all_conversations.append(
+                            {
+                                "response_id": session_id,
+                                "title": title,
+                                "endpoint": "langflow",
+                                "messages": messages,  # Function calls preserved from Langflow
+                                "created_at": metadata.get("created_at")
+                                or conversation.get("created_at"),
+                                "last_activity": metadata.get("last_activity")
+                                or conversation.get("last_activity"),
+                                "total_messages": len(messages),
+                                "source": "langflow_enhanced",
+                                "langflow_session_id": session_id,
+                                "langflow_flow_id": conversation.get("flow_id"),
+                            }
+                        )
+
             # 3. Add any local metadata that doesn't have Langflow data yet (recent conversations)
             for response_id, metadata in local_metadata.items():
                 if not any(c["response_id"] == response_id for c in all_conversations):
-                    all_conversations.append({
-                        "response_id": response_id,
-                        "title": metadata.get("title", "New Chat"),
-                        "endpoint": "langflow", 
-                        "messages": [],  # Will be filled when Langflow sync catches up
-                        "created_at": metadata.get("created_at"),
-                        "last_activity": metadata.get("last_activity"),
-                        "total_messages": metadata.get("total_messages", 0),
-                        "source": "metadata_only"
-                    })
-                
+                    all_conversations.append(
+                        {
+                            "response_id": response_id,
+                            "title": metadata.get("title", "New Chat"),
+                            "endpoint": "langflow",
+                            "messages": [],  # Will be filled when Langflow sync catches up
+                            "created_at": metadata.get("created_at"),
+                            "last_activity": metadata.get("last_activity"),
+                            "total_messages": metadata.get("total_messages", 0),
+                            "source": "metadata_only",
+                        }
+                    )
+
             if langflow_history.get("conversations"):
-                print(f"[DEBUG] Added {len(langflow_history['conversations'])} historical conversations from Langflow")
+                print(
+                    f"[DEBUG] Added {len(langflow_history['conversations'])} historical conversations from Langflow"
+                )
             elif langflow_history.get("error"):
-                print(f"[DEBUG] Could not fetch Langflow history for user {user_id}: {langflow_history['error']}")
+                print(
+                    f"[DEBUG] Could not fetch Langflow history for user {user_id}: {langflow_history['error']}"
+                )
             else:
                 print(f"[DEBUG] No Langflow conversations found for user {user_id}")
-        
+
         except Exception as e:
             print(f"[ERROR] Failed to fetch Langflow history: {e}")
             # Continue with just in-memory conversations
-        
+
         # Sort by last activity (most recent first)
         all_conversations.sort(key=lambda c: c.get("last_activity", ""), reverse=True)
-        
-        print(f"[DEBUG] Returning {len(all_conversations)} conversations ({len(local_metadata)} from local metadata)")
-        
+
+        print(
+            f"[DEBUG] Returning {len(all_conversations)} conversations ({len(local_metadata)} from local metadata)"
+        )
+
         return {
             "user_id": user_id,
             "endpoint": "langflow",

From b3dcac0cde50a649eb1e09e6eaf53a3ba4752b6b Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 14:59:10 -0300
Subject: [PATCH 38/67] Refactor environment configuration in EnvManager

This commit updates the EnvConfig class by renaming the flow ID variables to LANGFLOW_CHAT_FLOW_ID and LANGFLOW_INGEST_FLOW_ID for better clarity. It also modifies the environment variable writing logic to reflect these changes. Additionally, the import statements have been reorganized for improved readability. These updates contribute to a more robust and well-documented codebase.
---
 src/tui/managers/env_manager.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py
index d0946b5a..8577054b 100644
--- a/src/tui/managers/env_manager.py
+++ b/src/tui/managers/env_manager.py
@@ -1,23 +1,23 @@
 """Environment configuration manager for OpenRAG TUI."""
 
-import os
 import secrets
 import string
+from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, Optional, List
-from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
 from utils.logging_config import get_logger
 
 logger = get_logger(__name__)
 
 from ..utils.validation import (
-    validate_openai_api_key,
+    sanitize_env_value,
+    validate_documents_paths,
     validate_google_oauth_client_id,
     validate_non_empty,
+    validate_openai_api_key,
     validate_url,
-    validate_documents_paths,
-    sanitize_env_value,
 )
 
 
@@ -31,7 +31,8 @@ class EnvConfig:
     langflow_secret_key: str = ""
     langflow_superuser: str = "admin"
     langflow_superuser_password: str = ""
-    flow_id: str = "1098eea1-6649-4e1d-aed1-b77249fb8dd0"
+    langflow_chat_flow_id: str = "1098eea1-6649-4e1d-aed1-b77249fb8dd0"
+    langflow_ingest_flow_id: str = "5488df7c-b93f-4f87-a446-b67028bc0813"
 
     # OAuth settings
     google_oauth_client_id: str = ""
@@ -98,7 +99,8 @@ class EnvManager:
                             "LANGFLOW_SECRET_KEY": "langflow_secret_key",
                             "LANGFLOW_SUPERUSER": "langflow_superuser",
                             "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
-                            "FLOW_ID": "flow_id",
+                            "LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
+                            "LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
                             "GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
                             "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
                             "MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
@@ -234,7 +236,10 @@ class EnvManager:
                 f.write(
                     f"LANGFLOW_SUPERUSER_PASSWORD={self.config.langflow_superuser_password}\n"
                 )
-                f.write(f"FLOW_ID={self.config.flow_id}\n")
+                f.write(f"LANGFLOW_CHAT_FLOW_ID={self.config.langflow_chat_flow_id}\n")
+                f.write(
+                    f"LANGFLOW_INGEST_FLOW_ID={self.config.langflow_ingest_flow_id}\n"
+                )
                 f.write(f"OPENSEARCH_PASSWORD={self.config.opensearch_password}\n")
                 f.write(f"OPENAI_API_KEY={self.config.openai_api_key}\n")
                 f.write(

From 1df2594698e35a7bcae156cbe380fb6401ff90cb Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 15:09:22 -0300
Subject: [PATCH 39/67] Update ingestion flow JSON configuration

This commit modifies the ingestion flow JSON by updating the last_updated timestamp, removing the obsolete File Path input field, and adjusting the file_path array to be empty. Additionally, it corrects the height measurement of a UI component and changes the selected state of a node. These updates contribute to a cleaner and more efficient configuration, aligning with best practices for async code development.
---
 flows/ingestion_flow.json | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 5ee5d1b2..5d872b42 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -884,7 +884,7 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-08T11:35:39.784Z",
+            "last_updated": "2025-09-08T17:45:33.714Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {},
@@ -906,23 +906,6 @@
                   "Message"
                 ],
                 "value": "__UNDEFINED__"
-              },
-              {
-                "allows_loop": false,
-                "cache": true,
-                "display_name": "File Path",
-                "group_outputs": false,
-                "hidden": null,
-                "method": "load_files_path",
-                "name": "path",
-                "options": null,
-                "required_inputs": null,
-                "selected": "Message",
-                "tool_mode": true,
-                "types": [
-                  "Message"
-                ],
-                "value": "__UNDEFINED__"
               }
             ],
             "pinned": false,
@@ -1069,9 +1052,7 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [
-                  "242b5797-4104-4a01-8da1-b8036813100d/test_ingestion.txt"
-                ],
+                "file_path": [],
                 "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
@@ -1151,7 +1132,7 @@
         "dragging": false,
         "id": "File-PSU37",
         "measured": {
-          "height": 234,
+          "height": 230,
           "width": 320
         },
         "position": {
@@ -1735,7 +1716,7 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": true,
+        "selected": false,
         "type": "genericNode"
       }
     ],

From 327b4358147fe9b6d6c469fde1a8f453d96e08e1 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 16:39:41 -0300
Subject: [PATCH 40/67] fix imports in task_service

---
 src/services/task_service.py | 61 ++++++++++--------------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/src/services/task_service.py b/src/services/task_service.py
index 0537e933..705f6f3c 100644
--- a/src/services/task_service.py
+++ b/src/services/task_service.py
@@ -1,10 +1,11 @@
 import asyncio
 import random
-from typing import Dict, Optional
+import time
+import uuid
 
-from models.tasks import TaskStatus, UploadTask, FileTask
-from utils.gpu_detection import get_worker_count
+from models.tasks import FileTask, TaskStatus, UploadTask
 from session_manager import AnonymousUser
+from utils.gpu_detection import get_worker_count
 from utils.logging_config import get_logger
 
 logger = get_logger(__name__)
@@ -14,9 +15,7 @@ class TaskService:
     def __init__(self, document_service=None, process_pool=None):
         self.document_service = document_service
         self.process_pool = process_pool
-        self.task_store: Dict[
-            str, Dict[str, UploadTask]
-        ] = {}  # user_id -> {task_id -> UploadTask}
+        self.task_store: dict[str, dict[str, UploadTask]] = {}  # user_id -> {task_id -> UploadTask}
         self.background_tasks = set()
 
         if self.process_pool is None:
@@ -67,9 +66,7 @@ class TaskService:
         self.task_store[user_id][task_id] = upload_task
 
         # Start background processing
-        background_task = asyncio.create_task(
-            self.background_custom_processor(user_id, task_id, items)
-        )
+        background_task = asyncio.create_task(self.background_custom_processor(user_id, task_id, items))
         self.background_tasks.add(background_task)
         background_task.add_done_callback(self.background_tasks.discard)
 
@@ -87,27 +84,18 @@ class TaskService:
 
             # Process files with limited concurrency to avoid overwhelming the system
             max_workers = get_worker_count()
-            semaphore = asyncio.Semaphore(
-                max_workers * 2
-            )  # Allow 2x process pool size for async I/O
+            semaphore = asyncio.Semaphore(max_workers * 2)  # Allow 2x process pool size for async I/O
 
             async def process_with_semaphore(file_path: str):
                 async with semaphore:
-                    await self.document_service.process_single_file_task(
-                        upload_task, file_path
-                    )
+                    await self.document_service.process_single_file_task(upload_task, file_path)
 
-            tasks = [
-                process_with_semaphore(file_path)
-                for file_path in upload_task.file_tasks.keys()
-            ]
+            tasks = [process_with_semaphore(file_path) for file_path in upload_task.file_tasks.keys()]
 
             await asyncio.gather(*tasks, return_exceptions=True)
 
         except Exception as e:
-            logger.error(
-                "Background upload processor failed", task_id=task_id, error=str(e)
-            )
+            logger.error("Background upload processor failed", task_id=task_id, error=str(e))
             import traceback
 
             traceback.print_exc()
@@ -115,9 +103,7 @@ class TaskService:
                 self.task_store[user_id][task_id].status = TaskStatus.FAILED
                 self.task_store[user_id][task_id].updated_at = time.time()
 
-    async def background_custom_processor(
-        self, user_id: str, task_id: str, items: list
-    ) -> None:
+    async def background_custom_processor(self, user_id: str, task_id: str, items: list) -> None:
         """Background task to process items using custom processor"""
         try:
             upload_task = self.task_store[user_id][task_id]
@@ -139,9 +125,7 @@ class TaskService:
                     try:
                         await processor.process_item(upload_task, item, file_task)
                     except Exception as e:
-                        logger.error(
-                            "Failed to process item", item=str(item), error=str(e)
-                        )
+                        logger.error("Failed to process item", item=str(item), error=str(e))
                         import traceback
 
                         traceback.print_exc()
@@ -168,9 +152,7 @@ class TaskService:
                 pass
             raise  # Re-raise to properly handle cancellation
         except Exception as e:
-            logger.error(
-                "Background custom processor failed", task_id=task_id, error=str(e)
-            )
+            logger.error("Background custom processor failed", task_id=task_id, error=str(e))
             import traceback
 
             traceback.print_exc()
@@ -178,7 +160,7 @@ class TaskService:
                 self.task_store[user_id][task_id].status = TaskStatus.FAILED
                 self.task_store[user_id][task_id].updated_at = time.time()
 
-    def get_task_status(self, user_id: str, task_id: str) -> Optional[dict]:
+    def get_task_status(self, user_id: str, task_id: str) -> dict | None:
         """Get the status of a specific upload task
 
         Includes fallback to shared tasks stored under the "anonymous" user key
@@ -192,10 +174,7 @@ class TaskService:
 
         upload_task = None
         for candidate_user_id in candidate_user_ids:
-            if (
-                candidate_user_id in self.task_store
-                and task_id in self.task_store[candidate_user_id]
-            ):
+            if candidate_user_id in self.task_store and task_id in self.task_store[candidate_user_id]:
                 upload_task = self.task_store[candidate_user_id][task_id]
                 break
 
@@ -269,10 +248,7 @@ class TaskService:
 
         store_user_id = None
         for candidate_user_id in candidate_user_ids:
-            if (
-                candidate_user_id in self.task_store
-                and task_id in self.task_store[candidate_user_id]
-            ):
+            if candidate_user_id in self.task_store and task_id in self.task_store[candidate_user_id]:
                 store_user_id = candidate_user_id
                 break
 
@@ -286,10 +262,7 @@ class TaskService:
             return False
 
         # Cancel the background task to stop scheduling new work
-        if (
-            hasattr(upload_task, "background_task")
-            and not upload_task.background_task.done()
-        ):
+        if hasattr(upload_task, "background_task") and not upload_task.background_task.done():
             upload_task.background_task.cancel()
 
         # Mark task as failed (cancelled)

From 542082e5bf2a96a760c3b7e4d6c455e5b81b97d6 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 16:40:06 -0300
Subject: [PATCH 41/67] Add LangflowConnectorFileProcessor for handling
 connector file uploads

This commit introduces the LangflowConnectorFileProcessor class, which processes connector file uploads using the Langflow service. It includes initialization parameters for user and connection details, and implements the process_item method to handle file processing asynchronously. Additionally, it cleans up the existing ConnectorFileProcessor by removing unused imports and streamlining file info retrieval. These changes enhance the code's robustness and maintainability in line with async development practices.
---
 src/models/processors.py | 78 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 3 deletions(-)

diff --git a/src/models/processors.py b/src/models/processors.py
index ed5a1bb4..02836020 100644
--- a/src/models/processors.py
+++ b/src/models/processors.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from typing import Any
 from .tasks import UploadTask, FileTask
 from utils.logging_config import get_logger
 
@@ -91,10 +91,9 @@ class ConnectorFileProcessor(TaskProcessor):
     ) -> None:
         """Process a connector file using ConnectorService"""
         from models.tasks import TaskStatus
-        import time
 
         file_id = item  # item is the connector file ID
-        file_info = self.file_info_map.get(file_id)
+        self.file_info_map.get(file_id)
 
         # Get the connector and connection info
         connector = await self.connector_service.get_connector(self.connection_id)
@@ -126,6 +125,79 @@ class ConnectorFileProcessor(TaskProcessor):
         upload_task.successful_files += 1
 
 
+class LangflowConnectorFileProcessor(TaskProcessor):
+    """Processor for connector file uploads using Langflow"""
+
+    def __init__(
+        self,
+        langflow_connector_service,
+        connection_id: str,
+        files_to_process: list,
+        user_id: str = None,
+        jwt_token: str = None,
+        owner_name: str = None,
+        owner_email: str = None,
+    ):
+        self.langflow_connector_service = langflow_connector_service
+        self.connection_id = connection_id
+        self.files_to_process = files_to_process
+        self.user_id = user_id
+        self.jwt_token = jwt_token
+        self.owner_name = owner_name
+        self.owner_email = owner_email
+        # Create lookup map for file info - handle both file objects and file IDs
+        self.file_info_map = {}
+        for f in files_to_process:
+            if isinstance(f, dict):
+                # Full file info objects
+                self.file_info_map[f["id"]] = f
+            else:
+                # Just file IDs - will need to fetch metadata during processing
+                self.file_info_map[f] = None
+
+    async def process_item(
+        self, upload_task: UploadTask, item: str, file_task: FileTask
+    ) -> None:
+        """Process a connector file using LangflowConnectorService"""
+        from models.tasks import TaskStatus
+
+        file_id = item  # item is the connector file ID
+        self.file_info_map.get(file_id)
+
+        # Get the connector and connection info
+        connector = await self.langflow_connector_service.get_connector(
+            self.connection_id
+        )
+        connection = (
+            await self.langflow_connector_service.connection_manager.get_connection(
+                self.connection_id
+            )
+        )
+        if not connector or not connection:
+            raise ValueError(f"Connection '{self.connection_id}' not found")
+
+        # Get file content from connector (the connector will fetch metadata if needed)
+        document = await connector.get_file_content(file_id)
+
+        # Use the user_id passed during initialization
+        if not self.user_id:
+            raise ValueError("user_id not provided to LangflowConnectorFileProcessor")
+
+        # Process using Langflow pipeline
+        result = await self.langflow_connector_service.process_connector_document(
+            document,
+            self.user_id,
+            connection.connector_type,
+            jwt_token=self.jwt_token,
+            owner_name=self.owner_name,
+            owner_email=self.owner_email,
+        )
+
+        file_task.status = TaskStatus.COMPLETED
+        file_task.result = result
+        upload_task.successful_files += 1
+
+
 class S3FileProcessor(TaskProcessor):
     """Processor for files stored in S3 buckets"""
 

From beee47068c3ed89520707d7fcc632d22d2e8e7a0 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 16:40:24 -0300
Subject: [PATCH 42/67] Add LangflowConnectorService for managing connector
 documents

This commit introduces the LangflowConnectorService class, which facilitates the management and processing of connector documents using the Langflow service. It includes methods for initializing the service, retrieving connectors, processing documents, and syncing files. The implementation emphasizes asynchronous processing and robust error handling, enhancing the overall maintainability and documentation of the codebase.
---
 src/connectors/langflow_connector_service.py | 290 +++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 src/connectors/langflow_connector_service.py

diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py
new file mode 100644
index 00000000..fa82eee7
--- /dev/null
+++ b/src/connectors/langflow_connector_service.py
@@ -0,0 +1,290 @@
+import os
+import tempfile
+from typing import Any, Dict, List, Optional
+
+# Create custom processor for connector files using Langflow
+from models.processors import LangflowConnectorFileProcessor
+from services.langflow_file_service import LangflowFileService
+from utils.logging_config import get_logger
+
+from .base import BaseConnector, ConnectorDocument
+from .connection_manager import ConnectionManager
+
+logger = get_logger(__name__)
+
+
+class LangflowConnectorService:
+    """Service to manage connector documents and process them via Langflow"""
+
+    def __init__(
+        self,
+        task_service=None,
+        session_manager=None,
+    ):
+        self.task_service = task_service
+        self.session_manager = session_manager
+        self.connection_manager = ConnectionManager()
+
+        # Initialize LangflowFileService for processing connector documents
+        self.langflow_service = LangflowFileService()
+
+    async def initialize(self):
+        """Initialize the service by loading existing connections"""
+        await self.connection_manager.load_connections()
+
+    async def get_connector(self, connection_id: str) -> Optional[BaseConnector]:
+        """Get a connector by connection ID"""
+        return await self.connection_manager.get_connector(connection_id)
+
+    async def process_connector_document(
+        self,
+        document: ConnectorDocument,
+        owner_user_id: str,
+        connector_type: str,
+        jwt_token: str = None,
+        owner_name: str = None,
+        owner_email: str = None,
+    ) -> Dict[str, Any]:
+        """Process a document from a connector using LangflowFileService pattern"""
+
+        logger.debug(
+            "Processing connector document via Langflow",
+            document_id=document.id,
+            filename=document.filename,
+        )
+
+        # Create temporary file from document content
+        with tempfile.NamedTemporaryFile(
+            delete=False, suffix=self._get_file_extension(document.mimetype)
+        ) as tmp_file:
+            tmp_file.write(document.content)
+            tmp_file.flush()
+
+            try:
+                # Step 1: Upload file to Langflow
+                logger.debug("Uploading file to Langflow", filename=document.filename)
+                content = document.content
+                file_tuple = (
+                    document.filename,
+                    content,
+                    document.mimetype or "application/octet-stream",
+                )
+
+                upload_result = await self.langflow_service.upload_user_file(
+                    file_tuple, jwt_token
+                )
+                langflow_file_id = upload_result["id"]
+                langflow_file_path = upload_result["path"]
+
+                logger.debug(
+                    "File uploaded to Langflow",
+                    file_id=langflow_file_id,
+                    path=langflow_file_path,
+                )
+
+                # Step 2: Run ingestion flow with the uploaded file
+                logger.debug(
+                    "Running Langflow ingestion flow", file_path=langflow_file_path
+                )
+
+                # Use the same tweaks pattern as LangflowFileService
+                tweaks = {}  # Let Langflow handle the ingestion with default settings
+
+                ingestion_result = await self.langflow_service.run_ingestion_flow(
+                    file_paths=[langflow_file_path], jwt_token=jwt_token, tweaks=tweaks
+                )
+
+                logger.debug("Ingestion flow completed", result=ingestion_result)
+
+                # Step 3: Delete the file from Langflow
+                logger.debug("Deleting file from Langflow", file_id=langflow_file_id)
+                await self.langflow_service.delete_user_file(langflow_file_id)
+                logger.debug("File deleted from Langflow", file_id=langflow_file_id)
+
+                return {
+                    "status": "indexed",
+                    "filename": document.filename,
+                    "source_url": document.source_url,
+                    "document_id": document.id,
+                    "connector_type": connector_type,
+                    "langflow_result": ingestion_result,
+                }
+
+            except Exception as e:
+                logger.error(
+                    "Failed to process connector document via Langflow",
+                    document_id=document.id,
+                    error=str(e),
+                )
+                # Try to clean up Langflow file if upload succeeded but processing failed
+                if "langflow_file_id" in locals():
+                    try:
+                        await self.langflow_service.delete_user_file(langflow_file_id)
+                        logger.debug(
+                            "Cleaned up Langflow file after error",
+                            file_id=langflow_file_id,
+                        )
+                    except Exception as cleanup_error:
+                        logger.warning(
+                            "Failed to cleanup Langflow file",
+                            file_id=langflow_file_id,
+                            error=str(cleanup_error),
+                        )
+                raise
+
+            finally:
+                # Clean up temporary file
+                os.unlink(tmp_file.name)
+
+    def _get_file_extension(self, mimetype: str) -> str:
+        """Get file extension based on MIME type"""
+        mime_to_ext = {
+            "application/pdf": ".pdf",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+            "application/msword": ".doc",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+            "application/vnd.ms-powerpoint": ".ppt",
+            "text/plain": ".txt",
+            "text/html": ".html",
+            "application/rtf": ".rtf",
+            "application/vnd.google-apps.document": ".pdf",  # Exported as PDF
+            "application/vnd.google-apps.presentation": ".pdf",
+            "application/vnd.google-apps.spreadsheet": ".pdf",
+        }
+        return mime_to_ext.get(mimetype, ".bin")
+
+    async def sync_connector_files(
+        self,
+        connection_id: str,
+        user_id: str,
+        max_files: int = None,
+        jwt_token: str = None,
+    ) -> str:
+        """Sync files from a connector connection using Langflow processing"""
+        if not self.task_service:
+            raise ValueError(
+                "TaskService not available - connector sync requires task service dependency"
+            )
+
+        logger.debug(
+            "Starting Langflow-based sync for connection",
+            connection_id=connection_id,
+            max_files=max_files,
+        )
+
+        connector = await self.get_connector(connection_id)
+        if not connector:
+            raise ValueError(
+                f"Connection '{connection_id}' not found or not authenticated"
+            )
+
+        logger.debug("Got connector", authenticated=connector.is_authenticated)
+
+        if not connector.is_authenticated:
+            raise ValueError(f"Connection '{connection_id}' not authenticated")
+
+        # Collect files to process (limited by max_files)
+        files_to_process = []
+        page_token = None
+
+        # Calculate page size to minimize API calls
+        page_size = min(max_files or 100, 1000) if max_files else 100
+
+        while True:
+            # List files from connector with limit
+            logger.debug(
+                "Calling list_files", page_size=page_size, page_token=page_token
+            )
+            file_list = await connector.list_files(page_token, limit=page_size)
+            logger.debug(
+                "Got files from connector", file_count=len(file_list.get("files", []))
+            )
+            files = file_list["files"]
+
+            if not files:
+                break
+
+            for file_info in files:
+                if max_files and len(files_to_process) >= max_files:
+                    break
+                files_to_process.append(file_info)
+
+            # Stop if we have enough files or no more pages
+            if (max_files and len(files_to_process) >= max_files) or not file_list.get(
+                "nextPageToken"
+            ):
+                break
+
+            page_token = file_list.get("nextPageToken")
+
+        # Get user information
+        user = self.session_manager.get_user(user_id) if self.session_manager else None
+        owner_name = user.name if user else None
+        owner_email = user.email if user else None
+
+        processor = LangflowConnectorFileProcessor(
+            self,
+            connection_id,
+            files_to_process,
+            user_id,
+            jwt_token=jwt_token,
+            owner_name=owner_name,
+            owner_email=owner_email,
+        )
+
+        # Use file IDs as items
+        file_ids = [file_info["id"] for file_info in files_to_process]
+
+        # Create custom task using TaskService
+        task_id = await self.task_service.create_custom_task(
+            user_id, file_ids, processor
+        )
+
+        return task_id
+
+    async def sync_specific_files(
+        self,
+        connection_id: str,
+        user_id: str,
+        file_ids: List[str],
+        jwt_token: str = None,
+    ) -> str:
+        """Sync specific files by their IDs using Langflow processing"""
+        if not self.task_service:
+            raise ValueError(
+                "TaskService not available - connector sync requires task service dependency"
+            )
+
+        connector = await self.get_connector(connection_id)
+        if not connector:
+            raise ValueError(
+                f"Connection '{connection_id}' not found or not authenticated"
+            )
+
+        if not connector.is_authenticated:
+            raise ValueError(f"Connection '{connection_id}' not authenticated")
+
+        if not file_ids:
+            raise ValueError("No file IDs provided")
+
+        # Get user information
+        user = self.session_manager.get_user(user_id) if self.session_manager else None
+        owner_name = user.name if user else None
+        owner_email = user.email if user else None
+
+        processor = LangflowConnectorFileProcessor(
+            self,
+            connection_id,
+            file_ids,
+            user_id,
+            jwt_token=jwt_token,
+            owner_name=owner_name,
+            owner_email=owner_email,
+        )
+
+        # Create custom task using TaskService
+        task_id = await self.task_service.create_custom_task(
+            user_id, file_ids, processor
+        )
+
+        return task_id

From 6d0a94e4b44b98fed60c75815c47fbe1d8331372 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 16:42:07 -0300
Subject: [PATCH 43/67] Refactor connector service initialization to use
 LangflowConnectorService

This commit updates the service initialization in main.py to replace the existing ConnectorService with LangflowConnectorService. This change enhances the management of connector documents by leveraging the new service's capabilities, aligning with the ongoing improvements in asynchronous processing and code maintainability.
---
 src/main.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/main.py b/src/main.py
index 9810ca24..890faa98 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,6 +1,7 @@
 import sys
 
 # Configure structured logging early
+from connectors.langflow_connector_service import LangflowConnectorService
 from utils.logging_config import configure_from_env, get_logger
 
 configure_from_env()
@@ -49,7 +50,6 @@ from config.settings import (
 )
 
 # Existing services
-from connectors.service import ConnectorService
 from services.auth_service import AuthService
 from services.chat_service import ChatService
 
@@ -301,11 +301,7 @@ async def initialize_services():
     document_service.process_pool = process_pool
 
     # Initialize connector service
-    connector_service = ConnectorService(
-        patched_async_client=clients.patched_async_client,
-        process_pool=process_pool,
-        embed_model="text-embedding-3-small",
-        index_name=INDEX_NAME,
+    connector_service = LangflowConnectorService(
         task_service=task_service,
         session_manager=session_manager,
     )

From f3616db4cd829aca5606520d5592a4e0bf790d88 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Mon, 8 Sep 2025 16:33:06 -0400
Subject: [PATCH 44/67] FLOW_ID

---
 src/services/chat_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/services/chat_service.py b/src/services/chat_service.py
index 6c0bb825..d2fe7ca9 100644
--- a/src/services/chat_service.py
+++ b/src/services/chat_service.py
@@ -1,4 +1,4 @@
-from config.settings import NUDGES_FLOW_ID, clients, LANGFLOW_URL, FLOW_ID
+from config.settings import NUDGES_FLOW_ID, clients, LANGFLOW_URL
 from agent import (
     async_chat,
     async_langflow,

From 63bf4cad0f2e6ffcbabdd0cfaedcf8fec3ce8f40 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 18:01:28 -0300
Subject: [PATCH 45/67] Refactor settings page to streamline state updates and
 remove unused interfaces

This commit simplifies the state update logic in the KnowledgeSourcesPage component by replacing multiple conditional assignments with concise if statements. Additionally, it removes unused GoogleDriveFile and OneDriveFile interfaces, enhancing code clarity and maintainability in line with best practices for async development.
---
 frontend/src/app/settings/page.tsx | 31 +++++-------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx
index 1c979fd7..48482a35 100644
--- a/frontend/src/app/settings/page.tsx
+++ b/frontend/src/app/settings/page.tsx
@@ -19,24 +19,6 @@ import { ProtectedRoute } from "@/components/protected-route";
 import { useTask } from "@/contexts/task-context";
 import { useAuth } from "@/contexts/auth-context";
 
-interface GoogleDriveFile {
-  id: string
-  name: string
-  mimeType: string
-  webViewLink?: string
-  iconLink?: string
-}
-
-interface OneDriveFile {
-  id: string
-  name: string
-  mimeType?: string
-  webUrl?: string
-  driveItem?: {
-    file?: { mimeType: string }
-    folder?: unknown
-  }
-}
 
 interface Connector {
 	id: string;
@@ -105,14 +87,11 @@ function KnowledgeSourcesPage() {
 			if (response.ok) {
 				const settings = await response.json();
 				// Update all state cleanly
-				settings.flow_id && setFlowId(settings.flow_id);
-				settings.ingest_flow_id && setIngestFlowId(settings.ingest_flow_id);
-				settings.langflow_edit_url &&
-					setLangflowEditUrl(settings.langflow_edit_url);
-				settings.langflow_ingest_edit_url &&
-					setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
-				settings.langflow_public_url &&
-					setPublicLangflowUrl(settings.langflow_public_url);
+				if (settings.flow_id) setFlowId(settings.flow_id);
+				if (settings.ingest_flow_id) setIngestFlowId(settings.ingest_flow_id);
+				if (settings.langflow_edit_url) setLangflowEditUrl(settings.langflow_edit_url);
+				if (settings.langflow_ingest_edit_url) setLangflowIngestEditUrl(settings.langflow_ingest_edit_url);
+				if (settings.langflow_public_url) setPublicLangflowUrl(settings.langflow_public_url);
 				if (settings.ingestion_defaults) {
 					console.log(
 						"Loading ingestion defaults from backend:",

From 69c064763af60c45618a47a79b19c6de64173373 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 18:01:39 -0300
Subject: [PATCH 46/67] Add environment and build file exclusions to
 .dockerignore

This commit expands the .dockerignore file to include various environment files, authentication files, dependency directories, Python cache files, build outputs, development files, logs, OS-specific files, and temporary files. These additions enhance the Docker build process by preventing unnecessary files from being included, thereby improving efficiency and maintainability in line with best practices for async development.
---
 .dockerignore | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index 8e0ed179..3f0066a9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,49 @@
+# Environment files
 .env
+.env.local
+.env.development
+.env.production
+
+# Auth files
 .drive.json
+*.json
+
+# Dependencies
+node_modules/
+*/node_modules/
+**/node_modules/
+
+# Python cache
+__pycache__/
+*/__pycache__/
+**/__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+
+# Build outputs
+build/
+dist/
+.next/
+out/
+
+# Development files
+.git/
+.gitignore
+README.md
+*.md
+.vscode/
+.idea/
+
+# Logs
+*.log
+logs/
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Temporary files
+tmp/
+temp/

From 762152934e3a0953eee9b6ae1de1fec3d25ce466 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 18:05:55 -0300
Subject: [PATCH 47/67] Add async method to retrieve connector by ID in
 LangflowConnectorService

This commit introduces the _get_connector method in the LangflowConnectorService class, providing an asynchronous way to retrieve a connector by its connection ID. This addition enhances the service's functionality and aligns with the ongoing improvements in asynchronous processing and code maintainability.
---
 src/connectors/langflow_connector_service.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py
index fa82eee7..52f0a6b8 100644
--- a/src/connectors/langflow_connector_service.py
+++ b/src/connectors/langflow_connector_service.py
@@ -288,3 +288,7 @@ class LangflowConnectorService:
         )
 
         return task_id
+
+    async def _get_connector(self, connection_id: str) -> Optional[BaseConnector]:
+        """Get a connector by connection ID (alias for get_connector)"""
+        return await self.get_connector(connection_id)

From 1a1911a34eaade70b8852d9c942fedbafea9d17c Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 18:57:33 -0300
Subject: [PATCH 48/67] Update Makefile for improved command execution and
 formatting

This commit modifies the Makefile to ensure consistent command execution by changing the directory context for the backend startup command. Additionally, it corrects formatting in the output messages for better readability. These changes contribute to a more robust development environment.
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 74df8d40..fe76467a 100644
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,7 @@ dev:
 	docker-compose up -d
 	@echo "✅ Services started!"
 	@echo "   Backend: http://localhost:8000"
-	@echo "   Frontend: http://localhost:3000"  
+	@echo "   Frontend: http://localhost:3000"
 	@echo "   Langflow: http://localhost:7860"
 	@echo "   OpenSearch: http://localhost:9200"
 	@echo "   Dashboards: http://localhost:5601"
@@ -93,7 +93,7 @@ clean: stop
 backend:
 	@echo "🐍 Starting backend locally..."
 	@if [ ! -f .env ]; then echo "⚠️  .env file not found. Copy .env.example to .env first"; exit 1; fi
-	cd src && uv run python main.py
+	uv run python src/main.py
 
 frontend:
 	@echo "⚛️  Starting frontend locally..."
@@ -187,7 +187,7 @@ db-reset:
 	curl -X DELETE "http://localhost:9200/knowledge_filters" -u admin:$$(grep OPENSEARCH_PASSWORD .env | cut -d= -f2) || true
 	@echo "Indices reset. Restart backend to recreate."
 
-# Flow management  
+# Flow management
 flow-upload:
 	@echo "📁 Uploading flow to Langflow..."
 	@if [ -z "$(FLOW_FILE)" ]; then echo "Usage: make flow-upload FLOW_FILE=path/to/flow.json"; exit 1; fi

From 550930e6c1004093b6700963c99891fad548a532 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 19:17:33 -0300
Subject: [PATCH 49/67] Update ingestion flow configuration and component
 settings

This commit updates the ingestion flow JSON configuration, including modifications to the last updated timestamps, dimensions of UI components, and selected output types. Additionally, it enhances the OpenSearchHybridComponent by refining input options and ensuring proper handling of authentication settings. These changes improve the overall functionality and user experience of the ingestion flow.
---
 flows/ingestion_flow.json | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 5d872b42..6e8e09ed 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -884,7 +884,7 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-08T17:45:33.714Z",
+            "last_updated": "2025-09-08T22:16:40.365Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {},
@@ -1132,7 +1132,7 @@
         "dragging": false,
         "id": "File-PSU37",
         "measured": {
-          "height": 230,
+          "height": 229,
           "width": 320
         },
         "position": {
@@ -1183,10 +1183,9 @@
             ],
             "frozen": false,
             "icon": "OpenSearch",
-            "last_updated": "2025-09-05T21:19:52.776Z",
             "legacy": false,
             "metadata": {
-              "code_hash": "37e8631c902b",
+              "code_hash": "b14dce621594",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1232,6 +1231,7 @@
                 "name": "dataframe",
                 "options": null,
                 "required_inputs": null,
+                "selected": "DataFrame",
                 "tool_mode": true,
                 "types": [
                   "DataFrame"
@@ -1248,6 +1248,7 @@
                 "name": "vectorstoreconnection",
                 "options": null,
                 "required_inputs": null,
+                "selected": "VectorStore",
                 "tool_mode": true,
                 "types": [
                   "VectorStore"
@@ -1318,7 +1319,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\nimport uuid\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"nmslib\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        try:\n            client.indices.get(index=index_name)\n        except NotFoundError:\n            client.indices.create(index=index_name, body=mapping)\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        if not is_aoss:\n            client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        return self.build_client()\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            lc_doc = doc_obj.to_lc_document()\n            texts.append(lc_doc.page_content)\n            metadatas.append(lc_doc.metadata)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"nmslib\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\nimport uuid\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"nmslib\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        try:\n            client.indices.get(index=index_name)\n        except NotFoundError:\n            client.indices.create(index=index_name, body=mapping)\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        if not is_aoss:\n            client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our \"vector store.\"\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            lc_doc = doc_obj.to_lc_document()\n            texts.append(lc_doc.page_content)\n            metadatas.append(lc_doc.metadata)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"nmslib\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1482,7 +1483,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "str",
-                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3NzExNzEyLCJpYXQiOjE3NTcxMDY5MTIsImF1dGhfdGltZSI6MTc1NzExNzcxMiwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.JneUFesg-FuNKVdd0Nbc8dtItxtrctwldJTnrj8I2U_mGcZgX0ObnqrrrF8lvn25Su3rdyZIJ84bX16WMUMhUivzRl1od7X5_PUOr21F_MHtIVMBnmQW_DO5MjN6Op4-v54FAc9HZn6v5gS_RdUr4E0Vscv5CJIfbirFTA0B3Yip9hxg1UXocgXnc0NwiwTJnu9XBhEgPOXJLIu1PJjvVWBclO7ZgzMmgSUoZPzDH6GQphPqtWxeav-bGk38HyI2GR0QaRYjGMgKMB-xwGQWh5kvCuwEQ5ylF80yXN7lVIc7DGY69vhy24II6W8FaWZvMVqJnwcByfHJWbWQ8g8UDA"
+                "value": ""
               },
               "m": {
                 "_input_type": "IntInput",
@@ -1702,14 +1703,14 @@
             },
             "tool_mode": false
           },
-          "selected_output": "search_results",
+          "selected_output": "dataframe",
           "showNode": true,
           "type": "OpenSearchHybrid"
         },
         "dragging": false,
         "id": "OpenSearchHybrid-Ve6bS",
         "measured": {
-          "height": 765,
+          "height": 761,
           "width": 320
         },
         "position": {
@@ -1721,9 +1722,9 @@
       }
     ],
     "viewport": {
-      "x": -1214.8709460066525,
-      "y": -1289.0306227762003,
-      "zoom": 1.0020797567291742
+      "x": -988.2209660078397,
+      "y": -990.7176624994117,
+      "zoom": 0.7898282762479812
     }
   },
   "description": "Load your data for chat context with Retrieval Augmented Generation.",

From ea069bd4dea1b90dd77de39843d79adf21064392 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 19:47:27 -0300
Subject: [PATCH 50/67] Update ingestion flow JSON with new input options and
 authentication settings

This commit modifies the ingestion flow configuration by adding a new input option for file paths and updating the last updated timestamp. It also refines the OpenSearchHybridComponent to enhance authentication handling and input configurations. These changes improve the functionality and user experience of the ingestion flow, aligning with best practices for async development.
---
 flows/ingestion_flow.json | 40 ++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 6e8e09ed..7bc4949d 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -884,7 +884,7 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-08T22:16:40.365Z",
+            "last_updated": "2025-09-08T22:45:35.898Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {},
@@ -906,6 +906,23 @@
                   "Message"
                 ],
                 "value": "__UNDEFINED__"
+              },
+              {
+                "allows_loop": false,
+                "cache": true,
+                "display_name": "File Path",
+                "group_outputs": false,
+                "hidden": null,
+                "method": "load_files_path",
+                "name": "path",
+                "options": null,
+                "required_inputs": null,
+                "selected": "Message",
+                "tool_mode": true,
+                "types": [
+                  "Message"
+                ],
+                "value": "__UNDEFINED__"
               }
             ],
             "pinned": false,
@@ -1052,7 +1069,9 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [],
+                "file_path": [
+                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/Actors.txt"
+                ],
                 "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
@@ -1132,7 +1151,7 @@
         "dragging": false,
         "id": "File-PSU37",
         "measured": {
-          "height": 229,
+          "height": 233,
           "width": 320
         },
         "position": {
@@ -1184,8 +1203,9 @@
             "frozen": false,
             "icon": "OpenSearch",
             "legacy": false,
+            "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "b14dce621594",
+              "code_hash": "d31747dc61c3",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1319,7 +1339,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\nimport uuid\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"nmslib\", \"faiss\", \"lucene\"],\n            value=\"nmslib\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"nmslib\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        try:\n            client.indices.get(index=index_name)\n        except NotFoundError:\n            client.indices.create(index=index_name, body=mapping)\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        if not is_aoss:\n            client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our \"vector store.\"\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            lc_doc = doc_obj.to_lc_document()\n            texts.append(lc_doc.page_content)\n            metadatas.append(lc_doc.metadata)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"nmslib\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        # if not is_aoss:\n        #     client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1367,8 +1387,10 @@
                 "display_name": "Engine",
                 "dynamic": false,
                 "info": "Vector search engine to use.",
+                "load_from_db": false,
                 "name": "engine",
                 "options": [
+                  "jvector",
                   "nmslib",
                   "faiss",
                   "lucene"
@@ -1483,7 +1505,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "str",
-                "value": ""
+                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3OTc0MDk1LCJpYXQiOjE3NTczNjkyOTUsImF1dGhfdGltZSI6MTc1NzM4MDA5NSwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.S9MDGbI8jPuN1m1cziYk1FwL8rukmHV8QcR7DaohMWvlsC0cEKPKTb1mjwG7DJNE20jK75smp02G1guf54Xykqa3HJmvHbsF4XXT-xAHzbAW20xEsBCj318JwHUfJegFoyhPRk2c7PDGvYMa88YVvJEaYf65UCw6YRsPyTguDXSPblHI2bV49tyTc3xQYXMYQBz8OEa4fXCwlNt0WBwDDAFVW6DwMyAMalTlIqZiLGgyeADwGAhh7vgi0n3F0k16ynDvrhQIzNQnG-u6BMAKifE5LR4HvyG3UYPIh6a_d0DENM7MqfsWyDdY8V7upMz2vk_F3MVUMXfEOysDiTa4wQ"
               },
               "m": {
                 "_input_type": "IntInput",
@@ -1717,13 +1739,13 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": false,
+        "selected": true,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -988.2209660078397,
-      "y": -990.7176624994117,
+      "x": -679.2209660078397,
+      "y": -919.7176624994117,
       "zoom": 0.7898282762479812
     }
   },

From a0151dfaff1a1781a77ea46b1e9ab7fe5bbd0925 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 20:10:34 -0300
Subject: [PATCH 51/67] Refactor ingestion flow JSON to enhance edge
 connections and update component settings

This commit refines the ingestion flow JSON by reintroducing an edge connection between the File and SplitText nodes, ensuring proper data flow. It also updates the last updated timestamp and modifies the selection state of a node, contributing to improved functionality and user experience. These changes align with best practices for async development and enhance the overall robustness of the ingestion flow.
---
 flows/ingestion_flow.json | 79 +++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 7bc4949d..b671c16b 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -1,36 +1,6 @@
 {
   "data": {
     "edges": [
-      {
-        "animated": false,
-        "className": "",
-        "data": {
-          "sourceHandle": {
-            "dataType": "File",
-            "id": "File-PSU37",
-            "name": "message",
-            "output_types": [
-              "Message"
-            ]
-          },
-          "targetHandle": {
-            "fieldName": "data_inputs",
-            "id": "SplitText-QIKhg",
-            "inputTypes": [
-              "Data",
-              "DataFrame",
-              "Message"
-            ],
-            "type": "other"
-          }
-        },
-        "id": "reactflow__edge-File-PSU37{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}",
-        "selected": false,
-        "source": "File-PSU37",
-        "sourceHandle": "{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}",
-        "target": "SplitText-QIKhg",
-        "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}"
-      },
       {
         "animated": false,
         "className": "",
@@ -87,6 +57,36 @@
         "sourceHandle": "{œdataTypeœ:œOpenAIEmbeddingsœ,œidœ:œOpenAIEmbeddings-joRJ6œ,œnameœ:œembeddingsœ,œoutput_typesœ:[œEmbeddingsœ]}",
         "target": "OpenSearchHybrid-Ve6bS",
         "targetHandle": "{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}"
+      },
+      {
+        "animated": false,
+        "className": "",
+        "data": {
+          "sourceHandle": {
+            "dataType": "File",
+            "id": "File-PSU37",
+            "name": "message",
+            "output_types": [
+              "Message"
+            ]
+          },
+          "targetHandle": {
+            "fieldName": "data_inputs",
+            "id": "SplitText-QIKhg",
+            "inputTypes": [
+              "Data",
+              "DataFrame",
+              "Message"
+            ],
+            "type": "other"
+          }
+        },
+        "id": "xy-edge__File-PSU37{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}",
+        "selected": false,
+        "source": "File-PSU37",
+        "sourceHandle": "{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}",
+        "target": "SplitText-QIKhg",
+        "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}"
       }
     ],
     "nodes": [
@@ -884,7 +884,7 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-08T22:45:35.898Z",
+            "last_updated": "2025-09-08T23:05:09.886Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {},
@@ -917,7 +917,6 @@
                 "name": "path",
                 "options": null,
                 "required_inputs": null,
-                "selected": "Message",
                 "tool_mode": true,
                 "types": [
                   "Message"
@@ -1069,9 +1068,7 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [
-                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/Actors.txt"
-                ],
+                "file_path": [],
                 "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
@@ -1205,7 +1202,7 @@
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "d31747dc61c3",
+              "code_hash": "307f5461379f",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1339,7 +1336,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 512,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"hnsw\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n        try:\n            from opensearchpy.exceptions import NotFoundError\n        except ImportError:\n            raise ImportError(\"Could not import OpenSearch exceptions\")\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        # if not is_aoss:\n        #     client.indices.refresh(index=index_name)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1505,7 +1502,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "str",
-                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3OTc0MDk1LCJpYXQiOjE3NTczNjkyOTUsImF1dGhfdGltZSI6MTc1NzM4MDA5NSwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.S9MDGbI8jPuN1m1cziYk1FwL8rukmHV8QcR7DaohMWvlsC0cEKPKTb1mjwG7DJNE20jK75smp02G1guf54Xykqa3HJmvHbsF4XXT-xAHzbAW20xEsBCj318JwHUfJegFoyhPRk2c7PDGvYMa88YVvJEaYf65UCw6YRsPyTguDXSPblHI2bV49tyTc3xQYXMYQBz8OEa4fXCwlNt0WBwDDAFVW6DwMyAMalTlIqZiLGgyeADwGAhh7vgi0n3F0k16ynDvrhQIzNQnG-u6BMAKifE5LR4HvyG3UYPIh6a_d0DENM7MqfsWyDdY8V7upMz2vk_F3MVUMXfEOysDiTa4wQ"
+                "value": ""
               },
               "m": {
                 "_input_type": "IntInput",
@@ -1739,13 +1736,13 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": true,
+        "selected": false,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -679.2209660078397,
-      "y": -919.7176624994117,
+      "x": -793.2209660078397,
+      "y": -894.7176624994117,
       "zoom": 0.7898282762479812
     }
   },

From 12cbd63dbf498a125eda6b26ff630a755805ec73 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 20:10:48 -0300
Subject: [PATCH 52/67] Refactor SearchService to improve data retrieval from
 search results

This commit updates the SearchService class to utilize the `get` method for safely accessing fields in the search results. This change enhances the robustness of the code by preventing potential KeyErrors and aligns with best practices for building maintainable async code. Additionally, it simplifies the data extraction process from the search results, improving overall code clarity.
---
 src/services/search_service.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/services/search_service.py b/src/services/search_service.py
index 222d6541..230c052f 100644
--- a/src/services/search_service.py
+++ b/src/services/search_service.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 from agentd.tool_decorator import tool
 from config.settings import clients, INDEX_NAME, EMBED_MODEL
 from auth_context import get_auth_context
@@ -166,11 +166,11 @@ class SearchService:
         for hit in results["hits"]["hits"]:
             chunks.append(
                 {
-                    "filename": hit["_source"]["filename"],
-                    "mimetype": hit["_source"]["mimetype"],
-                    "page": hit["_source"]["page"],
-                    "text": hit["_source"]["text"],
-                    "score": hit["_score"],
+                    "filename": hit["_source"].get("filename"),
+                    "mimetype": hit["_source"].get("mimetype"),
+                    "page": hit["_source"].get("page"),
+                    "text": hit["_source"].get("text"),
+                    "score": hit.get("_score"),
                     "source_url": hit["_source"].get("source_url"),
                     "owner": hit["_source"].get("owner"),
                     "owner_name": hit["_source"].get("owner_name"),

From 8ae543ac69f3d891331e884a462f41608cf249e3 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 20:11:21 -0300
Subject: [PATCH 53/67] Refactor file path handling in LangflowFileService to
 improve clarity and consistency

This commit updates the LangflowFileService class by changing the key for file paths in the tweaks dictionary from "path" to "file_path". This modification enhances code clarity and aligns with best practices for maintaining robust async code. Additionally, it simplifies the logging statement for better readability while preserving the functionality related to JWT token handling.
---
 src/services/langflow_file_service.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 494048ed..b2e91fe2 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -82,15 +82,13 @@ class LangflowFileService:
 
         # Pass files via tweaks to File component (File-PSU37 from the flow)
         if file_paths:
-            tweaks["File-PSU37"] = {"path": file_paths}
+            tweaks["File-PSU37"] = {"file_path": file_paths}
 
         # Pass JWT token via tweaks using the x-langflow-global-var- pattern
         if jwt_token:
             # Using the global variable pattern that Langflow expects for OpenSearch components
             tweaks["OpenSearchHybrid-Ve6bS"] = {"jwt_token": jwt_token}
-            logger.debug(
-                "[LF] Added JWT token to tweaks for OpenSearch components"
-            )
+            logger.debug("[LF] Added JWT token to tweaks for OpenSearch components")
         else:
             logger.warning("[LF] No JWT token provided")
         if tweaks:

From acdfd3c8ca776945477a9314df89d7eec5ab80d1 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Mon, 8 Sep 2025 20:52:52 -0400
Subject: [PATCH 54/67] os diag

---
 src/tui/screens/diagnostics.py | 206 +++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/src/tui/screens/diagnostics.py b/src/tui/screens/diagnostics.py
index 5091654a..3be628f2 100644
--- a/src/tui/screens/diagnostics.py
+++ b/src/tui/screens/diagnostics.py
@@ -63,6 +63,7 @@ class DiagnosticsScreen(Screen):
                 yield Button("Refresh", variant="primary", id="refresh-btn")
                 yield Button("Check Podman", variant="default", id="check-podman-btn")
                 yield Button("Check Docker", variant="default", id="check-docker-btn")
+                yield Button("Check OpenSearch Security", variant="default", id="check-opensearch-security-btn")
                 yield Button("Copy to Clipboard", variant="default", id="copy-btn")
                 yield Button("Save to File", variant="default", id="save-btn")
                 yield Button("Back", variant="default", id="back-btn")
@@ -92,6 +93,8 @@ class DiagnosticsScreen(Screen):
             asyncio.create_task(self.check_podman())
         elif event.button.id == "check-docker-btn":
             asyncio.create_task(self.check_docker())
+        elif event.button.id == "check-opensearch-security-btn":
+            asyncio.create_task(self.check_opensearch_security())
         elif event.button.id == "copy-btn":
             self.copy_to_clipboard()
         elif event.button.id == "save-btn":
@@ -415,5 +418,208 @@ class DiagnosticsScreen(Screen):
 
         log.write("")
 
+    async def check_opensearch_security(self) -> None:
+        """Run OpenSearch security configuration diagnostics."""
+        log = self.query_one("#diagnostics-log", Log)
+        log.write("[bold green]OpenSearch Security Diagnostics[/bold green]")
+
+        # Get OpenSearch password from environment or prompt user that it's needed
+        opensearch_password = os.getenv("OPENSEARCH_PASSWORD")
+        if not opensearch_password:
+            log.write("[red]OPENSEARCH_PASSWORD environment variable not set[/red]")
+            log.write("[yellow]Set OPENSEARCH_PASSWORD to test security configuration[/yellow]")
+            log.write("")
+            return
+
+        # Test basic authentication
+        log.write("Testing basic authentication...")
+        cmd = [
+            "curl", "-s", "-k", "-w", "%{http_code}",
+            "-u", f"admin:{opensearch_password}",
+            "https://localhost:9200"
+        ]
+        process = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        
+        if process.returncode == 0:
+            response = stdout.decode().strip()
+            # Extract HTTP status code (last 3 characters)
+            if len(response) >= 3:
+                status_code = response[-3:]
+                response_body = response[:-3]
+                if status_code == "200":
+                    log.write("[green]✓ Basic authentication successful[/green]")
+                    try:
+                        import json
+                        info = json.loads(response_body)
+                        if "version" in info and "distribution" in info["version"]:
+                            log.write(f"  OpenSearch version: {info['version']['number']}")
+                    except:
+                        pass
+                else:
+                    log.write(f"[red]✗ Basic authentication failed with status {status_code}[/red]")
+            else:
+                log.write("[red]✗ Unexpected response from OpenSearch[/red]")
+        else:
+            log.write(f"[red]✗ Failed to connect to OpenSearch: {stderr.decode().strip()}[/red]")
+
+        # Test security plugin account info
+        log.write("Testing security plugin account info...")
+        cmd = [
+            "curl", "-s", "-k", "-w", "%{http_code}",
+            "-u", f"admin:{opensearch_password}",
+            "https://localhost:9200/_plugins/_security/api/account"
+        ]
+        process = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        
+        if process.returncode == 0:
+            response = stdout.decode().strip()
+            if len(response) >= 3:
+                status_code = response[-3:]
+                response_body = response[:-3]
+                if status_code == "200":
+                    log.write("[green]✓ Security plugin accessible[/green]")
+                    try:
+                        import json
+                        user_info = json.loads(response_body)
+                        if "user_name" in user_info:
+                            log.write(f"  Current user: {user_info['user_name']}")
+                        if "roles" in user_info:
+                            log.write(f"  Roles: {', '.join(user_info['roles'])}")
+                        if "tenants" in user_info:
+                            tenants = list(user_info['tenants'].keys())
+                            log.write(f"  Tenants: {', '.join(tenants)}")
+                    except:
+                        log.write("  Account info retrieved but couldn't parse JSON")
+                else:
+                    log.write(f"[red]✗ Security plugin returned status {status_code}[/red]")
+        else:
+            log.write(f"[red]✗ Failed to access security plugin: {stderr.decode().strip()}[/red]")
+
+        # Test internal users
+        log.write("Testing internal users configuration...")
+        cmd = [
+            "curl", "-s", "-k", "-w", "%{http_code}",
+            "-u", f"admin:{opensearch_password}",
+            "https://localhost:9200/_plugins/_security/api/internalusers"
+        ]
+        process = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        
+        if process.returncode == 0:
+            response = stdout.decode().strip()
+            if len(response) >= 3:
+                status_code = response[-3:]
+                response_body = response[:-3]
+                if status_code == "200":
+                    try:
+                        import json
+                        users = json.loads(response_body)
+                        if "admin" in users:
+                            log.write("[green]✓ Admin user configured[/green]")
+                            admin_user = users["admin"]
+                            if admin_user.get("reserved"):
+                                log.write("  Admin user is reserved (protected)")
+                        log.write(f"  Total internal users: {len(users)}")
+                    except:
+                        log.write("[green]✓ Internal users endpoint accessible[/green]")
+                else:
+                    log.write(f"[red]✗ Internal users returned status {status_code}[/red]")
+        else:
+            log.write(f"[red]✗ Failed to access internal users: {stderr.decode().strip()}[/red]")
+
+        # Test authentication domains configuration
+        log.write("Testing authentication configuration...")
+        cmd = [
+            "curl", "-s", "-k", "-w", "%{http_code}",
+            "-u", f"admin:{opensearch_password}",
+            "https://localhost:9200/_plugins/_security/api/securityconfig"
+        ]
+        process = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        
+        if process.returncode == 0:
+            response = stdout.decode().strip()
+            if len(response) >= 3:
+                status_code = response[-3:]
+                response_body = response[:-3]
+                if status_code == "200":
+                    try:
+                        import json
+                        config = json.loads(response_body)
+                        if "config" in config and "dynamic" in config["config"] and "authc" in config["config"]["dynamic"]:
+                            authc = config["config"]["dynamic"]["authc"]
+                            if "openid_auth_domain" in authc:
+                                log.write("[green]✓ OpenID Connect authentication domain configured[/green]")
+                                oidc_config = authc["openid_auth_domain"].get("http_authenticator", {}).get("config", {})
+                                if "openid_connect_url" in oidc_config:
+                                    log.write(f"  OIDC URL: {oidc_config['openid_connect_url']}")
+                                if "subject_key" in oidc_config:
+                                    log.write(f"  Subject key: {oidc_config['subject_key']}")
+                            if "basic_internal_auth_domain" in authc:
+                                log.write("[green]✓ Basic internal authentication domain configured[/green]")
+                            
+                            # Check for multi-tenancy
+                            if "kibana" in config["config"]["dynamic"]:
+                                kibana_config = config["config"]["dynamic"]["kibana"]
+                                if kibana_config.get("multitenancy_enabled"):
+                                    log.write("[green]✓ Multi-tenancy enabled[/green]")
+                        else:
+                            log.write("[yellow]⚠ Authentication configuration not found in expected format[/yellow]")
+                    except Exception as e:
+                        log.write("[green]✓ Security config endpoint accessible[/green]")
+                        log.write(f"  (Could not parse JSON: {str(e)[:50]}...)")
+                else:
+                    log.write(f"[red]✗ Security config returned status {status_code}[/red]")
+        else:
+            log.write(f"[red]✗ Failed to access security config: {stderr.decode().strip()}[/red]")
+
+        # Test indices with potential security filtering
+        log.write("Testing index access...")
+        cmd = [
+            "curl", "-s", "-k", "-w", "%{http_code}",
+            "-u", f"admin:{opensearch_password}",
+            "https://localhost:9200/_cat/indices?v"
+        ]
+        process = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
+        stdout, stderr = await process.communicate()
+        
+        if process.returncode == 0:
+            response = stdout.decode().strip()
+            if len(response) >= 3:
+                status_code = response[-3:]
+                response_body = response[:-3]
+                if status_code == "200":
+                    log.write("[green]✓ Index listing accessible[/green]")
+                    lines = response_body.strip().split('\n')
+                    if len(lines) > 1:  # Skip header
+                        indices_found = []
+                        for line in lines[1:]:
+                            if 'documents' in line:
+                                indices_found.append('documents')
+                            elif 'knowledge_filters' in line:
+                                indices_found.append('knowledge_filters')
+                            elif '.opendistro_security' in line:
+                                indices_found.append('.opendistro_security')
+                        if indices_found:
+                            log.write(f"  Key indices found: {', '.join(indices_found)}")
+                else:
+                    log.write(f"[red]✗ Index listing returned status {status_code}[/red]")
+        else:
+            log.write(f"[red]✗ Failed to list indices: {stderr.decode().strip()}[/red]")
+
+        log.write("")
+
 
 # Made with Bob

From 86709db9849ba2fb7215fe5564b2aa7157f2b477 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 22:02:12 -0300
Subject: [PATCH 55/67] Update file path key in tweaks dictionary for
 LangflowFileService to enhance clarity

This commit modifies the key for file paths in the tweaks dictionary from "file_path" to "path" within the LangflowFileService class. This change improves code clarity and consistency, aligning with best practices for robust async development while maintaining existing functionality.
---
 src/services/langflow_file_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index b2e91fe2..9d582989 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -82,7 +82,7 @@ class LangflowFileService:
 
         # Pass files via tweaks to File component (File-PSU37 from the flow)
         if file_paths:
-            tweaks["File-PSU37"] = {"file_path": file_paths}
+            tweaks["File-PSU37"] = {"path": file_paths}
 
         # Pass JWT token via tweaks using the x-langflow-global-var- pattern
         if jwt_token:

From df46e17b60d1ad481cfb6dfb752c49c2e06bb628 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 22:02:44 -0300
Subject: [PATCH 56/67] Update uv.lock to reflect new package version and
 revision

This commit updates the uv.lock file by incrementing the revision number to 3 and updating the version of the "openrag" package from 0.1.0 to 0.1.1. These changes ensure that the project dependencies are accurately tracked and align with best practices for maintaining robust async code.
---
 uv.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/uv.lock b/uv.lock
index 87734b48..bd7da744 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.13"
 resolution-markers = [
     "sys_platform == 'darwin'",
@@ -1405,7 +1405,7 @@ wheels = [
 
 [[package]]
 name = "openrag"
-version = "0.1.0"
+version = "0.1.1"
 source = { editable = "." }
 dependencies = [
     { name = "agentd" },

From 422d24363898abd75de90aa9b5ce37d3f906c0c6 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 22:39:53 -0300
Subject: [PATCH 57/67] Enhance ingestion flow JSON with updated component
 configurations and documentation links

This commit updates the ingestion flow JSON by modifying the SplitText component to include new input options such as "text_key" and "keep_separator", along with a detailed documentation link. Additionally, it refines the File component's description and adds new input fields for advanced document processing, improving clarity and functionality. These changes align with best practices for robust async development and enhance the overall user experience.
---
 flows/ingestion_flow.json | 277 +++++++++++++++++++++++++++++++++-----
 1 file changed, 247 insertions(+), 30 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index b671c16b..0e2a0708 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -97,27 +97,28 @@
           "id": "SplitText-QIKhg",
           "node": {
             "base_classes": [
-              "Data"
+              "DataFrame"
             ],
             "beta": false,
             "conditional_paths": [],
             "custom_fields": {},
             "description": "Split text into chunks based on specified criteria.",
             "display_name": "Split Text",
-            "documentation": "",
-            "edited": false,
+            "documentation": "https://docs.langflow.org/components-processing#split-text",
+            "edited": true,
             "field_order": [
               "data_inputs",
               "chunk_overlap",
               "chunk_size",
-              "separator"
+              "separator",
+              "text_key",
+              "keep_separator"
             ],
             "frozen": false,
             "icon": "scissors-line-dashed",
             "legacy": false,
-            "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "dbf2e9d2319d",
+              "code_hash": "65a90e1f4fe6",
               "dependencies": {
                 "dependencies": [
                   {
@@ -131,8 +132,9 @@
                 ],
                 "total_dependencies": 2
               },
-              "module": "langflow.components.processing.split_text.SplitTextComponent"
+              "module": "custom_components.split_text"
             },
+            "minimized": false,
             "output_types": [],
             "outputs": [
               {
@@ -140,8 +142,11 @@
                 "cache": true,
                 "display_name": "Chunks",
                 "group_outputs": false,
+                "hidden": null,
                 "method": "split_text",
                 "name": "dataframe",
+                "options": null,
+                "required_inputs": null,
                 "selected": "DataFrame",
                 "tool_mode": true,
                 "types": [
@@ -154,31 +159,37 @@
             "template": {
               "_type": "Component",
               "chunk_overlap": {
+                "_input_type": "IntInput",
                 "advanced": false,
                 "display_name": "Chunk Overlap",
                 "dynamic": false,
                 "info": "Number of characters to overlap between chunks.",
                 "list": false,
+                "list_add_label": "Add More",
                 "name": "chunk_overlap",
                 "placeholder": "",
                 "required": false,
                 "show": true,
                 "title_case": false,
+                "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "int",
                 "value": 200
               },
               "chunk_size": {
+                "_input_type": "IntInput",
                 "advanced": false,
                 "display_name": "Chunk Size",
                 "dynamic": false,
                 "info": "The maximum length of each chunk. Text is first split by separator, then chunks are merged up to this size. Individual splits larger than this won't be further divided.",
                 "list": false,
+                "list_add_label": "Add More",
                 "name": "chunk_size",
                 "placeholder": "",
                 "required": false,
                 "show": true,
                 "title_case": false,
+                "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "int",
                 "value": 1000
@@ -199,9 +210,10 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from langchain_text_splitters import CharacterTextSplitter\n\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\nfrom langflow.utils.util import unescape_string\n\n\nclass SplitTextComponent(Component):\n    display_name: str = \"Split Text\"\n    description: str = \"Split text into chunks based on specified criteria.\"\n    documentation: str = \"https://docs.langflow.org/components-processing#split-text\"\n    icon = \"scissors-line-dashed\"\n    name = \"SplitText\"\n\n    inputs = [\n        HandleInput(\n            name=\"data_inputs\",\n            display_name=\"Input\",\n            info=\"The data with texts to split in chunks.\",\n            input_types=[\"Data\", \"DataFrame\", \"Message\"],\n            required=True,\n        ),\n        IntInput(\n            name=\"chunk_overlap\",\n            display_name=\"Chunk Overlap\",\n            info=\"Number of characters to overlap between chunks.\",\n            value=200,\n        ),\n        IntInput(\n            name=\"chunk_size\",\n            display_name=\"Chunk Size\",\n            info=(\n                \"The maximum length of each chunk. Text is first split by separator, \"\n                \"then chunks are merged up to this size. \"\n                \"Individual splits larger than this won't be further divided.\"\n            ),\n            value=1000,\n        ),\n        MessageTextInput(\n            name=\"separator\",\n            display_name=\"Separator\",\n            info=(\n                \"The character to split on. Use \\\\n for newline. \"\n                \"Examples: \\\\n\\\\n for paragraphs, \\\\n for lines, . for sentences\"\n            ),\n            value=\"\\n\",\n        ),\n        MessageTextInput(\n            name=\"text_key\",\n            display_name=\"Text Key\",\n            info=\"The key to use for the text column.\",\n            value=\"text\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"keep_separator\",\n            display_name=\"Keep Separator\",\n            info=\"Whether to keep the separator in the output chunks and where to place it.\",\n            options=[\"False\", \"True\", \"Start\", \"End\"],\n            value=\"False\",\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Chunks\", name=\"dataframe\", method=\"split_text\"),\n    ]\n\n    def _docs_to_data(self, docs) -> list[Data]:\n        return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]\n\n    def _fix_separator(self, separator: str) -> str:\n        \"\"\"Fix common separator issues and convert to proper format.\"\"\"\n        if separator == \"/n\":\n            return \"\\n\"\n        if separator == \"/t\":\n            return \"\\t\"\n        return separator\n\n    def split_text_base(self):\n        separator = self._fix_separator(self.separator)\n        separator = unescape_string(separator)\n\n        if isinstance(self.data_inputs, DataFrame):\n            if not len(self.data_inputs):\n                msg = \"DataFrame is empty\"\n                raise TypeError(msg)\n\n            self.data_inputs.text_key = self.text_key\n            try:\n                documents = self.data_inputs.to_lc_documents()\n            except Exception as e:\n                msg = f\"Error converting DataFrame to documents: {e}\"\n                raise TypeError(msg) from e\n        elif isinstance(self.data_inputs, Message):\n            self.data_inputs = [self.data_inputs.to_data()]\n            return self.split_text_base()\n        else:\n            if not self.data_inputs:\n                msg = \"No data inputs provided\"\n                raise TypeError(msg)\n\n            documents = []\n            if isinstance(self.data_inputs, Data):\n                self.data_inputs.text_key = self.text_key\n                documents = [self.data_inputs.to_lc_document()]\n            else:\n                try:\n                    documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]\n                    if not documents:\n                        msg = f\"No valid Data inputs found in {type(self.data_inputs)}\"\n                        raise TypeError(msg)\n                except AttributeError as e:\n                    msg = f\"Invalid input type in collection: {e}\"\n                    raise TypeError(msg) from e\n        try:\n            # Convert string 'False'/'True' to boolean\n            keep_sep = self.keep_separator\n            if isinstance(keep_sep, str):\n                if keep_sep.lower() == \"false\":\n                    keep_sep = False\n                elif keep_sep.lower() == \"true\":\n                    keep_sep = True\n                # 'start' and 'end' are kept as strings\n\n            splitter = CharacterTextSplitter(\n                chunk_overlap=self.chunk_overlap,\n                chunk_size=self.chunk_size,\n                separator=separator,\n                keep_separator=keep_sep,\n            )\n            return splitter.split_documents(documents)\n        except Exception as e:\n            msg = f\"Error splitting text: {e}\"\n            raise TypeError(msg) from e\n\n    def split_text(self) -> DataFrame:\n        return DataFrame(self._docs_to_data(self.split_text_base()))\n"
+                "value": "from langchain_text_splitters import CharacterTextSplitter\n\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\nfrom langflow.utils.util import unescape_string\n\n\nclass SplitTextComponent(Component):\n    display_name: str = \"Split Text\"\n    description: str = \"Split text into chunks based on specified criteria.\"\n    documentation: str = \"https://docs.langflow.org/components-processing#split-text\"\n    icon = \"scissors-line-dashed\"\n    name = \"SplitText\"\n\n    inputs = [\n        HandleInput(\n            name=\"data_inputs\",\n            display_name=\"Input\",\n            info=\"The data with texts to split in chunks.\",\n            input_types=[\"Data\", \"DataFrame\", \"Message\"],\n            required=True,\n        ),\n        IntInput(\n            name=\"chunk_overlap\",\n            display_name=\"Chunk Overlap\",\n            info=\"Number of characters to overlap between chunks.\",\n            value=200,\n        ),\n        IntInput(\n            name=\"chunk_size\",\n            display_name=\"Chunk Size\",\n            info=(\n                \"The maximum length of each chunk. Text is first split by separator, \"\n                \"then chunks are merged up to this size. \"\n                \"Individual splits larger than this won't be further divided.\"\n            ),\n            value=1000,\n        ),\n        MessageTextInput(\n            name=\"separator\",\n            display_name=\"Separator\",\n            info=(\n                \"The character to split on. Use \\\\n for newline. \"\n                \"Examples: \\\\n\\\\n for paragraphs, \\\\n for lines, . for sentences\"\n            ),\n            value=\"\\n\",\n        ),\n        MessageTextInput(\n            name=\"text_key\",\n            display_name=\"Text Key\",\n            info=\"The key to use for the text column.\",\n            value=\"text\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"keep_separator\",\n            display_name=\"Keep Separator\",\n            info=\"Whether to keep the separator in the output chunks and where to place it.\",\n            options=[\"False\", \"True\", \"Start\", \"End\"],\n            value=\"False\",\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Chunks\", name=\"dataframe\", method=\"split_text\"),\n    ]\n\n    def _docs_to_data(self, docs) -> list[Data]:\n        data_list = [Data(text=doc.page_content, data=doc.metadata) for doc in docs]\n        return data_list\n\n    def _fix_separator(self, separator: str) -> str:\n        \"\"\"Fix common separator issues and convert to proper format.\"\"\"\n        if separator == \"/n\":\n            return \"\\n\"\n        if separator == \"/t\":\n            return \"\\t\"\n        return separator\n\n    def split_text_base(self):\n        separator = self._fix_separator(self.separator)\n        separator = unescape_string(separator)\n\n        if isinstance(self.data_inputs, DataFrame):\n            if not len(self.data_inputs):\n                msg = \"DataFrame is empty\"\n                raise TypeError(msg)\n\n            self.data_inputs.text_key = self.text_key\n            try:\n                documents = self.data_inputs.to_lc_documents()\n            except Exception as e:\n                msg = f\"Error converting DataFrame to documents: {e}\"\n                raise TypeError(msg) from e\n        elif isinstance(self.data_inputs, Message):\n            self.data_inputs = [self.data_inputs.to_data()]\n            return self.split_text_base()\n        else:\n            if not self.data_inputs:\n                msg = \"No data inputs provided\"\n                raise TypeError(msg)\n\n            documents = []\n            if isinstance(self.data_inputs, Data):\n                self.data_inputs.text_key = self.text_key\n                documents = [self.data_inputs.to_lc_document()]\n            else:\n                try:\n                    documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]\n                    if not documents:\n                        msg = f\"No valid Data inputs found in {type(self.data_inputs)}\"\n                        raise TypeError(msg)\n                except AttributeError as e:\n                    msg = f\"Invalid input type in collection: {e}\"\n                    raise TypeError(msg) from e\n        try:\n            # Convert string 'False'/'True' to boolean\n            keep_sep = self.keep_separator\n            if isinstance(keep_sep, str):\n                if keep_sep.lower() == \"false\":\n                    keep_sep = False\n                elif keep_sep.lower() == \"true\":\n                    keep_sep = True\n                # 'start' and 'end' are kept as strings\n            self.log(documents)\n            splitter = CharacterTextSplitter(\n                chunk_overlap=self.chunk_overlap,\n                chunk_size=self.chunk_size,\n                separator=separator,\n                keep_separator=keep_sep,\n            )\n            return splitter.split_documents(documents)\n        except Exception as e:\n            msg = f\"Error splitting text: {e}\"\n            raise TypeError(msg) from e\n\n    def split_text(self) -> DataFrame:\n        return DataFrame(self._docs_to_data(self.split_text_base()))\n"
               },
               "data_inputs": {
+                "_input_type": "HandleInput",
                 "advanced": false,
                 "display_name": "Input",
                 "dynamic": false,
@@ -212,6 +224,7 @@
                   "Message"
                 ],
                 "list": false,
+                "list_add_label": "Add More",
                 "name": "data_inputs",
                 "placeholder": "",
                 "required": true,
@@ -241,12 +254,14 @@
                 "required": false,
                 "show": true,
                 "title_case": false,
+                "toggle": false,
                 "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "str",
                 "value": "False"
               },
               "separator": {
+                "_input_type": "MessageTextInput",
                 "advanced": false,
                 "display_name": "Separator",
                 "dynamic": false,
@@ -255,12 +270,14 @@
                   "Message"
                 ],
                 "list": false,
+                "list_add_label": "Add More",
                 "load_from_db": false,
                 "name": "separator",
                 "placeholder": "",
                 "required": false,
                 "show": true,
                 "title_case": false,
+                "tool_mode": false,
                 "trace_as_input": true,
                 "trace_as_metadata": true,
                 "type": "str",
@@ -289,7 +306,8 @@
                 "type": "str",
                 "value": "text"
               }
-            }
+            },
+            "tool_mode": false
           },
           "selected_output": "chunks",
           "type": "SplitText"
@@ -302,14 +320,14 @@
           "width": 320
         },
         "position": {
-          "x": 1692.461995335383,
-          "y": 1328.2681481569232
+          "x": 1729.1788373023007,
+          "y": 1330.8003441546418
         },
         "positionAbsolute": {
           "x": 1683.4543896546102,
           "y": 1350.7871623588553
         },
-        "selected": false,
+        "selected": true,
         "type": "genericNode",
         "width": 320
       },
@@ -814,8 +832,8 @@
           "width": 320
         },
         "position": {
-          "x": 1690.9220896443658,
-          "y": 1866.483269483266
+          "x": 1750.4286955907494,
+          "y": 1934.8525614216642
         },
         "positionAbsolute": {
           "x": 1690.9220896443658,
@@ -867,10 +885,10 @@
             "beta": false,
             "conditional_paths": [],
             "custom_fields": {},
-            "description": "Loads content from one or more files as a DataFrame.",
+            "description": "Loads content from files with optional advanced document processing and export using Docling.",
             "display_name": "File",
-            "documentation": "",
-            "edited": false,
+            "documentation": "https://docs.langflow.org/components-data#file",
+            "edited": true,
             "field_order": [
               "path",
               "file_path",
@@ -879,15 +897,34 @@
               "delete_server_file_after_processing",
               "ignore_unsupported_extensions",
               "ignore_unspecified_files",
+              "advanced_mode",
+              "pipeline",
+              "ocr_engine",
+              "md_image_placeholder",
+              "md_page_break_placeholder",
+              "doc_key",
               "use_multithreading",
-              "concurrency_multithreading"
+              "concurrency_multithreading",
+              "markdown"
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-08T23:05:09.886Z",
+            "last_updated": "2025-09-09T01:10:10.705Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
-            "metadata": {},
+            "metadata": {
+              "code_hash": "4223fbeb87a0",
+              "dependencies": {
+                "dependencies": [
+                  {
+                    "name": "langflow",
+                    "version": "1.5.0.post2"
+                  }
+                ],
+                "total_dependencies": 1
+              },
+              "module": "custom_components.file"
+            },
             "minimized": false,
             "output_types": [],
             "outputs": [
@@ -896,6 +933,7 @@
                 "cache": true,
                 "display_name": "Raw Content",
                 "group_outputs": false,
+                "hidden": null,
                 "method": "load_files_message",
                 "name": "message",
                 "options": null,
@@ -917,6 +955,7 @@
                 "name": "path",
                 "options": null,
                 "required_inputs": null,
+                "selected": "Message",
                 "tool_mode": true,
                 "types": [
                   "Message"
@@ -927,6 +966,25 @@
             "pinned": false,
             "template": {
               "_type": "Component",
+              "advanced_mode": {
+                "_input_type": "BoolInput",
+                "advanced": false,
+                "display_name": "Advanced Parser",
+                "dynamic": false,
+                "info": "Enable advanced document processing and export with Docling for PDFs, images, and office documents. Available only for single file processing.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "advanced_mode",
+                "placeholder": "",
+                "real_time_refresh": true,
+                "required": false,
+                "show": true,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
               "code": {
                 "advanced": true,
                 "dynamic": true,
@@ -943,7 +1001,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "\"\"\"Enhanced file component with clearer structure and Docling isolation.\n\nNotes:\n-----\n- Functionality is preserved with minimal behavioral changes.\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n  growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n  utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import TYPE_CHECKING, Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FileInput,\n    IntInput,\n    MessageTextInput,\n    Output,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.message import Message\n\nif TYPE_CHECKING:\n    from langflow.schema import DataFrame\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n    display_name = \"File\"\n    description = \"Loads content from files with optional advanced document processing and export using Docling.\"\n    documentation: str = \"https://docs.langflow.org/components-data#file\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n    VALID_EXTENSIONS = [\n        \"adoc\",\n        \"asciidoc\",\n        \"asc\",\n        \"bmp\",\n        \"csv\",\n        \"dotx\",\n        \"dotm\",\n        \"docm\",\n        \"docx\",\n        \"htm\",\n        \"html\",\n        \"jpeg\",\n        \"json\",\n        \"md\",\n        \"pdf\",\n        \"png\",\n        \"potx\",\n        \"ppsx\",\n        \"pptm\",\n        \"potm\",\n        \"ppsm\",\n        \"pptx\",\n        \"tiff\",\n        \"txt\",\n        \"xls\",\n        \"xlsx\",\n        \"xhtml\",\n        \"xml\",\n        \"webp\",\n        *TEXT_FILE_TYPES,\n    ]\n\n    # Fixed export settings used when markdown export is requested.\n    EXPORT_FORMAT = \"Markdown\"\n    IMAGE_MODE = \"placeholder\"\n\n    # ---- Inputs / Outputs (kept as close to original as possible) -------------------\n    _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n    for input_item in _base_inputs:\n        if isinstance(input_item, FileInput) and input_item.name == \"path\":\n            input_item.real_time_refresh = True\n            break\n\n    inputs = [\n        *_base_inputs,\n        BoolInput(\n            name=\"advanced_mode\",\n            display_name=\"Advanced Parser\",\n            value=False,\n            real_time_refresh=True,\n            info=(\n                \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n                \"Available only for single file processing.\"\n            ),\n            show=False,\n        ),\n        DropdownInput(\n            name=\"pipeline\",\n            display_name=\"Pipeline\",\n            info=\"Docling pipeline to use\",\n            options=[\"standard\", \"vlm\"],\n            value=\"standard\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"ocr_engine\",\n            display_name=\"OCR Engine\",\n            info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n            options=[\"\", \"easyocr\"],\n            value=\"\",\n            show=False,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"md_image_placeholder\",\n            display_name=\"Image placeholder\",\n            info=\"Specify the image placeholder for markdown exports.\",\n            value=\"<!-- image -->\",\n            advanced=True,\n            show=False,\n        ),\n        StrInput(\n            name=\"md_page_break_placeholder\",\n            display_name=\"Page break placeholder\",\n            info=\"Add this placeholder between pages in the markdown output.\",\n            value=\"\",\n            advanced=True,\n            show=False,\n        ),\n        MessageTextInput(\n            name=\"doc_key\",\n            display_name=\"Doc Key\",\n            info=\"The key to use for the DoclingDocument column.\",\n            value=\"doc\",\n            advanced=True,\n            show=False,\n        ),\n        # Deprecated input retained for backward-compatibility.\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=True,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n        BoolInput(\n            name=\"markdown\",\n            display_name=\"Markdown Export\",\n            info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n            value=False,\n            show=False,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n    ]\n\n    # ------------------------------ UI helpers --------------------------------------\n\n    def _path_value(self, template: dict) -> list[str]:\n        \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n        return template.get(\"path\", {}).get(\"file_path\", [])\n\n    def update_build_config(\n        self,\n        build_config: dict[str, Any],\n        field_value: Any,\n        field_name: str | None = None,\n    ) -> dict[str, Any]:\n        \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n        if field_name == \"path\":\n            paths = self._path_value(build_config)\n            file_path = paths[0] if paths else \"\"\n            file_count = len(field_value) if field_value else 0\n\n            # Advanced mode only for single (non-tabular) file\n            allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n            build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n            if not allow_advanced:\n                build_config[\"advanced_mode\"][\"value\"] = False\n                for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                    if f in build_config:\n                        build_config[f][\"show\"] = False\n\n        elif field_name == \"advanced_mode\":\n            for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                if f in build_config:\n                    build_config[f][\"show\"] = bool(field_value)\n\n        return build_config\n\n    def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]:  # noqa: ARG002\n        \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n        if field_name not in [\"path\", \"advanced_mode\"]:\n            return frontend_node\n\n        template = frontend_node.get(\"template\", {})\n        paths = self._path_value(template)\n        if not paths:\n            return frontend_node\n\n        frontend_node[\"outputs\"] = []\n        if len(paths) == 1:\n            file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n            if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n                )\n            elif file_path.endswith(\".json\"):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n                )\n\n            advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n            if advanced_mode:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Output\", name=\"advanced\", method=\"load_files_advanced\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Markdown\", name=\"markdown\", method=\"load_files_markdown\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n            else:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n        else:\n            # Multiple files => DataFrame output; advanced parser disabled\n            frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n        return frontend_node\n\n    # ------------------------------ Core processing ----------------------------------\n\n    def _is_docling_compatible(self, file_path: str) -> bool:\n        \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n        docling_exts = (\n            \".adoc\",\n            \".asciidoc\",\n            \".asc\",\n            \".bmp\",\n            \".csv\",\n            \".dotx\",\n            \".dotm\",\n            \".docm\",\n            \".docx\",\n            \".htm\",\n            \".html\",\n            \".jpeg\",\n            \".json\",\n            \".md\",\n            \".pdf\",\n            \".png\",\n            \".potx\",\n            \".ppsx\",\n            \".pptm\",\n            \".potm\",\n            \".ppsm\",\n            \".pptx\",\n            \".tiff\",\n            \".txt\",\n            \".xls\",\n            \".xlsx\",\n            \".xhtml\",\n            \".xml\",\n            \".webp\",\n        )\n        return file_path.lower().endswith(docling_exts)\n\n    def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n        \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n        We avoid multiprocessing pickling by launching `python -c \"<script>\"` and\n        passing JSON config via stdin. The child prints a JSON result to stdout.\n        \"\"\"\n        if not file_path:\n            return None\n\n        args: dict[str, Any] = {\n            \"file_path\": file_path,\n            \"markdown\": bool(self.markdown),\n            \"image_mode\": str(self.IMAGE_MODE),\n            \"md_image_placeholder\": str(self.md_image_placeholder),\n            \"md_page_break_placeholder\": str(self.md_page_break_placeholder),\n            \"pipeline\": str(self.pipeline),\n            \"ocr_engine\": str(self.ocr_engine) if getattr(self, \"ocr_engine\", \"\") else None,\n        }\n\n        # The child is a tiny, self-contained script to keep memory/state isolated.\n        child_script = textwrap.dedent(\n            r\"\"\"\n            import json, sys\n\n            def try_imports():\n                # Strategy 1: latest layout\n                try:\n                    from docling.datamodel.base_models import ConversionStatus, InputFormat  # type: ignore\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    from docling_core.types.doc import ImageRefMode  # type: ignore\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"latest\"\n                except Exception:\n                    pass\n                # Strategy 2: alternative layout\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    try:\n                        from docling_core.types import ConversionStatus, InputFormat  # type: ignore\n                    except Exception:\n                        try:\n                            from docling.datamodel import ConversionStatus, InputFormat  # type: ignore\n                        except Exception:\n                            class ConversionStatus: SUCCESS = \"success\"\n                            class InputFormat:\n                                PDF=\"pdf\"; IMAGE=\"image\"\n                    try:\n                        from docling_core.types.doc import ImageRefMode  # type: ignore\n                    except Exception:\n                        class ImageRefMode:\n                            PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"alternative\"\n                except Exception:\n                    pass\n                # Strategy 3: basic converter only\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    class ConversionStatus: SUCCESS = \"success\"\n                    class InputFormat:\n                        PDF=\"pdf\"; IMAGE=\"image\"\n                    class ImageRefMode:\n                        PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"basic\"\n                except Exception as e:\n                    raise ImportError(f\"Docling imports failed: {e}\") from e\n\n            def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):\n                if strategy == \"latest\" and pipeline == \"standard\":\n                    try:\n                        from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore\n                        from docling.document_converter import PdfFormatOption  # type: ignore\n                        pipe = PdfPipelineOptions()\n                        if ocr_engine:\n                            try:\n                                from docling.models.factories import get_ocr_factory  # type: ignore\n                                pipe.do_ocr = True\n                                fac = get_ocr_factory(allow_external_plugins=False)\n                                pipe.ocr_options = fac.create_options(kind=ocr_engine)\n                            except Exception:\n                                pipe.do_ocr = False\n                        fmt = {}\n                        if hasattr(input_format, \"PDF\"):\n                            fmt[getattr(input_format, \"PDF\")] = PdfFormatOption(pipeline_options=pipe)\n                        if hasattr(input_format, \"IMAGE\"):\n                            fmt[getattr(input_format, \"IMAGE\")] = PdfFormatOption(pipeline_options=pipe)\n                        return DocumentConverter(format_options=fmt)\n                    except Exception:\n                        return DocumentConverter()\n                return DocumentConverter()\n\n            def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):\n                try:\n                    mode = getattr(ImageRefMode, image_mode.upper(), image_mode)\n                    return document.export_to_markdown(\n                        image_mode=mode,\n                        image_placeholder=img_ph,\n                        page_break_placeholder=pg_ph,\n                    )\n                except Exception:\n                    try:\n                        return document.export_to_text()\n                    except Exception:\n                        return str(document)\n\n            def to_rows(doc_dict):\n                rows = []\n                for t in doc_dict.get(\"texts\", []):\n                    prov = t.get(\"prov\") or []\n                    page_no = None\n                    if prov and isinstance(prov, list) and isinstance(prov[0], dict):\n                        page_no = prov[0].get(\"page_no\")\n                    rows.append({\n                        \"page_no\": page_no,\n                        \"label\": t.get(\"label\"),\n                        \"text\": t.get(\"text\"),\n                        \"level\": t.get(\"level\"),\n                    })\n                return rows\n\n            def main():\n                cfg = json.loads(sys.stdin.read())\n                file_path = cfg[\"file_path\"]\n                markdown = cfg[\"markdown\"]\n                image_mode = cfg[\"image_mode\"]\n                img_ph = cfg[\"md_image_placeholder\"]\n                pg_ph = cfg[\"md_page_break_placeholder\"]\n                pipeline = cfg[\"pipeline\"]\n                ocr_engine = cfg.get(\"ocr_engine\")\n                meta = {\"file_path\": file_path}\n\n                try:\n                    ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()\n                    converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)\n                    try:\n                        res = converter.convert(file_path)\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling conversion error: {e}\", \"meta\": meta}))\n                        return\n\n                    ok = False\n                    if hasattr(res, \"status\"):\n                        try:\n                            ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == \"success\")\n                        except Exception:\n                            ok = (str(res.status).lower() == \"success\")\n                    if not ok and hasattr(res, \"document\"):\n                        ok = getattr(res, \"document\", None) is not None\n                    if not ok:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling conversion failed\", \"meta\": meta}))\n                        return\n\n                    doc = getattr(res, \"document\", None)\n                    if doc is None:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling produced no document\", \"meta\": meta}))\n                        return\n\n                    if markdown:\n                        text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)\n                        print(json.dumps({\"ok\": True, \"mode\": \"markdown\", \"text\": text, \"meta\": meta}))\n                        return\n\n                    # structured\n                    try:\n                        doc_dict = doc.export_to_dict()\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling export_to_dict failed: {e}\", \"meta\": meta}))\n                        return\n\n                    rows = to_rows(doc_dict)\n                    print(json.dumps({\"ok\": True, \"mode\": \"structured\", \"doc\": rows, \"meta\": meta}))\n                except Exception as e:\n                    print(\n                        json.dumps({\n                            \"ok\": False,\n                            \"error\": f\"Docling processing error: {e}\",\n                            \"meta\": {\"file_path\": file_path},\n                        })\n                    )\n\n            if __name__ == \"__main__\":\n                main()\n            \"\"\"\n        )\n\n        # Validate file_path to avoid command injection or unsafe input\n        if not isinstance(args[\"file_path\"], str) or any(c in args[\"file_path\"] for c in [\";\", \"|\", \"&\", \"$\", \"`\"]):\n            return Data(data={\"error\": \"Unsafe file path detected.\", \"file_path\": args[\"file_path\"]})\n\n        proc = subprocess.run(  # noqa: S603\n            [sys.executable, \"-u\", \"-c\", child_script],\n            input=json.dumps(args).encode(\"utf-8\"),\n            capture_output=True,\n            check=False,\n        )\n\n        if not proc.stdout:\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\") or \"no output from child process\"\n            return Data(data={\"error\": f\"Docling subprocess error: {err_msg}\", \"file_path\": file_path})\n\n        try:\n            result = json.loads(proc.stdout.decode(\"utf-8\"))\n        except Exception as e:  # noqa: BLE001\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\")\n            return Data(\n                data={\"error\": f\"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}\", \"file_path\": file_path},\n            )\n\n        if not result.get(\"ok\"):\n            return Data(data={\"error\": result.get(\"error\", \"Unknown Docling error\"), **result.get(\"meta\", {})})\n\n        meta = result.get(\"meta\", {})\n        if result.get(\"mode\") == \"markdown\":\n            exported_content = str(result.get(\"text\", \"\"))\n            return Data(\n                text=exported_content,\n                data={\"exported_content\": exported_content, \"export_format\": self.EXPORT_FORMAT, **meta},\n            )\n\n        rows = list(result.get(\"doc\", []))\n        return Data(data={\"doc\": rows, \"export_format\": self.EXPORT_FORMAT, **meta})\n\n    def process_files(\n        self,\n        file_list: list[BaseFileComponent.BaseFile],\n    ) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Process input files.\n\n        - Single file + advanced_mode => Docling in a separate process.\n        - Otherwise => standard parsing in current process (optionally threaded).\n        \"\"\"\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                self.log(f\"File not found: {file_path}. Error: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                self.log(f\"Unexpected error processing {file_path}: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n\n        # Advanced path: only for a single Docling-compatible file\n        if len(file_list) == 1:\n            file_path = str(file_list[0].path)\n            if self.advanced_mode and self._is_docling_compatible(file_path):\n                advanced_data: Data | None = self._process_docling_in_subprocess(file_path)\n\n                # --- UNNEST: expand each element in `doc` to its own Data row\n                payload = getattr(advanced_data, \"data\", {}) or {}\n                doc_rows = payload.get(\"doc\")\n                if isinstance(doc_rows, list):\n                    rows: list[Data | None] = [\n                        Data(\n                            data={\n                                \"file_path\": file_path,\n                                **(item if isinstance(item, dict) else {\"value\": item}),\n                            },\n                        )\n                        for item in doc_rows\n                    ]\n                    return self.rollup_data(file_list, rows)\n\n                # If not structured, keep as-is (e.g., markdown export or error dict)\n                return self.rollup_data(file_list, [advanced_data])\n\n        # Standard multi-file (or single non-advanced) path\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_paths = [str(f.path) for f in file_list]\n        self.log(f\"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.\")\n        my_data = parallel_load_data(\n            file_paths,\n            silent_errors=self.silent_errors,\n            load_function=process_file_standard,\n            max_concurrency=concurrency,\n        )\n        return self.rollup_data(file_list, my_data)\n\n    # ------------------------------ Output helpers -----------------------------------\n\n    def load_files_advanced(self) -> DataFrame:\n        \"\"\"Load files using advanced Docling processing and export to an advanced format.\"\"\"\n        self.markdown = False\n        return self.load_files()\n\n    def load_files_markdown(self) -> Message:\n        \"\"\"Load files using advanced Docling processing and export to Markdown format.\"\"\"\n        self.markdown = True\n        result = self.load_files()\n        return Message(text=str(result.text[0]))\n"
+                "value": "\"\"\"Enhanced file component with clearer structure and Docling isolation.\n\nNotes:\n-----\n- Functionality is preserved with minimal behavioral changes.\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n  growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n  utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import TYPE_CHECKING, Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FileInput,\n    IntInput,\n    MessageTextInput,\n    Output,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.message import Message\nfrom pathlib import Path\nfrom langflow.services.storage.utils import build_content_type_from_extension\nif TYPE_CHECKING:\n    from langflow.schema import DataFrame\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n    display_name = \"File\"\n    description = \"Loads content from files with optional advanced document processing and export using Docling.\"\n    documentation: str = \"https://docs.langflow.org/components-data#file\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n    VALID_EXTENSIONS = [\n        \"adoc\",\n        \"asciidoc\",\n        \"asc\",\n        \"bmp\",\n        \"csv\",\n        \"dotx\",\n        \"dotm\",\n        \"docm\",\n        \"docx\",\n        \"htm\",\n        \"html\",\n        \"jpeg\",\n        \"json\",\n        \"md\",\n        \"pdf\",\n        \"png\",\n        \"potx\",\n        \"ppsx\",\n        \"pptm\",\n        \"potm\",\n        \"ppsm\",\n        \"pptx\",\n        \"tiff\",\n        \"txt\",\n        \"xls\",\n        \"xlsx\",\n        \"xhtml\",\n        \"xml\",\n        \"webp\",\n        *TEXT_FILE_TYPES,\n    ]\n\n    # Fixed export settings used when markdown export is requested.\n    EXPORT_FORMAT = \"Markdown\"\n    IMAGE_MODE = \"placeholder\"\n\n    # ---- Inputs / Outputs (kept as close to original as possible) -------------------\n    _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n    for input_item in _base_inputs:\n        if isinstance(input_item, FileInput) and input_item.name == \"path\":\n            input_item.real_time_refresh = True\n            break\n\n    inputs = [\n        *_base_inputs,\n        BoolInput(\n            name=\"advanced_mode\",\n            display_name=\"Advanced Parser\",\n            value=False,\n            real_time_refresh=True,\n            info=(\n                \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n                \"Available only for single file processing.\"\n            ),\n            show=False,\n        ),\n        DropdownInput(\n            name=\"pipeline\",\n            display_name=\"Pipeline\",\n            info=\"Docling pipeline to use\",\n            options=[\"standard\", \"vlm\"],\n            value=\"standard\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"ocr_engine\",\n            display_name=\"OCR Engine\",\n            info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n            options=[\"\", \"easyocr\"],\n            value=\"\",\n            show=False,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"md_image_placeholder\",\n            display_name=\"Image placeholder\",\n            info=\"Specify the image placeholder for markdown exports.\",\n            value=\"<!-- image -->\",\n            advanced=True,\n            show=False,\n        ),\n        StrInput(\n            name=\"md_page_break_placeholder\",\n            display_name=\"Page break placeholder\",\n            info=\"Add this placeholder between pages in the markdown output.\",\n            value=\"\",\n            advanced=True,\n            show=False,\n        ),\n        MessageTextInput(\n            name=\"doc_key\",\n            display_name=\"Doc Key\",\n            info=\"The key to use for the DoclingDocument column.\",\n            value=\"doc\",\n            advanced=True,\n            show=False,\n        ),\n        # Deprecated input retained for backward-compatibility.\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=True,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n        BoolInput(\n            name=\"markdown\",\n            display_name=\"Markdown Export\",\n            info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n            value=False,\n            show=False,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n    ]\n\n    # ------------------------------ UI helpers --------------------------------------\n\n    def _path_value(self, template: dict) -> list[str]:\n        \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n        return template.get(\"path\", {}).get(\"file_path\", [])\n\n    def update_build_config(\n        self,\n        build_config: dict[str, Any],\n        field_value: Any,\n        field_name: str | None = None,\n    ) -> dict[str, Any]:\n        \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n        if field_name == \"path\":\n            paths = self._path_value(build_config)\n            file_path = paths[0] if paths else \"\"\n            file_count = len(field_value) if field_value else 0\n\n            # Advanced mode only for single (non-tabular) file\n            allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n            build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n            if not allow_advanced:\n                build_config[\"advanced_mode\"][\"value\"] = False\n                for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                    if f in build_config:\n                        build_config[f][\"show\"] = False\n\n        elif field_name == \"advanced_mode\":\n            for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                if f in build_config:\n                    build_config[f][\"show\"] = bool(field_value)\n\n        return build_config\n\n    def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]:  # noqa: ARG002\n        \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n        if field_name not in [\"path\", \"advanced_mode\"]:\n            return frontend_node\n\n        template = frontend_node.get(\"template\", {})\n        paths = self._path_value(template)\n        if not paths:\n            return frontend_node\n\n        frontend_node[\"outputs\"] = []\n        if len(paths) == 1:\n            file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n            if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n                )\n            elif file_path.endswith(\".json\"):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n                )\n\n            advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n            if advanced_mode:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Output\", name=\"advanced\", method=\"load_files_advanced\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Markdown\", name=\"markdown\", method=\"load_files_markdown\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n            else:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n        else:\n            # Multiple files => DataFrame output; advanced parser disabled\n            frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n        return frontend_node\n\n    # ------------------------------ Core processing ----------------------------------\n\n    def _is_docling_compatible(self, file_path: str) -> bool:\n        \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n        docling_exts = (\n            \".adoc\",\n            \".asciidoc\",\n            \".asc\",\n            \".bmp\",\n            \".csv\",\n            \".dotx\",\n            \".dotm\",\n            \".docm\",\n            \".docx\",\n            \".htm\",\n            \".html\",\n            \".jpeg\",\n            \".json\",\n            \".md\",\n            \".pdf\",\n            \".png\",\n            \".potx\",\n            \".ppsx\",\n            \".pptm\",\n            \".potm\",\n            \".ppsm\",\n            \".pptx\",\n            \".tiff\",\n            \".txt\",\n            \".xls\",\n            \".xlsx\",\n            \".xhtml\",\n            \".xml\",\n            \".webp\",\n        )\n        return file_path.lower().endswith(docling_exts)\n\n    def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n        \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n        We avoid multiprocessing pickling by launching `python -c \"<script>\"` and\n        passing JSON config via stdin. The child prints a JSON result to stdout.\n        \"\"\"\n        if not file_path:\n            return None\n\n        args: dict[str, Any] = {\n            \"file_path\": file_path,\n            \"markdown\": bool(self.markdown),\n            \"image_mode\": str(self.IMAGE_MODE),\n            \"md_image_placeholder\": str(self.md_image_placeholder),\n            \"md_page_break_placeholder\": str(self.md_page_break_placeholder),\n            \"pipeline\": str(self.pipeline),\n            \"ocr_engine\": str(self.ocr_engine) if getattr(self, \"ocr_engine\", \"\") else None,\n        }\n\n        # The child is a tiny, self-contained script to keep memory/state isolated.\n        child_script = textwrap.dedent(\n            r\"\"\"\n            import json, sys\n\n            def try_imports():\n                # Strategy 1: latest layout\n                try:\n                    from docling.datamodel.base_models import ConversionStatus, InputFormat  # type: ignore\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    from docling_core.types.doc import ImageRefMode  # type: ignore\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"latest\"\n                except Exception:\n                    pass\n                # Strategy 2: alternative layout\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    try:\n                        from docling_core.types import ConversionStatus, InputFormat  # type: ignore\n                    except Exception:\n                        try:\n                            from docling.datamodel import ConversionStatus, InputFormat  # type: ignore\n                        except Exception:\n                            class ConversionStatus: SUCCESS = \"success\"\n                            class InputFormat:\n                                PDF=\"pdf\"; IMAGE=\"image\"\n                    try:\n                        from docling_core.types.doc import ImageRefMode  # type: ignore\n                    except Exception:\n                        class ImageRefMode:\n                            PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"alternative\"\n                except Exception:\n                    pass\n                # Strategy 3: basic converter only\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    class ConversionStatus: SUCCESS = \"success\"\n                    class InputFormat:\n                        PDF=\"pdf\"; IMAGE=\"image\"\n                    class ImageRefMode:\n                        PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"basic\"\n                except Exception as e:\n                    raise ImportError(f\"Docling imports failed: {e}\") from e\n\n            def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):\n                if strategy == \"latest\" and pipeline == \"standard\":\n                    try:\n                        from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore\n                        from docling.document_converter import PdfFormatOption  # type: ignore\n                        pipe = PdfPipelineOptions()\n                        if ocr_engine:\n                            try:\n                                from docling.models.factories import get_ocr_factory  # type: ignore\n                                pipe.do_ocr = True\n                                fac = get_ocr_factory(allow_external_plugins=False)\n                                pipe.ocr_options = fac.create_options(kind=ocr_engine)\n                            except Exception:\n                                pipe.do_ocr = False\n                        fmt = {}\n                        if hasattr(input_format, \"PDF\"):\n                            fmt[getattr(input_format, \"PDF\")] = PdfFormatOption(pipeline_options=pipe)\n                        if hasattr(input_format, \"IMAGE\"):\n                            fmt[getattr(input_format, \"IMAGE\")] = PdfFormatOption(pipeline_options=pipe)\n                        return DocumentConverter(format_options=fmt)\n                    except Exception:\n                        return DocumentConverter()\n                return DocumentConverter()\n\n            def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):\n                try:\n                    mode = getattr(ImageRefMode, image_mode.upper(), image_mode)\n                    return document.export_to_markdown(\n                        image_mode=mode,\n                        image_placeholder=img_ph,\n                        page_break_placeholder=pg_ph,\n                    )\n                except Exception:\n                    try:\n                        return document.export_to_text()\n                    except Exception:\n                        return str(document)\n\n            def to_rows(doc_dict):\n                rows = []\n                for t in doc_dict.get(\"texts\", []):\n                    prov = t.get(\"prov\") or []\n                    page_no = None\n                    if prov and isinstance(prov, list) and isinstance(prov[0], dict):\n                        page_no = prov[0].get(\"page_no\")\n                    rows.append({\n                        \"page_no\": page_no,\n                        \"label\": t.get(\"label\"),\n                        \"text\": t.get(\"text\"),\n                        \"level\": t.get(\"level\"),\n                    })\n                return rows\n\n            def main():\n                cfg = json.loads(sys.stdin.read())\n                file_path = cfg[\"file_path\"]\n                markdown = cfg[\"markdown\"]\n                image_mode = cfg[\"image_mode\"]\n                img_ph = cfg[\"md_image_placeholder\"]\n                pg_ph = cfg[\"md_page_break_placeholder\"]\n                pipeline = cfg[\"pipeline\"]\n                ocr_engine = cfg.get(\"ocr_engine\")\n                meta = {\"file_path\": file_path}\n\n                try:\n                    ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()\n                    converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)\n                    try:\n                        res = converter.convert(file_path)\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling conversion error: {e}\", \"meta\": meta}))\n                        return\n\n                    ok = False\n                    if hasattr(res, \"status\"):\n                        try:\n                            ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == \"success\")\n                        except Exception:\n                            ok = (str(res.status).lower() == \"success\")\n                    if not ok and hasattr(res, \"document\"):\n                        ok = getattr(res, \"document\", None) is not None\n                    if not ok:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling conversion failed\", \"meta\": meta}))\n                        return\n\n                    doc = getattr(res, \"document\", None)\n                    if doc is None:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling produced no document\", \"meta\": meta}))\n                        return\n\n                    if markdown:\n                        text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)\n                        print(json.dumps({\"ok\": True, \"mode\": \"markdown\", \"text\": text, \"meta\": meta}))\n                        return\n\n                    # structured\n                    try:\n                        doc_dict = doc.export_to_dict()\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling export_to_dict failed: {e}\", \"meta\": meta}))\n                        return\n\n                    rows = to_rows(doc_dict)\n                    print(json.dumps({\"ok\": True, \"mode\": \"structured\", \"doc\": rows, \"meta\": meta}))\n                except Exception as e:\n                    print(\n                        json.dumps({\n                            \"ok\": False,\n                            \"error\": f\"Docling processing error: {e}\",\n                            \"meta\": {\"file_path\": file_path},\n                        })\n                    )\n\n            if __name__ == \"__main__\":\n                main()\n            \"\"\"\n        )\n\n        # Validate file_path to avoid command injection or unsafe input\n        if not isinstance(args[\"file_path\"], str) or any(c in args[\"file_path\"] for c in [\";\", \"|\", \"&\", \"$\", \"`\"]):\n            return Data(data={\"error\": \"Unsafe file path detected.\", \"file_path\": args[\"file_path\"]})\n\n        proc = subprocess.run(  # noqa: S603\n            [sys.executable, \"-u\", \"-c\", child_script],\n            input=json.dumps(args).encode(\"utf-8\"),\n            capture_output=True,\n            check=False,\n        )\n\n        if not proc.stdout:\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\") or \"no output from child process\"\n            return Data(data={\"error\": f\"Docling subprocess error: {err_msg}\", \"file_path\": file_path})\n\n        try:\n            result = json.loads(proc.stdout.decode(\"utf-8\"))\n        except Exception as e:  # noqa: BLE001\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\")\n            return Data(\n                data={\"error\": f\"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}\", \"file_path\": file_path},\n            )\n\n        if not result.get(\"ok\"):\n            return Data(data={\"error\": result.get(\"error\", \"Unknown Docling error\"), **result.get(\"meta\", {})})\n\n        meta = result.get(\"meta\", {})\n        if result.get(\"mode\") == \"markdown\":\n            exported_content = str(result.get(\"text\", \"\"))\n            return Data(\n                text=exported_content,\n                data={\"exported_content\": exported_content, \"export_format\": self.EXPORT_FORMAT, **meta},\n            )\n\n        rows = list(result.get(\"doc\", []))\n        return Data(data={\"doc\": rows, \"export_format\": self.EXPORT_FORMAT, **meta})\n\n    def process_files(\n        self,\n        file_list: list[BaseFileComponent.BaseFile],\n    ) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Process input files.\n\n        - Single file + advanced_mode => Docling in a separate process.\n        - Otherwise => standard parsing in current process (optionally threaded).\n        \"\"\"\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                self.log(f\"File not found: {file_path}. Error: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                self.log(f\"Unexpected error processing {file_path}: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n\n        # Advanced path: only for a single Docling-compatible file\n        if len(file_list) == 1:\n            file_path = str(file_list[0].path)\n            if self.advanced_mode and self._is_docling_compatible(file_path):\n                advanced_data: Data | None = self._process_docling_in_subprocess(file_path)\n\n                # --- UNNEST: expand each element in `doc` to its own Data row\n                payload = getattr(advanced_data, \"data\", {}) or {}\n                doc_rows = payload.get(\"doc\")\n                if isinstance(doc_rows, list):\n                    rows: list[Data | None] = [\n                        Data(\n                            data={\n                                \"file_path\": file_path,\n                                **(item if isinstance(item, dict) else {\"value\": item}),\n                            },\n                        )\n                        for item in doc_rows\n                    ]\n                    return self.rollup_data(file_list, rows)\n\n                # If not structured, keep as-is (e.g., markdown export or error dict)\n                return self.rollup_data(file_list, [advanced_data])\n\n        # Standard multi-file (or single non-advanced) path\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_paths = [str(f.path) for f in file_list]\n        self.log(f\"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.\")\n        my_data = parallel_load_data(\n            file_paths,\n            silent_errors=self.silent_errors,\n            load_function=process_file_standard,\n            max_concurrency=concurrency,\n        )\n        return self.rollup_data(file_list, my_data)\n\n    # ------------------------------ Output helpers -----------------------------------\n\n    def load_files_advanced(self) -> DataFrame:\n        \"\"\"Load files using advanced Docling processing and export to an advanced format.\"\"\"\n        self.markdown = False\n        return self.load_files()\n\n    def load_files_markdown(self) -> Message:\n        \"\"\"Load files using advanced Docling processing and export to Markdown format.\"\"\"\n        self.markdown = True\n        result = self.load_files()\n        return Message(text=str(result.text[0]))\n    \n    def load_files_message(self) -> Message:\n        \"\"\"Load files and return as Message.\n        \n        Returns:\n          Message: Message containing all file data\n        \"\"\"\n        data_list = self.load_files_core()\n        if not data_list:\n          return Message()  # No data -> empty message\n        \n        sep: str = getattr(self, \"separator\", \"\\n\\n\") or \"\\n\\n\"\n        \n        parts: list[str] = []\n        metadata = {}  # Initialize as empty dict instead of None\n        \n        for d in data_list:\n          # Prefer explicit text if available, fall back to full dict, lastly str()\n          text = (getattr(d, \"get_text\", lambda: None)() or d.data.get(\"text\")) if isinstance(d.data, dict) else None\n          parts.append(text if text is not None else str(d))\n        \n          # Set metadata from first file (or you could combine metadata from all files)\n          if not metadata and hasattr(d, 'file_path'):\n              file_path = d.file_path\n              # Get filename\n              filename = Path(file_path).name\n              metadata[\"filename\"] = filename\n              extension = filename.split(\".\")[-1]\n              if extension:\n                metadata[\"mimetype\"] = build_content_type_from_extension(filename.split(\".\")[-1])\n        \n              # Add other common metadata fields if available\n              if hasattr(d, 'data') and isinstance(d.data, dict):\n                  # Copy relevant metadata fields\n                  for field in ['mimetype', 'file_size', 'created_time', 'modified_time']:\n                      if field in d.data:\n                          metadata[field] = d.data[field]\n        self.log(metadata)\n        return Message(text=sep.join(parts), **metadata)\n"
               },
               "concurrency_multithreading": {
                 "_input_type": "IntInput",
@@ -981,6 +1039,29 @@
                 "type": "bool",
                 "value": true
               },
+              "doc_key": {
+                "_input_type": "MessageTextInput",
+                "advanced": true,
+                "display_name": "Doc Key",
+                "dynamic": false,
+                "info": "The key to use for the DoclingDocument column.",
+                "input_types": [
+                  "Message"
+                ],
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "doc_key",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_input": true,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "doc"
+              },
               "file_path": {
                 "_input_type": "HandleInput",
                 "advanced": true,
@@ -1038,12 +1119,121 @@
                 "type": "bool",
                 "value": true
               },
+              "markdown": {
+                "_input_type": "BoolInput",
+                "advanced": false,
+                "display_name": "Markdown Export",
+                "dynamic": false,
+                "info": "Export processed documents to Markdown format. Only available when advanced mode is enabled.",
+                "list": false,
+                "list_add_label": "Add More",
+                "name": "markdown",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "bool",
+                "value": false
+              },
+              "md_image_placeholder": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "Image placeholder",
+                "dynamic": false,
+                "info": "Specify the image placeholder for markdown exports.",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "md_image_placeholder",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "<!-- image -->"
+              },
+              "md_page_break_placeholder": {
+                "_input_type": "StrInput",
+                "advanced": true,
+                "display_name": "Page break placeholder",
+                "dynamic": false,
+                "info": "Add this placeholder between pages in the markdown output.",
+                "list": false,
+                "list_add_label": "Add More",
+                "load_from_db": false,
+                "name": "md_page_break_placeholder",
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
+              "ocr_engine": {
+                "_input_type": "DropdownInput",
+                "advanced": true,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "OCR Engine",
+                "dynamic": false,
+                "info": "OCR engine to use. Only available when pipeline is set to 'standard'.",
+                "name": "ocr_engine",
+                "options": [
+                  "",
+                  "easyocr"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": ""
+              },
               "path": {
                 "_input_type": "FileInput",
                 "advanced": false,
                 "display_name": "Files",
                 "dynamic": false,
                 "fileTypes": [
+                  "adoc",
+                  "asciidoc",
+                  "asc",
+                  "bmp",
+                  "csv",
+                  "dotx",
+                  "dotm",
+                  "docm",
+                  "docx",
+                  "htm",
+                  "html",
+                  "jpeg",
+                  "json",
+                  "md",
+                  "pdf",
+                  "png",
+                  "potx",
+                  "ppsx",
+                  "pptm",
+                  "potm",
+                  "ppsm",
+                  "pptx",
+                  "tiff",
+                  "txt",
+                  "xls",
+                  "xlsx",
+                  "xhtml",
+                  "xml",
+                  "webp",
                   "txt",
                   "md",
                   "mdx",
@@ -1068,10 +1258,13 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [],
-                "info": "Supported file extensions: txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
+                "file_path": [
+                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/Actors.txt"
+                ],
+                "info": "Supported file extensions: adoc, asciidoc, asc, bmp, csv, dotx, dotm, docm, docx, htm, html, jpeg, json, md, pdf, png, potx, ppsx, pptm, potm, ppsm, pptx, tiff, txt, xls, xlsx, xhtml, xml, webp, txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
+                "load_from_db": false,
                 "name": "path",
                 "placeholder": "",
                 "real_time_refresh": true,
@@ -1083,6 +1276,30 @@
                 "type": "file",
                 "value": ""
               },
+              "pipeline": {
+                "_input_type": "DropdownInput",
+                "advanced": true,
+                "combobox": false,
+                "dialog_inputs": {},
+                "display_name": "Pipeline",
+                "dynamic": false,
+                "info": "Docling pipeline to use",
+                "name": "pipeline",
+                "options": [
+                  "standard",
+                  "vlm"
+                ],
+                "options_metadata": [],
+                "placeholder": "",
+                "required": false,
+                "show": false,
+                "title_case": false,
+                "toggle": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "type": "str",
+                "value": "standard"
+              },
               "separator": {
                 "_input_type": "StrInput",
                 "advanced": true,
@@ -1148,7 +1365,7 @@
         "dragging": false,
         "id": "File-PSU37",
         "measured": {
-          "height": 233,
+          "height": 275,
           "width": 320
         },
         "position": {
@@ -1202,7 +1419,7 @@
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "307f5461379f",
+              "code_hash": "86088405c860",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1336,7 +1553,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    file_path=hit[\"metadata\"].get(\"file_path\", \"\"),\n                    text=hit[\"page_content\"],\n                )\n                for hit in raw\n            ]\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            data_copy[\"file_size\"] = document_sizes[filename]\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1502,7 +1719,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "str",
-                "value": ""
+                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3OTg1NDkyLCJpYXQiOjE3NTczODA2OTIsImF1dGhfdGltZSI6MTc1NzM5MTQ5MiwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.ArlITY6ISZTc-7_WTSnmMjA93BDM1J20mTbrebIyxWKwIsRDq5OblirZi9pgiGaersPC-zANzPHhUyv076QWt03l1Awt1IKf600vqd0eeG4YCfh4ANdtfxWsu7Y_aFyOU57Rc-VM8_XuSRB3l570IolwkjMlBuyRULvtP8oNt7haRp3DV9iW8zEW0vWxGLji33Axh0MTzqeANtZ1-lxw0oGaQ6XDg4liEhuydCeX89ykI27lI8ori358RgHT7PlD9t9R04UgZytt35ildLtd2M_6OWxtMy6o9OP-l-QQpflqMvE4y01kphOCPmo8MUbck554l4CShEuDkyCwbDMWew"
               },
               "m": {
                 "_input_type": "IntInput",
@@ -1722,7 +1939,7 @@
             },
             "tool_mode": false
           },
-          "selected_output": "dataframe",
+          "selected_output": "search_results",
           "showNode": true,
           "type": "OpenSearchHybrid"
         },
@@ -1741,8 +1958,8 @@
       }
     ],
     "viewport": {
-      "x": -793.2209660078397,
-      "y": -894.7176624994117,
+      "x": -813.2209660078397,
+      "y": -898.7176624994117,
       "zoom": 0.7898282762479812
     }
   },

From 6467fd2141e34131ff02a7144ddc215a32434c39 Mon Sep 17 00:00:00 2001
From: Edwin Jose <edwin.jose@datastax.com>
Date: Mon, 8 Sep 2025 22:02:32 -0400
Subject: [PATCH 58/67] fix ingestion flow

---
 docker-compose-cpu.yml | 1 +
 docker-compose.yml     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml
index 2e4e796e..132cb233 100644
--- a/docker-compose-cpu.yml
+++ b/docker-compose-cpu.yml
@@ -54,6 +54,7 @@ services:
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}
       - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD}
       - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID}
+      - LANGFLOW_INGEST_FLOW_ID=${LANGFLOW_INGEST_FLOW_ID}
       - NUDGES_FLOW_ID=${NUDGES_FLOW_ID}
       - OPENSEARCH_PORT=9200
       - OPENSEARCH_USERNAME=admin
diff --git a/docker-compose.yml b/docker-compose.yml
index 091775fe..62bb8d2c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -53,6 +53,7 @@ services:
       - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER}
       - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD}
       - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID}
+      - LANGFLOW_INGEST_FLOW_ID=${LANGFLOW_INGEST_FLOW_ID}
       - NUDGES_FLOW_ID=${NUDGES_FLOW_ID}
       - OPENSEARCH_PORT=9200
       - OPENSEARCH_USERNAME=admin

From bea087e25098c4e8b42bfffc5c8767eee658521c Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 23:03:14 -0300
Subject: [PATCH 59/67] Update ingestion flow JSON to enhance component
 configurations and improve user experience

This commit modifies the ingestion flow JSON by adding the "lf_version" field to components, updating the last updated timestamp, and changing the selection state of a node. Additionally, it refines the file path handling and improves the overall clarity of the configuration. These changes align with best practices for robust async development and enhance the functionality of the ingestion flow.
---
 flows/ingestion_flow.json | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 0e2a0708..93a07d53 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -117,6 +117,7 @@
             "frozen": false,
             "icon": "scissors-line-dashed",
             "legacy": false,
+            "lf_version": "1.5.0.post2",
             "metadata": {
               "code_hash": "65a90e1f4fe6",
               "dependencies": {
@@ -327,7 +328,7 @@
           "x": 1683.4543896546102,
           "y": 1350.7871623588553
         },
-        "selected": true,
+        "selected": false,
         "type": "genericNode",
         "width": 320
       },
@@ -909,7 +910,7 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-09T01:10:10.705Z",
+            "last_updated": "2025-09-09T01:56:37.852Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
@@ -1259,7 +1260,7 @@
                   "gz"
                 ],
                 "file_path": [
-                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/Actors.txt"
+                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/comprovanteDARF.pdf"
                 ],
                 "info": "Supported file extensions: adoc, asciidoc, asc, bmp, csv, dotx, dotm, docm, docx, htm, html, jpeg, json, md, pdf, png, potx, ppsx, pptm, potm, ppsm, pptx, tiff, txt, xls, xlsx, xhtml, xml, webp, txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
@@ -1419,7 +1420,7 @@
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "86088405c860",
+              "code_hash": "13675b26ae54",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1553,7 +1554,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                \"metadata\": metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            data_copy[\"file_size\"] = document_sizes[filename]\n            metadatas.append(data_copy)\n\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            data_copy[\"file_size\"] = document_sizes[filename]\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1719,7 +1720,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "str",
-                "value": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJodHRwOi8vb3BlbnJhZy1iYWNrZW5kOjgwMDAiLCJzdWIiOiIxMDMwNzA3MzY1NDU0NjQyNDYxMTMiLCJhdWQiOlsib3BlbnNlYXJjaCIsIm9wZW5yYWciXSwiZXhwIjoxNzU3OTg1NDkyLCJpYXQiOjE3NTczODA2OTIsImF1dGhfdGltZSI6MTc1NzM5MTQ5MiwidXNlcl9pZCI6IjEwMzA3MDczNjU0NTQ2NDI0NjExMyIsImVtYWlsIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJuYW1lIjoiR2FicmllbCBBbG1laWRhIiwicHJlZmVycmVkX3VzZXJuYW1lIjoiZ2FicmllbEBsYW5nZmxvdy5vcmciLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwicm9sZXMiOlsib3BlbnJhZ191c2VyIl19.ArlITY6ISZTc-7_WTSnmMjA93BDM1J20mTbrebIyxWKwIsRDq5OblirZi9pgiGaersPC-zANzPHhUyv076QWt03l1Awt1IKf600vqd0eeG4YCfh4ANdtfxWsu7Y_aFyOU57Rc-VM8_XuSRB3l570IolwkjMlBuyRULvtP8oNt7haRp3DV9iW8zEW0vWxGLji33Axh0MTzqeANtZ1-lxw0oGaQ6XDg4liEhuydCeX89ykI27lI8ori358RgHT7PlD9t9R04UgZytt35ildLtd2M_6OWxtMy6o9OP-l-QQpflqMvE4y01kphOCPmo8MUbck554l4CShEuDkyCwbDMWew"
+                "value": ""
               },
               "m": {
                 "_input_type": "IntInput",
@@ -1953,13 +1954,13 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": false,
+        "selected": true,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -813.2209660078397,
-      "y": -898.7176624994117,
+      "x": -761.2209660078397,
+      "y": -883.7176624994117,
       "zoom": 0.7898282762479812
     }
   },

From 54f03b468a21baa2e5f9ecac12aedb898d030975 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 23:20:38 -0300
Subject: [PATCH 60/67] Enhance ingestion flow JSON with updated component
 configurations and improved file handling

This commit updates the ingestion flow JSON by modifying the last updated timestamp, changing the code hash, and adding the "anyio" dependency to the metadata. Additionally, it refines the File component's description and input options for advanced document processing, ensuring better clarity and functionality. These changes align with best practices for robust async development and enhance the overall user experience.
---
 flows/ingestion_flow.json | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 93a07d53..8a5b9256 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -910,19 +910,23 @@
             ],
             "frozen": false,
             "icon": "file-text",
-            "last_updated": "2025-09-09T01:56:37.852Z",
+            "last_updated": "2025-09-09T02:18:48.064Z",
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "4223fbeb87a0",
+              "code_hash": "086578fbbd54",
               "dependencies": {
                 "dependencies": [
                   {
                     "name": "langflow",
                     "version": "1.5.0.post2"
+                  },
+                  {
+                    "name": "anyio",
+                    "version": "4.10.0"
                   }
                 ],
-                "total_dependencies": 1
+                "total_dependencies": 2
               },
               "module": "custom_components.file"
             },
@@ -1002,7 +1006,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "\"\"\"Enhanced file component with clearer structure and Docling isolation.\n\nNotes:\n-----\n- Functionality is preserved with minimal behavioral changes.\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n  growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n  utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import TYPE_CHECKING, Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FileInput,\n    IntInput,\n    MessageTextInput,\n    Output,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.message import Message\nfrom pathlib import Path\nfrom langflow.services.storage.utils import build_content_type_from_extension\nif TYPE_CHECKING:\n    from langflow.schema import DataFrame\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n    display_name = \"File\"\n    description = \"Loads content from files with optional advanced document processing and export using Docling.\"\n    documentation: str = \"https://docs.langflow.org/components-data#file\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n    VALID_EXTENSIONS = [\n        \"adoc\",\n        \"asciidoc\",\n        \"asc\",\n        \"bmp\",\n        \"csv\",\n        \"dotx\",\n        \"dotm\",\n        \"docm\",\n        \"docx\",\n        \"htm\",\n        \"html\",\n        \"jpeg\",\n        \"json\",\n        \"md\",\n        \"pdf\",\n        \"png\",\n        \"potx\",\n        \"ppsx\",\n        \"pptm\",\n        \"potm\",\n        \"ppsm\",\n        \"pptx\",\n        \"tiff\",\n        \"txt\",\n        \"xls\",\n        \"xlsx\",\n        \"xhtml\",\n        \"xml\",\n        \"webp\",\n        *TEXT_FILE_TYPES,\n    ]\n\n    # Fixed export settings used when markdown export is requested.\n    EXPORT_FORMAT = \"Markdown\"\n    IMAGE_MODE = \"placeholder\"\n\n    # ---- Inputs / Outputs (kept as close to original as possible) -------------------\n    _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n    for input_item in _base_inputs:\n        if isinstance(input_item, FileInput) and input_item.name == \"path\":\n            input_item.real_time_refresh = True\n            break\n\n    inputs = [\n        *_base_inputs,\n        BoolInput(\n            name=\"advanced_mode\",\n            display_name=\"Advanced Parser\",\n            value=False,\n            real_time_refresh=True,\n            info=(\n                \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n                \"Available only for single file processing.\"\n            ),\n            show=False,\n        ),\n        DropdownInput(\n            name=\"pipeline\",\n            display_name=\"Pipeline\",\n            info=\"Docling pipeline to use\",\n            options=[\"standard\", \"vlm\"],\n            value=\"standard\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"ocr_engine\",\n            display_name=\"OCR Engine\",\n            info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n            options=[\"\", \"easyocr\"],\n            value=\"\",\n            show=False,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"md_image_placeholder\",\n            display_name=\"Image placeholder\",\n            info=\"Specify the image placeholder for markdown exports.\",\n            value=\"<!-- image -->\",\n            advanced=True,\n            show=False,\n        ),\n        StrInput(\n            name=\"md_page_break_placeholder\",\n            display_name=\"Page break placeholder\",\n            info=\"Add this placeholder between pages in the markdown output.\",\n            value=\"\",\n            advanced=True,\n            show=False,\n        ),\n        MessageTextInput(\n            name=\"doc_key\",\n            display_name=\"Doc Key\",\n            info=\"The key to use for the DoclingDocument column.\",\n            value=\"doc\",\n            advanced=True,\n            show=False,\n        ),\n        # Deprecated input retained for backward-compatibility.\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=True,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n        BoolInput(\n            name=\"markdown\",\n            display_name=\"Markdown Export\",\n            info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n            value=False,\n            show=False,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n    ]\n\n    # ------------------------------ UI helpers --------------------------------------\n\n    def _path_value(self, template: dict) -> list[str]:\n        \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n        return template.get(\"path\", {}).get(\"file_path\", [])\n\n    def update_build_config(\n        self,\n        build_config: dict[str, Any],\n        field_value: Any,\n        field_name: str | None = None,\n    ) -> dict[str, Any]:\n        \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n        if field_name == \"path\":\n            paths = self._path_value(build_config)\n            file_path = paths[0] if paths else \"\"\n            file_count = len(field_value) if field_value else 0\n\n            # Advanced mode only for single (non-tabular) file\n            allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n            build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n            if not allow_advanced:\n                build_config[\"advanced_mode\"][\"value\"] = False\n                for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                    if f in build_config:\n                        build_config[f][\"show\"] = False\n\n        elif field_name == \"advanced_mode\":\n            for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                if f in build_config:\n                    build_config[f][\"show\"] = bool(field_value)\n\n        return build_config\n\n    def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]:  # noqa: ARG002\n        \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n        if field_name not in [\"path\", \"advanced_mode\"]:\n            return frontend_node\n\n        template = frontend_node.get(\"template\", {})\n        paths = self._path_value(template)\n        if not paths:\n            return frontend_node\n\n        frontend_node[\"outputs\"] = []\n        if len(paths) == 1:\n            file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n            if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n                )\n            elif file_path.endswith(\".json\"):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n                )\n\n            advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n            if advanced_mode:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Output\", name=\"advanced\", method=\"load_files_advanced\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Markdown\", name=\"markdown\", method=\"load_files_markdown\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n            else:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n        else:\n            # Multiple files => DataFrame output; advanced parser disabled\n            frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n        return frontend_node\n\n    # ------------------------------ Core processing ----------------------------------\n\n    def _is_docling_compatible(self, file_path: str) -> bool:\n        \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n        docling_exts = (\n            \".adoc\",\n            \".asciidoc\",\n            \".asc\",\n            \".bmp\",\n            \".csv\",\n            \".dotx\",\n            \".dotm\",\n            \".docm\",\n            \".docx\",\n            \".htm\",\n            \".html\",\n            \".jpeg\",\n            \".json\",\n            \".md\",\n            \".pdf\",\n            \".png\",\n            \".potx\",\n            \".ppsx\",\n            \".pptm\",\n            \".potm\",\n            \".ppsm\",\n            \".pptx\",\n            \".tiff\",\n            \".txt\",\n            \".xls\",\n            \".xlsx\",\n            \".xhtml\",\n            \".xml\",\n            \".webp\",\n        )\n        return file_path.lower().endswith(docling_exts)\n\n    def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n        \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n        We avoid multiprocessing pickling by launching `python -c \"<script>\"` and\n        passing JSON config via stdin. The child prints a JSON result to stdout.\n        \"\"\"\n        if not file_path:\n            return None\n\n        args: dict[str, Any] = {\n            \"file_path\": file_path,\n            \"markdown\": bool(self.markdown),\n            \"image_mode\": str(self.IMAGE_MODE),\n            \"md_image_placeholder\": str(self.md_image_placeholder),\n            \"md_page_break_placeholder\": str(self.md_page_break_placeholder),\n            \"pipeline\": str(self.pipeline),\n            \"ocr_engine\": str(self.ocr_engine) if getattr(self, \"ocr_engine\", \"\") else None,\n        }\n\n        # The child is a tiny, self-contained script to keep memory/state isolated.\n        child_script = textwrap.dedent(\n            r\"\"\"\n            import json, sys\n\n            def try_imports():\n                # Strategy 1: latest layout\n                try:\n                    from docling.datamodel.base_models import ConversionStatus, InputFormat  # type: ignore\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    from docling_core.types.doc import ImageRefMode  # type: ignore\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"latest\"\n                except Exception:\n                    pass\n                # Strategy 2: alternative layout\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    try:\n                        from docling_core.types import ConversionStatus, InputFormat  # type: ignore\n                    except Exception:\n                        try:\n                            from docling.datamodel import ConversionStatus, InputFormat  # type: ignore\n                        except Exception:\n                            class ConversionStatus: SUCCESS = \"success\"\n                            class InputFormat:\n                                PDF=\"pdf\"; IMAGE=\"image\"\n                    try:\n                        from docling_core.types.doc import ImageRefMode  # type: ignore\n                    except Exception:\n                        class ImageRefMode:\n                            PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"alternative\"\n                except Exception:\n                    pass\n                # Strategy 3: basic converter only\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    class ConversionStatus: SUCCESS = \"success\"\n                    class InputFormat:\n                        PDF=\"pdf\"; IMAGE=\"image\"\n                    class ImageRefMode:\n                        PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"basic\"\n                except Exception as e:\n                    raise ImportError(f\"Docling imports failed: {e}\") from e\n\n            def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):\n                if strategy == \"latest\" and pipeline == \"standard\":\n                    try:\n                        from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore\n                        from docling.document_converter import PdfFormatOption  # type: ignore\n                        pipe = PdfPipelineOptions()\n                        if ocr_engine:\n                            try:\n                                from docling.models.factories import get_ocr_factory  # type: ignore\n                                pipe.do_ocr = True\n                                fac = get_ocr_factory(allow_external_plugins=False)\n                                pipe.ocr_options = fac.create_options(kind=ocr_engine)\n                            except Exception:\n                                pipe.do_ocr = False\n                        fmt = {}\n                        if hasattr(input_format, \"PDF\"):\n                            fmt[getattr(input_format, \"PDF\")] = PdfFormatOption(pipeline_options=pipe)\n                        if hasattr(input_format, \"IMAGE\"):\n                            fmt[getattr(input_format, \"IMAGE\")] = PdfFormatOption(pipeline_options=pipe)\n                        return DocumentConverter(format_options=fmt)\n                    except Exception:\n                        return DocumentConverter()\n                return DocumentConverter()\n\n            def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):\n                try:\n                    mode = getattr(ImageRefMode, image_mode.upper(), image_mode)\n                    return document.export_to_markdown(\n                        image_mode=mode,\n                        image_placeholder=img_ph,\n                        page_break_placeholder=pg_ph,\n                    )\n                except Exception:\n                    try:\n                        return document.export_to_text()\n                    except Exception:\n                        return str(document)\n\n            def to_rows(doc_dict):\n                rows = []\n                for t in doc_dict.get(\"texts\", []):\n                    prov = t.get(\"prov\") or []\n                    page_no = None\n                    if prov and isinstance(prov, list) and isinstance(prov[0], dict):\n                        page_no = prov[0].get(\"page_no\")\n                    rows.append({\n                        \"page_no\": page_no,\n                        \"label\": t.get(\"label\"),\n                        \"text\": t.get(\"text\"),\n                        \"level\": t.get(\"level\"),\n                    })\n                return rows\n\n            def main():\n                cfg = json.loads(sys.stdin.read())\n                file_path = cfg[\"file_path\"]\n                markdown = cfg[\"markdown\"]\n                image_mode = cfg[\"image_mode\"]\n                img_ph = cfg[\"md_image_placeholder\"]\n                pg_ph = cfg[\"md_page_break_placeholder\"]\n                pipeline = cfg[\"pipeline\"]\n                ocr_engine = cfg.get(\"ocr_engine\")\n                meta = {\"file_path\": file_path}\n\n                try:\n                    ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()\n                    converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)\n                    try:\n                        res = converter.convert(file_path)\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling conversion error: {e}\", \"meta\": meta}))\n                        return\n\n                    ok = False\n                    if hasattr(res, \"status\"):\n                        try:\n                            ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == \"success\")\n                        except Exception:\n                            ok = (str(res.status).lower() == \"success\")\n                    if not ok and hasattr(res, \"document\"):\n                        ok = getattr(res, \"document\", None) is not None\n                    if not ok:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling conversion failed\", \"meta\": meta}))\n                        return\n\n                    doc = getattr(res, \"document\", None)\n                    if doc is None:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling produced no document\", \"meta\": meta}))\n                        return\n\n                    if markdown:\n                        text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)\n                        print(json.dumps({\"ok\": True, \"mode\": \"markdown\", \"text\": text, \"meta\": meta}))\n                        return\n\n                    # structured\n                    try:\n                        doc_dict = doc.export_to_dict()\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling export_to_dict failed: {e}\", \"meta\": meta}))\n                        return\n\n                    rows = to_rows(doc_dict)\n                    print(json.dumps({\"ok\": True, \"mode\": \"structured\", \"doc\": rows, \"meta\": meta}))\n                except Exception as e:\n                    print(\n                        json.dumps({\n                            \"ok\": False,\n                            \"error\": f\"Docling processing error: {e}\",\n                            \"meta\": {\"file_path\": file_path},\n                        })\n                    )\n\n            if __name__ == \"__main__\":\n                main()\n            \"\"\"\n        )\n\n        # Validate file_path to avoid command injection or unsafe input\n        if not isinstance(args[\"file_path\"], str) or any(c in args[\"file_path\"] for c in [\";\", \"|\", \"&\", \"$\", \"`\"]):\n            return Data(data={\"error\": \"Unsafe file path detected.\", \"file_path\": args[\"file_path\"]})\n\n        proc = subprocess.run(  # noqa: S603\n            [sys.executable, \"-u\", \"-c\", child_script],\n            input=json.dumps(args).encode(\"utf-8\"),\n            capture_output=True,\n            check=False,\n        )\n\n        if not proc.stdout:\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\") or \"no output from child process\"\n            return Data(data={\"error\": f\"Docling subprocess error: {err_msg}\", \"file_path\": file_path})\n\n        try:\n            result = json.loads(proc.stdout.decode(\"utf-8\"))\n        except Exception as e:  # noqa: BLE001\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\")\n            return Data(\n                data={\"error\": f\"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}\", \"file_path\": file_path},\n            )\n\n        if not result.get(\"ok\"):\n            return Data(data={\"error\": result.get(\"error\", \"Unknown Docling error\"), **result.get(\"meta\", {})})\n\n        meta = result.get(\"meta\", {})\n        if result.get(\"mode\") == \"markdown\":\n            exported_content = str(result.get(\"text\", \"\"))\n            return Data(\n                text=exported_content,\n                data={\"exported_content\": exported_content, \"export_format\": self.EXPORT_FORMAT, **meta},\n            )\n\n        rows = list(result.get(\"doc\", []))\n        return Data(data={\"doc\": rows, \"export_format\": self.EXPORT_FORMAT, **meta})\n\n    def process_files(\n        self,\n        file_list: list[BaseFileComponent.BaseFile],\n    ) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Process input files.\n\n        - Single file + advanced_mode => Docling in a separate process.\n        - Otherwise => standard parsing in current process (optionally threaded).\n        \"\"\"\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                self.log(f\"File not found: {file_path}. Error: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                self.log(f\"Unexpected error processing {file_path}: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n\n        # Advanced path: only for a single Docling-compatible file\n        if len(file_list) == 1:\n            file_path = str(file_list[0].path)\n            if self.advanced_mode and self._is_docling_compatible(file_path):\n                advanced_data: Data | None = self._process_docling_in_subprocess(file_path)\n\n                # --- UNNEST: expand each element in `doc` to its own Data row\n                payload = getattr(advanced_data, \"data\", {}) or {}\n                doc_rows = payload.get(\"doc\")\n                if isinstance(doc_rows, list):\n                    rows: list[Data | None] = [\n                        Data(\n                            data={\n                                \"file_path\": file_path,\n                                **(item if isinstance(item, dict) else {\"value\": item}),\n                            },\n                        )\n                        for item in doc_rows\n                    ]\n                    return self.rollup_data(file_list, rows)\n\n                # If not structured, keep as-is (e.g., markdown export or error dict)\n                return self.rollup_data(file_list, [advanced_data])\n\n        # Standard multi-file (or single non-advanced) path\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_paths = [str(f.path) for f in file_list]\n        self.log(f\"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.\")\n        my_data = parallel_load_data(\n            file_paths,\n            silent_errors=self.silent_errors,\n            load_function=process_file_standard,\n            max_concurrency=concurrency,\n        )\n        return self.rollup_data(file_list, my_data)\n\n    # ------------------------------ Output helpers -----------------------------------\n\n    def load_files_advanced(self) -> DataFrame:\n        \"\"\"Load files using advanced Docling processing and export to an advanced format.\"\"\"\n        self.markdown = False\n        return self.load_files()\n\n    def load_files_markdown(self) -> Message:\n        \"\"\"Load files using advanced Docling processing and export to Markdown format.\"\"\"\n        self.markdown = True\n        result = self.load_files()\n        return Message(text=str(result.text[0]))\n    \n    def load_files_message(self) -> Message:\n        \"\"\"Load files and return as Message.\n        \n        Returns:\n          Message: Message containing all file data\n        \"\"\"\n        data_list = self.load_files_core()\n        if not data_list:\n          return Message()  # No data -> empty message\n        \n        sep: str = getattr(self, \"separator\", \"\\n\\n\") or \"\\n\\n\"\n        \n        parts: list[str] = []\n        metadata = {}  # Initialize as empty dict instead of None\n        \n        for d in data_list:\n          # Prefer explicit text if available, fall back to full dict, lastly str()\n          text = (getattr(d, \"get_text\", lambda: None)() or d.data.get(\"text\")) if isinstance(d.data, dict) else None\n          parts.append(text if text is not None else str(d))\n        \n          # Set metadata from first file (or you could combine metadata from all files)\n          if not metadata and hasattr(d, 'file_path'):\n              file_path = d.file_path\n              # Get filename\n              filename = Path(file_path).name\n              metadata[\"filename\"] = filename\n              extension = filename.split(\".\")[-1]\n              if extension:\n                metadata[\"mimetype\"] = build_content_type_from_extension(filename.split(\".\")[-1])\n        \n              # Add other common metadata fields if available\n              if hasattr(d, 'data') and isinstance(d.data, dict):\n                  # Copy relevant metadata fields\n                  for field in ['mimetype', 'file_size', 'created_time', 'modified_time']:\n                      if field in d.data:\n                          metadata[field] = d.data[field]\n        self.log(metadata)\n        return Message(text=sep.join(parts), **metadata)\n"
+                "value": "\"\"\"Enhanced file component with clearer structure and Docling isolation.\n\nNotes:\n-----\n- Functionality is preserved with minimal behavioral changes.\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n  growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n  utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import TYPE_CHECKING, Any\n\nfrom langflow.base.data.base_file import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    FileInput,\n    IntInput,\n    MessageTextInput,\n    Output,\n    StrInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.message import Message\nimport anyio\nfrom langflow.services.storage.utils import build_content_type_from_extension\nif TYPE_CHECKING:\n    from langflow.schema import DataFrame\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n    display_name = \"File\"\n    description = \"Loads content from files with optional advanced document processing and export using Docling.\"\n    documentation: str = \"https://docs.langflow.org/components-data#file\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n    VALID_EXTENSIONS = [\n        \"adoc\",\n        \"asciidoc\",\n        \"asc\",\n        \"bmp\",\n        \"csv\",\n        \"dotx\",\n        \"dotm\",\n        \"docm\",\n        \"docx\",\n        \"htm\",\n        \"html\",\n        \"jpeg\",\n        \"json\",\n        \"md\",\n        \"pdf\",\n        \"png\",\n        \"potx\",\n        \"ppsx\",\n        \"pptm\",\n        \"potm\",\n        \"ppsm\",\n        \"pptx\",\n        \"tiff\",\n        \"txt\",\n        \"xls\",\n        \"xlsx\",\n        \"xhtml\",\n        \"xml\",\n        \"webp\",\n        *TEXT_FILE_TYPES,\n    ]\n\n    # Fixed export settings used when markdown export is requested.\n    EXPORT_FORMAT = \"Markdown\"\n    IMAGE_MODE = \"placeholder\"\n\n    # ---- Inputs / Outputs (kept as close to original as possible) -------------------\n    _base_inputs = deepcopy(BaseFileComponent._base_inputs)\n    for input_item in _base_inputs:\n        if isinstance(input_item, FileInput) and input_item.name == \"path\":\n            input_item.real_time_refresh = True\n            break\n\n    inputs = [\n        *_base_inputs,\n        BoolInput(\n            name=\"advanced_mode\",\n            display_name=\"Advanced Parser\",\n            value=False,\n            real_time_refresh=True,\n            info=(\n                \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n                \"Available only for single file processing.\"\n            ),\n            show=False,\n        ),\n        DropdownInput(\n            name=\"pipeline\",\n            display_name=\"Pipeline\",\n            info=\"Docling pipeline to use\",\n            options=[\"standard\", \"vlm\"],\n            value=\"standard\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"ocr_engine\",\n            display_name=\"OCR Engine\",\n            info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n            options=[\"\", \"easyocr\"],\n            value=\"\",\n            show=False,\n            advanced=True,\n        ),\n        StrInput(\n            name=\"md_image_placeholder\",\n            display_name=\"Image placeholder\",\n            info=\"Specify the image placeholder for markdown exports.\",\n            value=\"<!-- image -->\",\n            advanced=True,\n            show=False,\n        ),\n        StrInput(\n            name=\"md_page_break_placeholder\",\n            display_name=\"Page break placeholder\",\n            info=\"Add this placeholder between pages in the markdown output.\",\n            value=\"\",\n            advanced=True,\n            show=False,\n        ),\n        MessageTextInput(\n            name=\"doc_key\",\n            display_name=\"Doc Key\",\n            info=\"The key to use for the DoclingDocument column.\",\n            value=\"doc\",\n            advanced=True,\n            show=False,\n        ),\n        # Deprecated input retained for backward-compatibility.\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=True,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n        BoolInput(\n            name=\"markdown\",\n            display_name=\"Markdown Export\",\n            info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n            value=False,\n            show=False,\n        ),\n    ]\n\n    outputs = [\n        Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n    ]\n\n    # ------------------------------ UI helpers --------------------------------------\n\n    def _path_value(self, template: dict) -> list[str]:\n        \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n        return template.get(\"path\", {}).get(\"file_path\", [])\n\n    def update_build_config(\n        self,\n        build_config: dict[str, Any],\n        field_value: Any,\n        field_name: str | None = None,\n    ) -> dict[str, Any]:\n        \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n        if field_name == \"path\":\n            paths = self._path_value(build_config)\n            file_path = paths[0] if paths else \"\"\n            file_count = len(field_value) if field_value else 0\n\n            # Advanced mode only for single (non-tabular) file\n            allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n            build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n            if not allow_advanced:\n                build_config[\"advanced_mode\"][\"value\"] = False\n                for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                    if f in build_config:\n                        build_config[f][\"show\"] = False\n\n        elif field_name == \"advanced_mode\":\n            for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n                if f in build_config:\n                    build_config[f][\"show\"] = bool(field_value)\n\n        return build_config\n\n    def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]:  # noqa: ARG002\n        \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n        if field_name not in [\"path\", \"advanced_mode\"]:\n            return frontend_node\n\n        template = frontend_node.get(\"template\", {})\n        paths = self._path_value(template)\n        if not paths:\n            return frontend_node\n\n        frontend_node[\"outputs\"] = []\n        if len(paths) == 1:\n            file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n            if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n                )\n            elif file_path.endswith(\".json\"):\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n                )\n\n            advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n            if advanced_mode:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Structured Output\", name=\"advanced\", method=\"load_files_advanced\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Markdown\", name=\"markdown\", method=\"load_files_markdown\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n            else:\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n                )\n                frontend_node[\"outputs\"].append(\n                    Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n                )\n        else:\n            # Multiple files => DataFrame output; advanced parser disabled\n            frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n        return frontend_node\n\n    # ------------------------------ Core processing ----------------------------------\n\n    def _is_docling_compatible(self, file_path: str) -> bool:\n        \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n        docling_exts = (\n            \".adoc\",\n            \".asciidoc\",\n            \".asc\",\n            \".bmp\",\n            \".csv\",\n            \".dotx\",\n            \".dotm\",\n            \".docm\",\n            \".docx\",\n            \".htm\",\n            \".html\",\n            \".jpeg\",\n            \".json\",\n            \".md\",\n            \".pdf\",\n            \".png\",\n            \".potx\",\n            \".ppsx\",\n            \".pptm\",\n            \".potm\",\n            \".ppsm\",\n            \".pptx\",\n            \".tiff\",\n            \".txt\",\n            \".xls\",\n            \".xlsx\",\n            \".xhtml\",\n            \".xml\",\n            \".webp\",\n        )\n        return file_path.lower().endswith(docling_exts)\n\n    def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n        \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n        We avoid multiprocessing pickling by launching `python -c \"<script>\"` and\n        passing JSON config via stdin. The child prints a JSON result to stdout.\n        \"\"\"\n        if not file_path:\n            return None\n\n        args: dict[str, Any] = {\n            \"file_path\": file_path,\n            \"markdown\": bool(self.markdown),\n            \"image_mode\": str(self.IMAGE_MODE),\n            \"md_image_placeholder\": str(self.md_image_placeholder),\n            \"md_page_break_placeholder\": str(self.md_page_break_placeholder),\n            \"pipeline\": str(self.pipeline),\n            \"ocr_engine\": str(self.ocr_engine) if getattr(self, \"ocr_engine\", \"\") else None,\n        }\n\n        # The child is a tiny, self-contained script to keep memory/state isolated.\n        child_script = textwrap.dedent(\n            r\"\"\"\n            import json, sys\n\n            def try_imports():\n                # Strategy 1: latest layout\n                try:\n                    from docling.datamodel.base_models import ConversionStatus, InputFormat  # type: ignore\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    from docling_core.types.doc import ImageRefMode  # type: ignore\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"latest\"\n                except Exception:\n                    pass\n                # Strategy 2: alternative layout\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    try:\n                        from docling_core.types import ConversionStatus, InputFormat  # type: ignore\n                    except Exception:\n                        try:\n                            from docling.datamodel import ConversionStatus, InputFormat  # type: ignore\n                        except Exception:\n                            class ConversionStatus: SUCCESS = \"success\"\n                            class InputFormat:\n                                PDF=\"pdf\"; IMAGE=\"image\"\n                    try:\n                        from docling_core.types.doc import ImageRefMode  # type: ignore\n                    except Exception:\n                        class ImageRefMode:\n                            PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"alternative\"\n                except Exception:\n                    pass\n                # Strategy 3: basic converter only\n                try:\n                    from docling.document_converter import DocumentConverter  # type: ignore\n                    class ConversionStatus: SUCCESS = \"success\"\n                    class InputFormat:\n                        PDF=\"pdf\"; IMAGE=\"image\"\n                    class ImageRefMode:\n                        PLACEHOLDER=\"placeholder\"; EMBEDDED=\"embedded\"\n                    return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, \"basic\"\n                except Exception as e:\n                    raise ImportError(f\"Docling imports failed: {e}\") from e\n\n            def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):\n                if strategy == \"latest\" and pipeline == \"standard\":\n                    try:\n                        from docling.datamodel.pipeline_options import PdfPipelineOptions  # type: ignore\n                        from docling.document_converter import PdfFormatOption  # type: ignore\n                        pipe = PdfPipelineOptions()\n                        if ocr_engine:\n                            try:\n                                from docling.models.factories import get_ocr_factory  # type: ignore\n                                pipe.do_ocr = True\n                                fac = get_ocr_factory(allow_external_plugins=False)\n                                pipe.ocr_options = fac.create_options(kind=ocr_engine)\n                            except Exception:\n                                pipe.do_ocr = False\n                        fmt = {}\n                        if hasattr(input_format, \"PDF\"):\n                            fmt[getattr(input_format, \"PDF\")] = PdfFormatOption(pipeline_options=pipe)\n                        if hasattr(input_format, \"IMAGE\"):\n                            fmt[getattr(input_format, \"IMAGE\")] = PdfFormatOption(pipeline_options=pipe)\n                        return DocumentConverter(format_options=fmt)\n                    except Exception:\n                        return DocumentConverter()\n                return DocumentConverter()\n\n            def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):\n                try:\n                    mode = getattr(ImageRefMode, image_mode.upper(), image_mode)\n                    return document.export_to_markdown(\n                        image_mode=mode,\n                        image_placeholder=img_ph,\n                        page_break_placeholder=pg_ph,\n                    )\n                except Exception:\n                    try:\n                        return document.export_to_text()\n                    except Exception:\n                        return str(document)\n\n            def to_rows(doc_dict):\n                rows = []\n                for t in doc_dict.get(\"texts\", []):\n                    prov = t.get(\"prov\") or []\n                    page_no = None\n                    if prov and isinstance(prov, list) and isinstance(prov[0], dict):\n                        page_no = prov[0].get(\"page_no\")\n                    rows.append({\n                        \"page_no\": page_no,\n                        \"label\": t.get(\"label\"),\n                        \"text\": t.get(\"text\"),\n                        \"level\": t.get(\"level\"),\n                    })\n                return rows\n\n            def main():\n                cfg = json.loads(sys.stdin.read())\n                file_path = cfg[\"file_path\"]\n                markdown = cfg[\"markdown\"]\n                image_mode = cfg[\"image_mode\"]\n                img_ph = cfg[\"md_image_placeholder\"]\n                pg_ph = cfg[\"md_page_break_placeholder\"]\n                pipeline = cfg[\"pipeline\"]\n                ocr_engine = cfg.get(\"ocr_engine\")\n                meta = {\"file_path\": file_path}\n\n                try:\n                    ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()\n                    converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)\n                    try:\n                        res = converter.convert(file_path)\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling conversion error: {e}\", \"meta\": meta}))\n                        return\n\n                    ok = False\n                    if hasattr(res, \"status\"):\n                        try:\n                            ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == \"success\")\n                        except Exception:\n                            ok = (str(res.status).lower() == \"success\")\n                    if not ok and hasattr(res, \"document\"):\n                        ok = getattr(res, \"document\", None) is not None\n                    if not ok:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling conversion failed\", \"meta\": meta}))\n                        return\n\n                    doc = getattr(res, \"document\", None)\n                    if doc is None:\n                        print(json.dumps({\"ok\": False, \"error\": \"Docling produced no document\", \"meta\": meta}))\n                        return\n\n                    if markdown:\n                        text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)\n                        print(json.dumps({\"ok\": True, \"mode\": \"markdown\", \"text\": text, \"meta\": meta}))\n                        return\n\n                    # structured\n                    try:\n                        doc_dict = doc.export_to_dict()\n                    except Exception as e:\n                        print(json.dumps({\"ok\": False, \"error\": f\"Docling export_to_dict failed: {e}\", \"meta\": meta}))\n                        return\n\n                    rows = to_rows(doc_dict)\n                    print(json.dumps({\"ok\": True, \"mode\": \"structured\", \"doc\": rows, \"meta\": meta}))\n                except Exception as e:\n                    print(\n                        json.dumps({\n                            \"ok\": False,\n                            \"error\": f\"Docling processing error: {e}\",\n                            \"meta\": {\"file_path\": file_path},\n                        })\n                    )\n\n            if __name__ == \"__main__\":\n                main()\n            \"\"\"\n        )\n\n        # Validate file_path to avoid command injection or unsafe input\n        if not isinstance(args[\"file_path\"], str) or any(c in args[\"file_path\"] for c in [\";\", \"|\", \"&\", \"$\", \"`\"]):\n            return Data(data={\"error\": \"Unsafe file path detected.\", \"file_path\": args[\"file_path\"]})\n\n        proc = subprocess.run(  # noqa: S603\n            [sys.executable, \"-u\", \"-c\", child_script],\n            input=json.dumps(args).encode(\"utf-8\"),\n            capture_output=True,\n            check=False,\n        )\n\n        if not proc.stdout:\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\") or \"no output from child process\"\n            return Data(data={\"error\": f\"Docling subprocess error: {err_msg}\", \"file_path\": file_path})\n\n        try:\n            result = json.loads(proc.stdout.decode(\"utf-8\"))\n        except Exception as e:  # noqa: BLE001\n            err_msg = proc.stderr.decode(\"utf-8\", errors=\"replace\")\n            return Data(\n                data={\"error\": f\"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}\", \"file_path\": file_path},\n            )\n\n        if not result.get(\"ok\"):\n            return Data(data={\"error\": result.get(\"error\", \"Unknown Docling error\"), **result.get(\"meta\", {})})\n\n        meta = result.get(\"meta\", {})\n        if result.get(\"mode\") == \"markdown\":\n            exported_content = str(result.get(\"text\", \"\"))\n            return Data(\n                text=exported_content,\n                data={\"exported_content\": exported_content, \"export_format\": self.EXPORT_FORMAT, **meta},\n            )\n\n        rows = list(result.get(\"doc\", []))\n        return Data(data={\"doc\": rows, \"export_format\": self.EXPORT_FORMAT, **meta})\n\n    def process_files(\n        self,\n        file_list: list[BaseFileComponent.BaseFile],\n    ) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Process input files.\n\n        - Single file + advanced_mode => Docling in a separate process.\n        - Otherwise => standard parsing in current process (optionally threaded).\n        \"\"\"\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                self.log(f\"File not found: {file_path}. Error: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                self.log(f\"Unexpected error processing {file_path}: {e}\")\n                if not silent_errors:\n                    raise\n                return None\n\n        # Advanced path: only for a single Docling-compatible file\n        if len(file_list) == 1:\n            file_path = str(file_list[0].path)\n            if self.advanced_mode and self._is_docling_compatible(file_path):\n                advanced_data: Data | None = self._process_docling_in_subprocess(file_path)\n\n                # --- UNNEST: expand each element in `doc` to its own Data row\n                payload = getattr(advanced_data, \"data\", {}) or {}\n                doc_rows = payload.get(\"doc\")\n                if isinstance(doc_rows, list):\n                    rows: list[Data | None] = [\n                        Data(\n                            data={\n                                \"file_path\": file_path,\n                                **(item if isinstance(item, dict) else {\"value\": item}),\n                            },\n                        )\n                        for item in doc_rows\n                    ]\n                    return self.rollup_data(file_list, rows)\n\n                # If not structured, keep as-is (e.g., markdown export or error dict)\n                return self.rollup_data(file_list, [advanced_data])\n\n        # Standard multi-file (or single non-advanced) path\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_paths = [str(f.path) for f in file_list]\n        self.log(f\"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.\")\n        my_data = parallel_load_data(\n            file_paths,\n            silent_errors=self.silent_errors,\n            load_function=process_file_standard,\n            max_concurrency=concurrency,\n        )\n        return self.rollup_data(file_list, my_data)\n\n    # ------------------------------ Output helpers -----------------------------------\n\n    def load_files_advanced(self) -> DataFrame:\n        \"\"\"Load files using advanced Docling processing and export to an advanced format.\"\"\"\n        self.markdown = False\n        return self.load_files()\n\n    def load_files_markdown(self) -> Message:\n        \"\"\"Load files using advanced Docling processing and export to Markdown format.\"\"\"\n        self.markdown = True\n        result = self.load_files()\n        return Message(text=str(result.text[0]))\n    \n    async def load_files_message(self) -> Message:\n        \"\"\"Load files and return as Message.\n        \n        Returns:\n          Message: Message containing all file data\n        \"\"\"\n        data_list = self.load_files_core()\n        if not data_list:\n          return Message()  # No data -> empty message\n        \n        sep: str = getattr(self, \"separator\", \"\\n\\n\") or \"\\n\\n\"\n        \n        parts: list[str] = []\n        metadata = {}  # Initialize as empty dict instead of None\n        \n        for d in data_list:\n          # Prefer explicit text if available, fall back to full dict, lastly str()\n          text = (getattr(d, \"get_text\", lambda: None)() or d.data.get(\"text\")) if isinstance(d.data, dict) else None\n          parts.append(text if text is not None else str(d))\n        \n          # Set metadata from first file (or you could combine metadata from all files)\n          if not metadata and hasattr(d, 'file_path'):\n              file_path = d.file_path\n              # Get filename\n              file_path_obj = anyio.Path(file_path)\n              file_size_stat = await file_path_obj.stat()\n              filesize = file_size_stat.st_size\n              filename = file_path_obj.name\n              metadata[\"filename\"] = filename\n              metadata[\"file_size\"] = filesize\n              extension = filename.split(\".\")[-1]\n              if extension:\n                metadata[\"mimetype\"] = build_content_type_from_extension(filename.split(\".\")[-1])\n        \n              # Add other common metadata fields if available\n              if hasattr(d, 'data') and isinstance(d.data, dict):\n                  # Copy relevant metadata fields\n                  for field in ['mimetype', 'file_size', 'created_time', 'modified_time']:\n                      if field in d.data:\n                          metadata[field] = d.data[field]\n        self.log(metadata)\n        return Message(text=sep.join(parts), **metadata)\n"
               },
               "concurrency_multithreading": {
                 "_input_type": "IntInput",
@@ -1259,9 +1263,7 @@
                   "bz2",
                   "gz"
                 ],
-                "file_path": [
-                  "435b1280-b2e0-44eb-b917-cf6292dfc41a/comprovanteDARF.pdf"
-                ],
+                "file_path": [],
                 "info": "Supported file extensions: adoc, asciidoc, asc, bmp, csv, dotx, dotm, docm, docx, htm, html, jpeg, json, md, pdf, png, potx, ppsx, pptm, potm, ppsm, pptx, tiff, txt, xls, xlsx, xhtml, xml, webp, txt, md, mdx, csv, json, yaml, yml, xml, html, htm, pdf, docx, py, sh, sql, js, ts, tsx; optionally bundled in file extensions: zip, tar, tgz, bz2, gz",
                 "list": true,
                 "list_add_label": "Add More",
@@ -1420,7 +1422,7 @@
             "legacy": false,
             "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "13675b26ae54",
+              "code_hash": "7f0062789e1c",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1554,7 +1556,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            data_copy[\"file_size\"] = document_sizes[filename]\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1757,7 +1759,7 @@
                 "tool_mode": false,
                 "trace_as_metadata": true,
                 "type": "int",
-                "value": 4
+                "value": 15
               },
               "opensearch_url": {
                 "_input_type": "StrInput",
@@ -1959,8 +1961,8 @@
       }
     ],
     "viewport": {
-      "x": -761.2209660078397,
-      "y": -883.7176624994117,
+      "x": -960.2209660078397,
+      "y": -896.7176624994117,
       "zoom": 0.7898282762479812
     }
   },

From a6beb4a4aa6068a96052cdba92d755fcd8120320 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 23:49:20 -0300
Subject: [PATCH 61/67] Update ingestion flow JSON to enhance component
 configurations and add metadata input

This commit modifies the ingestion flow JSON by adjusting the position of nodes for better layout, adding a new input field for ingestion metadata, and updating the code hash. These changes improve the clarity and functionality of the component configurations, aligning with best practices for robust async development and enhancing the overall user experience.
---
 flows/ingestion_flow.json | 65 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 7 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 8a5b9256..0582548b 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -833,8 +833,8 @@
           "width": 320
         },
         "position": {
-          "x": 1750.4286955907494,
-          "y": 1934.8525614216642
+          "x": 1704.8491676318172,
+          "y": 1879.144249471858
         },
         "positionAbsolute": {
           "x": 1690.9220896443658,
@@ -1395,6 +1395,7 @@
             "documentation": "",
             "edited": true,
             "field_order": [
+              "docs_metadata",
               "opensearch_url",
               "index_name",
               "engine",
@@ -1420,9 +1421,8 @@
             "frozen": false,
             "icon": "OpenSearch",
             "legacy": false,
-            "lf_version": "1.5.0.post2",
             "metadata": {
-              "code_hash": "7f0062789e1c",
+              "code_hash": "f979f0872c35",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1556,7 +1556,58 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n    TableInput\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        TableInput(\n            name=\"docs_metadata\",\n            display_name=\"Ingestion Metadata\",\n            info=\"Key value pairs to be inserted into each ingested document.\",\n            table_schema=[\n                {\n                    \"name\": \"key\",\n                    \"display_name\": \"Key\",\n                    \"type\": \"str\",\n                    \"description\": \"Key name\",\n                },\n                {\n                    \"name\": \"value\",\n                    \"display_name\": \"Value\",\n                    \"type\": \"str\",\n                    \"description\": \"Value of the metadata\",\n                },\n            ],\n            value=[],\n            advanced=True,\n            \n        ),\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+              },
+              "docs_metadata": {
+                "_input_type": "TableInput",
+                "advanced": true,
+                "display_name": "Ingestion Metadata",
+                "dynamic": false,
+                "info": "Key value pairs to be inserted into each ingested document.",
+                "is_list": true,
+                "list_add_label": "Add More",
+                "name": "docs_metadata",
+                "placeholder": "",
+                "required": false,
+                "show": true,
+                "table_icon": "Table",
+                "table_schema": {
+                  "columns": [
+                    {
+                      "default": "None",
+                      "description": "Key name",
+                      "disable_edit": false,
+                      "display_name": "Key",
+                      "edit_mode": "popover",
+                      "filterable": true,
+                      "formatter": "text",
+                      "hidden": false,
+                      "name": "key",
+                      "sortable": true,
+                      "type": "str"
+                    },
+                    {
+                      "default": "None",
+                      "description": "Value of the metadata",
+                      "disable_edit": false,
+                      "display_name": "Value",
+                      "edit_mode": "popover",
+                      "filterable": true,
+                      "formatter": "text",
+                      "hidden": false,
+                      "name": "value",
+                      "sortable": true,
+                      "type": "str"
+                    }
+                  ]
+                },
+                "title_case": false,
+                "tool_mode": false,
+                "trace_as_metadata": true,
+                "trigger_icon": "Table",
+                "trigger_text": "Open table",
+                "type": "table",
+                "value": []
               },
               "ef_construction": {
                 "_input_type": "IntInput",
@@ -1956,12 +2007,12 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": true,
+        "selected": false,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -960.2209660078397,
+      "x": -960.2209660078395,
       "y": -896.7176624994117,
       "zoom": 0.7898282762479812
     }

From 0539f7751aa40e4b7a6b7f1d65edd0188c8498c4 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Mon, 8 Sep 2025 23:49:46 -0300
Subject: [PATCH 62/67] Enhance ingestion flow by adding user metadata and
 improving context handling

This commit updates the ingestion flow to include user metadata such as owner ID, name, and email, enhancing the context for downstream services. It also refines the handling of tweaks in the LangflowFileService to incorporate this metadata, ensuring better tracking and clarity in the ingestion process. These changes align with best practices for robust async development and improve the overall functionality of the ingestion flow.
---
 src/api/langflow_files.py                    | 12 +++++++++-
 src/connectors/langflow_connector_service.py |  8 ++++++-
 src/services/langflow_file_service.py        | 24 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/api/langflow_files.py b/src/api/langflow_files.py
index 23bd33ec..36deafbd 100644
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@@ -99,11 +99,17 @@ async def run_ingestion(
             logger.debug("Final tweaks with settings applied", tweaks=tweaks)
         # Include user JWT if available
         jwt_token = getattr(request.state, "jwt_token", None)
+
+        # Extract user info from User object
+        user = getattr(request.state, "user", None)
+        user_id = user.user_id if user else None
+        user_name = user.name if user else None
+        user_email = user.email if user else None
+
         if jwt_token:
             # Set auth context for downstream services
             from auth_context import set_auth_context
 
-            user_id = getattr(request.state, "user_id", None)
             set_auth_context(user_id, jwt_token)
 
         result = await langflow_file_service.run_ingestion_flow(
@@ -111,6 +117,10 @@ async def run_ingestion(
             jwt_token=jwt_token,
             session_id=session_id,
             tweaks=tweaks,
+            owner=user_id,
+            owner_name=user_name,
+            owner_email=user_email,
+            connector_type="local",
         )
         return JSONResponse(result)
     except Exception as e:
diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py
index 52f0a6b8..eda9d1fd 100644
--- a/src/connectors/langflow_connector_service.py
+++ b/src/connectors/langflow_connector_service.py
@@ -91,7 +91,13 @@ class LangflowConnectorService:
                 tweaks = {}  # Let Langflow handle the ingestion with default settings
 
                 ingestion_result = await self.langflow_service.run_ingestion_flow(
-                    file_paths=[langflow_file_path], jwt_token=jwt_token, tweaks=tweaks
+                    file_paths=[langflow_file_path],
+                    jwt_token=jwt_token,
+                    tweaks=tweaks,
+                    owner=owner_user_id,
+                    owner_name=owner_name,
+                    owner_email=owner_email,
+                    connector_type=connector_type,
                 )
 
                 logger.debug("Ingestion flow completed", result=ingestion_result)
diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index 9d582989..aab128bb 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -63,6 +63,10 @@ class LangflowFileService:
         jwt_token: str,
         session_id: Optional[str] = None,
         tweaks: Optional[Dict[str, Any]] = None,
+        owner: Optional[str] = None,
+        owner_name: Optional[str] = None,
+        owner_email: Optional[str] = None,
+        connector_type: Optional[str] = None,
     ) -> Dict[str, Any]:
         """
         Trigger the ingestion flow with provided file paths.
@@ -91,6 +95,26 @@ class LangflowFileService:
             logger.debug("[LF] Added JWT token to tweaks for OpenSearch components")
         else:
             logger.warning("[LF] No JWT token provided")
+
+        # Pass metadata via tweaks to OpenSearch component
+        metadata_tweaks = []
+        if owner:
+            metadata_tweaks.append({"key": "owner", "value": owner})
+        if owner_name:
+            metadata_tweaks.append({"key": "owner_name", "value": owner_name})
+        if owner_email:
+            metadata_tweaks.append({"key": "owner_email", "value": owner_email})
+        if connector_type:
+            metadata_tweaks.append({"key": "connector_type", "value": connector_type})
+
+        if metadata_tweaks:
+            # Initialize the OpenSearch component tweaks if not already present
+            if "OpenSearchHybrid-Ve6bS" not in tweaks:
+                tweaks["OpenSearchHybrid-Ve6bS"] = {}
+            tweaks["OpenSearchHybrid-Ve6bS"]["docs_metadata"] = metadata_tweaks
+            logger.debug(
+                "[LF] Added metadata to tweaks", metadata_count=len(metadata_tweaks)
+            )
         if tweaks:
             payload["tweaks"] = tweaks
         if session_id:

From 1d5d26b4560f984d302700fc704b1801534a4cc0 Mon Sep 17 00:00:00 2001
From: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Date: Tue, 9 Sep 2025 00:13:26 -0300
Subject: [PATCH 63/67] Update ingestion flow JSON to enhance metadata handling
 and improve component configurations

This commit modifies the ingestion flow JSON by adding support for user-defined metadata through the "docs_metadata" input field, improving the context for document ingestion. Additionally, it updates the selection state of a node and adjusts the viewport settings for better layout. These changes enhance the overall functionality and clarity of the component configurations, aligning with best practices for robust async development.
---
 flows/ingestion_flow.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json
index 0582548b..dd039a37 100644
--- a/flows/ingestion_flow.json
+++ b/flows/ingestion_flow.json
@@ -1422,7 +1422,7 @@
             "icon": "OpenSearch",
             "legacy": false,
             "metadata": {
-              "code_hash": "f979f0872c35",
+              "code_hash": "deee3f04cb47",
               "dependencies": {
                 "dependencies": [
                   {
@@ -1556,7 +1556,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n    TableInput\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n    ]\n\n    inputs = [\n        TableInput(\n            name=\"docs_metadata\",\n            display_name=\"Ingestion Metadata\",\n            info=\"Key value pairs to be inserted into each ingested document.\",\n            table_schema=[\n                {\n                    \"name\": \"key\",\n                    \"display_name\": \"Key\",\n                    \"type\": \"str\",\n                    \"description\": \"Key name\",\n                },\n                {\n                    \"name\": \"value\",\n                    \"display_name\": \"Value\",\n                    \"type\": \"str\",\n                    \"description\": \"Value of the metadata\",\n                },\n            ],\n            value=[],\n            advanced=True,\n            \n        ),\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Track document sizes by filename\n        document_sizes = {}\n        \n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Calculate and track document size\n            filename = data_copy.get(\"filename\", \"unknown\")\n            text_size = len(text.encode(\"utf-8\")) if text else 0\n            if filename in document_sizes:\n                document_sizes[filename] += text_size\n            else:\n                document_sizes[filename] = text_size\n\n            # Add calculated size to metadata (per chunk, but represents total document size)\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [\n                Data(\n                    text=hit[\"page_content\"],\n                    **hit[\"metadata\"]\n                )\n                for hit in raw\n            ]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
+                "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom langflow.base.vectorstores.model import (\n    LCVectorStoreComponent,\n    check_cached_vector_store,\n)\nfrom langflow.base.vectorstores.vector_store_connection_decorator import (\n    vector_store_connection,\n)\nfrom langflow.io import (\n    BoolInput,\n    DropdownInput,\n    HandleInput,\n    IntInput,\n    MultilineInput,\n    SecretStrInput,\n    StrInput,\n    TableInput,\n)\nfrom langflow.logging import logger\nfrom langflow.schema.data import Data\nfrom opensearchpy import OpenSearch, helpers\n\n\n@vector_store_connection\nclass OpenSearchHybridComponent(LCVectorStoreComponent):\n    \"\"\"OpenSearch hybrid search: KNN (k=10, boost=0.7) + multi_match (boost=0.3) with optional filters & min_score.\"\"\"\n\n    display_name: str = \"OpenSearch (Hybrid)\"\n    name: str = \"OpenSearchHybrid\"\n    icon: str = \"OpenSearch\"\n    description: str = \"Hybrid search: KNN + keyword, with optional filters, min_score, and aggregations.\"\n\n    # Keys we consider baseline\n    default_keys: list[str] = [\n        \"opensearch_url\",\n        \"index_name\",\n        *[\n            i.name for i in LCVectorStoreComponent.inputs\n        ],  # search_query, add_documents, etc.\n        \"embedding\",\n        \"vector_field\",\n        \"number_of_results\",\n        \"auth_mode\",\n        \"username\",\n        \"password\",\n        \"jwt_token\",\n        \"jwt_header\",\n        \"bearer_prefix\",\n        \"use_ssl\",\n        \"verify_certs\",\n        \"filter_expression\",\n        \"engine\",\n        \"space_type\",\n        \"ef_construction\",\n        \"m\",\n        \"docs_metadata\",\n    ]\n\n    inputs = [\n        TableInput(\n            name=\"docs_metadata\",\n            display_name=\"Ingestion Metadata\",\n            info=\"Key value pairs to be inserted into each ingested document.\",\n            table_schema=[\n                {\n                    \"name\": \"key\",\n                    \"display_name\": \"Key\",\n                    \"type\": \"str\",\n                    \"description\": \"Key name\",\n                },\n                {\n                    \"name\": \"value\",\n                    \"display_name\": \"Value\",\n                    \"type\": \"str\",\n                    \"description\": \"Value of the metadata\",\n                },\n            ],\n            value=[],\n            advanced=True,\n        ),\n        StrInput(\n            name=\"opensearch_url\",\n            display_name=\"OpenSearch URL\",\n            value=\"http://localhost:9200\",\n            info=\"URL for your OpenSearch cluster.\",\n        ),\n        StrInput(\n            name=\"index_name\",\n            display_name=\"Index Name\",\n            value=\"langflow\",\n            info=\"The index to search.\",\n        ),\n        DropdownInput(\n            name=\"engine\",\n            display_name=\"Engine\",\n            options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n            value=\"jvector\",\n            info=\"Vector search engine to use.\",\n            advanced=True,\n        ),\n        DropdownInput(\n            name=\"space_type\",\n            display_name=\"Space Type\",\n            options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n            value=\"l2\",\n            info=\"Distance metric for vector similarity.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"ef_construction\",\n            display_name=\"EF Construction\",\n            value=512,\n            info=\"Size of the dynamic list used during k-NN graph creation.\",\n            advanced=True,\n        ),\n        IntInput(\n            name=\"m\",\n            display_name=\"M Parameter\",\n            value=16,\n            info=\"Number of bidirectional links created for each new element.\",\n            advanced=True,\n        ),\n        *LCVectorStoreComponent.inputs,  # includes search_query, add_documents, etc.\n        HandleInput(\n            name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]\n        ),\n        StrInput(\n            name=\"vector_field\",\n            display_name=\"Vector Field\",\n            value=\"chunk_embedding\",\n            advanced=True,\n            info=\"Vector field used for KNN.\",\n        ),\n        IntInput(\n            name=\"number_of_results\",\n            display_name=\"Default Size (limit)\",\n            value=10,\n            advanced=True,\n            info=\"Default number of hits when no limit provided in filter_expression.\",\n        ),\n        MultilineInput(\n            name=\"filter_expression\",\n            display_name=\"Filter Expression (JSON)\",\n            value=\"\",\n            info=(\n                \"Optional JSON to control filters/limit/score threshold.\\n\"\n                \"Accepted shapes:\\n\"\n                '1) {\"filter\": [ {\"term\": {\"filename\":\"foo\"}}, {\"terms\":{\"owner\":[\"u1\",\"u2\"]}} ], \"limit\": 10, \"score_threshold\": 1.6 }\\n'\n                '2) Context-style maps: {\"data_sources\":[\"fileA\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"123\"]}\\n'\n                \"Placeholders with __IMPOSSIBLE_VALUE__ are ignored.\"\n            ),\n        ),\n        # ----- Auth controls (dynamic) -----\n        DropdownInput(\n            name=\"auth_mode\",\n            display_name=\"Auth Mode\",\n            value=\"basic\",\n            options=[\"basic\", \"jwt\"],\n            info=\"Choose Basic (username/password) or JWT (Bearer token).\",\n            real_time_refresh=True,\n            advanced=False,\n        ),\n        StrInput(\n            name=\"username\",\n            display_name=\"Username\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"password\",\n            display_name=\"Password\",\n            value=\"admin\",\n            show=False,\n        ),\n        SecretStrInput(\n            name=\"jwt_token\",\n            display_name=\"JWT Token\",\n            value=\"JWT\",\n            load_from_db=True,\n            show=True,\n            info=\"Paste a valid JWT (sent as a header).\",\n        ),\n        StrInput(\n            name=\"jwt_header\",\n            display_name=\"JWT Header Name\",\n            value=\"Authorization\",\n            show=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"bearer_prefix\",\n            display_name=\"Prefix 'Bearer '\",\n            value=True,\n            show=False,\n            advanced=True,\n        ),\n        # ----- TLS -----\n        BoolInput(name=\"use_ssl\", display_name=\"Use SSL\", value=True, advanced=True),\n        BoolInput(\n            name=\"verify_certs\",\n            display_name=\"Verify Certificates\",\n            value=False,\n            advanced=True,\n        ),\n    ]\n\n    # ---------- helper functions for index management ----------\n    def _default_text_mapping(\n        self,\n        dim: int,\n        engine: str = \"jvector\",\n        space_type: str = \"l2\",\n        ef_search: int = 512,\n        ef_construction: int = 100,\n        m: int = 16,\n        vector_field: str = \"vector_field\",\n    ) -> Dict[str, Any]:\n        \"\"\"For Approximate k-NN Search, this is the default mapping to create index.\"\"\"\n        return {\n            \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n            \"mappings\": {\n                \"properties\": {\n                    vector_field: {\n                        \"type\": \"knn_vector\",\n                        \"dimension\": dim,\n                        \"method\": {\n                            \"name\": \"disk_ann\",\n                            \"space_type\": space_type,\n                            \"engine\": engine,\n                            \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n                        },\n                    }\n                }\n            },\n        }\n\n    def _validate_aoss_with_engines(self, is_aoss: bool, engine: str) -> None:\n        \"\"\"Validate AOSS with the engine.\"\"\"\n        if is_aoss and engine != \"nmslib\" and engine != \"faiss\":\n            raise ValueError(\n                \"Amazon OpenSearch Service Serverless only \"\n                \"supports `nmslib` or `faiss` engines\"\n            )\n\n    def _is_aoss_enabled(self, http_auth: Any) -> bool:\n        \"\"\"Check if the service is http_auth is set as `aoss`.\"\"\"\n        if (\n            http_auth is not None\n            and hasattr(http_auth, \"service\")\n            and http_auth.service == \"aoss\"\n        ):\n            return True\n        return False\n\n    def _bulk_ingest_embeddings(\n        self,\n        client: OpenSearch,\n        index_name: str,\n        embeddings: List[List[float]],\n        texts: List[str],\n        metadatas: Optional[List[dict]] = None,\n        ids: Optional[List[str]] = None,\n        vector_field: str = \"vector_field\",\n        text_field: str = \"text\",\n        mapping: Optional[Dict] = None,\n        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,\n        is_aoss: bool = False,\n    ) -> List[str]:\n        \"\"\"Bulk Ingest Embeddings into given index.\"\"\"\n        if not mapping:\n            mapping = dict()\n\n        requests = []\n        return_ids = []\n\n        for i, text in enumerate(texts):\n            metadata = metadatas[i] if metadatas else {}\n            _id = ids[i] if ids else str(uuid.uuid4())\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": index_name,\n                vector_field: embeddings[i],\n                text_field: text,\n                **metadata,\n            }\n            if is_aoss:\n                request[\"id\"] = _id\n            else:\n                request[\"_id\"] = _id\n            requests.append(request)\n            return_ids.append(_id)\n        self.log(metadatas[i])\n        helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n        return return_ids\n\n    # ---------- auth / client ----------\n    def _build_auth_kwargs(self) -> Dict[str, Any]:\n        mode = (self.auth_mode or \"basic\").strip().lower()\n        if mode == \"jwt\":\n            token = (self.jwt_token or \"\").strip()\n            if not token:\n                raise ValueError(\"Auth Mode is 'jwt' but no jwt_token was provided.\")\n            header_name = (self.jwt_header or \"Authorization\").strip()\n            header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n            return {\"headers\": {header_name: header_value}}\n        user = (self.username or \"\").strip()\n        pwd = (self.password or \"\").strip()\n        if not user or not pwd:\n            raise ValueError(\"Auth Mode is 'basic' but username/password are missing.\")\n        return {\"http_auth\": (user, pwd)}\n\n    def build_client(self) -> OpenSearch:\n        auth_kwargs = self._build_auth_kwargs()\n        return OpenSearch(\n            hosts=[self.opensearch_url],\n            use_ssl=self.use_ssl,\n            verify_certs=self.verify_certs,\n            ssl_assert_hostname=False,\n            ssl_show_warn=False,\n            **auth_kwargs,\n        )\n\n    @check_cached_vector_store\n    def build_vector_store(self) -> OpenSearch:\n        # Return raw OpenSearch client as our “vector store.”\n        self.log(self.ingest_data)\n        client = self.build_client()\n        self._add_documents_to_vector_store(client=client)\n        return client\n\n    # ---------- ingest ----------\n    def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n        # Convert DataFrame to Data if needed using parent's method\n        self.ingest_data = self._prepare_ingest_data()\n\n        docs = self.ingest_data or []\n        if not docs:\n            self.log(\"No documents to ingest.\")\n            return\n\n        # Extract texts and metadata from documents\n        texts = []\n        metadatas = []\n        # Process docs_metadata table input into a dict\n        additional_metadata = {}\n        if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n            for item in self.docs_metadata:\n                if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n                    additional_metadata[item[\"key\"]] = item[\"value\"]\n\n        for doc_obj in docs:\n            data_copy = json.loads(doc_obj.model_dump_json())\n            text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n            texts.append(text)\n\n            # Merge additional metadata from table input\n            data_copy.update(additional_metadata)\n\n            metadatas.append(data_copy)\n        self.log(metadatas)\n        if not self.embedding:\n            raise ValueError(\"Embedding handle is required to embed documents.\")\n\n        # Generate embeddings\n        vectors = self.embedding.embed_documents(texts)\n\n        if not vectors:\n            self.log(\"No vectors generated from documents.\")\n            return\n\n        # Get vector dimension for mapping\n        dim = len(vectors[0]) if vectors else 768  # default fallback\n\n        # Check for AOSS\n        auth_kwargs = self._build_auth_kwargs()\n        is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n        # Validate engine with AOSS\n        engine = getattr(self, \"engine\", \"jvector\")\n        self._validate_aoss_with_engines(is_aoss, engine)\n\n        # Create mapping with proper KNN settings\n        space_type = getattr(self, \"space_type\", \"l2\")\n        ef_construction = getattr(self, \"ef_construction\", 512)\n        m = getattr(self, \"m\", 16)\n\n        mapping = self._default_text_mapping(\n            dim=dim,\n            engine=engine,\n            space_type=space_type,\n            ef_construction=ef_construction,\n            m=m,\n            vector_field=self.vector_field,\n        )\n\n        self.log(\n            f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\"\n        )\n\n        # Use the LangChain-style bulk ingestion\n        return_ids = self._bulk_ingest_embeddings(\n            client=client,\n            index_name=self.index_name,\n            embeddings=vectors,\n            texts=texts,\n            metadatas=metadatas,\n            vector_field=self.vector_field,\n            text_field=\"text\",\n            mapping=mapping,\n            is_aoss=is_aoss,\n        )\n        self.log(metadatas)\n\n        self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n    # ---------- helpers for filters ----------\n    def _is_placeholder_term(self, term_obj: dict) -> bool:\n        # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n        return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n    def _coerce_filter_clauses(self, filter_obj: dict | None) -> List[dict]:\n        \"\"\"\n        Accepts either:\n          A) {\"filter\":[ ...term/terms objects... ], \"limit\":..., \"score_threshold\":...}\n          B) Context-style: {\"data_sources\":[...], \"document_types\":[...], \"owners\":[...]}\n        Returns a list of OS filter clauses (term/terms), skipping placeholders and empty terms.\n        \"\"\"\n\n        if not filter_obj:\n            return []\n\n        # If it’s a string, try to parse it once\n        if isinstance(filter_obj, str):\n            try:\n                filter_obj = json.loads(filter_obj)\n            except Exception:\n                # Not valid JSON → treat as no filters\n                return []\n\n        # Case A: already an explicit list/dict under \"filter\"\n        if \"filter\" in filter_obj:\n            raw = filter_obj[\"filter\"]\n            if isinstance(raw, dict):\n                raw = [raw]\n            clauses: List[dict] = []\n            for f in raw or []:\n                if (\n                    \"term\" in f\n                    and isinstance(f[\"term\"], dict)\n                    and not self._is_placeholder_term(f[\"term\"])\n                ):\n                    clauses.append(f)\n                elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n                    field, vals = next(iter(f[\"terms\"].items()))\n                    if isinstance(vals, list) and len(vals) > 0:\n                        clauses.append(f)\n            return clauses\n\n        # Case B: convert context-style maps into clauses\n        field_mapping = {\n            \"data_sources\": \"filename\",\n            \"document_types\": \"mimetype\",\n            \"owners\": \"owner\",\n        }\n        clauses: List[dict] = []\n        for k, values in filter_obj.items():\n            if not isinstance(values, list):\n                continue\n            field = field_mapping.get(k, k)\n            if len(values) == 0:\n                # Match-nothing placeholder (kept to mirror your tool semantics)\n                clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n            elif len(values) == 1:\n                if values[0] != \"__IMPOSSIBLE_VALUE__\":\n                    clauses.append({\"term\": {field: values[0]}})\n            else:\n                clauses.append({\"terms\": {field: values}})\n        return clauses\n\n    # ---------- search (single hybrid path matching your tool) ----------\n    def search(self, query: str | None = None) -> list[dict[str, Any]]:\n        logger.info(self.ingest_data)\n        client = self.build_client()\n        q = (query or \"\").strip()\n\n        # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n        filter_obj = None\n        if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n            try:\n                filter_obj = json.loads(self.filter_expression)\n            except json.JSONDecodeError as e:\n                raise ValueError(f\"Invalid filter_expression JSON: {e}\") from e\n\n        if not self.embedding:\n            raise ValueError(\n                \"Embedding is required to run hybrid search (KNN + keyword).\"\n            )\n\n        # Embed the query\n        vec = self.embedding.embed_query(q)\n\n        # Build filter clauses (accept both shapes)\n        clauses = self._coerce_filter_clauses(filter_obj)\n\n        # Respect the tool's limit/threshold defaults\n        limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n        score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n        # Build the same hybrid body as your SearchService\n        body = {\n            \"query\": {\n                \"bool\": {\n                    \"should\": [\n                        {\n                            \"knn\": {\n                                self.vector_field: {\n                                    \"vector\": vec,\n                                    \"k\": 10,  # fixed to match the tool\n                                    \"boost\": 0.7,\n                                }\n                            }\n                        },\n                        {\n                            \"multi_match\": {\n                                \"query\": q,\n                                \"fields\": [\"text^2\", \"filename^1.5\"],\n                                \"type\": \"best_fields\",\n                                \"fuzziness\": \"AUTO\",\n                                \"boost\": 0.3,\n                            }\n                        },\n                    ],\n                    \"minimum_should_match\": 1,\n                }\n            },\n            \"aggs\": {\n                \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n                \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n                \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n            },\n            \"_source\": [\n                \"filename\",\n                \"mimetype\",\n                \"page\",\n                \"text\",\n                \"source_url\",\n                \"owner\",\n                \"allowed_users\",\n                \"allowed_groups\",\n            ],\n            \"size\": limit,\n        }\n        if clauses:\n            body[\"query\"][\"bool\"][\"filter\"] = clauses\n\n        if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n            # top-level min_score (matches your tool)\n            body[\"min_score\"] = score_threshold\n\n        resp = client.search(index=self.index_name, body=body)\n        hits = resp.get(\"hits\", {}).get(\"hits\", [])\n        return [\n            {\n                \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n                \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n                \"score\": hit.get(\"_score\"),\n            }\n            for hit in hits\n        ]\n\n    def search_documents(self) -> list[Data]:\n        try:\n            raw = self.search(self.search_query or \"\")\n            return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n            self.log(self.ingest_data)\n        except Exception as e:\n            self.log(f\"search_documents error: {e}\")\n            raise\n\n    # -------- dynamic UI handling (auth switch) --------\n    async def update_build_config(\n        self, build_config: dict, field_value: str, field_name: str | None = None\n    ) -> dict:\n        try:\n            if field_name == \"auth_mode\":\n                mode = (field_value or \"basic\").strip().lower()\n                is_basic = mode == \"basic\"\n                is_jwt = mode == \"jwt\"\n\n                build_config[\"username\"][\"show\"] = is_basic\n                build_config[\"password\"][\"show\"] = is_basic\n\n                build_config[\"jwt_token\"][\"show\"] = is_jwt\n                build_config[\"jwt_header\"][\"show\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n                build_config[\"username\"][\"required\"] = is_basic\n                build_config[\"password\"][\"required\"] = is_basic\n\n                build_config[\"jwt_token\"][\"required\"] = is_jwt\n                build_config[\"jwt_header\"][\"required\"] = is_jwt\n                build_config[\"bearer_prefix\"][\"required\"] = False\n\n                if is_basic:\n                    build_config[\"jwt_token\"][\"value\"] = \"\"\n\n                return build_config\n\n            return build_config\n\n        except Exception as e:\n            self.log(f\"update_build_config error: {e}\")\n            return build_config\n"
               },
               "docs_metadata": {
                 "_input_type": "TableInput",
@@ -2007,14 +2007,14 @@
           "x": 2218.9287723423276,
           "y": 1332.2598463956504
         },
-        "selected": false,
+        "selected": true,
         "type": "genericNode"
       }
     ],
     "viewport": {
-      "x": -960.2209660078395,
-      "y": -896.7176624994117,
-      "zoom": 0.7898282762479812
+      "x": -919.0070567185035,
+      "y": -955.5333976627492,
+      "zoom": 0.8337061732891438
     }
   },
   "description": "Load your data for chat context with Retrieval Augmented Generation.",

From 2fbc852eb3eb2ab9b4ed12abee8d4ed26cc336f2 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Mon, 8 Sep 2025 23:58:26 -0400
Subject: [PATCH 64/67] drive convert to pdf and include file extension

---
 src/connectors/google_drive/connector.py     | 5 +++--
 src/connectors/langflow_connector_service.py | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/connectors/google_drive/connector.py b/src/connectors/google_drive/connector.py
index c0ef2ff3..71eb24e0 100644
--- a/src/connectors/google_drive/connector.py
+++ b/src/connectors/google_drive/connector.py
@@ -400,8 +400,9 @@ class GoogleDriveConnector(BaseConnector):
         export_mime = self._pick_export_mime(mime_type)
         if mime_type.startswith("application/vnd.google-apps."):
             # default fallback if not overridden
-            if not export_mime:
-                export_mime = "application/pdf"
+            #if not export_mime:
+            #    export_mime = "application/pdf"
+            export_mime = "application/pdf"
             # NOTE: export_media does not accept supportsAllDrives/includeItemsFromAllDrives
             request = self.service.files().export_media(fileId=file_id, mimeType=export_mime)
         else:
diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py
index eda9d1fd..12fada87 100644
--- a/src/connectors/langflow_connector_service.py
+++ b/src/connectors/langflow_connector_service.py
@@ -53,9 +53,11 @@ class LangflowConnectorService:
             filename=document.filename,
         )
 
+        suffix = self._get_file_extension(document.mimetype)
+
         # Create temporary file from document content
         with tempfile.NamedTemporaryFile(
-            delete=False, suffix=self._get_file_extension(document.mimetype)
+            delete=False, suffix=suffix
         ) as tmp_file:
             tmp_file.write(document.content)
             tmp_file.flush()
@@ -65,7 +67,7 @@ class LangflowConnectorService:
                 logger.debug("Uploading file to Langflow", filename=document.filename)
                 content = document.content
                 file_tuple = (
-                    document.filename,
+                    document.filename+suffix,
                     content,
                     document.mimetype or "application/octet-stream",
                 )

From 132194d1f84614cfb58ae920b0dbe5e75813f25e Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Tue, 9 Sep 2025 00:13:16 -0400
Subject: [PATCH 65/67] filenames

---
 src/connectors/langflow_connector_service.py | 2 +-
 src/services/langflow_file_service.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/connectors/langflow_connector_service.py b/src/connectors/langflow_connector_service.py
index 12fada87..ef68816d 100644
--- a/src/connectors/langflow_connector_service.py
+++ b/src/connectors/langflow_connector_service.py
@@ -67,7 +67,7 @@ class LangflowConnectorService:
                 logger.debug("Uploading file to Langflow", filename=document.filename)
                 content = document.content
                 file_tuple = (
-                    document.filename+suffix,
+                    document.filename.replace(" ", "_").replace("/", "_")+suffix,
                     content,
                     document.mimetype or "application/octet-stream",
                 )
diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py
index aab128bb..ed0652cb 100644
--- a/src/services/langflow_file_service.py
+++ b/src/services/langflow_file_service.py
@@ -33,7 +33,7 @@ class LangflowFileService:
                 "[LF] Upload failed",
                 status_code=resp.status_code,
                 reason=resp.reason_phrase,
-                body=resp.text[:500],
+                body=resp.text,
             )
         resp.raise_for_status()
         return resp.json()

From 4a562301cd4909088f66c45fc3513ffbedb14f35 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Tue, 9 Sep 2025 00:18:51 -0400
Subject: [PATCH 66/67] duplicate file fix

---
 src/api/connectors.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/api/connectors.py b/src/api/connectors.py
index b7b603f0..6dc10cee 100644
--- a/src/api/connectors.py
+++ b/src/api/connectors.py
@@ -66,6 +66,7 @@ async def connector_sync(request: Request, connector_service, session_manager):
                     max_files,
                     jwt_token=jwt_token,
                 )
+            task_ids.append(task_id)
         return JSONResponse(
             {
                 "task_ids": task_ids,

From b3e6fe8abb90d12954acb471cdbc306e6fb28e00 Mon Sep 17 00:00:00 2001
From: phact <estevezsebastian@gmail.com>
Date: Tue, 9 Sep 2025 00:19:09 -0400
Subject: [PATCH 67/67] 0.1.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2b4e821d..f8d119ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openrag"
-version = "0.1.1"
+version = "0.1.2"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"