diff --git a/.env.example b/.env.example index 681280f2..8d412670 100644 --- a/.env.example +++ b/.env.example @@ -1,60 +1,43 @@ # Ingestion Configuration -# Set to true to disable Langflow ingestion and use the traditional OpenRAG processor. -# If unset or false, the Langflow pipeline is used (default: upload -> ingest -> delete). +# Set to true to disable Langflow ingestion and use traditional OpenRAG processor +# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete) DISABLE_INGEST_WITH_LANGFLOW=false - -# Create a Langflow secret key: -# https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key +# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key LANGFLOW_SECRET_KEY= - -# Flow IDs for chat and ingestion +# flow ids for chat and ingestion flows LANGFLOW_CHAT_FLOW_ID=1098eea1-6649-4e1d-aed1-b77249fb8dd0 LANGFLOW_INGEST_FLOW_ID=5488df7c-b93f-4f87-a446-b67028bc0813 -# Ingest flow using Docling +LANGFLOW_URL_INGEST_FLOW_ID=72c3d17c-2dac-4a73-b48a-6518473d7830 +# Ingest flow using docling # LANGFLOW_INGEST_FLOW_ID=1402618b-e6d1-4ff2-9a11-d6ce71186915 NUDGES_FLOW_ID=ebc01d31-1976-46ce-a385-b0240327226c - -# OpenSearch Auth -# Set a strong admin password for OpenSearch. -# A bcrypt hash is generated at container startup from this value. -# Do not commit real secrets. -# Must be changed for secure deployments. +# Set a strong admin password for OpenSearch; a bcrypt hash is generated at +# container startup from this value. Do not commit real secrets. +# must match the hashed password in secureconfig, must change for secure deployment!!! OPENSEARCH_PASSWORD= - -# Google OAuth -# Create credentials here: -# https://console.cloud.google.com/apis/credentials +# make here https://console.cloud.google.com/apis/credentials GOOGLE_OAUTH_CLIENT_ID= GOOGLE_OAUTH_CLIENT_SECRET= - -# Microsoft (SharePoint/OneDrive) OAuth -# Azure app registration credentials. +# Azure app registration credentials for SharePoint/OneDrive MICROSOFT_GRAPH_OAUTH_CLIENT_ID= MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET= - -# Webhooks (optional) -# Public, DNS-resolvable base URL (e.g., via ngrok) for continuous ingestion. +# OPTIONAL: dns routable from google (etc.) to handle continous ingest (something like ngrok works). This enables continous ingestion WEBHOOK_BASE_URL= - -# API Keys OPENAI_API_KEY= AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= - -# Langflow UI URL (optional) -# Public URL to link OpenRAG to Langflow in the UI. +# OPTIONAL url for openrag link to langflow in the UI LANGFLOW_PUBLIC_URL= - -# Langflow Auth +# Langflow auth LANGFLOW_AUTO_LOGIN=False LANGFLOW_SUPERUSER= LANGFLOW_SUPERUSER_PASSWORD= diff --git a/.gitignore b/.gitignore index 970b5bec..9c99e617 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ wheels/ 1001*.pdf *.json +!flows/*.json .DS_Store config/ diff --git a/Dockerfile.langflow b/Dockerfile.langflow index 86ee0ea5..71baf447 100644 --- a/Dockerfile.langflow +++ b/Dockerfile.langflow @@ -1,49 +1,5 @@ -FROM python:3.12-slim +FROM langflowai/langflow-nightly:1.6.3.dev0 -# Set environment variables -ENV DEBIAN_FRONTEND=noninteractive -ENV PYTHONUNBUFFERED=1 -ENV RUSTFLAGS="--cfg reqwest_unstable" - -# Accept build arguments for git repository and branch -ARG GIT_REPO=https://github.com/langflow-ai/langflow.git -ARG GIT_BRANCH=test-openai-responses - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ - git \ - ca-certificates \ - gnupg \ - npm \ - rustc cargo pkg-config libssl-dev \ - && rm -rf /var/lib/apt/lists/* - -# Install uv for faster Python package management -RUN pip install uv - -# Clone the repository and checkout the specified branch -RUN git clone --depth 1 --branch ${GIT_BRANCH} ${GIT_REPO} /app - -# Install backend dependencies -RUN uv sync --frozen --no-install-project --no-editable --extra postgresql - -# Build frontend -WORKDIR /app/src/frontend -RUN NODE_OPTIONS=--max_old_space_size=4096 npm ci && \ - NODE_OPTIONS=--max_old_space_size=4096 npm run build && \ - mkdir -p /app/src/backend/base/langflow/frontend && \ - cp -r build/* /app/src/backend/base/langflow/frontend/ - -# Return to app directory and install the project -WORKDIR /app -RUN uv sync --frozen --no-dev --no-editable --extra postgresql - -# Expose ports EXPOSE 7860 -# Start the backend server -CMD ["uv", "run", "langflow", "run", "--host", "0.0.0.0", "--port", "7860"] +CMD ["langflow", "run", "--host", "0.0.0.0", "--port", "7860"] \ No newline at end of file diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml index 9c121f89..570bc3b8 100644 --- a/docker-compose-cpu.yml +++ b/docker-compose-cpu.yml @@ -40,10 +40,10 @@ services: openrag-backend: image: phact/openrag-backend:${OPENRAG_VERSION:-latest} - #build: - #context: . - #dockerfile: Dockerfile.backend - container_name: openrag-backend + # build: + # context: . + # dockerfile: Dockerfile.backend + # container_name: openrag-backend depends_on: - langflow environment: @@ -55,6 +55,7 @@ services: - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD} - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID} - LANGFLOW_INGEST_FLOW_ID=${LANGFLOW_INGEST_FLOW_ID} + - LANGFLOW_URL_INGEST_FLOW_ID=${LANGFLOW_URL_INGEST_FLOW_ID} - DISABLE_INGEST_WITH_LANGFLOW=${DISABLE_INGEST_WITH_LANGFLOW:-false} - NUDGES_FLOW_ID=${NUDGES_FLOW_ID} - OPENSEARCH_PORT=9200 @@ -77,9 +78,9 @@ services: openrag-frontend: image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} - #build: - #context: . - #dockerfile: Dockerfile.frontend + # build: + # context: . + # dockerfile: Dockerfile.frontend container_name: openrag-frontend depends_on: - openrag-backend @@ -92,6 +93,9 @@ services: volumes: - ./flows:/app/flows:Z image: phact/openrag-langflow:${LANGFLOW_VERSION:-latest} + # build: + # context: . + # dockerfile: Dockerfile.langflow container_name: langflow ports: - "7860:7860" @@ -99,15 +103,23 @@ services: - OPENAI_API_KEY=${OPENAI_API_KEY} - LANGFLOW_LOAD_FLOWS_PATH=/app/flows - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY} - - JWT="dummy" + - JWT=None + - OWNER=None + - OWNER_NAME=None + - OWNER_EMAIL=None + - CONNECTOR_TYPE=system - OPENRAG-QUERY-FILTER="{}" - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD + - FILENAME=None + - MIMETYPE=None + - FILESIZE=0 + - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE,FILENAME,MIMETYPE,FILESIZE - LANGFLOW_LOG_LEVEL=DEBUG - LANGFLOW_AUTO_LOGIN=${LANGFLOW_AUTO_LOGIN} - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER} - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD} - LANGFLOW_NEW_USER_IS_ACTIVE=${LANGFLOW_NEW_USER_IS_ACTIVE} - LANGFLOW_ENABLE_SUPERUSER_CLI=${LANGFLOW_ENABLE_SUPERUSER_CLI} - - DEFAULT_FOLDER_NAME="OpenRAG" + # - DEFAULT_FOLDER_NAME=OpenRAG - HIDE_GETTING_STARTED_PROGRESS=true + diff --git a/docker-compose.yml b/docker-compose.yml index 64226fd5..b97f7cca 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,7 +43,7 @@ services: # build: # context: . # dockerfile: Dockerfile.backend - container_name: openrag-backend + # container_name: openrag-backend depends_on: - langflow environment: @@ -54,6 +54,7 @@ services: - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD} - LANGFLOW_CHAT_FLOW_ID=${LANGFLOW_CHAT_FLOW_ID} - LANGFLOW_INGEST_FLOW_ID=${LANGFLOW_INGEST_FLOW_ID} + - LANGFLOW_URL_INGEST_FLOW_ID=${LANGFLOW_URL_INGEST_FLOW_ID} - DISABLE_INGEST_WITH_LANGFLOW=${DISABLE_INGEST_WITH_LANGFLOW:-false} - NUDGES_FLOW_ID=${NUDGES_FLOW_ID} - OPENSEARCH_PORT=9200 @@ -80,7 +81,7 @@ services: # build: # context: . # dockerfile: Dockerfile.frontend - # #dockerfile: Dockerfile.frontend + #dockerfile: Dockerfile.frontend container_name: openrag-frontend depends_on: - openrag-backend @@ -109,13 +110,16 @@ services: - OWNER_EMAIL=None - CONNECTOR_TYPE=system - OPENRAG-QUERY-FILTER="{}" + - FILENAME=None + - MIMETYPE=None + - FILESIZE=0 - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE + - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE,FILENAME,MIMETYPE,FILESIZE - LANGFLOW_LOG_LEVEL=DEBUG - LANGFLOW_AUTO_LOGIN=${LANGFLOW_AUTO_LOGIN} - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER} - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD} - LANGFLOW_NEW_USER_IS_ACTIVE=${LANGFLOW_NEW_USER_IS_ACTIVE} - LANGFLOW_ENABLE_SUPERUSER_CLI=${LANGFLOW_ENABLE_SUPERUSER_CLI} - - DEFAULT_FOLDER_NAME="OpenRAG" + # - DEFAULT_FOLDER_NAME=OpenRAG - HIDE_GETTING_STARTED_PROGRESS=true diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json index 12cf5b63..911c3e38 100644 --- a/flows/ingestion_flow.json +++ b/flows/ingestion_flow.json @@ -30,36 +30,6 @@ "target": "OpenSearchHybrid-Ve6bS", "targetHandle": "{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" }, - { - "animated": false, - "className": "", - "data": { - "sourceHandle": { - "dataType": "File", - "id": "File-PSU37", - "name": "message", - "output_types": [ - "Message" - ] - }, - "targetHandle": { - "fieldName": "data_inputs", - "id": "SplitText-QIKhg", - "inputTypes": [ - "Data", - "DataFrame", - "Message" - ], - "type": "other" - } - }, - "id": "xy-edge__File-PSU37{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}", - "selected": false, - "source": "File-PSU37", - "sourceHandle": "{œdataTypeœ:œFileœ,œidœ:œFile-PSU37œ,œnameœ:œmessageœ,œoutput_typesœ:[œMessageœ]}", - "target": "SplitText-QIKhg", - "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}" - }, { "animated": false, "className": "", @@ -206,7 +176,7 @@ }, { "animated": false, - "className": "not-running", + "className": "", "data": { "sourceHandle": { "dataType": "AdvancedDynamicFormBuilder", @@ -231,6 +201,149 @@ "sourceHandle": "{œdataTypeœ:œAdvancedDynamicFormBuilderœ,œidœ:œAdvancedDynamicFormBuilder-81Exwœ,œnameœ:œform_dataœ,œoutput_typesœ:[œDataœ]}", "target": "OpenSearchHybrid-Ve6bS", "targetHandle": "{œfieldNameœ:œdocs_metadataœ,œidœ:œOpenSearchHybrid-Ve6bSœ,œinputTypesœ:[œDataœ],œtypeœ:œtableœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DoclingRemote", + "id": "DoclingRemote-Dp3PX", + "name": "dataframe", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "data_inputs", + "id": "ExportDoclingDocument-zZdRg", + "inputTypes": [ + "Data", + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__DoclingRemote-Dp3PX{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-ExportDoclingDocument-zZdRg{œfieldNameœ:œdata_inputsœ,œidœ:œExportDoclingDocument-zZdRgœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DoclingRemote-Dp3PX", + "sourceHandle": "{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "ExportDoclingDocument-zZdRg", + "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œExportDoclingDocument-zZdRgœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "ExportDoclingDocument", + "id": "ExportDoclingDocument-zZdRg", + "name": "dataframe", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "df", + "id": "DataFrameOperations-1BWXB", + "inputTypes": [ + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__ExportDoclingDocument-zZdRg{œdataTypeœ:œExportDoclingDocumentœ,œidœ:œExportDoclingDocument-zZdRgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-DataFrameOperations-1BWXB{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "ExportDoclingDocument-zZdRg", + "sourceHandle": "{œdataTypeœ:œExportDoclingDocumentœ,œidœ:œExportDoclingDocument-zZdRgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "DataFrameOperations-1BWXB", + "targetHandle": "{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DataFrameOperations", + "id": "DataFrameOperations-N80fC", + "name": "output", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "data_inputs", + "id": "SplitText-QIKhg", + "inputTypes": [ + "Data", + "DataFrame", + "Message" + ], + "type": "other" + } + }, + "id": "xy-edge__DataFrameOperations-N80fC{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DataFrameOperations-N80fC", + "sourceHandle": "{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "SplitText-QIKhg", + "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DataFrameOperations", + "id": "DataFrameOperations-1BWXB", + "name": "output", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "df", + "id": "DataFrameOperations-9vMrp", + "inputTypes": [ + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__DataFrameOperations-1BWXB{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-1BWXBœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}-DataFrameOperations-9vMrp{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-9vMrpœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DataFrameOperations-1BWXB", + "sourceHandle": "{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-1BWXBœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "DataFrameOperations-9vMrp", + "targetHandle": "{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-9vMrpœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DataFrameOperations", + "id": "DataFrameOperations-9vMrp", + "name": "output", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "df", + "id": "DataFrameOperations-N80fC", + "inputTypes": [ + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__DataFrameOperations-9vMrp{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-9vMrpœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}-DataFrameOperations-N80fC{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-N80fCœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DataFrameOperations-9vMrp", + "sourceHandle": "{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-9vMrpœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "DataFrameOperations-N80fC", + "targetHandle": "{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-N80fCœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}" } ], "nodes": [ @@ -261,7 +374,7 @@ "frozen": false, "icon": "scissors-line-dashed", "legacy": false, - "lf_version": "1.6.0", + "lf_version": "1.6.3.dev0", "metadata": { "code_hash": "f2867efda61f", "dependencies": { @@ -466,8 +579,8 @@ "width": 320 }, "position": { - "x": 1729.1788373023007, - "y": 1330.8003441546418 + "x": 1704.25352249077, + "y": 1199.364065218893 }, "positionAbsolute": { "x": 1683.4543896546102, @@ -477,481 +590,6 @@ "type": "genericNode", "width": 320 }, - { - "data": { - "id": "File-PSU37", - "node": { - "base_classes": [ - "Message" - ], - "beta": false, - "conditional_paths": [], - "custom_fields": {}, - "description": "Loads content from one or more files.", - "display_name": "File", - "documentation": "https://docs.langflow.org/components-data#file", - "edited": true, - "field_order": [ - "path", - "file_path", - "separator", - "silent_errors", - "delete_server_file_after_processing", - "ignore_unsupported_extensions", - "ignore_unspecified_files", - "advanced_mode", - "pipeline", - "ocr_engine", - "md_image_placeholder", - "md_page_break_placeholder", - "doc_key", - "use_multithreading", - "concurrency_multithreading", - "markdown" - ], - "frozen": false, - "icon": "file-text", - "last_updated": "2025-09-26T14:37:42.811Z", - "legacy": false, - "lf_version": "1.6.0", - "metadata": { - "code_hash": "9a1d497f4f91", - "dependencies": { - "dependencies": [ - { - "name": "lfx", - "version": null - } - ], - "total_dependencies": 1 - }, - "module": "custom_components.file" - }, - "minimized": false, - "output_types": [], - "outputs": [ - { - "allows_loop": false, - "cache": true, - "display_name": "Raw Content", - "group_outputs": false, - "hidden": null, - "method": "load_files_message", - "name": "message", - "options": null, - "required_inputs": null, - "selected": "Message", - "tool_mode": true, - "types": [ - "Message" - ], - "value": "__UNDEFINED__" - } - ], - "pinned": false, - "template": { - "_type": "Component", - "advanced_mode": { - "_input_type": "BoolInput", - "advanced": false, - "display_name": "Advanced Parser", - "dynamic": false, - "info": "Enable advanced document processing and export with Docling for PDFs, images, and office documents. Available only for single file processing.Note that advanced document processing can consume significant resources.", - "list": false, - "list_add_label": "Add More", - "name": "advanced_mode", - "placeholder": "", - "real_time_refresh": true, - "required": false, - "show": false, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "bool", - "value": false - }, - "code": { - "advanced": true, - "dynamic": true, - "fileTypes": [], - "file_path": "", - "info": "", - "list": false, - "load_from_db": false, - "multiline": true, - "name": "code", - "password": false, - "placeholder": "", - "required": true, - "show": true, - "title_case": false, - "type": "code", - "value": "\"\"\"Enhanced file component with Docling support and process isolation.\n\nNotes:\n-----\n- ALL Docling parsing/export runs in a separate OS process to prevent memory\n growth and native library state from impacting the main Langflow process.\n- Standard text/structured parsing continues to use existing BaseFileComponent\n utilities (and optional threading via `parallel_load_data`).\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nimport subprocess\nimport sys\nimport textwrap\nfrom copy import deepcopy\nfrom typing import Any\n\nfrom lfx.base.data.base_file import BaseFileComponent\nfrom lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput\nfrom lfx.io import BoolInput, FileInput, IntInput, Output\nfrom lfx.schema import DataFrame # noqa: TC001\nfrom lfx.schema.data import Data\nfrom lfx.schema.message import Message\n\n\nclass FileComponent(BaseFileComponent):\n \"\"\"File component with optional Docling processing (isolated in a subprocess).\"\"\"\n\n display_name = \"File\"\n description = \"Loads content from one or more files.\"\n documentation: str = \"https://docs.langflow.org/components-data#file\"\n icon = \"file-text\"\n name = \"File\"\n\n # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.\n VALID_EXTENSIONS = [\n *TEXT_FILE_TYPES,\n \"adoc\",\n \"asciidoc\",\n \"asc\",\n \"bmp\",\n \"dotx\",\n \"dotm\",\n \"docm\",\n \"jpeg\",\n \"png\",\n \"potx\",\n \"ppsx\",\n \"pptm\",\n \"potm\",\n \"ppsm\",\n \"pptx\",\n \"tiff\",\n \"xls\",\n \"xlsx\",\n \"xhtml\",\n \"webp\",\n ]\n\n # Fixed export settings used when markdown export is requested.\n EXPORT_FORMAT = \"Markdown\"\n IMAGE_MODE = \"placeholder\"\n\n _base_inputs = deepcopy(BaseFileComponent.get_base_inputs())\n\n for input_item in _base_inputs:\n if isinstance(input_item, FileInput) and input_item.name == \"path\":\n input_item.real_time_refresh = True\n break\n\n inputs = [\n *_base_inputs,\n BoolInput(\n name=\"advanced_mode\",\n display_name=\"Advanced Parser\",\n value=False,\n real_time_refresh=True,\n info=(\n \"Enable advanced document processing and export with Docling for PDFs, images, and office documents. \"\n \"Available only for single file processing.\"\n \"Note that advanced document processing can consume significant resources.\"\n ),\n show=False,\n ),\n DropdownInput(\n name=\"pipeline\",\n display_name=\"Pipeline\",\n info=\"Docling pipeline to use\",\n options=[\"standard\", \"vlm\"],\n value=\"standard\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"ocr_engine\",\n display_name=\"OCR Engine\",\n info=\"OCR engine to use. Only available when pipeline is set to 'standard'.\",\n options=[\"None\", \"easyocr\"],\n value=\"easyocr\",\n show=False,\n advanced=True,\n ),\n StrInput(\n name=\"md_image_placeholder\",\n display_name=\"Image placeholder\",\n info=\"Specify the image placeholder for markdown exports.\",\n value=\"\",\n advanced=True,\n show=False,\n ),\n StrInput(\n name=\"md_page_break_placeholder\",\n display_name=\"Page break placeholder\",\n info=\"Add this placeholder between pages in the markdown output.\",\n value=\"\",\n advanced=True,\n show=False,\n ),\n MessageTextInput(\n name=\"doc_key\",\n display_name=\"Doc Key\",\n info=\"The key to use for the DoclingDocument column.\",\n value=\"doc\",\n advanced=True,\n show=False,\n ),\n # Deprecated input retained for backward-compatibility.\n BoolInput(\n name=\"use_multithreading\",\n display_name=\"[Deprecated] Use Multithreading\",\n advanced=True,\n value=True,\n info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n ),\n IntInput(\n name=\"concurrency_multithreading\",\n display_name=\"Processing Concurrency\",\n advanced=True,\n info=\"When multiple files are being processed, the number of files to process concurrently.\",\n value=1,\n ),\n BoolInput(\n name=\"markdown\",\n display_name=\"Markdown Export\",\n info=\"Export processed documents to Markdown format. Only available when advanced mode is enabled.\",\n value=False,\n show=False,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n ]\n\n # ------------------------------ UI helpers --------------------------------------\n\n def _path_value(self, template: dict) -> list[str]:\n \"\"\"Return the list of currently selected file paths from the template.\"\"\"\n return template.get(\"path\", {}).get(\"file_path\", [])\n\n def update_build_config(\n self,\n build_config: dict[str, Any],\n field_value: Any,\n field_name: str | None = None,\n ) -> dict[str, Any]:\n \"\"\"Show/hide Advanced Parser and related fields based on selection context.\"\"\"\n if field_name == \"path\":\n paths = self._path_value(build_config)\n file_path = paths[0] if paths else \"\"\n file_count = len(field_value) if field_value else 0\n\n # Advanced mode only for single (non-tabular) file\n allow_advanced = file_count == 1 and not file_path.endswith((\".csv\", \".xlsx\", \".parquet\"))\n build_config[\"advanced_mode\"][\"show\"] = allow_advanced\n if not allow_advanced:\n build_config[\"advanced_mode\"][\"value\"] = False\n for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n if f in build_config:\n build_config[f][\"show\"] = False\n\n # Docling Processing\n elif field_name == \"advanced_mode\":\n for f in (\"pipeline\", \"ocr_engine\", \"doc_key\", \"md_image_placeholder\", \"md_page_break_placeholder\"):\n if f in build_config:\n build_config[f][\"show\"] = bool(field_value)\n\n elif field_name == \"pipeline\":\n if field_value == \"standard\":\n build_config[\"ocr_engine\"][\"show\"] = True\n build_config[\"ocr_engine\"][\"value\"] = \"easyocr\"\n else:\n build_config[\"ocr_engine\"][\"show\"] = False\n build_config[\"ocr_engine\"][\"value\"] = \"None\"\n\n return build_config\n\n def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]: # noqa: ARG002\n \"\"\"Dynamically show outputs based on file count/type and advanced mode.\"\"\"\n if field_name not in [\"path\", \"advanced_mode\", \"pipeline\"]:\n return frontend_node\n\n template = frontend_node.get(\"template\", {})\n paths = self._path_value(template)\n if not paths:\n return frontend_node\n\n frontend_node[\"outputs\"] = []\n if len(paths) == 1:\n file_path = paths[0] if field_name == \"path\" else frontend_node[\"template\"][\"path\"][\"file_path\"][0]\n if file_path.endswith((\".csv\", \".xlsx\", \".parquet\")):\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Structured Content\", name=\"dataframe\", method=\"load_files_structured\"),\n )\n elif file_path.endswith(\".json\"):\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Structured Content\", name=\"json\", method=\"load_files_json\"),\n )\n\n advanced_mode = frontend_node.get(\"template\", {}).get(\"advanced_mode\", {}).get(\"value\", False)\n if advanced_mode:\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Structured Output\", name=\"advanced_dataframe\", method=\"load_files_dataframe\"),\n )\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Markdown\", name=\"advanced_markdown\", method=\"load_files_markdown\"),\n )\n frontend_node[\"outputs\"].append(\n Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n )\n else:\n frontend_node[\"outputs\"].append(\n Output(display_name=\"Raw Content\", name=\"message\", method=\"load_files_message\"),\n )\n frontend_node[\"outputs\"].append(\n Output(display_name=\"File Path\", name=\"path\", method=\"load_files_path\"),\n )\n else:\n # Multiple files => DataFrame output; advanced parser disabled\n frontend_node[\"outputs\"].append(Output(display_name=\"Files\", name=\"dataframe\", method=\"load_files\"))\n\n return frontend_node\n\n # ------------------------------ Core processing ----------------------------------\n\n def _is_docling_compatible(self, file_path: str) -> bool:\n \"\"\"Lightweight extension gate for Docling-compatible types.\"\"\"\n docling_exts = (\n \".adoc\",\n \".asciidoc\",\n \".asc\",\n \".bmp\",\n \".csv\",\n \".dotx\",\n \".dotm\",\n \".docm\",\n \".docx\",\n \".htm\",\n \".html\",\n \".jpeg\",\n \".json\",\n \".md\",\n \".pdf\",\n \".png\",\n \".potx\",\n \".ppsx\",\n \".pptm\",\n \".potm\",\n \".ppsm\",\n \".pptx\",\n \".tiff\",\n \".txt\",\n \".xls\",\n \".xlsx\",\n \".xhtml\",\n \".xml\",\n \".webp\",\n )\n return file_path.lower().endswith(docling_exts)\n\n def _process_docling_in_subprocess(self, file_path: str) -> Data | None:\n \"\"\"Run Docling in a separate OS process and map the result to a Data object.\n\n We avoid multiprocessing pickling by launching `python -c \"