From 392322077f6b4eb5951a1ee05f1e8a5f7810968a Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 26 Sep 2025 03:08:26 -0400 Subject: [PATCH] Enable local build for OpenRAG services and update metadata handling Switched OpenRAG backend and frontend in docker-compose.yml to use local Dockerfile builds instead of remote images. Updated environment variables for better clarity and system integration. In flows/openrag_agent.json and langflow_file_service, improved handling of docs_metadata to support Data objects and added logging for metadata ingestion. Added agent_llm edge to agent node in flow definition. --- docker-compose.yml | 27 +-- flows/openrag_agent.json | 296 +++++--------------------- src/main.py | 4 + src/services/langflow_file_service.py | 2 +- 4 files changed, 67 insertions(+), 262 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d21cbb59..93fcbcdf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -39,10 +39,10 @@ services: - "5601:5601" openrag-backend: - image: phact/openrag-backend:${OPENRAG_VERSION:-latest} - #build: - #context: . - #dockerfile: Dockerfile.backend + # image: phact/openrag-backend:${OPENRAG_VERSION:-latest} + build: + context: . + dockerfile: Dockerfile.backend container_name: openrag-backend depends_on: - langflow @@ -76,10 +76,11 @@ services: gpus: all openrag-frontend: - image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} - #build: - #context: . - #dockerfile: Dockerfile.frontend + # image: phact/openrag-frontend:${OPENRAG_VERSION:-latest} + build: + context: . + dockerfile: Dockerfile.frontend + #dockerfile: Dockerfile.frontend container_name: openrag-frontend depends_on: - openrag-backend @@ -102,11 +103,11 @@ services: - OPENAI_API_KEY=${OPENAI_API_KEY} - LANGFLOW_LOAD_FLOWS_PATH=/app/flows - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY} - - JWT="dummy" - - OWNER="dummy" - - OWNER_NAME="dummy" - - OWNER_EMAIL="dummy" - - CONNECTOR_TYPE="dummy" + - JWT=None + - OWNER=None + - OWNER_NAME=None + - OWNER_EMAIL=None + - CONNECTOR_TYPE=system - OPENRAG-QUERY-FILTER="{}" - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index 8a76a475..0b2f5859 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -114,6 +114,33 @@ "sourceHandle": "{œdataTypeœ:œOpenSearchVectorStoreComponentœ,œidœ:œOpenSearch-iYfjfœ,œnameœ:œcomponent_as_toolœ,œoutput_typesœ:[œToolœ]}", "target": "Agent-crjWf", "targetHandle": "{œfieldNameœ:œtoolsœ,œidœ:œAgent-crjWfœ,œinputTypesœ:[œToolœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "data": { + "sourceHandle": { + "dataType": "LanguageModelComponent", + "id": "LanguageModelComponent-0YME7", + "name": "model_output", + "output_types": [ + "LanguageModel" + ] + }, + "targetHandle": { + "fieldName": "agent_llm", + "id": "Agent-crjWf", + "inputTypes": [ + "LanguageModel" + ], + "type": "str" + } + }, + "id": "xy-edge__LanguageModelComponent-0YME7{œdataTypeœ:œLanguageModelComponentœ,œidœ:œLanguageModelComponent-0YME7œ,œnameœ:œmodel_outputœ,œoutput_typesœ:[œLanguageModelœ]}-Agent-crjWf{œfieldNameœ:œagent_llmœ,œidœ:œAgent-crjWfœ,œinputTypesœ:[œLanguageModelœ],œtypeœ:œstrœ}", + "selected": false, + "source": "LanguageModelComponent-0YME7", + "sourceHandle": "{œdataTypeœ:œLanguageModelComponentœ,œidœ:œLanguageModelComponent-0YME7œ,œnameœ:œmodel_outputœ,œoutput_typesœ:[œLanguageModelœ]}", + "target": "Agent-crjWf", + "targetHandle": "{œfieldNameœ:œagent_llmœ,œidœ:œAgent-crjWfœ,œinputTypesœ:[œLanguageModelœ],œtypeœ:œstrœ}" } ], "nodes": [ @@ -674,11 +701,9 @@ ], "frozen": false, "icon": "OpenSearch", - "last_updated": "2025-09-26T05:15:05.779Z", "legacy": false, - "lf_version": "1.6.0", "metadata": { - "code_hash": "b19e82f1314a", + "code_hash": "07eef12db820", "dependencies": { "dependencies": [ { @@ -779,14 +804,17 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n advanced=True,\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", - "advanced": true, + "advanced": false, "display_name": "Document Metadata", "dynamic": false, "info": "Additional metadata key-value pairs to be added to all ingested documents. Useful for tagging documents with source information, categories, or other custom attributes.", + "input_types": [ + "Data" + ], "is_list": true, "list_add_label": "Add More", "name": "docs_metadata", @@ -798,12 +826,14 @@ { "description": "Key name", "display_name": "Key", + "formatter": "text", "name": "key", "type": "str" }, { "description": "Value of the metadata", "display_name": "Value", + "formatter": "text", "name": "value", "type": "str" } @@ -1285,7 +1315,7 @@ "dragging": false, "id": "OpenSearch-iYfjf", "measured": { - "height": 763, + "height": 848, "width": 320 }, "position": { @@ -1323,7 +1353,7 @@ ], "frozen": false, "icon": "binary", - "last_updated": "2025-09-26T05:15:05.781Z", + "last_updated": "2025-09-26T06:57:25.121Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -1648,7 +1678,7 @@ ], "frozen": false, "icon": "bot", - "last_updated": "2025-09-26T05:15:05.782Z", + "last_updated": "2025-09-26T07:03:41.442Z", "legacy": false, "lf_version": "1.6.0", "metadata": { @@ -1744,7 +1774,7 @@ "advanced": false, "combobox": false, "dialog_inputs": {}, - "display_name": "Model Provider", + "display_name": "Language Model", "dynamic": false, "external_options": { "fields": { @@ -1758,8 +1788,9 @@ } }, "info": "The provider of the language model that the agent will use to generate responses.", - "input_types": [], - "load_from_db": false, + "input_types": [ + "LanguageModel" + ], "name": "agent_llm", "options": [ "Anthropic", @@ -1777,7 +1808,7 @@ "icon": "OpenAI" } ], - "placeholder": "", + "placeholder": "Awaiting model input.", "real_time_refresh": true, "refresh_button": false, "required": false, @@ -1787,25 +1818,7 @@ "tool_mode": false, "trace_as_metadata": true, "type": "str", - "value": "OpenAI" - }, - "api_key": { - "_input_type": "SecretStrInput", - "advanced": false, - "display_name": "OpenAI API Key", - "dynamic": false, - "info": "The OpenAI API Key to use for the OpenAI model.", - "input_types": [], - "load_from_db": true, - "name": "api_key", - "password": true, - "placeholder": "", - "real_time_refresh": true, - "required": false, - "show": true, - "title_case": false, - "type": "str", - "value": "OPENAI_API_KEY" + "value": "" }, "code": { "advanced": true, @@ -1893,25 +1906,6 @@ "type": "str", "value": "" }, - "json_mode": { - "_input_type": "BoolInput", - "advanced": true, - "display_name": "JSON Mode", - "dynamic": false, - "info": "If True, it will output JSON regardless of passing a schema.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "json_mode", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "bool", - "value": false - }, "max_iterations": { "_input_type": "IntInput", "advanced": true, @@ -1931,113 +1925,6 @@ "type": "int", "value": 15 }, - "max_retries": { - "_input_type": "IntInput", - "advanced": true, - "display_name": "Max Retries", - "dynamic": false, - "info": "The maximum number of retries to make when generating.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "max_retries", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "int", - "value": 5 - }, - "max_tokens": { - "_input_type": "IntInput", - "advanced": true, - "display_name": "Max Tokens", - "dynamic": false, - "info": "The maximum number of tokens to generate. Set to 0 for unlimited tokens.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "max_tokens", - "placeholder": "", - "range_spec": { - "max": 128000, - "min": 0, - "step": 0.1, - "step_type": "float" - }, - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "int", - "value": "" - }, - "model_kwargs": { - "_input_type": "DictInput", - "advanced": true, - "display_name": "Model Kwargs", - "dynamic": false, - "info": "Additional keyword arguments to pass to the model.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "model_kwargs", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_input": true, - "type": "dict", - "value": {} - }, - "model_name": { - "_input_type": "DropdownInput", - "advanced": false, - "combobox": true, - "dialog_inputs": {}, - "display_name": "Model Name", - "dynamic": false, - "external_options": {}, - "info": "To see the model names, first choose a provider. Then, enter your API key and click the refresh button next to the model name.", - "input_types": [], - "name": "model_name", - "options": [ - "gpt-4o-mini", - "gpt-4o", - "gpt-4.1", - "gpt-4.1-mini", - "gpt-4.1-nano", - "gpt-4-turbo", - "gpt-4-turbo-preview", - "gpt-4", - "gpt-3.5-turbo", - "gpt-5", - "gpt-5-mini", - "gpt-5-nano", - "gpt-5-chat-latest", - "o1", - "o3-mini", - "o3", - "o3-pro", - "o4-mini", - "o4-mini-high" - ], - "options_metadata": [], - "placeholder": "", - "real_time_refresh": false, - "required": false, - "show": true, - "title_case": false, - "toggle": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "str", - "value": "gpt-4.1" - }, "n_messages": { "_input_type": "IntInput", "advanced": true, @@ -2057,26 +1944,6 @@ "type": "int", "value": 100 }, - "openai_api_base": { - "_input_type": "StrInput", - "advanced": true, - "display_name": "OpenAI API Base", - "dynamic": false, - "info": "The base URL of the OpenAI API. Defaults to https://api.openai.com/v1. You can change this to use other APIs like JinaChat, LocalAI and Prem.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "load_from_db": false, - "name": "openai_api_base", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "str", - "value": "" - }, "output_schema": { "_input_type": "TableInput", "advanced": true, @@ -2140,25 +2007,6 @@ "type": "table", "value": [] }, - "seed": { - "_input_type": "IntInput", - "advanced": true, - "display_name": "Seed", - "dynamic": false, - "info": "The seed controls the reproducibility of the job.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "seed", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "int", - "value": 1 - }, "system_prompt": { "_input_type": "MultilineInput", "advanced": false, @@ -2184,54 +2032,6 @@ "type": "str", "value": "You are a helpful assistant that can use tools to answer questions and perform tasks." }, - "temperature": { - "_input_type": "SliderInput", - "advanced": true, - "display_name": "Temperature", - "dynamic": false, - "info": "", - "input_types": [], - "max_label": "", - "max_label_icon": "", - "min_label": "", - "min_label_icon": "", - "name": "temperature", - "placeholder": "", - "range_spec": { - "max": 1, - "min": 0, - "step": 0.01, - "step_type": "float" - }, - "required": false, - "show": true, - "slider_buttons": false, - "slider_buttons_options": [], - "slider_input": false, - "title_case": false, - "tool_mode": false, - "type": "slider", - "value": 0.1 - }, - "timeout": { - "_input_type": "IntInput", - "advanced": true, - "display_name": "Timeout", - "dynamic": false, - "info": "The timeout for requests to OpenAI completion API.", - "input_types": [], - "list": false, - "list_add_label": "Add More", - "name": "timeout", - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "type": "int", - "value": 700 - }, "tools": { "_input_type": "HandleInput", "advanced": false, @@ -2281,14 +2081,14 @@ "dragging": false, "id": "Agent-crjWf", "measured": { - "height": 594, + "height": 429, "width": 320 }, "position": { "x": 1686.5732118555798, "y": 317.94354236557473 }, - "selected": false, + "selected": true, "type": "genericNode" }, { @@ -2417,7 +2217,7 @@ ], "frozen": false, "icon": "brain-circuit", - "last_updated": "2025-09-26T05:15:05.784Z", + "last_updated": "2025-09-26T06:57:25.122Z", "legacy": false, "metadata": { "code_hash": "bb5f8714781b", @@ -2722,7 +2522,7 @@ } ], "viewport": { - "x": -234.8770309457758, + "x": -337.8770309457759, "y": 153.7254076573895, "zoom": 0.6026322796158203 } diff --git a/src/main.py b/src/main.py index c4039b45..10592e47 100644 --- a/src/main.py +++ b/src/main.py @@ -335,6 +335,10 @@ async def _ingest_default_documents_langflow(services, file_paths): settings=None, # Use default ingestion settings jwt_token=effective_jwt, # Use JWT token (anonymous if needed) delete_after_ingest=True, # Clean up after ingestion + owner=None, + owner_name=anonymous_user.name, + owner_email=anonymous_user.email, + connector_type="system_default", ) logger.info( diff --git a/src/services/langflow_file_service.py b/src/services/langflow_file_service.py index 41971226..7ffdd3aa 100644 --- a/src/services/langflow_file_service.py +++ b/src/services/langflow_file_service.py @@ -98,7 +98,7 @@ class LangflowFileService: # Pass metadata via tweaks to OpenSearch component metadata_tweaks = [] - if owner: + if owner or owner is None: metadata_tweaks.append({"key": "owner", "value": owner}) if owner_name: metadata_tweaks.append({"key": "owner_name", "value": owner_name})