diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index c5ce1508..1a9ec444 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -16,6 +16,11 @@ jobs: # Prefer repository/environment variable first, then secret, then a sane fallback OPENSEARCH_PASSWORD: ${{ vars.OPENSEARCH_PASSWORD || secrets.OPENSEARCH_PASSWORD || 'OpenRag#2025!' }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LANGFLOW_AUTO_LOGIN: "True" + LANGFLOW_NEW_USER_IS_ACTIVE: "True" + LANGFLOW_ENABLE_SUPERUSER_CLI: "True" + LANGFLOW_CHAT_FLOW_ID: ${{ vars.LANGFLOW_CHAT_FLOW_ID || '1098eea1-6649-4e1d-aed1-b77249fb8dd0' }} + NUDGES_FLOW_ID: ${{ vars.NUDGES_FLOW_ID || 'ebc01d31-1976-46ce-a385-b0240327226c' }} steps: - run: df -h diff --git a/Dockerfile b/Dockerfile index 039053d0..af412430 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,10 +53,11 @@ RUN echo y | opensearch-plugin install repository-s3 # Create a script to apply security configuration after OpenSearch starts RUN echo '#!/bin/bash' > /usr/share/opensearch/setup-security.sh && \ echo 'echo "Waiting for OpenSearch to start..."' >> /usr/share/opensearch/setup-security.sh && \ - echo 'until curl -s -k -u admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} https://localhost:9200; do sleep 1; done' >> /usr/share/opensearch/setup-security.sh && \ - echo 'echo "Generating admin hash from OPENSEARCH_INITIAL_ADMIN_PASSWORD..."' >> /usr/share/opensearch/setup-security.sh && \ - echo 'if [ -z "${OPENSEARCH_INITIAL_ADMIN_PASSWORD}" ]; then echo "[ERROR] OPENSEARCH_INITIAL_ADMIN_PASSWORD not set"; exit 1; fi' >> /usr/share/opensearch/setup-security.sh && \ - echo 'HASH=$(/usr/share/opensearch/plugins/opensearch-security/tools/hash.sh -p "${OPENSEARCH_INITIAL_ADMIN_PASSWORD}")' >> /usr/share/opensearch/setup-security.sh && \ + echo 'PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD:-${OPENSEARCH_PASSWORD}}' >> /usr/share/opensearch/setup-security.sh && \ + echo 'if [ -z "$PASSWORD" ]; then echo "[ERROR] OPENSEARCH_INITIAL_ADMIN_PASSWORD or OPENSEARCH_PASSWORD must be set"; exit 1; fi' >> /usr/share/opensearch/setup-security.sh && \ + echo 'until curl -s -k -u admin:$PASSWORD https://localhost:9200; do sleep 1; done' >> /usr/share/opensearch/setup-security.sh && \ + echo 'echo "Generating admin hash from configured password..."' >> /usr/share/opensearch/setup-security.sh && \ + echo 'HASH=$(/usr/share/opensearch/plugins/opensearch-security/tools/hash.sh -p "$PASSWORD")' >> /usr/share/opensearch/setup-security.sh && \ echo 'if [ -z "$HASH" ]; then echo "[ERROR] Failed to generate admin hash"; exit 1; fi' >> /usr/share/opensearch/setup-security.sh && \ echo 'sed -i "s|^ hash: \".*\"| hash: \"$HASH\"|" /usr/share/opensearch/securityconfig/internal_users.yml' >> /usr/share/opensearch/setup-security.sh && \ echo 'echo "Updated internal_users.yml with runtime-generated admin hash"' >> /usr/share/opensearch/setup-security.sh && \ diff --git a/Makefile b/Makefile index b30f77fc..5c963298 100644 --- a/Makefile +++ b/Makefile @@ -206,6 +206,8 @@ test-ci: docker compose -f docker-compose-cpu.yml down -v 2>/dev/null || true; \ echo "Pulling latest images..."; \ docker compose -f docker-compose-cpu.yml pull; \ + echo "Building OpenSearch image override..."; \ + docker build --no-cache -t phact/openrag-opensearch:latest -f Dockerfile .; \ echo "Starting infra (OpenSearch + Dashboards + Langflow) with CPU containers"; \ docker compose -f docker-compose-cpu.yml up -d opensearch dashboards langflow; \ echo "Starting docling-serve..."; \ diff --git a/flows/components/opensearch.py b/flows/components/opensearch.py new file mode 100644 index 00000000..f453d8be --- /dev/null +++ b/flows/components/opensearch.py @@ -0,0 +1,1291 @@ +from __future__ import annotations + +import copy +import json +import time +import uuid +from typing import Any, List, Optional + +from concurrent.futures import ThreadPoolExecutor, as_completed + +from opensearchpy import OpenSearch, helpers +from opensearchpy.exceptions import RequestError + +from lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store +from lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection +from lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput +from lfx.log import logger +from lfx.schema.data import Data + + +def normalize_model_name(model_name: str) -> str: + """Normalize embedding model name for use as field suffix. + + Converts model names to valid OpenSearch field names by replacing + special characters and ensuring alphanumeric format. + + Args: + model_name: Original embedding model name (e.g., "text-embedding-3-small") + + Returns: + Normalized field suffix (e.g., "text_embedding_3_small") + """ + normalized = model_name.lower() + # Replace common separators with underscores + normalized = normalized.replace("-", "_").replace(":", "_").replace("/", "_").replace(".", "_") + # Remove any non-alphanumeric characters except underscores + normalized = "".join(c if c.isalnum() or c == "_" else "_" for c in normalized) + # Remove duplicate underscores + while "__" in normalized: + normalized = normalized.replace("__", "_") + return normalized.strip("_") + + +def get_embedding_field_name(model_name: str) -> str: + """Get the dynamic embedding field name for a model. + + Args: + model_name: Embedding model name + + Returns: + Field name in format: chunk_embedding_{normalized_model_name} + """ + return f"chunk_embedding_{normalize_model_name(model_name)}" + + +@vector_store_connection +class OpenSearchVectorStoreComponent(LCVectorStoreComponent): + """OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities. + + This component provides vector storage and retrieval using OpenSearch, combining semantic + similarity search (KNN) with keyword-based search for optimal results. It supports: + - Multiple embedding models per index with dynamic field names + - Automatic detection and querying of all available embedding models + - Parallel embedding generation for multi-model search + - Document ingestion with model tracking + - Advanced filtering and aggregations + - Flexible authentication options + + Features: + - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name}) + - Hybrid search combining multiple KNN queries (dis_max) + keyword matching + - Auto-detection of available models in the index + - Parallel query embedding generation for all detected models + - Vector storage with configurable engines (jvector, nmslib, faiss, lucene) + - Flexible authentication (Basic auth, JWT tokens) + """ + + display_name: str = "OpenSearch (Multi-Model)" + icon: str = "OpenSearch" + description: str = ( + "Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search." + ) + + # Keys we consider baseline + default_keys: list[str] = [ + "opensearch_url", + "index_name", + *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc. + "embedding", + "embedding_model_name", + "vector_field", + "number_of_results", + "auth_mode", + "username", + "password", + "jwt_token", + "jwt_header", + "bearer_prefix", + "use_ssl", + "verify_certs", + "filter_expression", + "engine", + "space_type", + "ef_construction", + "m", + "num_candidates", + "docs_metadata", + ] + + inputs = [ + TableInput( + name="docs_metadata", + display_name="Document Metadata", + info=( + "Additional metadata key-value pairs to be added to all ingested documents. " + "Useful for tagging documents with source information, categories, or other custom attributes." + ), + table_schema=[ + { + "name": "key", + "display_name": "Key", + "type": "str", + "description": "Key name", + }, + { + "name": "value", + "display_name": "Value", + "type": "str", + "description": "Value of the metadata", + }, + ], + value=[], + input_types=["Data"] + ), + StrInput( + name="opensearch_url", + display_name="OpenSearch URL", + value="http://localhost:9200", + info=( + "The connection URL for your OpenSearch cluster " + "(e.g., http://localhost:9200 for local development or your cloud endpoint)." + ), + ), + StrInput( + name="index_name", + display_name="Index Name", + value="langflow", + info=( + "The OpenSearch index name where documents will be stored and searched. " + "Will be created automatically if it doesn't exist." + ), + ), + DropdownInput( + name="engine", + display_name="Vector Engine", + options=["jvector", "nmslib", "faiss", "lucene"], + value="jvector", + info=( + "Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. " + "Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'." + ), + advanced=True, + ), + DropdownInput( + name="space_type", + display_name="Distance Metric", + options=["l2", "l1", "cosinesimil", "linf", "innerproduct"], + value="l2", + info=( + "Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, " + "'cosinesimil' for cosine similarity, 'innerproduct' for dot product." + ), + advanced=True, + ), + IntInput( + name="ef_construction", + display_name="EF Construction", + value=512, + info=( + "Size of the dynamic candidate list during index construction. " + "Higher values improve recall but increase indexing time and memory usage." + ), + advanced=True, + ), + IntInput( + name="m", + display_name="M Parameter", + value=16, + info=( + "Number of bidirectional connections for each vector in the HNSW graph. " + "Higher values improve search quality but increase memory usage and indexing time." + ), + advanced=True, + ), + IntInput( + name="num_candidates", + display_name="Candidate Pool Size", + value=1000, + info=( + "Number of approximate neighbors to consider for each KNN query. " + "Some OpenSearch deployments do not support this parameter; set to 0 to disable." + ), + advanced=True, + ), + *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc. + HandleInput(name="embedding", display_name="Embedding", input_types=["Embeddings"]), + StrInput( + name="embedding_model_name", + display_name="Embedding Model Name", + value="", + info=( + "Name of the embedding model being used (e.g., 'text-embedding-3-small'). " + "Used to create dynamic vector field names and track which model embedded each document. " + "Auto-detected from embedding component if not specified." + ), + ), + StrInput( + name="vector_field", + display_name="Legacy Vector Field Name", + value="chunk_embedding", + advanced=True, + info=( + "Legacy field name for backward compatibility. New documents use dynamic fields " + "(chunk_embedding_{model_name}) based on the embedding_model_name." + ), + ), + IntInput( + name="number_of_results", + display_name="Default Result Limit", + value=10, + advanced=True, + info=( + "Default maximum number of search results to return when no limit is " + "specified in the filter expression." + ), + ), + MultilineInput( + name="filter_expression", + display_name="Search Filters (JSON)", + value="", + info=( + "Optional JSON configuration for search filtering, result limits, and score thresholds.\n\n" + "Format 1 - Explicit filters:\n" + '{"filter": [{"term": {"filename":"doc.pdf"}}, ' + '{"terms":{"owner":["user1","user2"]}}], "limit": 10, "score_threshold": 1.6}\n\n' + "Format 2 - Context-style mapping:\n" + '{"data_sources":["file.pdf"], "document_types":["application/pdf"], "owners":["user123"]}\n\n' + "Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters." + ), + ), + # ----- Auth controls (dynamic) ----- + DropdownInput( + name="auth_mode", + display_name="Authentication Mode", + value="basic", + options=["basic", "jwt"], + info=( + "Authentication method: 'basic' for username/password authentication, " + "or 'jwt' for JSON Web Token (Bearer) authentication." + ), + real_time_refresh=True, + advanced=False, + ), + StrInput( + name="username", + display_name="Username", + value="admin", + show=False, + ), + SecretStrInput( + name="password", + display_name="OpenSearch Password", + value="admin", + show=False, + ), + SecretStrInput( + name="jwt_token", + display_name="JWT Token", + value="JWT", + load_from_db=False, + show=True, + info=( + "Valid JSON Web Token for authentication. " + "Will be sent in the Authorization header (with optional 'Bearer ' prefix)." + ), + ), + StrInput( + name="jwt_header", + display_name="JWT Header Name", + value="Authorization", + show=False, + advanced=True, + ), + BoolInput( + name="bearer_prefix", + display_name="Prefix 'Bearer '", + value=True, + show=False, + advanced=True, + ), + # ----- TLS ----- + BoolInput( + name="use_ssl", + display_name="Use SSL/TLS", + value=True, + advanced=True, + info="Enable SSL/TLS encryption for secure connections to OpenSearch.", + ), + BoolInput( + name="verify_certs", + display_name="Verify SSL Certificates", + value=False, + advanced=True, + info=( + "Verify SSL certificates when connecting. " + "Disable for self-signed certificates in development environments." + ), + ), + ] + + def _get_embedding_model_name(self) -> str: + """Get the embedding model name from component config or embedding object. + + Returns: + Embedding model name + + Raises: + ValueError: If embedding model name cannot be determined + """ + # First try explicit embedding_model_name input + if hasattr(self, "embedding_model_name") and self.embedding_model_name: + return self.embedding_model_name.strip() + + # Try to get from embedding component + if hasattr(self, "embedding") and self.embedding: + if hasattr(self.embedding, "model"): + return str(self.embedding.model) + if hasattr(self.embedding, "model_name"): + return str(self.embedding.model_name) + + msg = ( + "Could not determine embedding model name. " + "Please set the 'embedding_model_name' field or ensure the embedding component " + "has a 'model' or 'model_name' attribute." + ) + raise ValueError(msg) + + # ---------- helper functions for index management ---------- + def _default_text_mapping( + self, + dim: int, + engine: str = "jvector", + space_type: str = "l2", + ef_search: int = 512, + ef_construction: int = 100, + m: int = 16, + vector_field: str = "vector_field", + ) -> dict[str, Any]: + """Create the default OpenSearch index mapping for vector search. + + This method generates the index configuration with k-NN settings optimized + for approximate nearest neighbor search using the specified vector engine. + Includes the embedding_model keyword field for tracking which model was used. + + Args: + dim: Dimensionality of the vector embeddings + engine: Vector search engine (jvector, nmslib, faiss, lucene) + space_type: Distance metric for similarity calculation + ef_search: Size of dynamic list used during search + ef_construction: Size of dynamic list used during index construction + m: Number of bidirectional links for each vector + vector_field: Name of the field storing vector embeddings + + Returns: + Dictionary containing OpenSearch index mapping configuration + """ + return { + "settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}}, + "mappings": { + "properties": { + vector_field: { + "type": "knn_vector", + "dimension": dim, + "method": { + "name": "disk_ann", + "space_type": space_type, + "engine": engine, + "parameters": {"ef_construction": ef_construction, "m": m}, + }, + }, + "embedding_model": {"type": "keyword"}, # Track which model was used + "embedding_dimensions": {"type": "integer"}, + } + }, + } + + def _ensure_embedding_field_mapping( + self, + client: OpenSearch, + index_name: str, + field_name: str, + dim: int, + engine: str, + space_type: str, + ef_construction: int, + m: int, + ) -> None: + """Lazily add a dynamic embedding field to the index if it doesn't exist. + + This allows adding new embedding models without recreating the entire index. + Also ensures the embedding_model tracking field exists. + + Args: + client: OpenSearch client instance + index_name: Target index name + field_name: Dynamic field name for this embedding model + dim: Vector dimensionality + engine: Vector search engine + space_type: Distance metric + ef_construction: Construction parameter + m: HNSW parameter + """ + try: + mapping = { + "properties": { + field_name: { + "type": "knn_vector", + "dimension": dim, + "method": { + "name": "disk_ann", + "space_type": space_type, + "engine": engine, + "parameters": {"ef_construction": ef_construction, "m": m}, + }, + }, + # Also ensure the embedding_model tracking field exists as keyword + "embedding_model": { + "type": "keyword" + }, + "embedding_dimensions": { + "type": "integer" + } + } + } + client.indices.put_mapping(index=index_name, body=mapping) + logger.info(f"Added/updated embedding field mapping: {field_name}") + except Exception as e: + logger.warning(f"Could not add embedding field mapping for {field_name}: {e}") + raise + + properties = self._get_index_properties(client) + if not self._is_knn_vector_field(properties, field_name): + raise ValueError( + f"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}" + ) + + def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None: + """Validate engine compatibility with Amazon OpenSearch Serverless (AOSS). + + Amazon OpenSearch Serverless has restrictions on which vector engines + can be used. This method ensures the selected engine is compatible. + + Args: + is_aoss: Whether the connection is to Amazon OpenSearch Serverless + engine: The selected vector search engine + + Raises: + ValueError: If AOSS is used with an incompatible engine + """ + if is_aoss and engine not in {"nmslib", "faiss"}: + msg = "Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines" + raise ValueError(msg) + + def _is_aoss_enabled(self, http_auth: Any) -> bool: + """Determine if Amazon OpenSearch Serverless (AOSS) is being used. + + Args: + http_auth: The HTTP authentication object + + Returns: + True if AOSS is enabled, False otherwise + """ + return http_auth is not None and hasattr(http_auth, "service") and http_auth.service == "aoss" + + def _bulk_ingest_embeddings( + self, + client: OpenSearch, + index_name: str, + embeddings: list[list[float]], + texts: list[str], + metadatas: list[dict] | None = None, + ids: list[str] | None = None, + vector_field: str = "vector_field", + text_field: str = "text", + embedding_model: str = "unknown", + mapping: dict | None = None, + max_chunk_bytes: int | None = 1 * 1024 * 1024, + *, + is_aoss: bool = False, + ) -> list[str]: + """Efficiently ingest multiple documents with embeddings into OpenSearch. + + This method uses bulk operations to insert documents with their vector + embeddings and metadata into the specified OpenSearch index. Each document + is tagged with the embedding_model name for tracking. + + Args: + client: OpenSearch client instance + index_name: Target index for document storage + embeddings: List of vector embeddings for each document + texts: List of document texts + metadatas: Optional metadata dictionaries for each document + ids: Optional document IDs (UUIDs generated if not provided) + vector_field: Field name for storing vector embeddings + text_field: Field name for storing document text + embedding_model: Name of the embedding model used + mapping: Optional index mapping configuration + max_chunk_bytes: Maximum size per bulk request chunk + is_aoss: Whether using Amazon OpenSearch Serverless + + Returns: + List of document IDs that were successfully ingested + """ + if not mapping: + mapping = {} + + requests = [] + return_ids = [] + vector_dimensions = len(embeddings[0]) if embeddings else None + + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + if vector_dimensions is not None and "embedding_dimensions" not in metadata: + metadata = {**metadata, "embedding_dimensions": vector_dimensions} + _id = ids[i] if ids else str(uuid.uuid4()) + request = { + "_op_type": "index", + "_index": index_name, + vector_field: embeddings[i], + text_field: text, + "embedding_model": embedding_model, # Track which model was used + **metadata, + } + if is_aoss: + request["id"] = _id + else: + request["_id"] = _id + requests.append(request) + return_ids.append(_id) + if metadatas: + self.log(f"Sample metadata: {metadatas[0] if metadatas else {}}") + helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes) + return return_ids + + # ---------- auth / client ---------- + def _build_auth_kwargs(self) -> dict[str, Any]: + """Build authentication configuration for OpenSearch client. + + Constructs the appropriate authentication parameters based on the + selected auth mode (basic username/password or JWT token). + + Returns: + Dictionary containing authentication configuration + + Raises: + ValueError: If required authentication parameters are missing + """ + mode = (self.auth_mode or "basic").strip().lower() + if mode == "jwt": + token = (self.jwt_token or "").strip() + if not token: + msg = "Auth Mode is 'jwt' but no jwt_token was provided." + raise ValueError(msg) + header_name = (self.jwt_header or "Authorization").strip() + header_value = f"Bearer {token}" if self.bearer_prefix else token + return {"headers": {header_name: header_value}} + user = (self.username or "").strip() + pwd = (self.password or "").strip() + if not user or not pwd: + msg = "Auth Mode is 'basic' but username/password are missing." + raise ValueError(msg) + return {"http_auth": (user, pwd)} + + def build_client(self) -> OpenSearch: + """Create and configure an OpenSearch client instance. + + Returns: + Configured OpenSearch client ready for operations + """ + auth_kwargs = self._build_auth_kwargs() + return OpenSearch( + hosts=[self.opensearch_url], + use_ssl=self.use_ssl, + verify_certs=self.verify_certs, + ssl_assert_hostname=False, + ssl_show_warn=False, + **auth_kwargs, + ) + + @check_cached_vector_store + def build_vector_store(self) -> OpenSearch: + # Return raw OpenSearch client as our "vector store." + self.log(self.ingest_data) + client = self.build_client() + self._add_documents_to_vector_store(client=client) + return client + + # ---------- ingest ---------- + def _add_documents_to_vector_store(self, client: OpenSearch) -> None: + """Process and ingest documents into the OpenSearch vector store. + + This method handles the complete document ingestion pipeline: + - Prepares document data and metadata + - Generates vector embeddings + - Creates appropriate index mappings with dynamic field names + - Bulk inserts documents with vectors and model tracking + + Args: + client: OpenSearch client for performing operations + """ + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + + docs = self.ingest_data or [] + if not docs: + self.log("No documents to ingest.") + return + + # Get embedding model name + embedding_model = self._get_embedding_model_name() + dynamic_field_name = get_embedding_field_name(embedding_model) + + self.log(f"Using embedding model: {embedding_model}") + self.log(f"Dynamic vector field: {dynamic_field_name}") + + # Extract texts and metadata from documents + texts = [] + metadatas = [] + # Process docs_metadata table input into a dict + additional_metadata = {} + if hasattr(self, "docs_metadata") and self.docs_metadata: + logger.info(f"[LF] Docs metadata {self.docs_metadata}") + if isinstance(self.docs_metadata[-1], Data): + logger.info(f"[LF] Docs metadata is a Data object {self.docs_metadata}") + self.docs_metadata = self.docs_metadata[-1].data + logger.info(f"[LF] Docs metadata is a Data object {self.docs_metadata}") + additional_metadata.update(self.docs_metadata) + else: + for item in self.docs_metadata: + if isinstance(item, dict) and "key" in item and "value" in item: + additional_metadata[item["key"]] = item["value"] + # Replace string "None" values with actual None + for key, value in additional_metadata.items(): + if value == "None": + additional_metadata[key] = None + logger.info(f"[LF] Additional metadata {additional_metadata}") + for doc_obj in docs: + data_copy = json.loads(doc_obj.model_dump_json()) + text = data_copy.pop(doc_obj.text_key, doc_obj.default_value) + texts.append(text) + + # Merge additional metadata from table input + data_copy.update(additional_metadata) + + metadatas.append(data_copy) + self.log(metadatas) + if not self.embedding: + msg = "Embedding handle is required to embed documents." + raise ValueError(msg) + + # Generate embeddings (threaded for concurrency) with retries + def embed_chunk(chunk_text: str) -> list[float]: + return self.embedding.embed_documents([chunk_text])[0] + + vectors: Optional[List[List[float]]] = None + last_exception: Optional[Exception] = None + delay = 1.0 + attempts = 0 + + while attempts < 3: + attempts += 1 + try: + max_workers = min(max(len(texts), 1), 8) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)} + vectors = [None] * len(texts) + for future in as_completed(futures): + idx = futures[future] + vectors[idx] = future.result() + break + except Exception as exc: + last_exception = exc + if attempts >= 3: + logger.error( + "Embedding generation failed after retries", + error=str(exc), + ) + raise + logger.warning( + "Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs", + attempts, + 3, + delay, + ) + time.sleep(delay) + delay = min(delay * 2, 8.0) + + if vectors is None: + raise RuntimeError( + f"Embedding generation failed: {last_exception}" if last_exception else "Embedding generation failed" + ) + + if not vectors: + self.log("No vectors generated from documents.") + return + + # Get vector dimension for mapping + dim = len(vectors[0]) if vectors else 768 # default fallback + + # Check for AOSS + auth_kwargs = self._build_auth_kwargs() + is_aoss = self._is_aoss_enabled(auth_kwargs.get("http_auth")) + + # Validate engine with AOSS + engine = getattr(self, "engine", "jvector") + self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine) + + # Create mapping with proper KNN settings + space_type = getattr(self, "space_type", "l2") + ef_construction = getattr(self, "ef_construction", 512) + m = getattr(self, "m", 16) + + mapping = self._default_text_mapping( + dim=dim, + engine=engine, + space_type=space_type, + ef_construction=ef_construction, + m=m, + vector_field=dynamic_field_name, # Use dynamic field name + ) + + # Ensure index exists with baseline mapping + try: + if not client.indices.exists(index=self.index_name): + self.log(f"Creating index '{self.index_name}' with base mapping") + client.indices.create(index=self.index_name, body=mapping) + except RequestError as creation_error: + if creation_error.error != "resource_already_exists_exception": + logger.warning( + f"Failed to create index '{self.index_name}': {creation_error}" + ) + + # Ensure the dynamic field exists in the index + self._ensure_embedding_field_mapping( + client=client, + index_name=self.index_name, + field_name=dynamic_field_name, + dim=dim, + engine=engine, + space_type=space_type, + ef_construction=ef_construction, + m=m, + ) + + self.log(f"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...") + + # Use the bulk ingestion with model tracking + return_ids = self._bulk_ingest_embeddings( + client=client, + index_name=self.index_name, + embeddings=vectors, + texts=texts, + metadatas=metadatas, + vector_field=dynamic_field_name, # Use dynamic field name + text_field="text", + embedding_model=embedding_model, # Track the model + mapping=mapping, + is_aoss=is_aoss, + ) + self.log(metadatas) + + self.log(f"Successfully indexed {len(return_ids)} documents with model {embedding_model}.") + + # ---------- helpers for filters ---------- + def _is_placeholder_term(self, term_obj: dict) -> bool: + # term_obj like {"filename": "__IMPOSSIBLE_VALUE__"} + return any(v == "__IMPOSSIBLE_VALUE__" for v in term_obj.values()) + + def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]: + """Convert filter expressions into OpenSearch-compatible filter clauses. + + This method accepts two filter formats and converts them to standardized + OpenSearch query clauses: + + Format A - Explicit filters: + {"filter": [{"term": {"field": "value"}}, {"terms": {"field": ["val1", "val2"]}}], + "limit": 10, "score_threshold": 1.5} + + Format B - Context-style mapping: + {"data_sources": ["file1.pdf"], "document_types": ["pdf"], "owners": ["user1"]} + + Args: + filter_obj: Filter configuration dictionary or None + + Returns: + List of OpenSearch filter clauses (term/terms objects) + Placeholder values with "__IMPOSSIBLE_VALUE__" are ignored + """ + if not filter_obj: + return [] + + # If it is a string, try to parse it once + if isinstance(filter_obj, str): + try: + filter_obj = json.loads(filter_obj) + except json.JSONDecodeError: + # Not valid JSON - treat as no filters + return [] + + # Case A: already an explicit list/dict under "filter" + if "filter" in filter_obj: + raw = filter_obj["filter"] + if isinstance(raw, dict): + raw = [raw] + explicit_clauses: list[dict] = [] + for f in raw or []: + if "term" in f and isinstance(f["term"], dict) and not self._is_placeholder_term(f["term"]): + explicit_clauses.append(f) + elif "terms" in f and isinstance(f["terms"], dict): + field, vals = next(iter(f["terms"].items())) + if isinstance(vals, list) and len(vals) > 0: + explicit_clauses.append(f) + return explicit_clauses + + # Case B: convert context-style maps into clauses + field_mapping = { + "data_sources": "filename", + "document_types": "mimetype", + "owners": "owner", + } + context_clauses: list[dict] = [] + for k, values in filter_obj.items(): + if not isinstance(values, list): + continue + field = field_mapping.get(k, k) + if len(values) == 0: + # Match-nothing placeholder (kept to mirror your tool semantics) + context_clauses.append({"term": {field: "__IMPOSSIBLE_VALUE__"}}) + elif len(values) == 1: + if values[0] != "__IMPOSSIBLE_VALUE__": + context_clauses.append({"term": {field: values[0]}}) + else: + context_clauses.append({"terms": {field: values}}) + return context_clauses + + def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]: + """Detect which embedding models have documents in the index. + + Uses aggregation to find all unique embedding_model values, optionally + filtered to only documents matching the user's filter criteria. + + Args: + client: OpenSearch client instance + filter_clauses: Optional filter clauses to scope model detection + + Returns: + List of embedding model names found in the index + """ + try: + agg_query = { + "size": 0, + "aggs": { + "embedding_models": { + "terms": { + "field": "embedding_model", + "size": 10 + } + } + } + } + + # Apply filters to model detection if any exist + if filter_clauses: + agg_query["query"] = { + "bool": { + "filter": filter_clauses + } + } + + result = client.search( + index=self.index_name, + body=agg_query, + params={"terminate_after": 0}, + ) + buckets = result.get("aggregations", {}).get("embedding_models", {}).get("buckets", []) + models = [b["key"] for b in buckets if b["key"]] + + logger.info( + f"Detected embedding models in corpus: {models}" + + (f" (with {len(filter_clauses)} filters)" if filter_clauses else "") + ) + return models + except Exception as e: + logger.warning(f"Failed to detect embedding models: {e}") + # Fallback to current model + return [self._get_embedding_model_name()] + + def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None: + """Retrieve flattened mapping properties for the current index.""" + try: + mapping = client.indices.get_mapping(index=self.index_name) + except Exception as e: + logger.warning( + f"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata." + ) + return None + + properties: dict[str, Any] = {} + for index_data in mapping.values(): + props = index_data.get("mappings", {}).get("properties", {}) + if isinstance(props, dict): + properties.update(props) + return properties + + def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool: + """Check whether the field is mapped as a knn_vector.""" + if not field_name: + return False + if properties is None: + logger.warning( + f"Mapping metadata unavailable; assuming field '{field_name}' is usable." + ) + return True + field_def = properties.get(field_name) + if not isinstance(field_def, dict): + return False + if field_def.get("type") == "knn_vector": + return True + + nested_props = field_def.get("properties") + if isinstance(nested_props, dict) and nested_props.get("type") == "knn_vector": + return True + + return False + + # ---------- search (multi-model hybrid) ---------- + def search(self, query: str | None = None) -> list[dict[str, Any]]: + """Perform multi-model hybrid search combining multiple vector similarities and keyword matching. + + This method executes a sophisticated search that: + 1. Auto-detects all embedding models present in the index + 2. Generates query embeddings for ALL detected models in parallel + 3. Combines multiple KNN queries using dis_max (picks best match) + 4. Adds keyword search with fuzzy matching (30% weight) + 5. Applies optional filtering and score thresholds + 6. Returns aggregations for faceted search + + Search weights: + - Semantic search (dis_max across all models): 70% + - Keyword search: 30% + + Args: + query: Search query string (used for both vector embedding and keyword search) + + Returns: + List of search results with page_content, metadata, and relevance scores + + Raises: + ValueError: If embedding component is not provided or filter JSON is invalid + """ + logger.info(self.ingest_data) + client = self.build_client() + q = (query or "").strip() + + # Parse optional filter expression + filter_obj = None + if getattr(self, "filter_expression", "") and self.filter_expression.strip(): + try: + filter_obj = json.loads(self.filter_expression) + except json.JSONDecodeError as e: + msg = f"Invalid filter_expression JSON: {e}" + raise ValueError(msg) from e + + if not self.embedding: + msg = "Embedding is required to run hybrid search (KNN + keyword)." + raise ValueError(msg) + + # Build filter clauses first so we can use them in model detection + filter_clauses = self._coerce_filter_clauses(filter_obj) + + # Detect available embedding models in the index (scoped by filters) + available_models = self._detect_available_models(client, filter_clauses) + + if not available_models: + logger.warning("No embedding models found in index, using current model") + available_models = [self._get_embedding_model_name()] + + # Generate embeddings for ALL detected models in parallel + query_embeddings = {} + + # Note: Langflow is synchronous, so we can't use true async here + # But we log the intent for parallel processing + logger.info(f"Generating embeddings for {len(available_models)} models") + + original_model_attr = getattr(self.embedding, "model", None) + original_deployment_attr = getattr(self.embedding, "deployment", None) + original_dimensions_attr = getattr(self.embedding, "dimensions", None) + + for model_name in available_models: + try: + # In a real async environment, these would run in parallel + # For now, they run sequentially + if hasattr(self.embedding, "model"): + setattr(self.embedding, "model", model_name) + if hasattr(self.embedding, "deployment"): + setattr(self.embedding, "deployment", model_name) + if hasattr(self.embedding, "dimensions"): + setattr(self.embedding, "dimensions", None) + vec = self.embedding.embed_query(q) + query_embeddings[model_name] = vec + logger.info(f"Generated embedding for model: {model_name}") + except Exception as e: + logger.error(f"Failed to generate embedding for {model_name}: {e}") + + if hasattr(self.embedding, "model"): + setattr(self.embedding, "model", original_model_attr) + if hasattr(self.embedding, "deployment"): + setattr(self.embedding, "deployment", original_deployment_attr) + if hasattr(self.embedding, "dimensions"): + setattr(self.embedding, "dimensions", original_dimensions_attr) + + if not query_embeddings: + msg = "Failed to generate embeddings for any model" + raise ValueError(msg) + + index_properties = self._get_index_properties(client) + legacy_vector_field = getattr(self, "vector_field", "chunk_embedding") + + # Build KNN queries for each model + embedding_fields: list[str] = [] + knn_queries_with_candidates = [] + knn_queries_without_candidates = [] + + raw_num_candidates = getattr(self, "num_candidates", 1000) + try: + num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0 + except (TypeError, ValueError): + num_candidates = 0 + use_num_candidates = num_candidates > 0 + + for model_name, embedding_vector in query_embeddings.items(): + field_name = get_embedding_field_name(model_name) + selected_field = field_name + + # Only use the expected dynamic field - no legacy fallback + # This prevents dimension mismatches between models + if not self._is_knn_vector_field(index_properties, selected_field): + logger.warning( + f"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. " + f"Documents must be indexed with this embedding model before querying." + ) + continue + + embedding_fields.append(selected_field) + + base_query = { + "knn": { + selected_field: { + "vector": embedding_vector, + "k": 50, + } + } + } + + if use_num_candidates: + query_with_candidates = copy.deepcopy(base_query) + query_with_candidates["knn"][selected_field]["num_candidates"] = num_candidates + else: + query_with_candidates = base_query + + knn_queries_with_candidates.append(query_with_candidates) + knn_queries_without_candidates.append(base_query) + + if not knn_queries_with_candidates: + # No valid fields found - this can happen when: + # 1. Index is empty (no documents yet) + # 2. Embedding model has changed and field doesn't exist yet + # Return empty results instead of failing + logger.warning( + "No valid knn_vector fields found for embedding models. " + "This may indicate an empty index or missing field mappings. " + "Returning empty search results." + ) + return [] + + # Build exists filter - document must have at least one embedding field + exists_any_embedding = { + "bool": { + "should": [{"exists": {"field": f}} for f in set(embedding_fields)], + "minimum_should_match": 1 + } + } + + # Combine user filters with exists filter + all_filters = [*filter_clauses, exists_any_embedding] + + # Get limit and score threshold + limit = (filter_obj or {}).get("limit", self.number_of_results) + score_threshold = (filter_obj or {}).get("score_threshold", 0) + + # Build multi-model hybrid query + body = { + "query": { + "bool": { + "should": [ + { + "dis_max": { + "tie_breaker": 0.0, # Take only the best match, no blending + "boost": 0.7, # 70% weight for semantic search + "queries": knn_queries_with_candidates + } + }, + { + "multi_match": { + "query": q, + "fields": ["text^2", "filename^1.5"], + "type": "best_fields", + "fuzziness": "AUTO", + "boost": 0.3, # 30% weight for keyword search + } + }, + ], + "minimum_should_match": 1, + "filter": all_filters, + } + }, + "aggs": { + "data_sources": {"terms": {"field": "filename", "size": 20}}, + "document_types": {"terms": {"field": "mimetype", "size": 10}}, + "owners": {"terms": {"field": "owner", "size": 10}}, + "embedding_models": {"terms": {"field": "embedding_model", "size": 10}}, + }, + "_source": [ + "filename", + "mimetype", + "page", + "text", + "source_url", + "owner", + "embedding_model", + "allowed_users", + "allowed_groups", + ], + "size": limit, + } + + if isinstance(score_threshold, (int, float)) and score_threshold > 0: + body["min_score"] = score_threshold + + logger.info( + f"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models" + ) + + try: + resp = client.search( + index=self.index_name, body=body, params={"terminate_after": 0} + ) + except RequestError as e: + error_message = str(e) + lowered = error_message.lower() + if use_num_candidates and "num_candidates" in lowered: + logger.warning( + "Retrying search without num_candidates parameter due to cluster capabilities", + error=error_message, + ) + fallback_body = copy.deepcopy(body) + try: + fallback_body["query"]["bool"]["should"][0]["dis_max"]["queries"] = knn_queries_without_candidates + except (KeyError, IndexError, TypeError) as inner_err: + raise e from inner_err + resp = client.search( + index=self.index_name, + body=fallback_body, + params={"terminate_after": 0}, + ) + elif "knn_vector" in lowered or ("field" in lowered and "knn" in lowered): + fallback_vector = next(iter(query_embeddings.values()), None) + if fallback_vector is None: + raise + fallback_field = legacy_vector_field or "chunk_embedding" + logger.warning( + "KNN search failed for dynamic fields; falling back to legacy field '%s'.", + fallback_field, + ) + fallback_body = copy.deepcopy(body) + fallback_body["query"]["bool"]["filter"] = filter_clauses + knn_fallback = { + "knn": { + fallback_field: { + "vector": fallback_vector, + "k": 50, + } + } + } + if use_num_candidates: + knn_fallback["knn"][fallback_field]["num_candidates"] = num_candidates + fallback_body["query"]["bool"]["should"][0]["dis_max"]["queries"] = [knn_fallback] + resp = client.search( + index=self.index_name, + body=fallback_body, + params={"terminate_after": 0}, + ) + else: + raise + hits = resp.get("hits", {}).get("hits", []) + + logger.info(f"Found {len(hits)} results") + + return [ + { + "page_content": hit["_source"].get("text", ""), + "metadata": {k: v for k, v in hit["_source"].items() if k != "text"}, + "score": hit.get("_score"), + } + for hit in hits + ] + + def search_documents(self) -> list[Data]: + """Search documents and return results as Data objects. + + This is the main interface method that performs the multi-model search using the + configured search_query and returns results in Langflow's Data format. + + Returns: + List of Data objects containing search results with text and metadata + + Raises: + Exception: If search operation fails + """ + try: + raw = self.search(self.search_query or "") + return [Data(text=hit["page_content"], **hit["metadata"]) for hit in raw] + self.log(self.ingest_data) + except Exception as e: + self.log(f"search_documents error: {e}") + raise + + # -------- dynamic UI handling (auth switch) -------- + async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict: + """Dynamically update component configuration based on field changes. + + This method handles real-time UI updates, particularly for authentication + mode changes that show/hide relevant input fields. + + Args: + build_config: Current component configuration + field_value: New value for the changed field + field_name: Name of the field that changed + + Returns: + Updated build configuration with appropriate field visibility + """ + try: + if field_name == "auth_mode": + mode = (field_value or "basic").strip().lower() + is_basic = mode == "basic" + is_jwt = mode == "jwt" + + build_config["username"]["show"] = is_basic + build_config["password"]["show"] = is_basic + + build_config["jwt_token"]["show"] = is_jwt + build_config["jwt_header"]["show"] = is_jwt + build_config["bearer_prefix"]["show"] = is_jwt + + build_config["username"]["required"] = is_basic + build_config["password"]["required"] = is_basic + + build_config["jwt_token"]["required"] = is_jwt + build_config["jwt_header"]["required"] = is_jwt + build_config["bearer_prefix"]["required"] = False + + if is_basic: + build_config["jwt_token"]["value"] = "" + + return build_config + + except (KeyError, ValueError) as e: + self.log(f"update_build_config error: {e}") + + return build_config diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json index 911c3e38..b525bd6a 100644 --- a/flows/ingestion_flow.json +++ b/flows/ingestion_flow.json @@ -602,7 +602,7 @@ "beta": false, "conditional_paths": [], "custom_fields": {}, - "description": "Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.", + "description": "Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.", "display_name": "OpenSearch", "documentation": "", "edited": true, @@ -618,6 +618,7 @@ "search_query", "should_cache_vector_store", "embedding", + "embedding_model_name", "vector_field", "number_of_results", "filter_expression", @@ -632,10 +633,10 @@ ], "frozen": false, "icon": "OpenSearch", + "last_updated": "2025-10-10T14:37:10.405Z", "legacy": false, - "lf_version": "1.6.3.dev0", "metadata": { - "code_hash": "c81b23acb81a", + "code_hash": "62d330aec569", "dependencies": { "dependencies": [ { @@ -644,7 +645,7 @@ }, { "name": "lfx", - "version": null + "version": "0.1.12.dev32" } ], "total_dependencies": 2 @@ -747,7 +748,7 @@ "name": "bearer_prefix", "placeholder": "", "required": false, - "show": false, + "show": true, "title_case": false, "tool_mode": false, "trace_as_metadata": true, @@ -770,7 +771,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport time\nimport uuid\nfrom typing import Any, List, Optional\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import RequestError\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model being used (e.g., 'text-embedding-3-small'). \"\n \"Used to create dynamic vector field names and track which model embedded each document. \"\n \"Auto-detected from embedding component if not specified.\"\n ),\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n def _get_embedding_model_name(self) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from embedding component\n if hasattr(self, \"embedding\") and self.embedding:\n if hasattr(self.embedding, \"model\"):\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_name\"):\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'model' or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\n \"type\": \"keyword\"\n },\n \"embedding_dimensions\": {\n \"type\": \"integer\"\n }\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except Exception as e:\n logger.warning(f\"Could not add embedding field mapping for {field_name}: {e}\")\n raise\n\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n raise ValueError(\n f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n )\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Get embedding model name\n embedding_model = self._get_embedding_model_name()\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n self.log(f\"Using embedding model: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings (threaded for concurrency) with retries\n def embed_chunk(chunk_text: str) -> list[float]:\n return self.embedding.embed_documents([chunk_text])[0]\n\n vectors: Optional[List[List[float]]] = None\n last_exception: Optional[Exception] = None\n delay = 1.0\n attempts = 0\n\n while attempts < 3:\n attempts += 1\n try:\n max_workers = min(max(len(texts), 1), 8)\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}\n vectors = [None] * len(texts)\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n break\n except Exception as exc:\n last_exception = exc\n if attempts >= 3:\n logger.error(\n \"Embedding generation failed after retries\",\n error=str(exc),\n )\n raise\n logger.warning(\n \"Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs\",\n attempts,\n 3,\n delay,\n )\n time.sleep(delay)\n delay = min(delay * 2, 8.0)\n\n if vectors is None:\n raise RuntimeError(\n f\"Embedding generation failed: {last_exception}\" if last_exception else \"Embedding generation failed\"\n )\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping\n try:\n if not client.indices.exists(index=self.index_name):\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error != \"resource_already_exists_exception\":\n logger.warning(\n f\"Failed to create index '{self.index_name}': {creation_error}\"\n )\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\n \"embedding_models\": {\n \"terms\": {\n \"field\": \"embedding_model\",\n \"size\": 10\n }\n }\n }\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\n \"bool\": {\n \"filter\": filter_clauses\n }\n }\n\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n return models\n except Exception as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n return [self._get_embedding_model_name()]\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except Exception as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return True\n\n return False\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models in parallel\n query_embeddings = {}\n\n # Note: Langflow is synchronous, so we can't use true async here\n # But we log the intent for parallel processing\n logger.info(f\"Generating embeddings for {len(available_models)} models\")\n\n original_model_attr = getattr(self.embedding, \"model\", None)\n original_deployment_attr = getattr(self.embedding, \"deployment\", None)\n original_dimensions_attr = getattr(self.embedding, \"dimensions\", None)\n\n for model_name in available_models:\n try:\n # In a real async environment, these would run in parallel\n # For now, they run sequentially\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", model_name)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", model_name)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", None)\n vec = self.embedding.embed_query(q)\n query_embeddings[model_name] = vec\n logger.info(f\"Generated embedding for model: {model_name}\")\n except Exception as e:\n logger.error(f\"Failed to generate embedding for {model_name}: {e}\")\n\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", original_model_attr)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", original_deployment_attr)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", original_dimensions_attr)\n\n if not query_embeddings:\n msg = \"Failed to generate embeddings for any model\"\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n continue\n\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models\"\n )\n\n try:\n resp = client.search(\n index=self.index_name, body=body, params={\"terminate_after\": 0}\n )\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = knn_queries_without_candidates\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", @@ -850,6 +851,25 @@ "type": "other", "value": "" }, + "embedding_model_name": { + "_input_type": "StrInput", + "advanced": false, + "display_name": "Embedding Model Name", + "dynamic": false, + "info": "Name of the embedding model being used (e.g., 'text-embedding-3-small'). Used to create dynamic vector field names and track which model embedded each document. Auto-detected from embedding component if not specified.", + "list": false, + "list_add_label": "Add More", + "load_from_db": false, + "name": "embedding_model_name", + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "type": "str", + "value": "" + }, "engine": { "_input_type": "DropdownInput", "advanced": true, @@ -876,7 +896,7 @@ "tool_mode": false, "trace_as_metadata": true, "type": "str", - "value": "nmslib" + "value": "jvector" }, "filter_expression": { "_input_type": "MultilineInput", @@ -954,8 +974,8 @@ "load_from_db": false, "name": "jwt_header", "placeholder": "", - "required": false, - "show": false, + "required": true, + "show": true, "title_case": false, "tool_mode": false, "trace_as_metadata": true, @@ -973,11 +993,11 @@ "name": "jwt_token", "password": true, "placeholder": "", - "required": false, + "required": true, "show": true, "title_case": false, "type": "str", - "value": "" + "value": "JWT" }, "m": { "_input_type": "IntInput", @@ -1161,9 +1181,9 @@ "vector_field": { "_input_type": "StrInput", "advanced": true, - "display_name": "Vector Field Name", + "display_name": "Legacy Vector Field Name", "dynamic": false, - "info": "Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.", + "info": "Legacy field name for backward compatibility. New documents use dynamic fields (chunk_embedding_{model_name}) based on the embedding_model_name.", "list": false, "list_add_label": "Add More", "load_from_db": false, @@ -1205,14 +1225,14 @@ "dragging": false, "id": "OpenSearchHybrid-Ve6bS", "measured": { - "height": 822, + "height": 909, "width": 320 }, "position": { "x": 2218.9287723423276, "y": 1332.2598463956504 }, - "selected": false, + "selected": true, "type": "genericNode" }, { @@ -1245,7 +1265,7 @@ ], "frozen": false, "icon": "binary", - "last_updated": "2025-10-04T02:17:01.272Z", + "last_updated": "2025-10-10T13:34:19.767Z", "legacy": false, "lf_version": "1.6.3.dev0", "metadata": { @@ -1519,7 +1539,7 @@ "dragging": false, "id": "EmbeddingModel-eZ6bT", "measured": { - "height": 369, + "height": 370, "width": 320 }, "position": { @@ -1550,7 +1570,7 @@ ], "frozen": false, "icon": "braces", - "last_updated": "2025-10-04T02:17:01.273Z", + "last_updated": "2025-10-10T13:34:19.770Z", "legacy": false, "lf_version": "1.6.3.dev0", "metadata": {}, @@ -1838,7 +1858,7 @@ "dragging": false, "id": "AdvancedDynamicFormBuilder-81Exw", "measured": { - "height": 552, + "height": 554, "width": 320 }, "position": { @@ -2615,7 +2635,7 @@ "dragging": false, "id": "DoclingRemote-Dp3PX", "measured": { - "height": 475, + "height": 477, "width": 320 }, "position": { @@ -2872,7 +2892,7 @@ "dragging": false, "id": "ExportDoclingDocument-zZdRg", "measured": { - "height": 347, + "height": 348, "width": 320 }, "position": { @@ -2912,7 +2932,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-04T02:17:01.354Z", + "last_updated": "2025-10-10T13:34:19.877Z", "legacy": false, "lf_version": "1.6.3.dev0", "metadata": { @@ -3288,7 +3308,7 @@ "dragging": false, "id": "DataFrameOperations-1BWXB", "measured": { - "height": 399, + "height": 401, "width": 320 }, "position": { @@ -3328,7 +3348,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-04T02:17:01.355Z", + "last_updated": "2025-10-10T13:34:19.878Z", "legacy": false, "lf_version": "1.6.3.dev0", "metadata": { @@ -3704,7 +3724,7 @@ "dragging": false, "id": "DataFrameOperations-N80fC", "measured": { - "height": 399, + "height": 401, "width": 320 }, "position": { @@ -3744,7 +3764,7 @@ ], "frozen": false, "icon": "table", - "last_updated": "2025-10-04T02:17:01.355Z", + "last_updated": "2025-10-10T13:34:19.878Z", "legacy": false, "lf_version": "1.6.3.dev0", "metadata": { @@ -4120,7 +4140,7 @@ "dragging": false, "id": "DataFrameOperations-9vMrp", "measured": { - "height": 399, + "height": 401, "width": 320 }, "position": { @@ -4132,16 +4152,16 @@ } ], "viewport": { - "x": 227.3737875665738, - "y": -299.1651660660417, - "zoom": 0.43587407227641217 + "x": -418.8241631881149, + "y": -563.2891507884635, + "zoom": 0.6194861362488232 } }, "description": "Load your data for chat context with Retrieval Augmented Generation.", "endpoint_name": null, "id": "5488df7c-b93f-4f87-a446-b67028bc0813", "is_component": false, - "last_tested_version": "1.6.3.dev0", + "last_tested_version": "1.6.3.dev1", "name": "OpenSearch Ingestion Flow", "tags": [ "openai", @@ -4149,4 +4169,4 @@ "rag", "q-a" ] -} \ No newline at end of file +} diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index bb02b425..fd82caba 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -860,7 +860,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport time\nimport uuid\nfrom typing import Any, List, Optional\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import RequestError\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model being used (e.g., 'text-embedding-3-small'). \"\n \"Used to create dynamic vector field names and track which model embedded each document. \"\n \"Auto-detected from embedding component if not specified.\"\n ),\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n def _get_embedding_model_name(self) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from embedding component\n if hasattr(self, \"embedding\") and self.embedding:\n if hasattr(self.embedding, \"model\"):\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_name\"):\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'model' or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\n \"type\": \"keyword\"\n },\n \"embedding_dimensions\": {\n \"type\": \"integer\"\n }\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except Exception as e:\n logger.warning(f\"Could not add embedding field mapping for {field_name}: {e}\")\n raise\n\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n raise ValueError(\n f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n )\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Get embedding model name\n embedding_model = self._get_embedding_model_name()\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n self.log(f\"Using embedding model: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings (threaded for concurrency) with retries\n def embed_chunk(chunk_text: str) -> list[float]:\n return self.embedding.embed_documents([chunk_text])[0]\n\n vectors: Optional[List[List[float]]] = None\n last_exception: Optional[Exception] = None\n delay = 1.0\n attempts = 0\n\n while attempts < 3:\n attempts += 1\n try:\n max_workers = min(max(len(texts), 1), 8)\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}\n vectors = [None] * len(texts)\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n break\n except Exception as exc:\n last_exception = exc\n if attempts >= 3:\n logger.error(\n \"Embedding generation failed after retries\",\n error=str(exc),\n )\n raise\n logger.warning(\n \"Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs\",\n attempts,\n 3,\n delay,\n )\n time.sleep(delay)\n delay = min(delay * 2, 8.0)\n\n if vectors is None:\n raise RuntimeError(\n f\"Embedding generation failed: {last_exception}\" if last_exception else \"Embedding generation failed\"\n )\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping\n try:\n if not client.indices.exists(index=self.index_name):\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error != \"resource_already_exists_exception\":\n logger.warning(\n f\"Failed to create index '{self.index_name}': {creation_error}\"\n )\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\n \"embedding_models\": {\n \"terms\": {\n \"field\": \"embedding_model\",\n \"size\": 10\n }\n }\n }\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\n \"bool\": {\n \"filter\": filter_clauses\n }\n }\n\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n return models\n except Exception as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n return [self._get_embedding_model_name()]\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except Exception as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return True\n\n return False\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models in parallel\n query_embeddings = {}\n\n # Note: Langflow is synchronous, so we can't use true async here\n # But we log the intent for parallel processing\n logger.info(f\"Generating embeddings for {len(available_models)} models\")\n\n original_model_attr = getattr(self.embedding, \"model\", None)\n original_deployment_attr = getattr(self.embedding, \"deployment\", None)\n original_dimensions_attr = getattr(self.embedding, \"dimensions\", None)\n\n for model_name in available_models:\n try:\n # In a real async environment, these would run in parallel\n # For now, they run sequentially\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", model_name)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", model_name)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", None)\n vec = self.embedding.embed_query(q)\n query_embeddings[model_name] = vec\n logger.info(f\"Generated embedding for model: {model_name}\")\n except Exception as e:\n logger.error(f\"Failed to generate embedding for {model_name}: {e}\")\n\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", original_model_attr)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", original_deployment_attr)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", original_dimensions_attr)\n\n if not query_embeddings:\n msg = \"Failed to generate embeddings for any model\"\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n continue\n\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models\"\n )\n\n try:\n resp = client.search(\n index=self.index_name, body=body, params={\"terminate_after\": 0}\n )\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = knn_queries_without_candidates\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", @@ -2842,4 +2842,4 @@ "assistants", "agents" ] -} \ No newline at end of file +} diff --git a/flows/openrag_ingest_docling.json b/flows/openrag_ingest_docling.json index f0e8b164..a4a83c06 100644 --- a/flows/openrag_ingest_docling.json +++ b/flows/openrag_ingest_docling.json @@ -677,7 +677,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport time\nimport uuid\nfrom typing import Any, List, Optional\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import RequestError\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model being used (e.g., 'text-embedding-3-small'). \"\n \"Used to create dynamic vector field names and track which model embedded each document. \"\n \"Auto-detected from embedding component if not specified.\"\n ),\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n def _get_embedding_model_name(self) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from embedding component\n if hasattr(self, \"embedding\") and self.embedding:\n if hasattr(self.embedding, \"model\"):\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_name\"):\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'model' or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\n \"type\": \"keyword\"\n },\n \"embedding_dimensions\": {\n \"type\": \"integer\"\n }\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except Exception as e:\n logger.warning(f\"Could not add embedding field mapping for {field_name}: {e}\")\n raise\n\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n raise ValueError(\n f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n )\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Get embedding model name\n embedding_model = self._get_embedding_model_name()\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n self.log(f\"Using embedding model: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings (threaded for concurrency) with retries\n def embed_chunk(chunk_text: str) -> list[float]:\n return self.embedding.embed_documents([chunk_text])[0]\n\n vectors: Optional[List[List[float]]] = None\n last_exception: Optional[Exception] = None\n delay = 1.0\n attempts = 0\n\n while attempts < 3:\n attempts += 1\n try:\n max_workers = min(max(len(texts), 1), 8)\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}\n vectors = [None] * len(texts)\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n break\n except Exception as exc:\n last_exception = exc\n if attempts >= 3:\n logger.error(\n \"Embedding generation failed after retries\",\n error=str(exc),\n )\n raise\n logger.warning(\n \"Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs\",\n attempts,\n 3,\n delay,\n )\n time.sleep(delay)\n delay = min(delay * 2, 8.0)\n\n if vectors is None:\n raise RuntimeError(\n f\"Embedding generation failed: {last_exception}\" if last_exception else \"Embedding generation failed\"\n )\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping\n try:\n if not client.indices.exists(index=self.index_name):\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error != \"resource_already_exists_exception\":\n logger.warning(\n f\"Failed to create index '{self.index_name}': {creation_error}\"\n )\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\n \"embedding_models\": {\n \"terms\": {\n \"field\": \"embedding_model\",\n \"size\": 10\n }\n }\n }\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\n \"bool\": {\n \"filter\": filter_clauses\n }\n }\n\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n return models\n except Exception as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n return [self._get_embedding_model_name()]\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except Exception as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return True\n\n return False\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models in parallel\n query_embeddings = {}\n\n # Note: Langflow is synchronous, so we can't use true async here\n # But we log the intent for parallel processing\n logger.info(f\"Generating embeddings for {len(available_models)} models\")\n\n original_model_attr = getattr(self.embedding, \"model\", None)\n original_deployment_attr = getattr(self.embedding, \"deployment\", None)\n original_dimensions_attr = getattr(self.embedding, \"dimensions\", None)\n\n for model_name in available_models:\n try:\n # In a real async environment, these would run in parallel\n # For now, they run sequentially\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", model_name)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", model_name)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", None)\n vec = self.embedding.embed_query(q)\n query_embeddings[model_name] = vec\n logger.info(f\"Generated embedding for model: {model_name}\")\n except Exception as e:\n logger.error(f\"Failed to generate embedding for {model_name}: {e}\")\n\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", original_model_attr)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", original_deployment_attr)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", original_dimensions_attr)\n\n if not query_embeddings:\n msg = \"Failed to generate embeddings for any model\"\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n continue\n\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models\"\n )\n\n try:\n resp = client.search(\n index=self.index_name, body=body, params={\"terminate_after\": 0}\n )\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = knn_queries_without_candidates\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", @@ -783,7 +783,7 @@ "tool_mode": false, "trace_as_metadata": true, "type": "str", - "value": "nmslib" + "value": "jvector" }, "filter_expression": { "_input_type": "MultilineInput", @@ -2807,4 +2807,4 @@ "rag", "q-a" ] -} \ No newline at end of file +} diff --git a/flows/openrag_nudges.json b/flows/openrag_nudges.json index 7ed390d7..a463dfb3 100644 --- a/flows/openrag_nudges.json +++ b/flows/openrag_nudges.json @@ -871,7 +871,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport time\nimport uuid\nfrom typing import Any, List, Optional\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import RequestError\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model being used (e.g., 'text-embedding-3-small'). \"\n \"Used to create dynamic vector field names and track which model embedded each document. \"\n \"Auto-detected from embedding component if not specified.\"\n ),\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n def _get_embedding_model_name(self) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from embedding component\n if hasattr(self, \"embedding\") and self.embedding:\n if hasattr(self.embedding, \"model\"):\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_name\"):\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'model' or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\n \"type\": \"keyword\"\n },\n \"embedding_dimensions\": {\n \"type\": \"integer\"\n }\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except Exception as e:\n logger.warning(f\"Could not add embedding field mapping for {field_name}: {e}\")\n raise\n\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n raise ValueError(\n f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n )\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Get embedding model name\n embedding_model = self._get_embedding_model_name()\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n self.log(f\"Using embedding model: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings (threaded for concurrency) with retries\n def embed_chunk(chunk_text: str) -> list[float]:\n return self.embedding.embed_documents([chunk_text])[0]\n\n vectors: Optional[List[List[float]]] = None\n last_exception: Optional[Exception] = None\n delay = 1.0\n attempts = 0\n\n while attempts < 3:\n attempts += 1\n try:\n max_workers = min(max(len(texts), 1), 8)\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}\n vectors = [None] * len(texts)\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n break\n except Exception as exc:\n last_exception = exc\n if attempts >= 3:\n logger.error(\n \"Embedding generation failed after retries\",\n error=str(exc),\n )\n raise\n logger.warning(\n \"Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs\",\n attempts,\n 3,\n delay,\n )\n time.sleep(delay)\n delay = min(delay * 2, 8.0)\n\n if vectors is None:\n raise RuntimeError(\n f\"Embedding generation failed: {last_exception}\" if last_exception else \"Embedding generation failed\"\n )\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping\n try:\n if not client.indices.exists(index=self.index_name):\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error != \"resource_already_exists_exception\":\n logger.warning(\n f\"Failed to create index '{self.index_name}': {creation_error}\"\n )\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\n \"embedding_models\": {\n \"terms\": {\n \"field\": \"embedding_model\",\n \"size\": 10\n }\n }\n }\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\n \"bool\": {\n \"filter\": filter_clauses\n }\n }\n\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n return models\n except Exception as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n return [self._get_embedding_model_name()]\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except Exception as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return True\n\n return False\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models in parallel\n query_embeddings = {}\n\n # Note: Langflow is synchronous, so we can't use true async here\n # But we log the intent for parallel processing\n logger.info(f\"Generating embeddings for {len(available_models)} models\")\n\n original_model_attr = getattr(self.embedding, \"model\", None)\n original_deployment_attr = getattr(self.embedding, \"deployment\", None)\n original_dimensions_attr = getattr(self.embedding, \"dimensions\", None)\n\n for model_name in available_models:\n try:\n # In a real async environment, these would run in parallel\n # For now, they run sequentially\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", model_name)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", model_name)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", None)\n vec = self.embedding.embed_query(q)\n query_embeddings[model_name] = vec\n logger.info(f\"Generated embedding for model: {model_name}\")\n except Exception as e:\n logger.error(f\"Failed to generate embedding for {model_name}: {e}\")\n\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", original_model_attr)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", original_deployment_attr)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", original_dimensions_attr)\n\n if not query_embeddings:\n msg = \"Failed to generate embeddings for any model\"\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n continue\n\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models\"\n )\n\n try:\n resp = client.search(\n index=self.index_name, body=body, params={\"terminate_after\": 0}\n )\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = knn_queries_without_candidates\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", @@ -2347,4 +2347,4 @@ "assistants", "agents" ] -} \ No newline at end of file +} diff --git a/flows/openrag_url_mcp.json b/flows/openrag_url_mcp.json index 9cab0fed..c66ac1ac 100644 --- a/flows/openrag_url_mcp.json +++ b/flows/openrag_url_mcp.json @@ -681,7 +681,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nimport json\nimport uuid\nfrom typing import Any\n\nfrom opensearchpy import OpenSearch, helpers\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports document\n ingestion, vector embeddings, and advanced filtering with authentication options.\n\n Features:\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Hybrid search combining KNN vector similarity and keyword matching\n - Flexible authentication (Basic auth, JWT tokens)\n - Advanced filtering and aggregations\n - Metadata injection during document ingestion\n \"\"\"\n\n display_name: str = \"OpenSearch\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with hybrid semantic and keyword search capabilities.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n \"load_from_db\": True\n },\n ],\n value=[],\n # advanced=True,\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"vector_field\",\n display_name=\"Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=\"Name of the field in OpenSearch documents that stores the vector embeddings for similarity search.\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n }\n }\n },\n }\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our “vector store.”\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings\n - Bulk inserts documents with vectors\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings\n vectors = self.embedding.embed_documents(texts)\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=self.vector_field,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with proper KNN mapping...\")\n\n # Use the LangChain-style bulk ingestion\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=self.vector_field,\n text_field=\"text\",\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n # ---------- search (single hybrid path matching your tool) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform hybrid search combining vector similarity and keyword matching.\n\n This method executes a sophisticated search that combines:\n - K-nearest neighbor (KNN) vector similarity search (70% weight)\n - Multi-field keyword search with fuzzy matching (30% weight)\n - Optional filtering and score thresholds\n - Aggregations for faceted search results\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression (can be either A or B shape; see _coerce_filter_clauses)\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Embed the query\n vec = self.embedding.embed_query(q)\n\n # Build filter clauses (accept both shapes)\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Respect the tool's limit/threshold defaults\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build the same hybrid body as your SearchService\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"knn\": {\n self.vector_field: {\n \"vector\": vec,\n \"k\": 10, # fixed to match the tool\n \"boost\": 0.7,\n }\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3,\n }\n },\n ],\n \"minimum_should_match\": 1,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n if filter_clauses:\n body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n # top-level min_score (matches your tool)\n body[\"min_score\"] = score_threshold\n\n resp = client.search(index=self.index_name, body=body)\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" + "value": "from __future__ import annotations\n\nimport copy\nimport json\nimport time\nimport uuid\nfrom typing import Any, List, Optional\n\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nfrom opensearchpy import OpenSearch, helpers\nfrom opensearchpy.exceptions import RequestError\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MultilineInput, SecretStrInput, StrInput, TableInput\nfrom lfx.log import logger\nfrom lfx.schema.data import Data\n\n\ndef normalize_model_name(model_name: str) -> str:\n \"\"\"Normalize embedding model name for use as field suffix.\n\n Converts model names to valid OpenSearch field names by replacing\n special characters and ensuring alphanumeric format.\n\n Args:\n model_name: Original embedding model name (e.g., \"text-embedding-3-small\")\n\n Returns:\n Normalized field suffix (e.g., \"text_embedding_3_small\")\n \"\"\"\n normalized = model_name.lower()\n # Replace common separators with underscores\n normalized = normalized.replace(\"-\", \"_\").replace(\":\", \"_\").replace(\"/\", \"_\").replace(\".\", \"_\")\n # Remove any non-alphanumeric characters except underscores\n normalized = \"\".join(c if c.isalnum() or c == \"_\" else \"_\" for c in normalized)\n # Remove duplicate underscores\n while \"__\" in normalized:\n normalized = normalized.replace(\"__\", \"_\")\n return normalized.strip(\"_\")\n\n\ndef get_embedding_field_name(model_name: str) -> str:\n \"\"\"Get the dynamic embedding field name for a model.\n\n Args:\n model_name: Embedding model name\n\n Returns:\n Field name in format: chunk_embedding_{normalized_model_name}\n \"\"\"\n return f\"chunk_embedding_{normalize_model_name(model_name)}\"\n\n\n@vector_store_connection\nclass OpenSearchVectorStoreComponent(LCVectorStoreComponent):\n \"\"\"OpenSearch Vector Store Component with Multi-Model Hybrid Search Capabilities.\n\n This component provides vector storage and retrieval using OpenSearch, combining semantic\n similarity search (KNN) with keyword-based search for optimal results. It supports:\n - Multiple embedding models per index with dynamic field names\n - Automatic detection and querying of all available embedding models\n - Parallel embedding generation for multi-model search\n - Document ingestion with model tracking\n - Advanced filtering and aggregations\n - Flexible authentication options\n\n Features:\n - Multi-model vector storage with dynamic fields (chunk_embedding_{model_name})\n - Hybrid search combining multiple KNN queries (dis_max) + keyword matching\n - Auto-detection of available models in the index\n - Parallel query embedding generation for all detected models\n - Vector storage with configurable engines (jvector, nmslib, faiss, lucene)\n - Flexible authentication (Basic auth, JWT tokens)\n \"\"\"\n\n display_name: str = \"OpenSearch (Multi-Model)\"\n icon: str = \"OpenSearch\"\n description: str = (\n \"Store and search documents using OpenSearch with multi-model hybrid semantic and keyword search.\"\n )\n\n # Keys we consider baseline\n default_keys: list[str] = [\n \"opensearch_url\",\n \"index_name\",\n *[i.name for i in LCVectorStoreComponent.inputs], # search_query, add_documents, etc.\n \"embedding\",\n \"embedding_model_name\",\n \"vector_field\",\n \"number_of_results\",\n \"auth_mode\",\n \"username\",\n \"password\",\n \"jwt_token\",\n \"jwt_header\",\n \"bearer_prefix\",\n \"use_ssl\",\n \"verify_certs\",\n \"filter_expression\",\n \"engine\",\n \"space_type\",\n \"ef_construction\",\n \"m\",\n \"num_candidates\",\n \"docs_metadata\",\n ]\n\n inputs = [\n TableInput(\n name=\"docs_metadata\",\n display_name=\"Document Metadata\",\n info=(\n \"Additional metadata key-value pairs to be added to all ingested documents. \"\n \"Useful for tagging documents with source information, categories, or other custom attributes.\"\n ),\n table_schema=[\n {\n \"name\": \"key\",\n \"display_name\": \"Key\",\n \"type\": \"str\",\n \"description\": \"Key name\",\n },\n {\n \"name\": \"value\",\n \"display_name\": \"Value\",\n \"type\": \"str\",\n \"description\": \"Value of the metadata\",\n },\n ],\n value=[],\n input_types=[\"Data\"]\n ),\n StrInput(\n name=\"opensearch_url\",\n display_name=\"OpenSearch URL\",\n value=\"http://localhost:9200\",\n info=(\n \"The connection URL for your OpenSearch cluster \"\n \"(e.g., http://localhost:9200 for local development or your cloud endpoint).\"\n ),\n ),\n StrInput(\n name=\"index_name\",\n display_name=\"Index Name\",\n value=\"langflow\",\n info=(\n \"The OpenSearch index name where documents will be stored and searched. \"\n \"Will be created automatically if it doesn't exist.\"\n ),\n ),\n DropdownInput(\n name=\"engine\",\n display_name=\"Vector Engine\",\n options=[\"jvector\", \"nmslib\", \"faiss\", \"lucene\"],\n value=\"jvector\",\n info=(\n \"Vector search engine for similarity calculations. 'jvector' is recommended for most use cases. \"\n \"Note: Amazon OpenSearch Serverless only supports 'nmslib' or 'faiss'.\"\n ),\n advanced=True,\n ),\n DropdownInput(\n name=\"space_type\",\n display_name=\"Distance Metric\",\n options=[\"l2\", \"l1\", \"cosinesimil\", \"linf\", \"innerproduct\"],\n value=\"l2\",\n info=(\n \"Distance metric for calculating vector similarity. 'l2' (Euclidean) is most common, \"\n \"'cosinesimil' for cosine similarity, 'innerproduct' for dot product.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"ef_construction\",\n display_name=\"EF Construction\",\n value=512,\n info=(\n \"Size of the dynamic candidate list during index construction. \"\n \"Higher values improve recall but increase indexing time and memory usage.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"m\",\n display_name=\"M Parameter\",\n value=16,\n info=(\n \"Number of bidirectional connections for each vector in the HNSW graph. \"\n \"Higher values improve search quality but increase memory usage and indexing time.\"\n ),\n advanced=True,\n ),\n IntInput(\n name=\"num_candidates\",\n display_name=\"Candidate Pool Size\",\n value=1000,\n info=(\n \"Number of approximate neighbors to consider for each KNN query. \"\n \"Some OpenSearch deployments do not support this parameter; set to 0 to disable.\"\n ),\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs, # includes search_query, add_documents, etc.\n HandleInput(name=\"embedding\", display_name=\"Embedding\", input_types=[\"Embeddings\"]),\n StrInput(\n name=\"embedding_model_name\",\n display_name=\"Embedding Model Name\",\n value=\"\",\n info=(\n \"Name of the embedding model being used (e.g., 'text-embedding-3-small'). \"\n \"Used to create dynamic vector field names and track which model embedded each document. \"\n \"Auto-detected from embedding component if not specified.\"\n ),\n ),\n StrInput(\n name=\"vector_field\",\n display_name=\"Legacy Vector Field Name\",\n value=\"chunk_embedding\",\n advanced=True,\n info=(\n \"Legacy field name for backward compatibility. New documents use dynamic fields \"\n \"(chunk_embedding_{model_name}) based on the embedding_model_name.\"\n ),\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Default Result Limit\",\n value=10,\n advanced=True,\n info=(\n \"Default maximum number of search results to return when no limit is \"\n \"specified in the filter expression.\"\n ),\n ),\n MultilineInput(\n name=\"filter_expression\",\n display_name=\"Search Filters (JSON)\",\n value=\"\",\n info=(\n \"Optional JSON configuration for search filtering, result limits, and score thresholds.\\n\\n\"\n \"Format 1 - Explicit filters:\\n\"\n '{\"filter\": [{\"term\": {\"filename\":\"doc.pdf\"}}, '\n '{\"terms\":{\"owner\":[\"user1\",\"user2\"]}}], \"limit\": 10, \"score_threshold\": 1.6}\\n\\n'\n \"Format 2 - Context-style mapping:\\n\"\n '{\"data_sources\":[\"file.pdf\"], \"document_types\":[\"application/pdf\"], \"owners\":[\"user123\"]}\\n\\n'\n \"Use __IMPOSSIBLE_VALUE__ as placeholder to ignore specific filters.\"\n ),\n ),\n # ----- Auth controls (dynamic) -----\n DropdownInput(\n name=\"auth_mode\",\n display_name=\"Authentication Mode\",\n value=\"basic\",\n options=[\"basic\", \"jwt\"],\n info=(\n \"Authentication method: 'basic' for username/password authentication, \"\n \"or 'jwt' for JSON Web Token (Bearer) authentication.\"\n ),\n real_time_refresh=True,\n advanced=False,\n ),\n StrInput(\n name=\"username\",\n display_name=\"Username\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"password\",\n display_name=\"OpenSearch Password\",\n value=\"admin\",\n show=False,\n ),\n SecretStrInput(\n name=\"jwt_token\",\n display_name=\"JWT Token\",\n value=\"JWT\",\n load_from_db=False,\n show=True,\n info=(\n \"Valid JSON Web Token for authentication. \"\n \"Will be sent in the Authorization header (with optional 'Bearer ' prefix).\"\n ),\n ),\n StrInput(\n name=\"jwt_header\",\n display_name=\"JWT Header Name\",\n value=\"Authorization\",\n show=False,\n advanced=True,\n ),\n BoolInput(\n name=\"bearer_prefix\",\n display_name=\"Prefix 'Bearer '\",\n value=True,\n show=False,\n advanced=True,\n ),\n # ----- TLS -----\n BoolInput(\n name=\"use_ssl\",\n display_name=\"Use SSL/TLS\",\n value=True,\n advanced=True,\n info=\"Enable SSL/TLS encryption for secure connections to OpenSearch.\",\n ),\n BoolInput(\n name=\"verify_certs\",\n display_name=\"Verify SSL Certificates\",\n value=False,\n advanced=True,\n info=(\n \"Verify SSL certificates when connecting. \"\n \"Disable for self-signed certificates in development environments.\"\n ),\n ),\n ]\n\n def _get_embedding_model_name(self) -> str:\n \"\"\"Get the embedding model name from component config or embedding object.\n\n Returns:\n Embedding model name\n\n Raises:\n ValueError: If embedding model name cannot be determined\n \"\"\"\n # First try explicit embedding_model_name input\n if hasattr(self, \"embedding_model_name\") and self.embedding_model_name:\n return self.embedding_model_name.strip()\n\n # Try to get from embedding component\n if hasattr(self, \"embedding\") and self.embedding:\n if hasattr(self.embedding, \"model\"):\n return str(self.embedding.model)\n if hasattr(self.embedding, \"model_name\"):\n return str(self.embedding.model_name)\n\n msg = (\n \"Could not determine embedding model name. \"\n \"Please set the 'embedding_model_name' field or ensure the embedding component \"\n \"has a 'model' or 'model_name' attribute.\"\n )\n raise ValueError(msg)\n\n # ---------- helper functions for index management ----------\n def _default_text_mapping(\n self,\n dim: int,\n engine: str = \"jvector\",\n space_type: str = \"l2\",\n ef_search: int = 512,\n ef_construction: int = 100,\n m: int = 16,\n vector_field: str = \"vector_field\",\n ) -> dict[str, Any]:\n \"\"\"Create the default OpenSearch index mapping for vector search.\n\n This method generates the index configuration with k-NN settings optimized\n for approximate nearest neighbor search using the specified vector engine.\n Includes the embedding_model keyword field for tracking which model was used.\n\n Args:\n dim: Dimensionality of the vector embeddings\n engine: Vector search engine (jvector, nmslib, faiss, lucene)\n space_type: Distance metric for similarity calculation\n ef_search: Size of dynamic list used during search\n ef_construction: Size of dynamic list used during index construction\n m: Number of bidirectional links for each vector\n vector_field: Name of the field storing vector embeddings\n\n Returns:\n Dictionary containing OpenSearch index mapping configuration\n \"\"\"\n return {\n \"settings\": {\"index\": {\"knn\": True, \"knn.algo_param.ef_search\": ef_search}},\n \"mappings\": {\n \"properties\": {\n vector_field: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n \"embedding_model\": {\"type\": \"keyword\"}, # Track which model was used\n \"embedding_dimensions\": {\"type\": \"integer\"},\n }\n },\n }\n\n def _ensure_embedding_field_mapping(\n self,\n client: OpenSearch,\n index_name: str,\n field_name: str,\n dim: int,\n engine: str,\n space_type: str,\n ef_construction: int,\n m: int,\n ) -> None:\n \"\"\"Lazily add a dynamic embedding field to the index if it doesn't exist.\n\n This allows adding new embedding models without recreating the entire index.\n Also ensures the embedding_model tracking field exists.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index name\n field_name: Dynamic field name for this embedding model\n dim: Vector dimensionality\n engine: Vector search engine\n space_type: Distance metric\n ef_construction: Construction parameter\n m: HNSW parameter\n \"\"\"\n try:\n mapping = {\n \"properties\": {\n field_name: {\n \"type\": \"knn_vector\",\n \"dimension\": dim,\n \"method\": {\n \"name\": \"disk_ann\",\n \"space_type\": space_type,\n \"engine\": engine,\n \"parameters\": {\"ef_construction\": ef_construction, \"m\": m},\n },\n },\n # Also ensure the embedding_model tracking field exists as keyword\n \"embedding_model\": {\n \"type\": \"keyword\"\n },\n \"embedding_dimensions\": {\n \"type\": \"integer\"\n }\n }\n }\n client.indices.put_mapping(index=index_name, body=mapping)\n logger.info(f\"Added/updated embedding field mapping: {field_name}\")\n except Exception as e:\n logger.warning(f\"Could not add embedding field mapping for {field_name}: {e}\")\n raise\n\n properties = self._get_index_properties(client)\n if not self._is_knn_vector_field(properties, field_name):\n raise ValueError(\n f\"Field '{field_name}' is not mapped as knn_vector. Current mapping: {properties.get(field_name)}\"\n )\n\n def _validate_aoss_with_engines(self, *, is_aoss: bool, engine: str) -> None:\n \"\"\"Validate engine compatibility with Amazon OpenSearch Serverless (AOSS).\n\n Amazon OpenSearch Serverless has restrictions on which vector engines\n can be used. This method ensures the selected engine is compatible.\n\n Args:\n is_aoss: Whether the connection is to Amazon OpenSearch Serverless\n engine: The selected vector search engine\n\n Raises:\n ValueError: If AOSS is used with an incompatible engine\n \"\"\"\n if is_aoss and engine not in {\"nmslib\", \"faiss\"}:\n msg = \"Amazon OpenSearch Service Serverless only supports `nmslib` or `faiss` engines\"\n raise ValueError(msg)\n\n def _is_aoss_enabled(self, http_auth: Any) -> bool:\n \"\"\"Determine if Amazon OpenSearch Serverless (AOSS) is being used.\n\n Args:\n http_auth: The HTTP authentication object\n\n Returns:\n True if AOSS is enabled, False otherwise\n \"\"\"\n return http_auth is not None and hasattr(http_auth, \"service\") and http_auth.service == \"aoss\"\n\n def _bulk_ingest_embeddings(\n self,\n client: OpenSearch,\n index_name: str,\n embeddings: list[list[float]],\n texts: list[str],\n metadatas: list[dict] | None = None,\n ids: list[str] | None = None,\n vector_field: str = \"vector_field\",\n text_field: str = \"text\",\n embedding_model: str = \"unknown\",\n mapping: dict | None = None,\n max_chunk_bytes: int | None = 1 * 1024 * 1024,\n *,\n is_aoss: bool = False,\n ) -> list[str]:\n \"\"\"Efficiently ingest multiple documents with embeddings into OpenSearch.\n\n This method uses bulk operations to insert documents with their vector\n embeddings and metadata into the specified OpenSearch index. Each document\n is tagged with the embedding_model name for tracking.\n\n Args:\n client: OpenSearch client instance\n index_name: Target index for document storage\n embeddings: List of vector embeddings for each document\n texts: List of document texts\n metadatas: Optional metadata dictionaries for each document\n ids: Optional document IDs (UUIDs generated if not provided)\n vector_field: Field name for storing vector embeddings\n text_field: Field name for storing document text\n embedding_model: Name of the embedding model used\n mapping: Optional index mapping configuration\n max_chunk_bytes: Maximum size per bulk request chunk\n is_aoss: Whether using Amazon OpenSearch Serverless\n\n Returns:\n List of document IDs that were successfully ingested\n \"\"\"\n if not mapping:\n mapping = {}\n\n requests = []\n return_ids = []\n vector_dimensions = len(embeddings[0]) if embeddings else None\n\n for i, text in enumerate(texts):\n metadata = metadatas[i] if metadatas else {}\n if vector_dimensions is not None and \"embedding_dimensions\" not in metadata:\n metadata = {**metadata, \"embedding_dimensions\": vector_dimensions}\n _id = ids[i] if ids else str(uuid.uuid4())\n request = {\n \"_op_type\": \"index\",\n \"_index\": index_name,\n vector_field: embeddings[i],\n text_field: text,\n \"embedding_model\": embedding_model, # Track which model was used\n **metadata,\n }\n if is_aoss:\n request[\"id\"] = _id\n else:\n request[\"_id\"] = _id\n requests.append(request)\n return_ids.append(_id)\n if metadatas:\n self.log(f\"Sample metadata: {metadatas[0] if metadatas else {}}\")\n helpers.bulk(client, requests, max_chunk_bytes=max_chunk_bytes)\n return return_ids\n\n # ---------- auth / client ----------\n def _build_auth_kwargs(self) -> dict[str, Any]:\n \"\"\"Build authentication configuration for OpenSearch client.\n\n Constructs the appropriate authentication parameters based on the\n selected auth mode (basic username/password or JWT token).\n\n Returns:\n Dictionary containing authentication configuration\n\n Raises:\n ValueError: If required authentication parameters are missing\n \"\"\"\n mode = (self.auth_mode or \"basic\").strip().lower()\n if mode == \"jwt\":\n token = (self.jwt_token or \"\").strip()\n if not token:\n msg = \"Auth Mode is 'jwt' but no jwt_token was provided.\"\n raise ValueError(msg)\n header_name = (self.jwt_header or \"Authorization\").strip()\n header_value = f\"Bearer {token}\" if self.bearer_prefix else token\n return {\"headers\": {header_name: header_value}}\n user = (self.username or \"\").strip()\n pwd = (self.password or \"\").strip()\n if not user or not pwd:\n msg = \"Auth Mode is 'basic' but username/password are missing.\"\n raise ValueError(msg)\n return {\"http_auth\": (user, pwd)}\n\n def build_client(self) -> OpenSearch:\n \"\"\"Create and configure an OpenSearch client instance.\n\n Returns:\n Configured OpenSearch client ready for operations\n \"\"\"\n auth_kwargs = self._build_auth_kwargs()\n return OpenSearch(\n hosts=[self.opensearch_url],\n use_ssl=self.use_ssl,\n verify_certs=self.verify_certs,\n ssl_assert_hostname=False,\n ssl_show_warn=False,\n **auth_kwargs,\n )\n\n @check_cached_vector_store\n def build_vector_store(self) -> OpenSearch:\n # Return raw OpenSearch client as our \"vector store.\"\n self.log(self.ingest_data)\n client = self.build_client()\n self._add_documents_to_vector_store(client=client)\n return client\n\n # ---------- ingest ----------\n def _add_documents_to_vector_store(self, client: OpenSearch) -> None:\n \"\"\"Process and ingest documents into the OpenSearch vector store.\n\n This method handles the complete document ingestion pipeline:\n - Prepares document data and metadata\n - Generates vector embeddings\n - Creates appropriate index mappings with dynamic field names\n - Bulk inserts documents with vectors and model tracking\n\n Args:\n client: OpenSearch client for performing operations\n \"\"\"\n # Convert DataFrame to Data if needed using parent's method\n self.ingest_data = self._prepare_ingest_data()\n\n docs = self.ingest_data or []\n if not docs:\n self.log(\"No documents to ingest.\")\n return\n\n # Get embedding model name\n embedding_model = self._get_embedding_model_name()\n dynamic_field_name = get_embedding_field_name(embedding_model)\n\n self.log(f\"Using embedding model: {embedding_model}\")\n self.log(f\"Dynamic vector field: {dynamic_field_name}\")\n\n # Extract texts and metadata from documents\n texts = []\n metadatas = []\n # Process docs_metadata table input into a dict\n additional_metadata = {}\n if hasattr(self, \"docs_metadata\") and self.docs_metadata:\n logger.info(f\"[LF] Docs metadata {self.docs_metadata}\")\n if isinstance(self.docs_metadata[-1], Data):\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n self.docs_metadata = self.docs_metadata[-1].data\n logger.info(f\"[LF] Docs metadata is a Data object {self.docs_metadata}\")\n additional_metadata.update(self.docs_metadata)\n else:\n for item in self.docs_metadata:\n if isinstance(item, dict) and \"key\" in item and \"value\" in item:\n additional_metadata[item[\"key\"]] = item[\"value\"]\n # Replace string \"None\" values with actual None\n for key, value in additional_metadata.items():\n if value == \"None\":\n additional_metadata[key] = None\n logger.info(f\"[LF] Additional metadata {additional_metadata}\")\n for doc_obj in docs:\n data_copy = json.loads(doc_obj.model_dump_json())\n text = data_copy.pop(doc_obj.text_key, doc_obj.default_value)\n texts.append(text)\n\n # Merge additional metadata from table input\n data_copy.update(additional_metadata)\n\n metadatas.append(data_copy)\n self.log(metadatas)\n if not self.embedding:\n msg = \"Embedding handle is required to embed documents.\"\n raise ValueError(msg)\n\n # Generate embeddings (threaded for concurrency) with retries\n def embed_chunk(chunk_text: str) -> list[float]:\n return self.embedding.embed_documents([chunk_text])[0]\n\n vectors: Optional[List[List[float]]] = None\n last_exception: Optional[Exception] = None\n delay = 1.0\n attempts = 0\n\n while attempts < 3:\n attempts += 1\n try:\n max_workers = min(max(len(texts), 1), 8)\n with ThreadPoolExecutor(max_workers=max_workers) as executor:\n futures = {executor.submit(embed_chunk, chunk): idx for idx, chunk in enumerate(texts)}\n vectors = [None] * len(texts)\n for future in as_completed(futures):\n idx = futures[future]\n vectors[idx] = future.result()\n break\n except Exception as exc:\n last_exception = exc\n if attempts >= 3:\n logger.error(\n \"Embedding generation failed after retries\",\n error=str(exc),\n )\n raise\n logger.warning(\n \"Threaded embedding generation failed (attempt %s/%s), retrying in %.1fs\",\n attempts,\n 3,\n delay,\n )\n time.sleep(delay)\n delay = min(delay * 2, 8.0)\n\n if vectors is None:\n raise RuntimeError(\n f\"Embedding generation failed: {last_exception}\" if last_exception else \"Embedding generation failed\"\n )\n\n if not vectors:\n self.log(\"No vectors generated from documents.\")\n return\n\n # Get vector dimension for mapping\n dim = len(vectors[0]) if vectors else 768 # default fallback\n\n # Check for AOSS\n auth_kwargs = self._build_auth_kwargs()\n is_aoss = self._is_aoss_enabled(auth_kwargs.get(\"http_auth\"))\n\n # Validate engine with AOSS\n engine = getattr(self, \"engine\", \"jvector\")\n self._validate_aoss_with_engines(is_aoss=is_aoss, engine=engine)\n\n # Create mapping with proper KNN settings\n space_type = getattr(self, \"space_type\", \"l2\")\n ef_construction = getattr(self, \"ef_construction\", 512)\n m = getattr(self, \"m\", 16)\n\n mapping = self._default_text_mapping(\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n vector_field=dynamic_field_name, # Use dynamic field name\n )\n\n # Ensure index exists with baseline mapping\n try:\n if not client.indices.exists(index=self.index_name):\n self.log(f\"Creating index '{self.index_name}' with base mapping\")\n client.indices.create(index=self.index_name, body=mapping)\n except RequestError as creation_error:\n if creation_error.error != \"resource_already_exists_exception\":\n logger.warning(\n f\"Failed to create index '{self.index_name}': {creation_error}\"\n )\n\n # Ensure the dynamic field exists in the index\n self._ensure_embedding_field_mapping(\n client=client,\n index_name=self.index_name,\n field_name=dynamic_field_name,\n dim=dim,\n engine=engine,\n space_type=space_type,\n ef_construction=ef_construction,\n m=m,\n )\n\n self.log(f\"Indexing {len(texts)} documents into '{self.index_name}' with model '{embedding_model}'...\")\n\n # Use the bulk ingestion with model tracking\n return_ids = self._bulk_ingest_embeddings(\n client=client,\n index_name=self.index_name,\n embeddings=vectors,\n texts=texts,\n metadatas=metadatas,\n vector_field=dynamic_field_name, # Use dynamic field name\n text_field=\"text\",\n embedding_model=embedding_model, # Track the model\n mapping=mapping,\n is_aoss=is_aoss,\n )\n self.log(metadatas)\n\n self.log(f\"Successfully indexed {len(return_ids)} documents with model {embedding_model}.\")\n\n # ---------- helpers for filters ----------\n def _is_placeholder_term(self, term_obj: dict) -> bool:\n # term_obj like {\"filename\": \"__IMPOSSIBLE_VALUE__\"}\n return any(v == \"__IMPOSSIBLE_VALUE__\" for v in term_obj.values())\n\n def _coerce_filter_clauses(self, filter_obj: dict | None) -> list[dict]:\n \"\"\"Convert filter expressions into OpenSearch-compatible filter clauses.\n\n This method accepts two filter formats and converts them to standardized\n OpenSearch query clauses:\n\n Format A - Explicit filters:\n {\"filter\": [{\"term\": {\"field\": \"value\"}}, {\"terms\": {\"field\": [\"val1\", \"val2\"]}}],\n \"limit\": 10, \"score_threshold\": 1.5}\n\n Format B - Context-style mapping:\n {\"data_sources\": [\"file1.pdf\"], \"document_types\": [\"pdf\"], \"owners\": [\"user1\"]}\n\n Args:\n filter_obj: Filter configuration dictionary or None\n\n Returns:\n List of OpenSearch filter clauses (term/terms objects)\n Placeholder values with \"__IMPOSSIBLE_VALUE__\" are ignored\n \"\"\"\n if not filter_obj:\n return []\n\n # If it is a string, try to parse it once\n if isinstance(filter_obj, str):\n try:\n filter_obj = json.loads(filter_obj)\n except json.JSONDecodeError:\n # Not valid JSON - treat as no filters\n return []\n\n # Case A: already an explicit list/dict under \"filter\"\n if \"filter\" in filter_obj:\n raw = filter_obj[\"filter\"]\n if isinstance(raw, dict):\n raw = [raw]\n explicit_clauses: list[dict] = []\n for f in raw or []:\n if \"term\" in f and isinstance(f[\"term\"], dict) and not self._is_placeholder_term(f[\"term\"]):\n explicit_clauses.append(f)\n elif \"terms\" in f and isinstance(f[\"terms\"], dict):\n field, vals = next(iter(f[\"terms\"].items()))\n if isinstance(vals, list) and len(vals) > 0:\n explicit_clauses.append(f)\n return explicit_clauses\n\n # Case B: convert context-style maps into clauses\n field_mapping = {\n \"data_sources\": \"filename\",\n \"document_types\": \"mimetype\",\n \"owners\": \"owner\",\n }\n context_clauses: list[dict] = []\n for k, values in filter_obj.items():\n if not isinstance(values, list):\n continue\n field = field_mapping.get(k, k)\n if len(values) == 0:\n # Match-nothing placeholder (kept to mirror your tool semantics)\n context_clauses.append({\"term\": {field: \"__IMPOSSIBLE_VALUE__\"}})\n elif len(values) == 1:\n if values[0] != \"__IMPOSSIBLE_VALUE__\":\n context_clauses.append({\"term\": {field: values[0]}})\n else:\n context_clauses.append({\"terms\": {field: values}})\n return context_clauses\n\n def _detect_available_models(self, client: OpenSearch, filter_clauses: list[dict] = None) -> list[str]:\n \"\"\"Detect which embedding models have documents in the index.\n\n Uses aggregation to find all unique embedding_model values, optionally\n filtered to only documents matching the user's filter criteria.\n\n Args:\n client: OpenSearch client instance\n filter_clauses: Optional filter clauses to scope model detection\n\n Returns:\n List of embedding model names found in the index\n \"\"\"\n try:\n agg_query = {\n \"size\": 0,\n \"aggs\": {\n \"embedding_models\": {\n \"terms\": {\n \"field\": \"embedding_model\",\n \"size\": 10\n }\n }\n }\n }\n\n # Apply filters to model detection if any exist\n if filter_clauses:\n agg_query[\"query\"] = {\n \"bool\": {\n \"filter\": filter_clauses\n }\n }\n\n result = client.search(\n index=self.index_name,\n body=agg_query,\n params={\"terminate_after\": 0},\n )\n buckets = result.get(\"aggregations\", {}).get(\"embedding_models\", {}).get(\"buckets\", [])\n models = [b[\"key\"] for b in buckets if b[\"key\"]]\n\n logger.info(\n f\"Detected embedding models in corpus: {models}\"\n + (f\" (with {len(filter_clauses)} filters)\" if filter_clauses else \"\")\n )\n return models\n except Exception as e:\n logger.warning(f\"Failed to detect embedding models: {e}\")\n # Fallback to current model\n return [self._get_embedding_model_name()]\n\n def _get_index_properties(self, client: OpenSearch) -> dict[str, Any] | None:\n \"\"\"Retrieve flattened mapping properties for the current index.\"\"\"\n try:\n mapping = client.indices.get_mapping(index=self.index_name)\n except Exception as e:\n logger.warning(\n f\"Failed to fetch mapping for index '{self.index_name}': {e}. Proceeding without mapping metadata.\"\n )\n return None\n\n properties: dict[str, Any] = {}\n for index_data in mapping.values():\n props = index_data.get(\"mappings\", {}).get(\"properties\", {})\n if isinstance(props, dict):\n properties.update(props)\n return properties\n\n def _is_knn_vector_field(self, properties: dict[str, Any] | None, field_name: str) -> bool:\n \"\"\"Check whether the field is mapped as a knn_vector.\"\"\"\n if not field_name:\n return False\n if properties is None:\n logger.warning(\n f\"Mapping metadata unavailable; assuming field '{field_name}' is usable.\"\n )\n return True\n field_def = properties.get(field_name)\n if not isinstance(field_def, dict):\n return False\n if field_def.get(\"type\") == \"knn_vector\":\n return True\n\n nested_props = field_def.get(\"properties\")\n if isinstance(nested_props, dict) and nested_props.get(\"type\") == \"knn_vector\":\n return True\n\n return False\n\n # ---------- search (multi-model hybrid) ----------\n def search(self, query: str | None = None) -> list[dict[str, Any]]:\n \"\"\"Perform multi-model hybrid search combining multiple vector similarities and keyword matching.\n\n This method executes a sophisticated search that:\n 1. Auto-detects all embedding models present in the index\n 2. Generates query embeddings for ALL detected models in parallel\n 3. Combines multiple KNN queries using dis_max (picks best match)\n 4. Adds keyword search with fuzzy matching (30% weight)\n 5. Applies optional filtering and score thresholds\n 6. Returns aggregations for faceted search\n\n Search weights:\n - Semantic search (dis_max across all models): 70%\n - Keyword search: 30%\n\n Args:\n query: Search query string (used for both vector embedding and keyword search)\n\n Returns:\n List of search results with page_content, metadata, and relevance scores\n\n Raises:\n ValueError: If embedding component is not provided or filter JSON is invalid\n \"\"\"\n logger.info(self.ingest_data)\n client = self.build_client()\n q = (query or \"\").strip()\n\n # Parse optional filter expression\n filter_obj = None\n if getattr(self, \"filter_expression\", \"\") and self.filter_expression.strip():\n try:\n filter_obj = json.loads(self.filter_expression)\n except json.JSONDecodeError as e:\n msg = f\"Invalid filter_expression JSON: {e}\"\n raise ValueError(msg) from e\n\n if not self.embedding:\n msg = \"Embedding is required to run hybrid search (KNN + keyword).\"\n raise ValueError(msg)\n\n # Build filter clauses first so we can use them in model detection\n filter_clauses = self._coerce_filter_clauses(filter_obj)\n\n # Detect available embedding models in the index (scoped by filters)\n available_models = self._detect_available_models(client, filter_clauses)\n\n if not available_models:\n logger.warning(\"No embedding models found in index, using current model\")\n available_models = [self._get_embedding_model_name()]\n\n # Generate embeddings for ALL detected models in parallel\n query_embeddings = {}\n\n # Note: Langflow is synchronous, so we can't use true async here\n # But we log the intent for parallel processing\n logger.info(f\"Generating embeddings for {len(available_models)} models\")\n\n original_model_attr = getattr(self.embedding, \"model\", None)\n original_deployment_attr = getattr(self.embedding, \"deployment\", None)\n original_dimensions_attr = getattr(self.embedding, \"dimensions\", None)\n\n for model_name in available_models:\n try:\n # In a real async environment, these would run in parallel\n # For now, they run sequentially\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", model_name)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", model_name)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", None)\n vec = self.embedding.embed_query(q)\n query_embeddings[model_name] = vec\n logger.info(f\"Generated embedding for model: {model_name}\")\n except Exception as e:\n logger.error(f\"Failed to generate embedding for {model_name}: {e}\")\n\n if hasattr(self.embedding, \"model\"):\n setattr(self.embedding, \"model\", original_model_attr)\n if hasattr(self.embedding, \"deployment\"):\n setattr(self.embedding, \"deployment\", original_deployment_attr)\n if hasattr(self.embedding, \"dimensions\"):\n setattr(self.embedding, \"dimensions\", original_dimensions_attr)\n\n if not query_embeddings:\n msg = \"Failed to generate embeddings for any model\"\n raise ValueError(msg)\n\n index_properties = self._get_index_properties(client)\n legacy_vector_field = getattr(self, \"vector_field\", \"chunk_embedding\")\n\n # Build KNN queries for each model\n embedding_fields: list[str] = []\n knn_queries_with_candidates = []\n knn_queries_without_candidates = []\n\n raw_num_candidates = getattr(self, \"num_candidates\", 1000)\n try:\n num_candidates = int(raw_num_candidates) if raw_num_candidates is not None else 0\n except (TypeError, ValueError):\n num_candidates = 0\n use_num_candidates = num_candidates > 0\n\n for model_name, embedding_vector in query_embeddings.items():\n field_name = get_embedding_field_name(model_name)\n selected_field = field_name\n\n # Only use the expected dynamic field - no legacy fallback\n # This prevents dimension mismatches between models\n if not self._is_knn_vector_field(index_properties, selected_field):\n logger.warning(\n f\"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. \"\n f\"Documents must be indexed with this embedding model before querying.\"\n )\n continue\n\n embedding_fields.append(selected_field)\n\n base_query = {\n \"knn\": {\n selected_field: {\n \"vector\": embedding_vector,\n \"k\": 50,\n }\n }\n }\n\n if use_num_candidates:\n query_with_candidates = copy.deepcopy(base_query)\n query_with_candidates[\"knn\"][selected_field][\"num_candidates\"] = num_candidates\n else:\n query_with_candidates = base_query\n\n knn_queries_with_candidates.append(query_with_candidates)\n knn_queries_without_candidates.append(base_query)\n\n if not knn_queries_with_candidates:\n # No valid fields found - this can happen when:\n # 1. Index is empty (no documents yet)\n # 2. Embedding model has changed and field doesn't exist yet\n # Return empty results instead of failing\n logger.warning(\n \"No valid knn_vector fields found for embedding models. \"\n \"This may indicate an empty index or missing field mappings. \"\n \"Returning empty search results.\"\n )\n return []\n\n # Build exists filter - document must have at least one embedding field\n exists_any_embedding = {\n \"bool\": {\n \"should\": [{\"exists\": {\"field\": f}} for f in set(embedding_fields)],\n \"minimum_should_match\": 1\n }\n }\n\n # Combine user filters with exists filter\n all_filters = [*filter_clauses, exists_any_embedding]\n\n # Get limit and score threshold\n limit = (filter_obj or {}).get(\"limit\", self.number_of_results)\n score_threshold = (filter_obj or {}).get(\"score_threshold\", 0)\n\n # Build multi-model hybrid query\n body = {\n \"query\": {\n \"bool\": {\n \"should\": [\n {\n \"dis_max\": {\n \"tie_breaker\": 0.0, # Take only the best match, no blending\n \"boost\": 0.7, # 70% weight for semantic search\n \"queries\": knn_queries_with_candidates\n }\n },\n {\n \"multi_match\": {\n \"query\": q,\n \"fields\": [\"text^2\", \"filename^1.5\"],\n \"type\": \"best_fields\",\n \"fuzziness\": \"AUTO\",\n \"boost\": 0.3, # 30% weight for keyword search\n }\n },\n ],\n \"minimum_should_match\": 1,\n \"filter\": all_filters,\n }\n },\n \"aggs\": {\n \"data_sources\": {\"terms\": {\"field\": \"filename\", \"size\": 20}},\n \"document_types\": {\"terms\": {\"field\": \"mimetype\", \"size\": 10}},\n \"owners\": {\"terms\": {\"field\": \"owner\", \"size\": 10}},\n \"embedding_models\": {\"terms\": {\"field\": \"embedding_model\", \"size\": 10}},\n },\n \"_source\": [\n \"filename\",\n \"mimetype\",\n \"page\",\n \"text\",\n \"source_url\",\n \"owner\",\n \"embedding_model\",\n \"allowed_users\",\n \"allowed_groups\",\n ],\n \"size\": limit,\n }\n\n if isinstance(score_threshold, (int, float)) and score_threshold > 0:\n body[\"min_score\"] = score_threshold\n\n logger.info(\n f\"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models\"\n )\n\n try:\n resp = client.search(\n index=self.index_name, body=body, params={\"terminate_after\": 0}\n )\n except RequestError as e:\n error_message = str(e)\n lowered = error_message.lower()\n if use_num_candidates and \"num_candidates\" in lowered:\n logger.warning(\n \"Retrying search without num_candidates parameter due to cluster capabilities\",\n error=error_message,\n )\n fallback_body = copy.deepcopy(body)\n try:\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = knn_queries_without_candidates\n except (KeyError, IndexError, TypeError) as inner_err:\n raise e from inner_err\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n elif \"knn_vector\" in lowered or (\"field\" in lowered and \"knn\" in lowered):\n fallback_vector = next(iter(query_embeddings.values()), None)\n if fallback_vector is None:\n raise\n fallback_field = legacy_vector_field or \"chunk_embedding\"\n logger.warning(\n \"KNN search failed for dynamic fields; falling back to legacy field '%s'.\",\n fallback_field,\n )\n fallback_body = copy.deepcopy(body)\n fallback_body[\"query\"][\"bool\"][\"filter\"] = filter_clauses\n knn_fallback = {\n \"knn\": {\n fallback_field: {\n \"vector\": fallback_vector,\n \"k\": 50,\n }\n }\n }\n if use_num_candidates:\n knn_fallback[\"knn\"][fallback_field][\"num_candidates\"] = num_candidates\n fallback_body[\"query\"][\"bool\"][\"should\"][0][\"dis_max\"][\"queries\"] = [knn_fallback]\n resp = client.search(\n index=self.index_name,\n body=fallback_body,\n params={\"terminate_after\": 0},\n )\n else:\n raise\n hits = resp.get(\"hits\", {}).get(\"hits\", [])\n\n logger.info(f\"Found {len(hits)} results\")\n\n return [\n {\n \"page_content\": hit[\"_source\"].get(\"text\", \"\"),\n \"metadata\": {k: v for k, v in hit[\"_source\"].items() if k != \"text\"},\n \"score\": hit.get(\"_score\"),\n }\n for hit in hits\n ]\n\n def search_documents(self) -> list[Data]:\n \"\"\"Search documents and return results as Data objects.\n\n This is the main interface method that performs the multi-model search using the\n configured search_query and returns results in Langflow's Data format.\n\n Returns:\n List of Data objects containing search results with text and metadata\n\n Raises:\n Exception: If search operation fails\n \"\"\"\n try:\n raw = self.search(self.search_query or \"\")\n return [Data(text=hit[\"page_content\"], **hit[\"metadata\"]) for hit in raw]\n self.log(self.ingest_data)\n except Exception as e:\n self.log(f\"search_documents error: {e}\")\n raise\n\n # -------- dynamic UI handling (auth switch) --------\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update component configuration based on field changes.\n\n This method handles real-time UI updates, particularly for authentication\n mode changes that show/hide relevant input fields.\n\n Args:\n build_config: Current component configuration\n field_value: New value for the changed field\n field_name: Name of the field that changed\n\n Returns:\n Updated build configuration with appropriate field visibility\n \"\"\"\n try:\n if field_name == \"auth_mode\":\n mode = (field_value or \"basic\").strip().lower()\n is_basic = mode == \"basic\"\n is_jwt = mode == \"jwt\"\n\n build_config[\"username\"][\"show\"] = is_basic\n build_config[\"password\"][\"show\"] = is_basic\n\n build_config[\"jwt_token\"][\"show\"] = is_jwt\n build_config[\"jwt_header\"][\"show\"] = is_jwt\n build_config[\"bearer_prefix\"][\"show\"] = is_jwt\n\n build_config[\"username\"][\"required\"] = is_basic\n build_config[\"password\"][\"required\"] = is_basic\n\n build_config[\"jwt_token\"][\"required\"] = is_jwt\n build_config[\"jwt_header\"][\"required\"] = is_jwt\n build_config[\"bearer_prefix\"][\"required\"] = False\n\n if is_basic:\n build_config[\"jwt_token\"][\"value\"] = \"\"\n\n return build_config\n\n except (KeyError, ValueError) as e:\n self.log(f\"update_build_config error: {e}\")\n\n return build_config\n" }, "docs_metadata": { "_input_type": "TableInput", @@ -805,7 +805,7 @@ "tool_mode": false, "trace_as_metadata": true, "type": "str", - "value": "nmslib" + "value": "jvector" }, "filter_expression": { "_input_type": "MultilineInput", @@ -3618,4 +3618,4 @@ "rag", "q-a" ] -} \ No newline at end of file +} diff --git a/frontend/components/ui/inputs/embedding-model.tsx b/frontend/components/ui/inputs/embedding-model.tsx index ded138ad..33bda350 100644 --- a/frontend/components/ui/inputs/embedding-model.tsx +++ b/frontend/components/ui/inputs/embedding-model.tsx @@ -34,20 +34,25 @@ export const EmbeddingModelInput = ({ modelsData, currentProvider = "openai", }: EmbeddingModelInputProps) => { + const isDisabled = Boolean(disabled); + const tooltipMessage = isDisabled + ? "Locked to keep embeddings consistent" + : "Choose the embedding model for ingest and retrieval"; + return ( - - + - Locked to keep embeddings consistent + {tooltipMessage} (); @@ -141,6 +148,15 @@ export const useGetSearchQuery = ( if (existing) { existing.chunks.push(chunk); existing.totalScore += chunk.score; + if (!existing.embedding_model && chunk.embedding_model) { + existing.embedding_model = chunk.embedding_model; + } + if ( + existing.embedding_dimensions == null && + typeof chunk.embedding_dimensions === "number" + ) { + existing.embedding_dimensions = chunk.embedding_dimensions; + } } else { fileMap.set(chunk.filename, { filename: chunk.filename, @@ -153,6 +169,8 @@ export const useGetSearchQuery = ( owner_email: chunk.owner_email, file_size: chunk.file_size, connector_type: chunk.connector_type, + embedding_model: chunk.embedding_model, + embedding_dimensions: chunk.embedding_dimensions, }); } }); @@ -168,6 +186,8 @@ export const useGetSearchQuery = ( owner_email: file.owner_email || "", size: file.file_size || 0, connector_type: file.connector_type || "local", + embedding_model: file.embedding_model, + embedding_dimensions: file.embedding_dimensions, chunks: file.chunks, })); diff --git a/frontend/src/app/api/queries/useGetTasksQuery.ts b/frontend/src/app/api/queries/useGetTasksQuery.ts index 1ea59d26..b8cdba01 100644 --- a/frontend/src/app/api/queries/useGetTasksQuery.ts +++ b/frontend/src/app/api/queries/useGetTasksQuery.ts @@ -4,6 +4,26 @@ import { useQueryClient, } from "@tanstack/react-query"; +export interface TaskFileEntry { + status?: + | "pending" + | "running" + | "processing" + | "completed" + | "failed" + | "error"; + result?: unknown; + error?: string; + retry_count?: number; + created_at?: string; + updated_at?: string; + duration_seconds?: number; + filename?: string; + embedding_model?: string; + embedding_dimensions?: number; + [key: string]: unknown; +} + export interface Task { task_id: string; status: @@ -24,7 +44,7 @@ export interface Task { duration_seconds?: number; result?: Record; error?: string; - files?: Record>; + files?: Record; } export interface TasksResponse { diff --git a/frontend/src/app/globals.css b/frontend/src/app/globals.css index 5b786b7b..d8460fd6 100644 --- a/frontend/src/app/globals.css +++ b/frontend/src/app/globals.css @@ -168,7 +168,7 @@ } .header-notifications { - @apply absolute right-[0px] top-[-4px] h-1 w-1 rounded-full bg-destructive; + @apply absolute right-1 top-1 h-2 w-2 rounded-full bg-destructive; } .header-menu-bar { diff --git a/frontend/src/app/knowledge/page.tsx b/frontend/src/app/knowledge/page.tsx index 334f8e6f..9cd28433 100644 --- a/frontend/src/app/knowledge/page.tsx +++ b/frontend/src/app/knowledge/page.tsx @@ -26,6 +26,14 @@ import GoogleDriveIcon from "../settings/icons/google-drive-icon"; import OneDriveIcon from "../settings/icons/one-drive-icon"; import SharePointIcon from "../settings/icons/share-point-icon"; import { KnowledgeSearchInput } from "@/components/knowledge-search-input"; +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, + DialogTrigger, +} from "@/components/ui/dialog"; // Function to get the appropriate icon for a connector type function getSourceIcon(connectorType?: string) { @@ -77,6 +85,9 @@ function SearchPage() { size: taskFile.size, connector_type: taskFile.connector_type, status: taskFile.status, + error: taskFile.error, + embedding_model: taskFile.embedding_model, + embedding_dimensions: taskFile.embedding_dimensions, }; }); @@ -115,7 +126,7 @@ function SearchPage() { const gridRef = useRef(null); - const columnDefs = [ + const columnDefs: ColDef[] = [ { field: "filename", headerName: "Source", @@ -128,7 +139,6 @@ function SearchPage() { // Read status directly from data on each render const status = data?.status || "active"; const isActive = status === "active"; - console.log(data?.filename, status, "a"); return (
) => ( + + {data?.embedding_model || "—"} + + ), + }, + { + field: "embedding_dimensions", + headerName: "Dimensions", + width: 110, + cellRenderer: ({ data }: CustomCellRendererProps) => ( + + {typeof data?.embedding_dimensions === "number" + ? data.embedding_dimensions.toString() + : "—"} + + ), + }, { field: "status", headerName: "Status", cellRenderer: ({ data }: CustomCellRendererProps) => { - console.log(data?.filename, data?.status, "b"); - // Default to 'active' status if no status is provided const status = data?.status || "active"; + const error = + typeof data?.error === "string" && data.error.trim().length > 0 + ? data.error.trim() + : undefined; + if (status === "failed" && error) { + return ( + + + + + + + Ingestion failed + + {data?.filename || "Unknown file"} + + +
+ {error} +
+
+
+ ); + } return ; }, }, diff --git a/frontend/src/app/onboarding/components/advanced.tsx b/frontend/src/app/onboarding/components/advanced.tsx index 20764aed..5872cadf 100644 --- a/frontend/src/app/onboarding/components/advanced.tsx +++ b/frontend/src/app/onboarding/components/advanced.tsx @@ -77,7 +77,7 @@ export function AdvancedOnboarding({ {(hasLanguageModels || hasEmbeddingModels) && } diff --git a/frontend/src/app/settings/page.tsx b/frontend/src/app/settings/page.tsx index 148da3bd..728a7861 100644 --- a/frontend/src/app/settings/page.tsx +++ b/frontend/src/app/settings/page.tsx @@ -243,6 +243,8 @@ function KnowledgeSourcesPage() { updateFlowSettingMutation.mutate({ embedding_model: newModel }); }; + const isEmbeddingModelSelectDisabled = updateFlowSettingMutation.isPending; + // Update chunk size setting with debounce const handleChunkSizeChange = (value: string) => { const numValue = Math.max(0, parseInt(value) || 0); @@ -1029,8 +1031,7 @@ function KnowledgeSourcesPage() { label="Embedding model" > {}} + onValueChange={(value) => handleSettingsChange({ embeddingModel: value })} > - + - Locked to keep embeddings consistent + Choose the embedding model for this upload diff --git a/frontend/src/components/layout-wrapper.tsx b/frontend/src/components/layout-wrapper.tsx index fda7c181..d6061384 100644 --- a/frontend/src/components/layout-wrapper.tsx +++ b/frontend/src/components/layout-wrapper.tsx @@ -129,7 +129,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) { {/* Task Notification Bell */} -
- )} - - )} - {/* Cancel button for tasks without progress */} - {!formatTaskProgress(task) && (task.status === 'pending' || task.status === 'running' || task.status === 'processing') && ( - -
- -
-
- )} - - ))} + {showCancel && ( +
+ +
+ )} + + )} + + ) + })}
)} @@ -282,43 +274,47 @@ export function TaskNotificationMenu() { {isExpanded && (
- {recentTasks.map((task) => ( -
- {getTaskIcon(task.status)} -
-
- Task {task.task_id.substring(0, 8)}... -
-
- {formatRelativeTime(task.updated_at)} - {formatDuration(task.duration_seconds) && ( - - • {formatDuration(task.duration_seconds)} - - )} -
- {/* Show final results for completed tasks */} - {task.status === 'completed' && formatTaskProgress(task)?.detailed && ( -
- {formatTaskProgress(task)?.detailed.successful} success, {' '} - {formatTaskProgress(task)?.detailed.failed} failed - {(formatTaskProgress(task)?.detailed.running || 0) > 0 && ( - , {formatTaskProgress(task)?.detailed.running} running + {recentTasks.map((task) => { + const progress = formatTaskProgress(task) + + return ( +
+ {getTaskIcon(task.status)} +
+
+ Task {task.task_id.substring(0, 8)}... +
+
+ {formatRelativeTime(task.updated_at)} + {formatDuration(task.duration_seconds) && ( + + • {formatDuration(task.duration_seconds)} + )}
- )} - {task.status === 'failed' && task.error && ( -
- {task.error} -
- )} + {/* Show final results for completed tasks */} + {task.status === 'completed' && progress?.detailed && ( +
+ {progress.detailed.successful} success,{' '} + {progress.detailed.failed} failed + {(progress.detailed.running || 0) > 0 && ( + , {progress.detailed.running} running + )} +
+ )} + {task.status === 'failed' && task.error && ( +
+ {task.error} +
+ )} +
+ {getStatusBadge(task.status)}
- {getStatusBadge(task.status)} -
- ))} + ) + })}
)}
@@ -338,4 +334,4 @@ export function TaskNotificationMenu() {
) -} \ No newline at end of file +} diff --git a/frontend/src/contexts/task-context.tsx b/frontend/src/contexts/task-context.tsx index 9b3d9908..8bb2bb24 100644 --- a/frontend/src/contexts/task-context.tsx +++ b/frontend/src/contexts/task-context.tsx @@ -14,6 +14,7 @@ import { toast } from "sonner"; import { useCancelTaskMutation } from "@/app/api/mutations/useCancelTaskMutation"; import { type Task, + type TaskFileEntry, useGetTasksQuery, } from "@/app/api/queries/useGetTasksQuery"; import { useAuth } from "@/contexts/auth-context"; @@ -31,6 +32,9 @@ export interface TaskFile { task_id: string; created_at: string; updated_at: string; + error?: string; + embedding_model?: string; + embedding_dimensions?: number; } interface TaskContextType { tasks: Task[]; @@ -105,6 +109,9 @@ export function TaskProvider({ children }: { children: React.ReactNode }) { task_id: taskId, created_at: now, updated_at: now, + error: file.error, + embedding_model: file.embedding_model, + embedding_dimensions: file.embedding_dimensions, })); setFiles((prevFiles) => [...prevFiles, ...filesToAdd]); @@ -138,12 +145,13 @@ export function TaskProvider({ children }: { children: React.ReactNode }) { taskFileEntries.forEach(([filePath, fileInfo]) => { if (typeof fileInfo === "object" && fileInfo) { + const fileInfoEntry = fileInfo as TaskFileEntry; // Use the filename from backend if available, otherwise extract from path const fileName = - (fileInfo as any).filename || + fileInfoEntry.filename || filePath.split("/").pop() || filePath; - const fileStatus = fileInfo.status as string; + const fileStatus = fileInfoEntry.status ?? "processing"; // Map backend file status to our TaskFile status let mappedStatus: TaskFile["status"]; @@ -162,6 +170,23 @@ export function TaskProvider({ children }: { children: React.ReactNode }) { mappedStatus = "processing"; } + const fileError = (() => { + if ( + typeof fileInfoEntry.error === "string" && + fileInfoEntry.error.trim().length > 0 + ) { + return fileInfoEntry.error.trim(); + } + if ( + mappedStatus === "failed" && + typeof currentTask.error === "string" && + currentTask.error.trim().length > 0 + ) { + return currentTask.error.trim(); + } + return undefined; + })(); + setFiles((prevFiles) => { const existingFileIndex = prevFiles.findIndex( (f) => @@ -185,13 +210,22 @@ export function TaskProvider({ children }: { children: React.ReactNode }) { status: mappedStatus, task_id: currentTask.task_id, created_at: - typeof fileInfo.created_at === "string" - ? fileInfo.created_at + typeof fileInfoEntry.created_at === "string" + ? fileInfoEntry.created_at : now, updated_at: - typeof fileInfo.updated_at === "string" - ? fileInfo.updated_at + typeof fileInfoEntry.updated_at === "string" + ? fileInfoEntry.updated_at : now, + error: fileError, + embedding_model: + typeof fileInfoEntry.embedding_model === "string" + ? fileInfoEntry.embedding_model + : undefined, + embedding_dimensions: + typeof fileInfoEntry.embedding_dimensions === "number" + ? fileInfoEntry.embedding_dimensions + : undefined, }; if (existingFileIndex >= 0) { diff --git a/scripts/extract_flow_component.py b/scripts/extract_flow_component.py new file mode 100644 index 00000000..949bdae9 --- /dev/null +++ b/scripts/extract_flow_component.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Extract embedded component code from a Langflow JSON flow. + +Example: + python scripts/extract_flow_component.py \\ + --flow-file flows/ingestion_flow.json \\ + --display-name "OpenSearch (Multi-Model)" \\ + --output flows/components/opensearch_multimodel.py +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Optional + + +def should_select_component( + node: dict, + *, + display_name: Optional[str], + metadata_module: Optional[str], +) -> bool: + """Return True if the node matches the requested component filters.""" + node_data = node.get("data", {}) + component = node_data.get("node", {}) + + if display_name and component.get("display_name") != display_name: + return False + + if metadata_module: + metadata = component.get("metadata", {}) + if metadata.get("module") != metadata_module: + return False + + template = component.get("template", {}) + code_entry = template.get("code") + return isinstance(code_entry, dict) and "value" in code_entry + + +def extract_code_from_flow( + flow_path: Path, + *, + display_name: Optional[str], + metadata_module: Optional[str], + match_index: int, +) -> str: + """Fetch the embedded code string from the matching component node.""" + try: + flow_data = json.loads(flow_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SystemExit(f"[error] failed to parse {flow_path}: {exc}") from exc + + matches = [] + for node in flow_data.get("data", {}).get("nodes", []): + if should_select_component( + node, + display_name=display_name, + metadata_module=metadata_module, + ): + matches.append(node) + + if not matches: + raise SystemExit( + "[error] no component found matching the supplied filters " + f"in {flow_path}" + ) + + if match_index < 0 or match_index >= len(matches): + raise SystemExit( + f"[error] match index {match_index} out of range " + f"(found {len(matches)} matches)" + ) + + target = matches[match_index] + return target["data"]["node"]["template"]["code"]["value"] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Extract component code from a Langflow JSON flow." + ) + parser.add_argument( + "--flow-file", + required=True, + type=Path, + help="Path to the flow JSON file.", + ) + parser.add_argument( + "--display-name", + help="Component display_name to match (e.g. 'OpenSearch (Multi-Model)').", + ) + parser.add_argument( + "--metadata-module", + help="Component metadata.module value to match.", + ) + parser.add_argument( + "--match-index", + type=int, + default=0, + help="Index of the matched component when multiple exist (default: 0).", + ) + parser.add_argument( + "--output", + type=Path, + help="Destination file for the extracted code (stdout if omitted).", + ) + + args = parser.parse_args() + + if not args.display_name and not args.metadata_module: + # Offer an interactive selection of component display names + if not args.flow_file.exists(): + parser.error(f"Flow file not found: {args.flow_file}") + + try: + flow_data = json.loads(args.flow_file.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SystemExit(f"[error] failed to parse {args.flow_file}: {exc}") from exc + + nodes = flow_data.get("data", {}).get("nodes", []) + display_names = sorted( + { + node.get("data", {}) + .get("node", {}) + .get("display_name", "") + for node in nodes + } + ) + + if not display_names: + parser.error( + "Unable to locate any components in the flow; supply --metadata-module instead." + ) + + print("Select a component display name:") + for idx, name in enumerate(display_names): + print(f" [{idx}] {name}") + + while True: + choice = input(f"Enter choice (0-{len(display_names)-1}): ").strip() or "0" + if choice.isdigit(): + index = int(choice) + if 0 <= index < len(display_names): + args.display_name = display_names[index] + break + print("Invalid selection, please try again.") + + return args + + +def main() -> None: + args = parse_args() + + if not args.flow_file.exists(): + raise SystemExit(f"[error] flow file not found: {args.flow_file}") + + code = extract_code_from_flow( + args.flow_file, + display_name=args.display_name, + metadata_module=args.metadata_module, + match_index=args.match_index, + ) + + if args.output: + args.output.write_text(code, encoding="utf-8") + else: + print(code, end="") + + +if __name__ == "__main__": + main() diff --git a/scripts/migrate_embedding_model_field.py b/scripts/migrate_embedding_model_field.py new file mode 100644 index 00000000..d90f270e --- /dev/null +++ b/scripts/migrate_embedding_model_field.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +Migration script to migrate legacy embeddings to multi-model setup. + +This script migrates documents from the legacy single-field embedding system +to the new multi-model system with dynamic field names. + +Legacy format: + { + "chunk_embedding": [0.1, 0.2, ...], + // no embedding_model field + } + +New format: + { + "chunk_embedding_text_embedding_3_small": [0.1, 0.2, ...], + "embedding_model": "text-embedding-3-small" + } + +Usage: + uv run python scripts/migrate_embedding_model_field.py --model + +Example: + uv run python scripts/migrate_embedding_model_field.py --model text-embedding-3-small + +Options: + --model MODEL The embedding model name to assign to legacy embeddings + (e.g., "text-embedding-3-small", "nomic-embed-text") + --batch-size SIZE Number of documents to process per batch (default: 100) + --dry-run Show what would be migrated without making changes + --index INDEX Index name (default: documents) + +What it does: + 1. Finds all documents with legacy "chunk_embedding" field but no "embedding_model" field + 2. For each document: + - Copies the vector from "chunk_embedding" to "chunk_embedding_{model_name}" + - Adds "embedding_model" field with the specified model name + - Optionally removes the legacy "chunk_embedding" field + 3. Uses bulk updates for efficiency + +Note: This script does NOT re-embed documents. It simply tags existing embeddings +with the model name you specify. Make sure to specify the correct model that was +actually used to create those embeddings. +""" +import asyncio +import sys +import os +import argparse +from typing import List, Dict, Any + +from opensearchpy import AsyncOpenSearch, helpers +from opensearchpy._async.http_aiohttp import AIOHttpConnection + +# Add src directory to path to import config +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from config.settings import ( + OPENSEARCH_HOST, + OPENSEARCH_PORT, + OPENSEARCH_USERNAME, + OPENSEARCH_PASSWORD, + INDEX_NAME, +) +from utils.logging_config import get_logger +from utils.embedding_fields import get_embedding_field_name + +logger = get_logger(__name__) + + +async def ensure_new_field_exists( + client: AsyncOpenSearch, + index_name: str, + field_name: str, + dimensions: int +) -> None: + """Ensure the new embedding field exists in the index.""" + mapping = { + "properties": { + field_name: { + "type": "knn_vector", + "dimension": dimensions, + "method": { + "name": "disk_ann", + "engine": "jvector", + "space_type": "l2", + "parameters": {"ef_construction": 100, "m": 16}, + }, + }, + "embedding_model": { + "type": "keyword" + } + } + } + + try: + await client.indices.put_mapping(index=index_name, body=mapping) + logger.info(f"Ensured field exists: {field_name}") + except Exception as e: + error_msg = str(e).lower() + if "already" in error_msg or "exists" in error_msg: + logger.debug(f"Field already exists: {field_name}") + else: + logger.error(f"Failed to add field mapping: {e}") + raise + + +async def find_legacy_documents( + client: AsyncOpenSearch, + index_name: str, + batch_size: int = 100 +) -> List[Dict[str, Any]]: + """Find all documents with legacy chunk_embedding but no embedding_model field.""" + query = { + "query": { + "bool": { + "must": [ + {"exists": {"field": "chunk_embedding"}} + ], + "must_not": [ + {"exists": {"field": "embedding_model"}} + ] + } + }, + "size": batch_size, + "_source": True + } + + try: + response = await client.search(index=index_name, body=query, scroll='5m') + scroll_id = response['_scroll_id'] + hits = response['hits']['hits'] + + all_docs = hits + + # Continue scrolling until no more results + while len(hits) > 0: + response = await client.scroll(scroll_id=scroll_id, scroll='5m') + scroll_id = response['_scroll_id'] + hits = response['hits']['hits'] + all_docs.extend(hits) + + # Clean up scroll + await client.clear_scroll(scroll_id=scroll_id) + + return all_docs + except Exception as e: + logger.error(f"Error finding legacy documents: {e}") + raise + + +async def migrate_documents( + client: AsyncOpenSearch, + index_name: str, + documents: List[Dict[str, Any]], + model_name: str, + new_field_name: str, + dry_run: bool = False +) -> Dict[str, int]: + """Migrate legacy documents to new format.""" + if not documents: + return {"migrated": 0, "errors": 0} + + if dry_run: + logger.info(f"DRY RUN: Would migrate {len(documents)} documents") + for doc in documents[:5]: # Show first 5 as sample + doc_id = doc['_id'] + has_legacy = 'chunk_embedding' in doc['_source'] + logger.info(f" Document {doc_id}: has_legacy={has_legacy}") + if len(documents) > 5: + logger.info(f" ... and {len(documents) - 5} more documents") + return {"migrated": len(documents), "errors": 0} + + # Prepare bulk update actions + actions = [] + for doc in documents: + doc_id = doc['_id'] + source = doc['_source'] + + # Copy the legacy embedding to the new field + legacy_embedding = source.get('chunk_embedding') + if not legacy_embedding: + logger.warning(f"Document {doc_id} missing chunk_embedding, skipping") + continue + + # Build update document + update_doc = { + new_field_name: legacy_embedding, + "embedding_model": model_name + } + + action = { + "_op_type": "update", + "_index": index_name, + "_id": doc_id, + "doc": update_doc + } + actions.append(action) + + # Execute bulk update + migrated = 0 + errors = 0 + + try: + success, failed = await helpers.async_bulk( + client, + actions, + raise_on_error=False, + raise_on_exception=False + ) + migrated = success + errors = len(failed) if isinstance(failed, list) else 0 + + if errors > 0: + logger.error(f"Failed to migrate {errors} documents") + for failure in (failed if isinstance(failed, list) else [])[:5]: + logger.error(f" Error: {failure}") + + logger.info(f"Successfully migrated {migrated} documents") + except Exception as e: + logger.error(f"Bulk migration failed: {e}") + raise + + return {"migrated": migrated, "errors": errors} + + +async def migrate_legacy_embeddings( + model_name: str, + batch_size: int = 100, + dry_run: bool = False, + index_name: str = None +) -> bool: + """Main migration function.""" + if index_name is None: + index_name = INDEX_NAME + + new_field_name = get_embedding_field_name(model_name) + + logger.info("=" * 60) + logger.info("Legacy Embedding Migration") + logger.info("=" * 60) + logger.info(f"Index: {index_name}") + logger.info(f"Model: {model_name}") + logger.info(f"New field: {new_field_name}") + logger.info(f"Batch size: {batch_size}") + logger.info(f"Dry run: {dry_run}") + logger.info("=" * 60) + + # Create admin OpenSearch client + client = AsyncOpenSearch( + hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}], + connection_class=AIOHttpConnection, + scheme="https", + use_ssl=True, + verify_certs=False, + ssl_assert_fingerprint=None, + http_auth=(OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD), + http_compress=True, + ) + + try: + # Check if index exists + exists = await client.indices.exists(index=index_name) + if not exists: + logger.error(f"Index '{index_name}' does not exist") + return False + + # Find legacy documents + logger.info("Searching for legacy documents...") + legacy_docs = await find_legacy_documents(client, index_name, batch_size) + + if not legacy_docs: + logger.info("No legacy documents found. Migration not needed.") + return True + + logger.info(f"Found {len(legacy_docs)} legacy documents to migrate") + + # Get vector dimension from first document + first_doc = legacy_docs[0] + legacy_embedding = first_doc['_source'].get('chunk_embedding', []) + dimensions = len(legacy_embedding) + logger.info(f"Detected vector dimensions: {dimensions}") + + # Ensure new field exists + if not dry_run: + logger.info(f"Ensuring new field exists: {new_field_name}") + await ensure_new_field_exists(client, index_name, new_field_name, dimensions) + + # Migrate documents + logger.info("Starting migration...") + result = await migrate_documents( + client, + index_name, + legacy_docs, + model_name, + new_field_name, + dry_run + ) + + logger.info("=" * 60) + logger.info("Migration Summary") + logger.info("=" * 60) + logger.info(f"Total documents: {len(legacy_docs)}") + logger.info(f"Successfully migrated: {result['migrated']}") + logger.info(f"Errors: {result['errors']}") + logger.info("=" * 60) + + if result['errors'] > 0: + logger.warning("Migration completed with errors") + return False + + if dry_run: + logger.info("DRY RUN completed. No changes were made.") + logger.info(f"Run without --dry-run to perform the migration") + else: + logger.info("Migration completed successfully!") + + return True + + except Exception as e: + logger.error(f"Migration failed: {e}") + return False + finally: + await client.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Migrate legacy embeddings to multi-model setup", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Dry run to see what would be migrated + uv run python scripts/migrate_embedding_model_field.py --model text-embedding-3-small --dry-run + + # Perform actual migration + uv run python scripts/migrate_embedding_model_field.py --model text-embedding-3-small + + # Migrate with custom batch size + uv run python scripts/migrate_embedding_model_field.py --model nomic-embed-text --batch-size 500 + """ + ) + + parser.add_argument( + '--model', + required=True, + help='Embedding model name to assign to legacy embeddings (e.g., "text-embedding-3-small")' + ) + parser.add_argument( + '--batch-size', + type=int, + default=100, + help='Number of documents to process per batch (default: 100)' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be migrated without making changes' + ) + parser.add_argument( + '--index', + default=None, + help=f'Index name (default: {INDEX_NAME})' + ) + + args = parser.parse_args() + + # Run migration + success = asyncio.run(migrate_legacy_embeddings( + model_name=args.model, + batch_size=args.batch_size, + dry_run=args.dry_run, + index_name=args.index + )) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/update_flow_components.py b/scripts/update_flow_components.py new file mode 100644 index 00000000..8aaafd3a --- /dev/null +++ b/scripts/update_flow_components.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Utility to sync embedded component code inside Langflow JSON files. + +Given a Python source file (e.g. the OpenSearch component implementation) and +a target selector, this script updates every flow definition in ``./flows`` so +that the component's ``template.code.value`` matches the supplied file. + +Example: + python scripts/update_flow_components.py \\ + --code-file flows/components/opensearch_multimodel.py \\ + --display-name \"OpenSearch (Multi-Model)\" +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable + + +def load_code(source_path: Path) -> str: + try: + return source_path.read_text(encoding="utf-8") + except FileNotFoundError as exc: + raise SystemExit(f"[error] code file not found: {source_path}") from exc + + +def should_update_component(node: dict, *, display_name: str | None, metadata_module: str | None) -> bool: + node_data = node.get("data", {}) + component = node_data.get("node", {}) + + if display_name and component.get("display_name") != display_name: + return False + + if metadata_module: + metadata = component.get("metadata", {}) + module_name = metadata.get("module") + if module_name != metadata_module: + return False + + template = component.get("template", {}) + code_entry = template.get("code") + return isinstance(code_entry, dict) and "value" in code_entry + + +def update_flow(flow_path: Path, code: str, *, display_name: str | None, metadata_module: str | None, dry_run: bool) -> bool: + with flow_path.open(encoding="utf-8") as fh: + try: + data = json.load(fh) + except json.JSONDecodeError as exc: + raise SystemExit(f"[error] failed to parse {flow_path}: {exc}") from exc + + changed = False + + for node in data.get("data", {}).get("nodes", []): + if not should_update_component(node, display_name=display_name, metadata_module=metadata_module): + continue + + template = node["data"]["node"]["template"] + if template["code"]["value"] != code: + if dry_run: + changed = True + else: + template["code"]["value"] = code + changed = True + + if changed and not dry_run: + flow_path.write_text( + json.dumps(data, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + return changed + + +def iter_flow_files(flows_dir: Path) -> Iterable[Path]: + for path in sorted(flows_dir.glob("*.json")): + if path.is_file(): + yield path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Update embedded component code in Langflow JSON files.") + parser.add_argument("--code-file", required=True, type=Path, help="Path to the Python file containing the component code.") + parser.add_argument("--flows-dir", type=Path, default=Path("flows"), help="Directory containing Langflow JSON files.") + parser.add_argument("--display-name", help="Component display_name to match (e.g. 'OpenSearch (Multi-Model)').") + parser.add_argument("--metadata-module", help="Component metadata.module value to match.") + parser.add_argument("--dry-run", action="store_true", help="Report which files would change without modifying them.") + + args = parser.parse_args() + + if not args.display_name and not args.metadata_module: + parser.error("At least one of --display-name or --metadata-module must be provided.") + + return args + + +def main() -> None: + args = parse_args() + + flows_dir: Path = args.flows_dir + if not flows_dir.exists(): + raise SystemExit(f"[error] flows directory not found: {flows_dir}") + + code = load_code(args.code_file) + + updated_files = [] + for flow_path in iter_flow_files(flows_dir): + changed = update_flow( + flow_path, + code, + display_name=args.display_name, + metadata_module=args.metadata_module, + dry_run=args.dry_run, + ) + if changed: + updated_files.append(flow_path) + + if args.dry_run: + if updated_files: + print("[dry-run] files that would be updated:") + for path in updated_files: + print(f" - {path}") + else: + print("[dry-run] no files would change.") + else: + if updated_files: + print("Updated component code in:") + for path in updated_files: + print(f" - {path}") + else: + print("No updates were necessary.") + + +if __name__ == "__main__": + main() diff --git a/securityconfig/roles.yml b/securityconfig/roles.yml index 07532bfb..163bcb4c 100644 --- a/securityconfig/roles.yml +++ b/securityconfig/roles.yml @@ -21,10 +21,13 @@ openrag_user_role: allowed_actions: - crud - create_index + - indices:admin/mappings/get + - indices:admin/mappings/put + - indices:admin/exists + - indices:admin/get dls: > {"bool":{"should":[ {"term":{"owner":"${user.name}"}}, {"term":{"allowed_users":"${user.name}"}}, {"bool":{"must_not":{"exists":{"field":"owner"}}}} ],"minimum_should_match":1}} - diff --git a/src/api/settings.py b/src/api/settings.py index f60afe69..17c57a60 100644 --- a/src/api/settings.py +++ b/src/api/settings.py @@ -241,19 +241,49 @@ async def update_settings(request, session_manager): {"error": "embedding_model must be a non-empty string"}, status_code=400, ) - current_config.knowledge.embedding_model = body["embedding_model"].strip() + new_embedding_model = body["embedding_model"].strip() + current_config.knowledge.embedding_model = new_embedding_model config_updated = True # Also update the ingest flow with the new embedding model try: flows_service = _get_flows_service() await flows_service.update_ingest_flow_embedding_model( - body["embedding_model"].strip(), + new_embedding_model, current_config.provider.model_provider.lower() ) logger.info( f"Successfully updated ingest flow embedding model to '{body['embedding_model'].strip()}'" ) + + provider = ( + current_config.provider.model_provider.lower() + if current_config.provider.model_provider + else "openai" + ) + endpoint = current_config.provider.endpoint or None + llm_model = current_config.agent.llm_model + + change_result = await flows_service.change_langflow_model_value( + provider=provider, + embedding_model=new_embedding_model, + llm_model=llm_model, + endpoint=endpoint, + ) + + if not change_result.get("success", False): + logger.warning( + "Change embedding model across flows completed with issues", + provider=provider, + embedding_model=new_embedding_model, + change_result=change_result, + ) + else: + logger.info( + "Successfully updated embedding model across Langflow flows", + provider=provider, + embedding_model=new_embedding_model, + ) except Exception as e: logger.error(f"Failed to update ingest flow embedding model: {str(e)}") # Don't fail the entire settings update if flow update fails diff --git a/src/config/settings.py b/src/config/settings.py index 415516e8..53025937 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -107,6 +107,8 @@ INDEX_BODY = { "mimetype": {"type": "keyword"}, "page": {"type": "integer"}, "text": {"type": "text"}, + # Legacy field - kept for backward compatibility + # New documents will use chunk_embedding_{model_name} fields "chunk_embedding": { "type": "knn_vector", "dimension": VECTOR_DIM, @@ -117,6 +119,8 @@ INDEX_BODY = { "parameters": {"ef_construction": 100, "m": 16}, }, }, + # Track which embedding model was used for this chunk + "embedding_model": {"type": "keyword"}, "source_url": {"type": "keyword"}, "connector_type": {"type": "keyword"}, "owner": {"type": "keyword"}, @@ -322,7 +326,7 @@ class AppClients: # Initialize Langflow HTTP client self.langflow_http_client = httpx.AsyncClient( - base_url=LANGFLOW_URL, timeout=60.0 + base_url=LANGFLOW_URL, timeout=300.0 ) return self @@ -591,3 +595,8 @@ def get_knowledge_config(): def get_agent_config(): """Get agent configuration.""" return get_openrag_config().agent + + +def get_embedding_model() -> str: + """Return the currently configured embedding model.""" + return get_openrag_config().knowledge.embedding_model diff --git a/src/connectors/service.py b/src/connectors/service.py index 278743d3..96daaf77 100644 --- a/src/connectors/service.py +++ b/src/connectors/service.py @@ -271,6 +271,7 @@ class ConnectorService: # Create custom processor for connector files from models.processors import ConnectorFileProcessor + from services.document_service import DocumentService processor = ConnectorFileProcessor( self, @@ -280,6 +281,11 @@ class ConnectorService: jwt_token=jwt_token, owner_name=owner_name, owner_email=owner_email, + document_service=( + self.task_service.document_service + if self.task_service and self.task_service.document_service + else DocumentService(session_manager=self.session_manager) + ), ) # Use file IDs as items (no more fake file paths!) @@ -366,6 +372,7 @@ class ConnectorService: # Create custom processor for specific connector files from models.processors import ConnectorFileProcessor + from services.document_service import DocumentService # Use expanded_file_ids which has folders already expanded processor = ConnectorFileProcessor( @@ -376,6 +383,11 @@ class ConnectorService: jwt_token=jwt_token, owner_name=owner_name, owner_email=owner_email, + document_service=( + self.task_service.document_service + if self.task_service and self.task_service.document_service + else DocumentService(session_manager=self.session_manager) + ), ) # Create custom task using TaskService diff --git a/src/main.py b/src/main.py index b16a50d5..45cd05f9 100644 --- a/src/main.py +++ b/src/main.py @@ -53,11 +53,11 @@ from auth_middleware import optional_auth, require_auth # Configuration and setup from config.settings import ( DISABLE_INGEST_WITH_LANGFLOW, - EMBED_MODEL, INDEX_BODY, INDEX_NAME, SESSION_SECRET, clients, + get_embedding_model, is_no_auth_mode, get_openrag_config, ) @@ -505,7 +505,7 @@ async def initialize_services(): openrag_connector_service = ConnectorService( patched_async_client=clients.patched_async_client, process_pool=process_pool, - embed_model=EMBED_MODEL, + embed_model=get_embedding_model(), index_name=INDEX_NAME, task_service=task_service, session_manager=session_manager, @@ -567,18 +567,6 @@ async def create_app(): # Create route handlers with service dependencies injected routes = [ - # Upload endpoints - Route( - "/upload", - require_auth(services["session_manager"])( - partial( - upload.upload, - document_service=services["document_service"], - session_manager=services["session_manager"], - ) - ), - methods=["POST"], - ), # Langflow Files endpoints Route( "/langflow/files/upload", @@ -1228,4 +1216,4 @@ if __name__ == "__main__": host="0.0.0.0", port=8000, reload=False, # Disable reload since we're running from main - ) \ No newline at end of file + ) diff --git a/src/models/processors.py b/src/models/processors.py index 4a5d96b5..d6fb7a72 100644 --- a/src/models/processors.py +++ b/src/models/processors.py @@ -156,15 +156,24 @@ class TaskProcessor: owner_email: str = None, file_size: int = None, connector_type: str = "local", + embedding_model: str = None, ): """ Standard processing pipeline for non-Langflow processors: docling conversion + embeddings + OpenSearch indexing. + + Args: + embedding_model: Embedding model to use (defaults to the current + embedding model from settings) """ import datetime - from config.settings import INDEX_NAME, EMBED_MODEL, clients + from config.settings import INDEX_NAME, clients, get_embedding_model from services.document_service import chunk_texts_for_embeddings from utils.document_processing import extract_relevant + from utils.embedding_fields import get_embedding_field_name, ensure_embedding_field_exists + + # Use provided embedding model or fall back to default + embedding_model = embedding_model or get_embedding_model() # Get user's OpenSearch client with JWT for OIDC auth opensearch_client = self.document_service.session_manager.get_user_opensearch_client( @@ -175,6 +184,18 @@ class TaskProcessor: if await self.check_document_exists(file_hash, opensearch_client): return {"status": "unchanged", "id": file_hash} + # Ensure the embedding field exists for this model + embedding_field_name = await ensure_embedding_field_exists( + opensearch_client, embedding_model, INDEX_NAME + ) + + logger.info( + "Processing document with embedding model", + embedding_model=embedding_model, + embedding_field=embedding_field_name, + file_hash=file_hash, + ) + # Convert and extract result = clients.converter.convert(file_path) full_doc = result.document.export_to_dict() @@ -188,7 +209,7 @@ class TaskProcessor: for batch in text_batches: resp = await clients.patched_async_client.embeddings.create( - model=EMBED_MODEL, input=batch + model=embedding_model, input=batch ) embeddings.extend([d.embedding for d in resp.data]) @@ -202,7 +223,11 @@ class TaskProcessor: "mimetype": slim_doc["mimetype"], "page": chunk["page"], "text": chunk["text"], - "chunk_embedding": vect, + # Store embedding in model-specific field + embedding_field_name: vect, + # Track which model was used + "embedding_model": embedding_model, + "embedding_dimensions": len(vect), "file_size": file_size, "connector_type": connector_type, "indexed_time": datetime.datetime.now().isoformat(), @@ -331,8 +356,9 @@ class ConnectorFileProcessor(TaskProcessor): jwt_token: str = None, owner_name: str = None, owner_email: str = None, + document_service=None, ): - super().__init__() + super().__init__(document_service=document_service) self.connector_service = connector_service self.connection_id = connection_id self.files_to_process = files_to_process @@ -550,7 +576,7 @@ class S3FileProcessor(TaskProcessor): import time import asyncio import datetime - from config.settings import INDEX_NAME, EMBED_MODEL, clients + from config.settings import INDEX_NAME, clients, get_embedding_model from services.document_service import chunk_texts_for_embeddings from utils.document_processing import process_document_sync @@ -740,4 +766,4 @@ class LangflowFileProcessor(TaskProcessor): file_task.error_message = str(e) file_task.updated_at = time.time() upload_task.failed_files += 1 - raise \ No newline at end of file + raise diff --git a/src/services/document_service.py b/src/services/document_service.py index d596fb25..882b5eaf 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -12,12 +12,13 @@ from utils.logging_config import get_logger logger = get_logger(__name__) -from config.settings import clients, INDEX_NAME, EMBED_MODEL +from config.settings import clients, INDEX_NAME, get_embedding_model from utils.document_processing import extract_relevant, process_document_sync -def get_token_count(text: str, model: str = EMBED_MODEL) -> int: +def get_token_count(text: str, model: str = None) -> int: """Get accurate token count using tiktoken""" + model = model or get_embedding_model() try: encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(text)) @@ -28,12 +29,14 @@ def get_token_count(text: str, model: str = EMBED_MODEL) -> int: def chunk_texts_for_embeddings( - texts: List[str], max_tokens: int = None, model: str = EMBED_MODEL + texts: List[str], max_tokens: int = None, model: str = None ) -> List[List[str]]: """ Split texts into batches that won't exceed token limits. If max_tokens is None, returns texts as single batch (no splitting). """ + model = model or get_embedding_model() + if max_tokens is None: return [texts] diff --git a/src/services/search_service.py b/src/services/search_service.py index 230c052f..8d12d375 100644 --- a/src/services/search_service.py +++ b/src/services/search_service.py @@ -1,27 +1,50 @@ +import copy from typing import Any, Dict from agentd.tool_decorator import tool -from config.settings import clients, INDEX_NAME, EMBED_MODEL +from config.settings import clients, INDEX_NAME, get_embedding_model from auth_context import get_auth_context from utils.logging_config import get_logger logger = get_logger(__name__) +MAX_EMBED_RETRIES = 3 +EMBED_RETRY_INITIAL_DELAY = 1.0 +EMBED_RETRY_MAX_DELAY = 8.0 + class SearchService: def __init__(self, session_manager=None): self.session_manager = session_manager @tool - async def search_tool(self, query: str) -> Dict[str, Any]: + async def search_tool(self, query: str, embedding_model: str = None) -> Dict[str, Any]: """ Use this tool to search for documents relevant to the query. Args: query (str): query string to search the corpus + embedding_model (str): Optional override for embedding model. + If not provided, uses the current embedding + model from configuration. Returns: dict (str, Any): {"results": [chunks]} on success """ + from utils.embedding_fields import get_embedding_field_name + + # Strategy: Use provided model, or default to the configured embedding + # model. This assumes documents are embedded with that model by default. + # Future enhancement: Could auto-detect available models in corpus + embedding_model = embedding_model or get_embedding_model() + embedding_field_name = get_embedding_field_name(embedding_model) + + logger.info( + "Search with embedding model", + embedding_model=embedding_model, + embedding_field=embedding_field_name, + query_preview=query[:50] if query else None, + ) + # Get authentication context from the current async context user_id, jwt_token = get_auth_context() # Get search filters, limit, and score threshold from context @@ -37,40 +60,176 @@ class SearchService: # Detect wildcard request ("*") to return global facets/stats without semantic search is_wildcard_match_all = isinstance(query, str) and query.strip() == "*" - # Only embed when not doing match_all + # Get available embedding models from corpus + query_embeddings = {} + available_models = [] + + opensearch_client = self.session_manager.get_user_opensearch_client( + user_id, jwt_token + ) + if not is_wildcard_match_all: - resp = await clients.patched_async_client.embeddings.create( - model=EMBED_MODEL, input=[query] - ) - query_embedding = resp.data[0].embedding + # Build filter clauses first so we can use them in model detection + filter_clauses = [] + if filters: + # Map frontend filter names to backend field names + field_mapping = { + "data_sources": "filename", + "document_types": "mimetype", + "owners": "owner_name.keyword", + "connector_types": "connector_type", + } - # Build filter clauses - filter_clauses = [] - if filters: - # Map frontend filter names to backend field names - field_mapping = { - "data_sources": "filename", - "document_types": "mimetype", - "owners": "owner_name.keyword", - "connector_types": "connector_type", - } + for filter_key, values in filters.items(): + if values is not None and isinstance(values, list): + # Map frontend key to backend field name + field_name = field_mapping.get(filter_key, filter_key) - for filter_key, values in filters.items(): - if values is not None and isinstance(values, list): - # Map frontend key to backend field name - field_name = field_mapping.get(filter_key, filter_key) + if len(values) == 0: + # Empty array means "match nothing" - use impossible filter + filter_clauses.append( + {"term": {field_name: "__IMPOSSIBLE_VALUE__"}} + ) + elif len(values) == 1: + # Single value filter + filter_clauses.append({"term": {field_name: values[0]}}) + else: + # Multiple values filter + filter_clauses.append({"terms": {field_name: values}}) - if len(values) == 0: - # Empty array means "match nothing" - use impossible filter - filter_clauses.append( - {"term": {field_name: "__IMPOSSIBLE_VALUE__"}} + try: + # Build aggregation query with filters applied + agg_query = { + "size": 0, + "aggs": { + "embedding_models": { + "terms": { + "field": "embedding_model", + "size": 10 + } + } + } + } + + # Apply filters to model detection if any exist + if filter_clauses: + agg_query["query"] = { + "bool": { + "filter": filter_clauses + } + } + + agg_result = await opensearch_client.search( + index=INDEX_NAME, body=agg_query, params={"terminate_after": 0} + ) + buckets = agg_result.get("aggregations", {}).get("embedding_models", {}).get("buckets", []) + available_models = [b["key"] for b in buckets if b["key"]] + + if not available_models: + # Fallback to configured model if no documents indexed yet + available_models = [embedding_model] + + logger.info( + "Detected embedding models in corpus", + available_models=available_models, + model_counts={b["key"]: b["doc_count"] for b in buckets}, + with_filters=len(filter_clauses) > 0 + ) + except Exception as e: + logger.warning("Failed to detect embedding models, using configured model", error=str(e)) + available_models = [embedding_model] + + # Parallelize embedding generation for all models + import asyncio + + async def embed_with_model(model_name): + delay = EMBED_RETRY_INITIAL_DELAY + attempts = 0 + last_exception = None + + while attempts < MAX_EMBED_RETRIES: + attempts += 1 + try: + resp = await clients.patched_async_client.embeddings.create( + model=model_name, input=[query] ) - elif len(values) == 1: - # Single value filter - filter_clauses.append({"term": {field_name: values[0]}}) - else: - # Multiple values filter - filter_clauses.append({"terms": {field_name: values}}) + return model_name, resp.data[0].embedding + except Exception as e: + last_exception = e + if attempts >= MAX_EMBED_RETRIES: + logger.error( + "Failed to embed with model after retries", + model=model_name, + attempts=attempts, + error=str(e), + ) + raise RuntimeError( + f"Failed to embed with model {model_name}" + ) from e + + logger.warning( + "Retrying embedding generation", + model=model_name, + attempt=attempts, + max_attempts=MAX_EMBED_RETRIES, + error=str(e), + ) + await asyncio.sleep(delay) + delay = min(delay * 2, EMBED_RETRY_MAX_DELAY) + + # Should not reach here, but guard in case + raise RuntimeError( + f"Failed to embed with model {model_name}" + ) from last_exception + + # Run all embeddings in parallel + try: + embedding_results = await asyncio.gather( + *[embed_with_model(model) for model in available_models] + ) + except Exception as e: + logger.error("Embedding generation failed", error=str(e)) + raise + + # Collect successful embeddings + for result in embedding_results: + if isinstance(result, tuple) and result[1] is not None: + model_name, embedding = result + query_embeddings[model_name] = embedding + + logger.info( + "Generated query embeddings", + models=list(query_embeddings.keys()), + query_preview=query[:50] + ) + else: + # Wildcard query - no embedding needed + filter_clauses = [] + if filters: + # Map frontend filter names to backend field names + field_mapping = { + "data_sources": "filename", + "document_types": "mimetype", + "owners": "owner_name.keyword", + "connector_types": "connector_type", + } + + for filter_key, values in filters.items(): + if values is not None and isinstance(values, list): + # Map frontend key to backend field name + field_name = field_mapping.get(filter_key, filter_key) + + if len(values) == 0: + # Empty array means "match nothing" - use impossible filter + filter_clauses.append( + {"term": {field_name: "__IMPOSSIBLE_VALUE__"}} + ) + elif len(values) == 1: + # Single value filter + filter_clauses.append({"term": {field_name: values[0]}}) + else: + # Multiple values filter + filter_clauses.append({"terms": {field_name: values}}) # Build query body if is_wildcard_match_all: @@ -80,17 +239,51 @@ class SearchService: else: query_block = {"match_all": {}} else: + # Build multi-model KNN queries + knn_queries = [] + embedding_fields_to_check = [] + + for model_name, embedding_vector in query_embeddings.items(): + field_name = get_embedding_field_name(model_name) + embedding_fields_to_check.append(field_name) + knn_queries.append({ + "knn": { + field_name: { + "vector": embedding_vector, + "k": 50, + "num_candidates": 1000, + } + } + }) + + # Build exists filter - doc must have at least one embedding field + exists_any_embedding = { + "bool": { + "should": [{"exists": {"field": f}} for f in embedding_fields_to_check], + "minimum_should_match": 1 + } + } + + # Add exists filter to existing filters + all_filters = [*filter_clauses, exists_any_embedding] + + logger.debug( + "Building hybrid query with filters", + user_filters_count=len(filter_clauses), + total_filters_count=len(all_filters), + filter_types=[type(f).__name__ for f in all_filters] + ) + # Hybrid search query structure (semantic + keyword) + # Use dis_max to pick best score across multiple embedding fields query_block = { "bool": { "should": [ { - "knn": { - "chunk_embedding": { - "vector": query_embedding, - "k": 10, - "boost": 0.7, - } + "dis_max": { + "tie_breaker": 0.0, # Take only the best match, no blending + "boost": 0.7, # 70% weight for semantic search + "queries": knn_queries } }, { @@ -99,12 +292,12 @@ class SearchService: "fields": ["text^2", "filename^1.5"], "type": "best_fields", "fuzziness": "AUTO", - "boost": 0.3, + "boost": 0.3, # 30% weight for keyword search } }, ], "minimum_should_match": 1, - **({"filter": filter_clauses} if filter_clauses else {}), + "filter": all_filters, } } @@ -115,6 +308,7 @@ class SearchService: "document_types": {"terms": {"field": "mimetype", "size": 10}}, "owners": {"terms": {"field": "owner_name.keyword", "size": 10}}, "connector_types": {"terms": {"field": "connector_type", "size": 10}}, + "embedding_models": {"terms": {"field": "embedding_model", "size": 10}}, }, "_source": [ "filename", @@ -127,6 +321,8 @@ class SearchService: "owner_email", "file_size", "connector_type", + "embedding_model", # Include embedding model in results + "embedding_dimensions", "allowed_users", "allowed_groups", ], @@ -137,6 +333,23 @@ class SearchService: if not is_wildcard_match_all and score_threshold > 0: search_body["min_score"] = score_threshold + # Prepare fallback search body without num_candidates for clusters that don't support it + fallback_search_body = None + if not is_wildcard_match_all: + try: + fallback_search_body = copy.deepcopy(search_body) + knn_query_blocks = ( + fallback_search_body["query"]["bool"]["should"][0]["dis_max"]["queries"] + ) + for query_candidate in knn_query_blocks: + knn_section = query_candidate.get("knn") + if isinstance(knn_section, dict): + for params in knn_section.values(): + if isinstance(params, dict): + params.pop("num_candidates", None) + except (KeyError, IndexError, AttributeError, TypeError): + fallback_search_body = None + # Authentication required - DLS will handle document filtering automatically logger.debug( "search_service authentication info", @@ -152,8 +365,41 @@ class SearchService: user_id, jwt_token ) + from opensearchpy.exceptions import RequestError + + search_params = {"terminate_after": 0} + try: - results = await opensearch_client.search(index=INDEX_NAME, body=search_body) + results = await opensearch_client.search( + index=INDEX_NAME, body=search_body, params=search_params + ) + except RequestError as e: + error_message = str(e) + if ( + fallback_search_body is not None + and "unknown field [num_candidates]" in error_message.lower() + ): + logger.warning( + "OpenSearch cluster does not support num_candidates; retrying without it" + ) + try: + results = await opensearch_client.search( + index=INDEX_NAME, + body=fallback_search_body, + params=search_params, + ) + except RequestError as retry_error: + logger.error( + "OpenSearch retry without num_candidates failed", + error=str(retry_error), + search_body=fallback_search_body, + ) + raise + else: + logger.error( + "OpenSearch query failed", error=error_message, search_body=search_body + ) + raise except Exception as e: logger.error( "OpenSearch query failed", error=str(e), search_body=search_body @@ -177,6 +423,8 @@ class SearchService: "owner_email": hit["_source"].get("owner_email"), "file_size": hit["_source"].get("file_size"), "connector_type": hit["_source"].get("connector_type"), + "embedding_model": hit["_source"].get("embedding_model"), # Include in results + "embedding_dimensions": hit["_source"].get("embedding_dimensions"), } ) @@ -199,8 +447,14 @@ class SearchService: filters: Dict[str, Any] = None, limit: int = 10, score_threshold: float = 0, + embedding_model: str = None, ) -> Dict[str, Any]: - """Public search method for API endpoints""" + """Public search method for API endpoints + + Args: + embedding_model: Embedding model to use for search (defaults to the + currently configured embedding model) + """ # Set auth context if provided (for direct API calls) from config.settings import is_no_auth_mode @@ -220,4 +474,4 @@ class SearchService: set_search_limit(limit) set_score_threshold(score_threshold) - return await self.search_tool(query) + return await self.search_tool(query, embedding_model=embedding_model) diff --git a/src/tui/managers/container_manager.py b/src/tui/managers/container_manager.py index 41dd7178..bdf42865 100644 --- a/src/tui/managers/container_manager.py +++ b/src/tui/managers/container_manager.py @@ -505,36 +505,116 @@ class ContainerManager: digests[image] = stdout.strip().splitlines()[0] return digests - def _parse_compose_images(self) -> list[str]: - """Best-effort parse of image names from compose files without YAML dependency.""" + def _extract_images_from_compose_config(self, text: str, tried_json: bool) -> set[str]: + """ + Try JSON first (if we asked for it or it looks like JSON), then YAML if available. + Returns a set of image names. + """ images: set[str] = set() - for compose in [self.compose_file, self.cpu_compose_file]: + + # Try JSON parse + if tried_json or (text.lstrip().startswith("{") and text.rstrip().endswith("}")): try: - if not compose.exists(): + cfg = json.loads(text) + services = cfg.get("services", {}) + for _, svc in services.items(): + image = svc.get("image") + if image: + images.add(str(image)) + if images: + return images + except json.JSONDecodeError: + pass + + # Try YAML (if available) - import here to avoid hard dependency + try: + import yaml + cfg = yaml.safe_load(text) or {} + services = cfg.get("services", {}) + if isinstance(services, dict): + for _, svc in services.items(): + if isinstance(svc, dict): + image = svc.get("image") + if image: + images.add(str(image)) + if images: + return images + except Exception: + pass + + return images + + async def _parse_compose_images(self) -> list[str]: + """Get resolved image names from compose files using docker/podman compose, with robust fallbacks.""" + images: set[str] = set() + + compose_files = [self.compose_file, self.cpu_compose_file] + for compose_file in compose_files: + try: + if not compose_file or not compose_file.exists(): continue - for line in compose.read_text().splitlines(): - line = line.strip() - if not line or line.startswith("#"): + + cpu_mode = (compose_file == self.cpu_compose_file) + + # Try JSON format first + success, stdout, _ = await self._run_compose_command( + ["config", "--format", "json"], + cpu_mode=cpu_mode + ) + + if success and stdout.strip(): + from_cfg = self._extract_images_from_compose_config(stdout, tried_json=True) + if from_cfg: + images.update(from_cfg) + continue # this compose file succeeded; move to next file + + # Fallback to YAML output (for older compose versions) + success, stdout, _ = await self._run_compose_command( + ["config"], + cpu_mode=cpu_mode + ) + + if success and stdout.strip(): + from_cfg = self._extract_images_from_compose_config(stdout, tried_json=False) + if from_cfg: + images.update(from_cfg) continue - if line.startswith("image:"): - # image: repo/name:tag - val = line.split(":", 1)[1].strip() - # Remove quotes if present - if (val.startswith('"') and val.endswith('"')) or ( - val.startswith("'") and val.endswith("'") - ): - val = val[1:-1] - images.add(val) + except Exception: + # Keep behavior resilient—just continue to next file continue - return list(images) + + # Fallback: manual parsing if compose config didn't work + if not images: + for compose in compose_files: + try: + if not compose.exists(): + continue + for line in compose.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + if line.startswith("image:"): + # image: repo/name:tag + val = line.split(":", 1)[1].strip() + # Remove quotes if present + if (val.startswith('"') and val.endswith('"')) or ( + val.startswith("'") and val.endswith("'") + ): + val = val[1:-1] + if val: + images.add(val) + except Exception: + continue + + return sorted(images) async def get_project_images_info(self) -> list[tuple[str, str]]: """ Return list of (image, digest_or_id) for images referenced by compose files. If an image isn't present locally, returns '-' for its digest. """ - expected = self._parse_compose_images() + expected = await self._parse_compose_images() results: list[tuple[str, str]] = [] for image in expected: digest = "-" diff --git a/src/utils/embedding_fields.py b/src/utils/embedding_fields.py new file mode 100644 index 00000000..990cb116 --- /dev/null +++ b/src/utils/embedding_fields.py @@ -0,0 +1,178 @@ +""" +Utility functions for managing dynamic embedding field names in OpenSearch. + +This module provides helpers for: +- Normalizing embedding model names to valid OpenSearch field names +- Generating dynamic field names based on embedding models +- Ensuring embedding fields exist in the OpenSearch index +""" + +from typing import Dict, Any + +from utils.logging_config import get_logger + +logger = get_logger(__name__) + + +def normalize_model_name(model_name: str) -> str: + """ + Convert an embedding model name to a valid OpenSearch field suffix. + + Examples: + - "text-embedding-3-small" -> "text_embedding_3_small" + - "nomic-embed-text:latest" -> "nomic_embed_text_latest" + - "ibm/slate-125m-english-rtrvr" -> "ibm_slate_125m_english_rtrvr" + + Args: + model_name: The embedding model name (e.g., from OpenAI, Ollama, Watsonx) + + Returns: + Normalized string safe for use as OpenSearch field name suffix + """ + normalized = model_name.lower() + # Replace common separators with underscores + normalized = normalized.replace("-", "_") + normalized = normalized.replace(":", "_") + normalized = normalized.replace("/", "_") + normalized = normalized.replace(".", "_") + # Remove any other non-alphanumeric characters + normalized = "".join(c if c.isalnum() or c == "_" else "_" for c in normalized) + # Remove duplicate underscores + while "__" in normalized: + normalized = normalized.replace("__", "_") + # Remove leading/trailing underscores + normalized = normalized.strip("_") + + return normalized + + +def get_embedding_field_name(model_name: str) -> str: + """ + Get the OpenSearch field name for storing embeddings from a specific model. + + Args: + model_name: The embedding model name + + Returns: + Field name in format: chunk_embedding_{normalized_model_name} + + Examples: + >>> get_embedding_field_name("text-embedding-3-small") + 'chunk_embedding_text_embedding_3_small' + >>> get_embedding_field_name("nomic-embed-text") + 'chunk_embedding_nomic_embed_text' + """ + normalized = normalize_model_name(model_name) + return f"chunk_embedding_{normalized}" + + +async def ensure_embedding_field_exists( + opensearch_client, + model_name: str, + index_name: str = None, +) -> str: + """ + Ensure that an embedding field for the specified model exists in the OpenSearch index. + If the field doesn't exist, it will be added dynamically using PUT mapping API. + + Args: + opensearch_client: OpenSearch client instance + model_name: The embedding model name + index_name: OpenSearch index name (defaults to INDEX_NAME from settings) + + Returns: + The field name that was ensured to exist + + Raises: + Exception: If unable to add the field mapping + """ + from config.settings import INDEX_NAME + from utils.embeddings import get_embedding_dimensions + + if index_name is None: + index_name = INDEX_NAME + + field_name = get_embedding_field_name(model_name) + dimensions = await get_embedding_dimensions(model_name) + + logger.info( + "Ensuring embedding field exists", + field_name=field_name, + model_name=model_name, + dimensions=dimensions, + ) + + async def _get_field_definition() -> Dict[str, Any]: + try: + mapping = await opensearch_client.indices.get_mapping(index=index_name) + except Exception as e: + logger.debug( + "Failed to fetch mapping before ensuring embedding field", + index=index_name, + error=str(e), + ) + return {} + + properties = mapping.get(index_name, {}).get("mappings", {}).get("properties", {}) + return properties.get(field_name, {}) if isinstance(properties, dict) else {} + + existing_definition = await _get_field_definition() + if existing_definition: + if existing_definition.get("type") != "knn_vector": + raise RuntimeError( + f"Field '{field_name}' already exists with incompatible type '{existing_definition.get('type')}'" + ) + return field_name + + # Define the field mapping for both the vector field and the tracking field + mapping = { + "properties": { + field_name: { + "type": "knn_vector", + "dimension": dimensions, + "method": { + "name": "disk_ann", + "engine": "jvector", + "space_type": "l2", + "parameters": {"ef_construction": 100, "m": 16}, + }, + }, + # Also ensure the embedding_model tracking field exists as keyword + "embedding_model": { + "type": "keyword" + }, + "embedding_dimensions": { + "type": "integer" + }, + } + } + + try: + # Try to add the mapping + # OpenSearch will ignore if field already exists + await opensearch_client.indices.put_mapping( + index=index_name, + body=mapping + ) + logger.info( + "Successfully ensured embedding field exists", + field_name=field_name, + model_name=model_name, + ) + except Exception as e: + logger.error( + "Failed to add embedding field mapping", + field_name=field_name, + model_name=model_name, + error=str(e), + ) + raise + + # Verify mapping was applied correctly + new_definition = await _get_field_definition() + if new_definition.get("type") != "knn_vector": + raise RuntimeError( + f"Failed to ensure '{field_name}' is mapped as knn_vector. Current definition: {new_definition}" + ) + + return field_name diff --git a/src/utils/embeddings.py b/src/utils/embeddings.py index 46c53509..20acda9e 100644 --- a/src/utils/embeddings.py +++ b/src/utils/embeddings.py @@ -167,6 +167,8 @@ async def create_dynamic_index_body( "mimetype": {"type": "keyword"}, "page": {"type": "integer"}, "text": {"type": "text"}, + # Legacy field - kept for backward compatibility + # New documents will use chunk_embedding_{model_name} fields "chunk_embedding": { "type": "knn_vector", "dimension": dimensions, @@ -177,6 +179,9 @@ async def create_dynamic_index_body( "parameters": {"ef_construction": 100, "m": 16}, }, }, + # Track which embedding model was used for this chunk + "embedding_model": {"type": "keyword"}, + "embedding_dimensions": {"type": "integer"}, "source_url": {"type": "keyword"}, "connector_type": {"type": "keyword"}, "owner": {"type": "keyword"}, diff --git a/tests/conftest.py b/tests/conftest.py index 7c2ffc1d..eadca98b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,55 @@ from src.session_manager import SessionManager from src.main import generate_jwt_keys +@pytest_asyncio.fixture(scope="session", autouse=True) +async def onboard_system(): + """Perform initial onboarding once for all tests in the session. + + This ensures the OpenRAG config is marked as edited and properly initialized + so that tests can use the /settings endpoint. + """ + from pathlib import Path + + # Delete any existing config to ensure clean onboarding + config_file = Path("config/config.yaml") + if config_file.exists(): + config_file.unlink() + + # Initialize clients + await clients.initialize() + + # Create app and perform onboarding via API + from src.main import create_app, startup_tasks + import httpx + + app = await create_app() + await startup_tasks(app.state.services) + + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + onboarding_payload = { + "model_provider": "openai", + "embedding_model": "text-embedding-3-small", + "llm_model": "gpt-4o-mini", + "endpoint": "https://api.openai.com/v1", + "sample_data": False, + } + resp = await client.post("/onboarding", json=onboarding_payload) + if resp.status_code not in (200, 204): + # If it fails, it might already be onboarded, which is fine + print(f"[DEBUG] Onboarding returned {resp.status_code}: {resp.text}") + else: + print(f"[DEBUG] Session onboarding completed successfully") + + yield + + # Cleanup after all tests + try: + await clients.close() + except Exception: + pass + + @pytest.fixture(scope="session") def event_loop(): """Create an instance of the default event loop for the test session.""" diff --git a/tests/integration/test_api_endpoints.py b/tests/integration/test_api_endpoints.py index 869928fe..4dbe02fc 100644 --- a/tests/integration/test_api_endpoints.py +++ b/tests/integration/test_api_endpoints.py @@ -1,11 +1,43 @@ import asyncio import os +import subprocess from pathlib import Path import httpx import pytest +def dump_docker_logs(container_name_pattern: str = "langflow", tail: int = 100): + """Dump Docker container logs for debugging.""" + try: + # Find container ID by name pattern + find_cmd = ["docker", "ps", "-a", "--filter", f"name={container_name_pattern}", "--format", "{{.ID}}"] + result = subprocess.run(find_cmd, capture_output=True, text=True, timeout=5) + container_ids = result.stdout.strip().split('\n') + + if not container_ids or not container_ids[0]: + print(f"[DEBUG] No Docker containers found matching pattern: {container_name_pattern}") + return + + for container_id in container_ids: + if not container_id: + continue + print(f"\n{'='*80}") + print(f"[DEBUG] Docker logs for container {container_id} (last {tail} lines):") + print(f"{'='*80}") + + logs_cmd = ["docker", "logs", "--tail", str(tail), container_id] + logs_result = subprocess.run(logs_cmd, capture_output=True, text=True, timeout=10) + print(logs_result.stdout) + if logs_result.stderr: + print("[STDERR]:", logs_result.stderr) + print(f"{'='*80}\n") + except subprocess.TimeoutExpired: + print(f"[DEBUG] Timeout while fetching docker logs for {container_name_pattern}") + except Exception as e: + print(f"[DEBUG] Failed to fetch docker logs for {container_name_pattern}: {e}") + + async def wait_for_service_ready(client: httpx.AsyncClient, timeout_s: float = 30.0): """Poll existing endpoints until the app and OpenSearch are ready. @@ -160,11 +192,22 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges "text/markdown", ) } - upload_resp = await client.post("/upload", files=files) + upload_resp = await client.post("/router/upload_ingest", files=files) body = upload_resp.json() - assert upload_resp.status_code == 201, upload_resp.text - assert body.get("status") in {"indexed", "unchanged"} - assert isinstance(body.get("id"), str) + assert upload_resp.status_code in (201, 202), upload_resp.text + + # Handle different response formats based on whether Langflow is used + if disable_langflow_ingest: + # Traditional OpenRAG response (201) + assert body.get("status") in {"indexed", "unchanged"} + assert isinstance(body.get("id"), str) + else: + # Langflow task response (202) + task_id = body.get("task_id") + assert isinstance(task_id, str) + assert body.get("file_count") == 1 + # Wait for task completion before searching + await _wait_for_task_completion(client, task_id) # Poll search for the specific content until it's indexed async def _wait_for_indexed(timeout_s: float = 30.0): @@ -204,6 +247,353 @@ async def test_upload_and_search_endpoint(tmp_path: Path, disable_langflow_inges pass +async def _wait_for_langflow_chat( + client: httpx.AsyncClient, payload: dict, timeout_s: float = 120.0 +) -> dict: + deadline = asyncio.get_event_loop().time() + timeout_s + last_payload = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.post("/langflow", json=payload) + if resp.status_code == 200: + try: + data = resp.json() + except Exception: + last_payload = resp.text + else: + response_text = data.get("response") + if isinstance(response_text, str) and response_text.strip(): + return data + last_payload = data + else: + last_payload = resp.text + await asyncio.sleep(1.0) + + # Dump Langflow logs before raising error + print(f"\n[DEBUG] /langflow timed out. Dumping Langflow container logs...") + dump_docker_logs(container_name_pattern="langflow", tail=200) + raise AssertionError(f"/langflow never returned a usable response. Last payload: {last_payload}") + + +async def _wait_for_nudges( + client: httpx.AsyncClient, chat_id: str | None = None, timeout_s: float = 90.0 +) -> dict: + endpoint = "/nudges" if not chat_id else f"/nudges/{chat_id}" + deadline = asyncio.get_event_loop().time() + timeout_s + last_payload = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.get(endpoint) + if resp.status_code == 200: + try: + data = resp.json() + except Exception: + last_payload = resp.text + else: + response_text = data.get("response") + if isinstance(response_text, str) and response_text.strip(): + return data + last_payload = data + else: + last_payload = resp.text + await asyncio.sleep(1.0) + + # Dump Langflow logs before raising error + print(f"\n[DEBUG] {endpoint} timed out. Dumping Langflow container logs...") + dump_docker_logs(container_name_pattern="langflow", tail=200) + raise AssertionError(f"{endpoint} never returned a usable response. Last payload: {last_payload}") + + +async def _wait_for_task_completion( + client: httpx.AsyncClient, task_id: str, timeout_s: float = 180.0 +) -> dict: + deadline = asyncio.get_event_loop().time() + timeout_s + last_payload = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.get(f"/tasks/{task_id}") + if resp.status_code == 200: + try: + data = resp.json() + except Exception: + last_payload = resp.text + else: + status = (data.get("status") or "").lower() + if status == "completed": + return data + if status == "failed": + raise AssertionError(f"Task {task_id} failed: {data}") + last_payload = data + elif resp.status_code == 404: + last_payload = resp.text + else: + last_payload = resp.text + await asyncio.sleep(1.0) + raise AssertionError( + f"Task {task_id} did not complete in time. Last payload: {last_payload}" + ) + + +@pytest.mark.asyncio +@pytest.mark.skip +async def test_langflow_chat_and_nudges_endpoints(): + """Exercise /langflow and /nudges endpoints against a live Langflow backend.""" + required_env = ["LANGFLOW_CHAT_FLOW_ID", "NUDGES_FLOW_ID"] + missing = [var for var in required_env if not os.getenv(var)] + assert not missing, f"Missing required Langflow configuration: {missing}" + + os.environ["DISABLE_INGEST_WITH_LANGFLOW"] = "true" + os.environ["DISABLE_STARTUP_INGEST"] = "true" + os.environ["GOOGLE_OAUTH_CLIENT_ID"] = "" + os.environ["GOOGLE_OAUTH_CLIENT_SECRET"] = "" + + import sys + + for mod in [ + "src.api.chat", + "api.chat", + "src.api.nudges", + "api.nudges", + "src.api.router", + "api.router", + "src.api.connector_router", + "api.connector_router", + "src.config.settings", + "config.settings", + "src.auth_middleware", + "auth_middleware", + "src.main", + "api", + "src.api", + "services", + "src.services", + "services.search_service", + "src.services.search_service", + "services.chat_service", + "src.services.chat_service", + ]: + sys.modules.pop(mod, None) + + from src.main import create_app, startup_tasks + from src.config.settings import clients, LANGFLOW_CHAT_FLOW_ID, NUDGES_FLOW_ID + + assert LANGFLOW_CHAT_FLOW_ID, "LANGFLOW_CHAT_FLOW_ID must be configured for integration test" + assert NUDGES_FLOW_ID, "NUDGES_FLOW_ID must be configured for integration test" + + await clients.initialize() + app = await create_app() + await startup_tasks(app.state.services) + + langflow_client = None + deadline = asyncio.get_event_loop().time() + 60.0 + while asyncio.get_event_loop().time() < deadline: + langflow_client = await clients.ensure_langflow_client() + if langflow_client is not None: + break + await asyncio.sleep(1.0) + assert langflow_client is not None, "Langflow client not initialized. Provide LANGFLOW_KEY or enable superuser auto-login." + + transport = httpx.ASGITransport(app=app) + try: + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_service_ready(client) + + # Ensure embedding model is configured via settings + resp = await client.post( + "/settings", + json={ + "embedding_model": "text-embedding-3-small", + "llm_model": "gpt-4o-mini", + }, + ) + assert resp.status_code == 200, resp.text + + warmup_file = Path("./nudges_seed.md") + warmup_file.write_text( + "The user may care about different fruits including apples, hardy kiwi, and bananas" + ) + files = { + "file": ( + warmup_file.name, + warmup_file.read_bytes(), + "text/plain", + ) + } + upload_resp = await client.post("/router/upload_ingest", files=files) + assert upload_resp.status_code in (201, 202), upload_resp.text + payload = upload_resp.json() + task_id = payload.get("task_id") + if task_id: + await _wait_for_task_completion(client, task_id) + + prompt = "Respond with a brief acknowledgement for the OpenRAG integration test." + langflow_payload = {"prompt": prompt, "limit": 5, "scoreThreshold": 0} + langflow_data = await _wait_for_langflow_chat(client, langflow_payload) + + assert isinstance(langflow_data.get("response"), str) + assert langflow_data["response"].strip() + + response_id = langflow_data.get("response_id") + + nudges_data = await _wait_for_nudges(client) + assert isinstance(nudges_data.get("response"), str) + assert nudges_data["response"].strip() + + if response_id: + nudges_thread_data = await _wait_for_nudges(client, response_id) + assert isinstance(nudges_thread_data.get("response"), str) + assert nudges_thread_data["response"].strip() + finally: + from src.config.settings import clients + + try: + await clients.close() + except Exception: + pass + + +@pytest.mark.skip +@pytest.mark.asyncio +async def test_search_multi_embedding_models( + tmp_path: Path +): + """Ensure /search fans out across multiple embedding models when present.""" + os.environ["DISABLE_INGEST_WITH_LANGFLOW"] = "true" + os.environ["DISABLE_STARTUP_INGEST"] = "true" + os.environ["GOOGLE_OAUTH_CLIENT_ID"] = "" + os.environ["GOOGLE_OAUTH_CLIENT_SECRET"] = "" + + import sys + + for mod in [ + "src.api.router", + "api.router", + "src.api.connector_router", + "api.connector_router", + "src.config.settings", + "config.settings", + "src.auth_middleware", + "auth_middleware", + "src.main", + "services.search_service", + "src.services.search_service", + ]: + sys.modules.pop(mod, None) + + from src.main import create_app, startup_tasks + from src.config.settings import clients, INDEX_NAME + + await clients.initialize() + try: + await clients.opensearch.indices.delete(index=INDEX_NAME) + await asyncio.sleep(1) + except Exception: + pass + + app = await create_app() + await startup_tasks(app.state.services) + + from src.main import _ensure_opensearch_index + + await _ensure_opensearch_index() + + transport = httpx.ASGITransport(app=app) + + try: + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client: + await wait_for_service_ready(client) + + async def _upload_doc(name: str, text: str) -> None: + file_path = tmp_path / name + file_path.write_text(text) + files = { + "file": ( + name, + file_path.read_bytes(), + "text/markdown", + ) + } + resp = await client.post("/router/upload_ingest", files=files) + assert resp.status_code in (201, 202), resp.text + payload = resp.json() + task_id = payload.get("task_id") + if task_id: + await _wait_for_task_completion(client, task_id) + + async def _wait_for_models(expected_models: set[str], query: str = "*"): + deadline = asyncio.get_event_loop().time() + 60.0 + last_payload = None + while asyncio.get_event_loop().time() < deadline: + resp = await client.post( + "/search", + json={"query": query, "limit": 0, "scoreThreshold": 0}, + ) + if resp.status_code != 200: + last_payload = resp.text + await asyncio.sleep(0.5) + continue + payload = resp.json() + buckets = ( + payload.get("aggregations", {}) + .get("embedding_models", {}) + .get("buckets", []) + ) + models = {b.get("key") for b in buckets if b.get("key")} + if expected_models <= models: + return payload + last_payload = payload + await asyncio.sleep(0.5) + raise AssertionError( + f"Embedding models not detected. Last payload: {last_payload}" + ) + + # Start with explicit small embedding model + resp = await client.post( + "/settings", + json={ + "embedding_model": "text-embedding-3-small", + "llm_model": "gpt-4o-mini", + }, + ) + assert resp.status_code == 200, resp.text + + # Ingest first document (small model) + await _upload_doc("doc-small.md", "Physics basics and fundamental principles.") + payload_small = await _wait_for_models({"text-embedding-3-small"}) + result_models_small = { + r.get("embedding_model") + for r in (payload_small.get("results") or []) + if r.get("embedding_model") + } + assert "text-embedding-3-small" in result_models_small or not result_models_small + + # Update embedding model via settings + resp = await client.post( + "/settings", + json={"embedding_model": "text-embedding-3-large"}, + ) + assert resp.status_code == 200, resp.text + + # Ingest second document which should use the large embedding model + await _upload_doc("doc-large.md", "Advanced physics covers quantum topics extensively.") + + payload = await _wait_for_models({"text-embedding-3-small", "text-embedding-3-large"}) + buckets = payload.get("aggregations", {}).get("embedding_models", {}).get("buckets", []) + models = {b.get("key") for b in buckets} + assert {"text-embedding-3-small", "text-embedding-3-large"} <= models + + result_models = { + r.get("embedding_model") + for r in (payload.get("results") or []) + if r.get("embedding_model") + } + assert {"text-embedding-3-small", "text-embedding-3-large"} <= result_models + finally: + from src.config.settings import clients + + try: + await clients.close() + except Exception: + pass + + @pytest.mark.parametrize("disable_langflow_ingest", [True, False]) @pytest.mark.asyncio async def test_router_upload_ingest_traditional(tmp_path: Path, disable_langflow_ingest: bool):