Merge branch 'main' into prune-image-tui

2025-12-10 20:54:43 -05:00 · 2025-12-10 20:54:43 -05:00 · 823e5925ee
commit 823e5925ee
parent 7390e515a9 a78926dd37
14 changed files with 383 additions and 46 deletions
--- a/.env.example
+++ b/.env.example
@ -2,6 +2,14 @@
 # Set to true to disable Langflow ingestion and use traditional OpenRAG processor
 # If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
 DISABLE_INGEST_WITH_LANGFLOW=false
+
+# Langflow HTTP timeout configuration (in seconds)
+# For large documents (300+ pages), ingestion can take 30+ minutes
+# Increase these values if you experience timeouts with very large PDFs
+# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
+# LANGFLOW_TIMEOUT=2400
+# LANGFLOW_CONNECT_TIMEOUT=30
+
 # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
 LANGFLOW_SECRET_KEY=

--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,11 @@
+version: 2
+
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    commit-message:
+      prefix: "build(deps):"
+      include: scope
+
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/Yelp/detect-secrets
+    rev: v1.5.0
+    hooks:
+      - id: detect-secrets
+        args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]
+
--- a/.secrets.baseline
+++ b/.secrets.baseline
@ -0,0 +1,180 @@
+{
+  "version": "1.5.0",
+  "plugins_used": [
+    {
+      "name": "ArtifactoryDetector"
+    },
+    {
+      "name": "AWSKeyDetector"
+    },
+    {
+      "name": "AzureStorageKeyDetector"
+    },
+    {
+      "name": "Base64HighEntropyString",
+      "limit": 4.5
+    },
+    {
+      "name": "BasicAuthDetector"
+    },
+    {
+      "name": "CloudantDetector"
+    },
+    {
+      "name": "DiscordBotTokenDetector"
+    },
+    {
+      "name": "GitHubTokenDetector"
+    },
+    {
+      "name": "GitLabTokenDetector"
+    },
+    {
+      "name": "HexHighEntropyString",
+      "limit": 3.0
+    },
+    {
+      "name": "IbmCloudIamDetector"
+    },
+    {
+      "name": "IbmCosHmacDetector"
+    },
+    {
+      "name": "IPPublicDetector"
+    },
+    {
+      "name": "JwtTokenDetector"
+    },
+    {
+      "name": "KeywordDetector",
+      "keyword_exclude": ""
+    },
+    {
+      "name": "MailchimpDetector"
+    },
+    {
+      "name": "NpmDetector"
+    },
+    {
+      "name": "OpenAIDetector"
+    },
+    {
+      "name": "PrivateKeyDetector"
+    },
+    {
+      "name": "PypiTokenDetector"
+    },
+    {
+      "name": "SendGridDetector"
+    },
+    {
+      "name": "SlackDetector"
+    },
+    {
+      "name": "SoftlayerDetector"
+    },
+    {
+      "name": "SquareOAuthDetector"
+    },
+    {
+      "name": "StripeDetector"
+    },
+    {
+      "name": "TelegramBotTokenDetector"
+    },
+    {
+      "name": "TwilioKeyDetector"
+    }
+  ],
+  "filters_used": [
+    {
+      "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
+    },
+    {
+      "path": "detect_secrets.filters.common.is_baseline_file",
+      "filename": ".secrets.baseline"
+    },
+    {
+      "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
+      "min_level": 2
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_indirect_reference"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_likely_id_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_lock_file"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_potential_uuid"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_sequential_string"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_swagger_file"
+    },
+    {
+      "path": "detect_secrets.filters.heuristic.is_templated_secret"
+    },
+    {
+      "path": "detect_secrets.filters.regex.should_exclude_file",
+      "pattern": [
+        "flows/.*\\.json$"
+      ]
+    },
+    {
+      "path": "detect_secrets.filters.regex.should_exclude_line",
+      "pattern": [
+        "code_hash"
+      ]
+    }
+  ],
+  "results": {
+    "docs/docs/_partial-integrate-chat.mdx": [
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/docs/_partial-integrate-chat.mdx",
+        "hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
+        "is_verified": false,
+        "line_number": 30
+      }
+    ],
+    "src/main.py": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": "src/main.py",
+        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+        "is_verified": false,
+        "line_number": 404
+      }
+    ],
+    "src/models/processors.py": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": "src/models/processors.py",
+        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+        "is_verified": false,
+        "line_number": 763
+      }
+    ],
+    "src/services/langflow_file_service.py": [
+      {
+        "type": "Base64 High Entropy String",
+        "filename": "src/services/langflow_file_service.py",
+        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+        "is_verified": false,
+        "line_number": 97
+      }
+    ]
+  },
+  "generated_at": "2025-12-09T20:33:13Z"
+}
--- a/flows/openrag_agent.json
+++ b/flows/openrag_agent.json
@ -4787,7 +4787,7 @@
  "is_component": false,
  "locked": true,
  "last_tested_version": "1.7.0.dev21",
-  "name": "OpenRAG OpenSearch Agent",
+  "name": "OpenRAG OpenSearch Agent Flow",
  "tags": [
    "assistants",
    "agents"
--- a/flows/openrag_nudges.json
+++ b/flows/openrag_nudges.json
@ -4114,7 +4114,7 @@
  "is_component": false,
  "locked": true,
  "last_tested_version": "1.7.0.dev21",
-  "name": "OpenRAG OpenSearch Nudges",
+  "name": "OpenRAG OpenSearch Nudges Flow",
  "tags": [
    "assistants",
    "agents"
--- a/frontend/components/layout-wrapper.tsx
+++ b/frontend/components/layout-wrapper.tsx
@ -15,6 +15,7 @@ import {
 } from "@/components/provider-health-banner";
 import { TaskNotificationMenu } from "@/components/task-notification-menu";
 import { useAuth } from "@/contexts/auth-context";
+import { useChat } from "@/contexts/chat-context";
 import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
 import { useTask } from "@/contexts/task-context";
 import { cn } from "@/lib/utils";
@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
  const { isMenuOpen } = useTask();
  const { isPanelOpen } = useKnowledgeFilter();
  const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
+  const { isOnboardingComplete } = useChat();

  // List of paths that should not show navigation
  const authPaths = ["/login", "/auth/callback"];
@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
            isOpen={isDoclingUnhealthy}
            className="w-full"
          >
-            <DoclingHealthBanner />
+          <DoclingHealthBanner />
+        </AnimatedConditional>
+        {settings?.edited && isOnboardingComplete && (
+          <AnimatedConditional
+            vertical
+            isOpen={isProviderUnhealthy}
+            className="w-full"
+          >
+            <ProviderHealthBanner />
          </AnimatedConditional>
-          {settings?.edited && (
-            <AnimatedConditional
-              vertical
-              isOpen={isProviderUnhealthy}
-              className="w-full"
-            >
-              <ProviderHealthBanner />
-            </AnimatedConditional>
-          )}
+        )}
        </div>

        <ChatRenderer settings={settings}>{children}</ChatRenderer>
--- a/src/config/settings.py
+++ b/src/config/settings.py
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
    "DISABLE_INGEST_WITH_LANGFLOW", "false"
 ).lower() in ("true", "1", "yes")

+# Langflow HTTP timeout configuration (in seconds)
+# For large documents (300+ pages), ingestion can take 30+ minutes
+# Default: 40 minutes total, 40 minutes read timeout
+LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400"))  # 40 minutes
+LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30"))  # 30 seconds
+

 def is_no_auth_mode():
    """Check if we're running in no-auth mode (OAuth credentials missing)"""
@ -317,9 +323,22 @@ class AppClients:
        # Initialize document converter
        self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)

-        # Initialize Langflow HTTP client
+        # Initialize Langflow HTTP client with extended timeouts for large documents
+        # Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
        self.langflow_http_client = httpx.AsyncClient(
-            base_url=LANGFLOW_URL, timeout=1200.0
+            base_url=LANGFLOW_URL,
+            timeout=httpx.Timeout(
+                timeout=LANGFLOW_TIMEOUT,  # Total timeout
+                connect=LANGFLOW_CONNECT_TIMEOUT,  # Connection timeout
+                read=LANGFLOW_TIMEOUT,  # Read timeout (most important for large PDFs)
+                write=LANGFLOW_CONNECT_TIMEOUT,  # Write timeout
+                pool=LANGFLOW_CONNECT_TIMEOUT,  # Pool timeout
+            )
+        )
+        logger.info(
+            "Initialized Langflow HTTP client with extended timeouts",
+            timeout_seconds=LANGFLOW_TIMEOUT,
+            connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
        )

        return self
--- a/src/connectors/onedrive/connector.py
+++ b/src/connectors/onedrive/connector.py
@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):

    # Required BaseConnector class attributes
    CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
-    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"  # pragma: allowlist secret

    # Connector metadata
    CONNECTOR_NAME = "OneDrive"
--- a/src/connectors/sharepoint/connector.py
+++ b/src/connectors/sharepoint/connector.py
@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):

    # Required BaseConnector class attributes
    CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
-    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"  # pragma: allowlist secret
    
    # Connector metadata
    CONNECTOR_NAME = "SharePoint"
--- a/src/models/processors.py
+++ b/src/models/processors.py
@ -197,10 +197,27 @@ class TaskProcessor:
            file_hash=file_hash,
        )

-        # Convert and extract
-        result = clients.converter.convert(file_path)
-        full_doc = result.document.export_to_dict()
-        slim_doc = extract_relevant(full_doc)
+        # Check if this is a .txt file - use simple processing instead of docling
+        import os
+        file_ext = os.path.splitext(file_path)[1].lower()
+        
+        if file_ext == '.txt':
+            # Simple text file processing without docling
+            from utils.document_processing import process_text_file
+            logger.info(
+                "Processing as plain text file (bypassing docling)",
+                file_path=file_path,
+                file_hash=file_hash,
+            )
+            slim_doc = process_text_file(file_path)
+            # Override filename with original_filename if provided
+            if original_filename:
+                slim_doc["filename"] = original_filename
+        else:
+            # Convert and extract using docling for other file types
+            result = clients.converter.convert(file_path)
+            full_doc = result.document.export_to_dict()
+            slim_doc = extract_relevant(full_doc)

        texts = [c["text"] for c in slim_doc["chunks"]]

--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -181,6 +181,7 @@ class DocumentService:
    async def process_upload_context(self, upload_file, filename: str = None):
        """Process uploaded file and return content for context"""
        import io
+        import os

        if not filename:
            filename = upload_file.filename or "uploaded_document"
@ -194,22 +195,37 @@ class DocumentService:
            content.write(chunk)
        content.seek(0)  # Reset to beginning for reading

-        # Create DocumentStream and process with docling
-        doc_stream = DocumentStream(name=filename, stream=content)
-        result = clients.converter.convert(doc_stream)
-        full_doc = result.document.export_to_dict()
-        slim_doc = extract_relevant(full_doc)
+        # Check if this is a .txt file - use simple processing
+        file_ext = os.path.splitext(filename)[1].lower()
+        
+        if file_ext == '.txt':
+            # Simple text file processing for chat context
+            text_content = content.read().decode('utf-8', errors='replace')
+            
+            # For context, we don't need to chunk - just return the full content
+            return {
+                "filename": filename,
+                "content": text_content,
+                "pages": 1,  # Text files don't have pages
+                "content_length": len(text_content),
+            }
+        else:
+            # Create DocumentStream and process with docling
+            doc_stream = DocumentStream(name=filename, stream=content)
+            result = clients.converter.convert(doc_stream)
+            full_doc = result.document.export_to_dict()
+            slim_doc = extract_relevant(full_doc)

-        # Extract all text content
-        all_text = []
-        for chunk in slim_doc["chunks"]:
-            all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
+            # Extract all text content
+            all_text = []
+            for chunk in slim_doc["chunks"]:
+                all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")

-        full_content = "\n\n".join(all_text)
+            full_content = "\n\n".join(all_text)

-        return {
-            "filename": filename,
-            "content": full_content,
-            "pages": len(slim_doc["chunks"]),
-            "content_length": len(full_content),
-        }
+            return {
+                "filename": filename,
+                "content": full_content,
+                "pages": len(slim_doc["chunks"]),
+                "content_length": len(full_content),
+            }
--- a/src/tui/managers/env_manager.py
+++ b/src/tui/managers/env_manager.py
@ -123,28 +123,29 @@ class EnvManager:
        import os
        
        # Map env vars to config attributes
-        attr_map = {
-            "OPENAI_API_KEY": "openai_api_key",
-            "ANTHROPIC_API_KEY": "anthropic_api_key",
+        # These are environment variable names, not actual secrets
+        attr_map = {  # pragma: allowlist secret
+            "OPENAI_API_KEY": "openai_api_key",  # pragma: allowlist secret
+            "ANTHROPIC_API_KEY": "anthropic_api_key",  # pragma: allowlist secret
            "OLLAMA_ENDPOINT": "ollama_endpoint",
-            "WATSONX_API_KEY": "watsonx_api_key",
+            "WATSONX_API_KEY": "watsonx_api_key",  # pragma: allowlist secret
            "WATSONX_ENDPOINT": "watsonx_endpoint",
            "WATSONX_PROJECT_ID": "watsonx_project_id",
-            "OPENSEARCH_PASSWORD": "opensearch_password",
-            "LANGFLOW_SECRET_KEY": "langflow_secret_key",
+            "OPENSEARCH_PASSWORD": "opensearch_password",  # pragma: allowlist secret
+            "LANGFLOW_SECRET_KEY": "langflow_secret_key",  # pragma: allowlist secret
            "LANGFLOW_SUPERUSER": "langflow_superuser",
-            "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
+            "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",  # pragma: allowlist secret
            "LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
            "LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
            "LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
            "NUDGES_FLOW_ID": "nudges_flow_id",
            "GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
-            "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
+            "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",  # pragma: allowlist secret
            "MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
-            "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
+            "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",  # pragma: allowlist secret
            "WEBHOOK_BASE_URL": "webhook_base_url",
            "AWS_ACCESS_KEY_ID": "aws_access_key_id",
-            "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
+            "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",  # pragma: allowlist secret
            "LANGFLOW_PUBLIC_URL": "langflow_public_url",
            "OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
            "OPENSEARCH_DATA_PATH": "opensearch_data_path",
--- a/src/utils/document_processing.py
+++ b/src/utils/document_processing.py
@ -119,6 +119,82 @@ def get_worker_converter():
    return _worker_converter


+def process_text_file(file_path: str) -> dict:
+    """
+    Process a plain text file without using docling.
+    Returns the same structure as extract_relevant() for consistency.
+    
+    Args:
+        file_path: Path to the .txt file
+        
+    Returns:
+        dict with keys: id, filename, mimetype, chunks
+    """
+    import os
+    from utils.hash_utils import hash_id
+    
+    # Read the file
+    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+        content = f.read()
+    
+    # Compute hash
+    file_hash = hash_id(file_path)
+    filename = os.path.basename(file_path)
+    
+    # Split content into chunks of ~1000 characters to match typical docling chunk sizes
+    # This ensures embeddings stay within reasonable token limits
+    chunk_size = 1000
+    chunks = []
+    
+    # Split by paragraphs first (double newline)
+    paragraphs = content.split('\n\n')
+    current_chunk = ""
+    chunk_index = 0
+    
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+            
+        # If adding this paragraph would exceed chunk size, save current chunk
+        if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
+            chunks.append({
+                "page": chunk_index + 1,  # Use chunk_index + 1 as "page" number
+                "type": "text",
+                "text": current_chunk.strip()
+            })
+            chunk_index += 1
+            current_chunk = para
+        else:
+            if current_chunk:
+                current_chunk += "\n\n" + para
+            else:
+                current_chunk = para
+    
+    # Add the last chunk if any
+    if current_chunk.strip():
+        chunks.append({
+            "page": chunk_index + 1,
+            "type": "text",
+            "text": current_chunk.strip()
+        })
+    
+    # If no chunks were created (empty file), create a single empty chunk
+    if not chunks:
+        chunks.append({
+            "page": 1,
+            "type": "text",
+            "text": ""
+        })
+    
+    return {
+        "id": file_hash,
+        "filename": filename,
+        "mimetype": "text/plain",
+        "chunks": chunks,
+    }
+
+
 def extract_relevant(doc_dict: dict) -> dict:
    """
    Given the full export_to_dict() result: