Merge branch 'main' into prune-image-tui

2025-12-10 20:54:43 -05:00 · 2025-12-10 20:54:43 -05:00 · 823e5925ee
commit 823e5925ee
parent 7390e515a9 a78926dd37
14 changed files with 383 additions and 46 deletions
--- a/.env.example
+++ b/.env.example
@ -2,6 +2,14 @@
 # Set to true to disable Langflow ingestion and use traditional OpenRAG processor
 # If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
 DISABLE_INGEST_WITH_LANGFLOW=false
 # Langflow HTTP timeout configuration (in seconds)
 # For large documents (300+ pages), ingestion can take 30+ minutes
 # Increase these values if you experience timeouts with very large PDFs
 # Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
 # LANGFLOW_TIMEOUT=2400
 # LANGFLOW_CONNECT_TIMEOUT=30
 # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
 LANGFLOW_SECRET_KEY=
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,11 @@
 version: 2
 updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "monthly"
    commit-message:
      prefix: "build(deps):"
      include: scope
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,7 @@
 repos:
  - repo: https://github.com/Yelp/detect-secrets
    rev: v1.5.0
    hooks:
      - id: detect-secrets
        args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]
--- a/.secrets.baseline
+++ b/.secrets.baseline
@ -0,0 +1,180 @@
 {
  "version": "1.5.0",
  "plugins_used": [
    {
      "name": "ArtifactoryDetector"
    },
    {
      "name": "AWSKeyDetector"
    },
    {
      "name": "AzureStorageKeyDetector"
    },
    {
      "name": "Base64HighEntropyString",
      "limit": 4.5
    },
    {
      "name": "BasicAuthDetector"
    },
    {
      "name": "CloudantDetector"
    },
    {
      "name": "DiscordBotTokenDetector"
    },
    {
      "name": "GitHubTokenDetector"
    },
    {
      "name": "GitLabTokenDetector"
    },
    {
      "name": "HexHighEntropyString",
      "limit": 3.0
    },
    {
      "name": "IbmCloudIamDetector"
    },
    {
      "name": "IbmCosHmacDetector"
    },
    {
      "name": "IPPublicDetector"
    },
    {
      "name": "JwtTokenDetector"
    },
    {
      "name": "KeywordDetector",
      "keyword_exclude": ""
    },
    {
      "name": "MailchimpDetector"
    },
    {
      "name": "NpmDetector"
    },
    {
      "name": "OpenAIDetector"
    },
    {
      "name": "PrivateKeyDetector"
    },
    {
      "name": "PypiTokenDetector"
    },
    {
      "name": "SendGridDetector"
    },
    {
      "name": "SlackDetector"
    },
    {
      "name": "SoftlayerDetector"
    },
    {
      "name": "SquareOAuthDetector"
    },
    {
      "name": "StripeDetector"
    },
    {
      "name": "TelegramBotTokenDetector"
    },
    {
      "name": "TwilioKeyDetector"
    }
  ],
  "filters_used": [
    {
      "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
    },
    {
      "path": "detect_secrets.filters.common.is_baseline_file",
      "filename": ".secrets.baseline"
    },
    {
      "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
      "min_level": 2
    },
    {
      "path": "detect_secrets.filters.heuristic.is_indirect_reference"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_likely_id_string"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_lock_file"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_potential_uuid"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_sequential_string"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_swagger_file"
    },
    {
      "path": "detect_secrets.filters.heuristic.is_templated_secret"
    },
    {
      "path": "detect_secrets.filters.regex.should_exclude_file",
      "pattern": [
        "flows/.*\\.json$"
      ]
    },
    {
      "path": "detect_secrets.filters.regex.should_exclude_line",
      "pattern": [
        "code_hash"
      ]
    }
  ],
  "results": {
    "docs/docs/_partial-integrate-chat.mdx": [
      {
        "type": "Secret Keyword",
        "filename": "docs/docs/_partial-integrate-chat.mdx",
        "hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
        "is_verified": false,
        "line_number": 30
      }
    ],
    "src/main.py": [
      {
        "type": "Base64 High Entropy String",
        "filename": "src/main.py",
        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
        "is_verified": false,
        "line_number": 404
      }
    ],
    "src/models/processors.py": [
      {
        "type": "Base64 High Entropy String",
        "filename": "src/models/processors.py",
        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
        "is_verified": false,
        "line_number": 763
      }
    ],
    "src/services/langflow_file_service.py": [
      {
        "type": "Base64 High Entropy String",
        "filename": "src/services/langflow_file_service.py",
        "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
        "is_verified": false,
        "line_number": 97
      }
    ]
  },
  "generated_at": "2025-12-09T20:33:13Z"
 }
--- a/flows/openrag_agent.json
+++ b/flows/openrag_agent.json
@ -4787,7 +4787,7 @@
  "is_component": false,
  "locked": true,
  "last_tested_version": "1.7.0.dev21",
-  "name": "OpenRAG OpenSearch Agent",
+  "name": "OpenRAG OpenSearch Agent Flow",
  "tags": [
    "assistants",
    "agents"
--- a/flows/openrag_nudges.json
+++ b/flows/openrag_nudges.json
@ -4114,7 +4114,7 @@
  "is_component": false,
  "locked": true,
  "last_tested_version": "1.7.0.dev21",
-  "name": "OpenRAG OpenSearch Nudges",
+  "name": "OpenRAG OpenSearch Nudges Flow",
  "tags": [
    "assistants",
    "agents"
--- a/frontend/components/layout-wrapper.tsx
+++ b/frontend/components/layout-wrapper.tsx
@ -15,6 +15,7 @@ import {
 } from "@/components/provider-health-banner";
 import { TaskNotificationMenu } from "@/components/task-notification-menu";
 import { useAuth } from "@/contexts/auth-context";
 import { useChat } from "@/contexts/chat-context";
 import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
 import { useTask } from "@/contexts/task-context";
 import { cn } from "@/lib/utils";
@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
  const { isMenuOpen } = useTask();
  const { isPanelOpen } = useKnowledgeFilter();
  const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
  const { isOnboardingComplete } = useChat();
  // List of paths that should not show navigation
  const authPaths = ["/login", "/auth/callback"];
@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
            isOpen={isDoclingUnhealthy}
            className="w-full"
          >
-            <DoclingHealthBanner />
+          <DoclingHealthBanner />
        </AnimatedConditional>
        {settings?.edited && isOnboardingComplete && (
          <AnimatedConditional
            vertical
            isOpen={isProviderUnhealthy}
            className="w-full"
          >
            <ProviderHealthBanner />
          </AnimatedConditional>
-          {settings?.edited && (
+        )}
            <AnimatedConditional
              vertical
              isOpen={isProviderUnhealthy}
              className="w-full"
            >
              <ProviderHealthBanner />
            </AnimatedConditional>
          )}
        </div>
        <ChatRenderer settings={settings}>{children}</ChatRenderer>
--- a/src/config/settings.py
+++ b/src/config/settings.py
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
    "DISABLE_INGEST_WITH_LANGFLOW", "false"
 ).lower() in ("true", "1", "yes")
 # Langflow HTTP timeout configuration (in seconds)
 # For large documents (300+ pages), ingestion can take 30+ minutes
 # Default: 40 minutes total, 40 minutes read timeout
 LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400"))  # 40 minutes
 LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30"))  # 30 seconds
 def is_no_auth_mode():
    """Check if we're running in no-auth mode (OAuth credentials missing)"""
@ -317,9 +323,22 @@ class AppClients:
        # Initialize document converter
        self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
-        # Initialize Langflow HTTP client
+        # Initialize Langflow HTTP client with extended timeouts for large documents
        # Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
        self.langflow_http_client = httpx.AsyncClient(
-            base_url=LANGFLOW_URL, timeout=1200.0
+            base_url=LANGFLOW_URL,
            timeout=httpx.Timeout(
                timeout=LANGFLOW_TIMEOUT,  # Total timeout
                connect=LANGFLOW_CONNECT_TIMEOUT,  # Connection timeout
                read=LANGFLOW_TIMEOUT,  # Read timeout (most important for large PDFs)
                write=LANGFLOW_CONNECT_TIMEOUT,  # Write timeout
                pool=LANGFLOW_CONNECT_TIMEOUT,  # Pool timeout
            )
        )
        logger.info(
            "Initialized Langflow HTTP client with extended timeouts",
            timeout_seconds=LANGFLOW_TIMEOUT,
            connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
        )
        return self
--- a/src/connectors/onedrive/connector.py
+++ b/src/connectors/onedrive/connector.py
@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):
    # Required BaseConnector class attributes
    CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
-    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"  # pragma: allowlist secret
    # Connector metadata
    CONNECTOR_NAME = "OneDrive"
--- a/src/connectors/sharepoint/connector.py
+++ b/src/connectors/sharepoint/connector.py
@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):
    # Required BaseConnector class attributes
    CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
-    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+    CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"  # pragma: allowlist secret
    # Connector metadata
    CONNECTOR_NAME = "SharePoint"
--- a/src/models/processors.py
+++ b/src/models/processors.py
@ -197,10 +197,27 @@ class TaskProcessor:
            file_hash=file_hash,
        )
-        # Convert and extract
+        # Check if this is a .txt file - use simple processing instead of docling
-        result = clients.converter.convert(file_path)
+        import os
-        full_doc = result.document.export_to_dict()
+        file_ext = os.path.splitext(file_path)[1].lower()
-        slim_doc = extract_relevant(full_doc)
+        
        if file_ext == '.txt':
            # Simple text file processing without docling
            from utils.document_processing import process_text_file
            logger.info(
                "Processing as plain text file (bypassing docling)",
                file_path=file_path,
                file_hash=file_hash,
            )
            slim_doc = process_text_file(file_path)
            # Override filename with original_filename if provided
            if original_filename:
                slim_doc["filename"] = original_filename
        else:
            # Convert and extract using docling for other file types
            result = clients.converter.convert(file_path)
            full_doc = result.document.export_to_dict()
            slim_doc = extract_relevant(full_doc)
        texts = [c["text"] for c in slim_doc["chunks"]]
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -181,6 +181,7 @@ class DocumentService:
    async def process_upload_context(self, upload_file, filename: str = None):
        """Process uploaded file and return content for context"""
        import io
        import os
        if not filename:
            filename = upload_file.filename or "uploaded_document"
@ -194,22 +195,37 @@ class DocumentService:
            content.write(chunk)
        content.seek(0)  # Reset to beginning for reading
-        # Create DocumentStream and process with docling
+        # Check if this is a .txt file - use simple processing
-        doc_stream = DocumentStream(name=filename, stream=content)
+        file_ext = os.path.splitext(filename)[1].lower()
-        result = clients.converter.convert(doc_stream)
+        
-        full_doc = result.document.export_to_dict()
+        if file_ext == '.txt':
-        slim_doc = extract_relevant(full_doc)
+            # Simple text file processing for chat context
            text_content = content.read().decode('utf-8', errors='replace')
            # For context, we don't need to chunk - just return the full content
            return {
                "filename": filename,
                "content": text_content,
                "pages": 1,  # Text files don't have pages
                "content_length": len(text_content),
            }
        else:
            # Create DocumentStream and process with docling
            doc_stream = DocumentStream(name=filename, stream=content)
            result = clients.converter.convert(doc_stream)
            full_doc = result.document.export_to_dict()
            slim_doc = extract_relevant(full_doc)
-        # Extract all text content
+            # Extract all text content
-        all_text = []
+            all_text = []
-        for chunk in slim_doc["chunks"]:
+            for chunk in slim_doc["chunks"]:
-            all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
+                all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
-        full_content = "\n\n".join(all_text)
+            full_content = "\n\n".join(all_text)
-        return {
+            return {
-            "filename": filename,
+                "filename": filename,
-            "content": full_content,
+                "content": full_content,
-            "pages": len(slim_doc["chunks"]),
+                "pages": len(slim_doc["chunks"]),
-            "content_length": len(full_content),
+                "content_length": len(full_content),
-        }
+            }
--- a/src/tui/managers/env_manager.py
+++ b/src/tui/managers/env_manager.py
@ -123,28 +123,29 @@ class EnvManager:
        import os
        # Map env vars to config attributes
-        attr_map = {
+        # These are environment variable names, not actual secrets
-            "OPENAI_API_KEY": "openai_api_key",
+        attr_map = {  # pragma: allowlist secret
-            "ANTHROPIC_API_KEY": "anthropic_api_key",
+            "OPENAI_API_KEY": "openai_api_key",  # pragma: allowlist secret
            "ANTHROPIC_API_KEY": "anthropic_api_key",  # pragma: allowlist secret
            "OLLAMA_ENDPOINT": "ollama_endpoint",
-            "WATSONX_API_KEY": "watsonx_api_key",
+            "WATSONX_API_KEY": "watsonx_api_key",  # pragma: allowlist secret
            "WATSONX_ENDPOINT": "watsonx_endpoint",
            "WATSONX_PROJECT_ID": "watsonx_project_id",
-            "OPENSEARCH_PASSWORD": "opensearch_password",
+            "OPENSEARCH_PASSWORD": "opensearch_password",  # pragma: allowlist secret
-            "LANGFLOW_SECRET_KEY": "langflow_secret_key",
+            "LANGFLOW_SECRET_KEY": "langflow_secret_key",  # pragma: allowlist secret
            "LANGFLOW_SUPERUSER": "langflow_superuser",
-            "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
+            "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",  # pragma: allowlist secret
            "LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
            "LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
            "LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
            "NUDGES_FLOW_ID": "nudges_flow_id",
            "GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
-            "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
+            "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",  # pragma: allowlist secret
            "MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
-            "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
+            "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",  # pragma: allowlist secret
            "WEBHOOK_BASE_URL": "webhook_base_url",
            "AWS_ACCESS_KEY_ID": "aws_access_key_id",
-            "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
+            "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",  # pragma: allowlist secret
            "LANGFLOW_PUBLIC_URL": "langflow_public_url",
            "OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
            "OPENSEARCH_DATA_PATH": "opensearch_data_path",
--- a/src/utils/document_processing.py
+++ b/src/utils/document_processing.py
@ -119,6 +119,82 @@ def get_worker_converter():
    return _worker_converter
 def process_text_file(file_path: str) -> dict:
    """
    Process a plain text file without using docling.
    Returns the same structure as extract_relevant() for consistency.
    Args:
        file_path: Path to the .txt file
    Returns:
        dict with keys: id, filename, mimetype, chunks
    """
    import os
    from utils.hash_utils import hash_id
    # Read the file
    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()
    # Compute hash
    file_hash = hash_id(file_path)
    filename = os.path.basename(file_path)
    # Split content into chunks of ~1000 characters to match typical docling chunk sizes
    # This ensures embeddings stay within reasonable token limits
    chunk_size = 1000
    chunks = []
    # Split by paragraphs first (double newline)
    paragraphs = content.split('\n\n')
    current_chunk = ""
    chunk_index = 0
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        # If adding this paragraph would exceed chunk size, save current chunk
        if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
            chunks.append({
                "page": chunk_index + 1,  # Use chunk_index + 1 as "page" number
                "type": "text",
                "text": current_chunk.strip()
            })
            chunk_index += 1
            current_chunk = para
        else:
            if current_chunk:
                current_chunk += "\n\n" + para
            else:
                current_chunk = para
    # Add the last chunk if any
    if current_chunk.strip():
        chunks.append({
            "page": chunk_index + 1,
            "type": "text",
            "text": current_chunk.strip()
        })
    # If no chunks were created (empty file), create a single empty chunk
    if not chunks:
        chunks.append({
            "page": 1,
            "type": "text",
            "text": ""
        })
    return {
        "id": file_hash,
        "filename": filename,
        "mimetype": "text/plain",
        "chunks": chunks,
    }
 def extract_relevant(doc_dict: dict) -> dict:
    """
    Given the full export_to_dict() result: