diff --git a/.env.example b/.env.example
index b4b1b88b..081c9026 100644
--- a/.env.example
+++ b/.env.example
@@ -2,6 +2,14 @@
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
DISABLE_INGEST_WITH_LANGFLOW=false
+
+# Langflow HTTP timeout configuration (in seconds)
+# For large documents (300+ pages), ingestion can take 30+ minutes
+# Increase these values if you experience timeouts with very large PDFs
+# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
+# LANGFLOW_TIMEOUT=2400
+# LANGFLOW_CONNECT_TIMEOUT=30
+
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
LANGFLOW_SECRET_KEY=
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..3b871ae9
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+
+updates:
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "monthly"
+ commit-message:
+ prefix: "build(deps):"
+ include: scope
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..b6c7a6fc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+ - repo: https://github.com/Yelp/detect-secrets
+ rev: v1.5.0
+ hooks:
+ - id: detect-secrets
+ args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]
+
diff --git a/.secrets.baseline b/.secrets.baseline
new file mode 100644
index 00000000..28837d45
--- /dev/null
+++ b/.secrets.baseline
@@ -0,0 +1,180 @@
+{
+ "version": "1.5.0",
+ "plugins_used": [
+ {
+ "name": "ArtifactoryDetector"
+ },
+ {
+ "name": "AWSKeyDetector"
+ },
+ {
+ "name": "AzureStorageKeyDetector"
+ },
+ {
+ "name": "Base64HighEntropyString",
+ "limit": 4.5
+ },
+ {
+ "name": "BasicAuthDetector"
+ },
+ {
+ "name": "CloudantDetector"
+ },
+ {
+ "name": "DiscordBotTokenDetector"
+ },
+ {
+ "name": "GitHubTokenDetector"
+ },
+ {
+ "name": "GitLabTokenDetector"
+ },
+ {
+ "name": "HexHighEntropyString",
+ "limit": 3.0
+ },
+ {
+ "name": "IbmCloudIamDetector"
+ },
+ {
+ "name": "IbmCosHmacDetector"
+ },
+ {
+ "name": "IPPublicDetector"
+ },
+ {
+ "name": "JwtTokenDetector"
+ },
+ {
+ "name": "KeywordDetector",
+ "keyword_exclude": ""
+ },
+ {
+ "name": "MailchimpDetector"
+ },
+ {
+ "name": "NpmDetector"
+ },
+ {
+ "name": "OpenAIDetector"
+ },
+ {
+ "name": "PrivateKeyDetector"
+ },
+ {
+ "name": "PypiTokenDetector"
+ },
+ {
+ "name": "SendGridDetector"
+ },
+ {
+ "name": "SlackDetector"
+ },
+ {
+ "name": "SoftlayerDetector"
+ },
+ {
+ "name": "SquareOAuthDetector"
+ },
+ {
+ "name": "StripeDetector"
+ },
+ {
+ "name": "TelegramBotTokenDetector"
+ },
+ {
+ "name": "TwilioKeyDetector"
+ }
+ ],
+ "filters_used": [
+ {
+ "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
+ },
+ {
+ "path": "detect_secrets.filters.common.is_baseline_file",
+ "filename": ".secrets.baseline"
+ },
+ {
+ "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
+ "min_level": 2
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_indirect_reference"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_likely_id_string"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_lock_file"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_potential_uuid"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_sequential_string"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_swagger_file"
+ },
+ {
+ "path": "detect_secrets.filters.heuristic.is_templated_secret"
+ },
+ {
+ "path": "detect_secrets.filters.regex.should_exclude_file",
+ "pattern": [
+ "flows/.*\\.json$"
+ ]
+ },
+ {
+ "path": "detect_secrets.filters.regex.should_exclude_line",
+ "pattern": [
+ "code_hash"
+ ]
+ }
+ ],
+ "results": {
+ "docs/docs/_partial-integrate-chat.mdx": [
+ {
+ "type": "Secret Keyword",
+ "filename": "docs/docs/_partial-integrate-chat.mdx",
+ "hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
+ "is_verified": false,
+ "line_number": 30
+ }
+ ],
+ "src/main.py": [
+ {
+ "type": "Base64 High Entropy String",
+ "filename": "src/main.py",
+ "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+ "is_verified": false,
+ "line_number": 404
+ }
+ ],
+ "src/models/processors.py": [
+ {
+ "type": "Base64 High Entropy String",
+ "filename": "src/models/processors.py",
+ "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+ "is_verified": false,
+ "line_number": 763
+ }
+ ],
+ "src/services/langflow_file_service.py": [
+ {
+ "type": "Base64 High Entropy String",
+ "filename": "src/services/langflow_file_service.py",
+ "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
+ "is_verified": false,
+ "line_number": 97
+ }
+ ]
+ },
+ "generated_at": "2025-12-09T20:33:13Z"
+}
diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json
index d9475aac..bb1adc71 100644
--- a/flows/openrag_agent.json
+++ b/flows/openrag_agent.json
@@ -4787,7 +4787,7 @@
"is_component": false,
"locked": true,
"last_tested_version": "1.7.0.dev21",
- "name": "OpenRAG OpenSearch Agent",
+ "name": "OpenRAG OpenSearch Agent Flow",
"tags": [
"assistants",
"agents"
diff --git a/flows/openrag_nudges.json b/flows/openrag_nudges.json
index d9d79e60..475833f9 100644
--- a/flows/openrag_nudges.json
+++ b/flows/openrag_nudges.json
@@ -4114,7 +4114,7 @@
"is_component": false,
"locked": true,
"last_tested_version": "1.7.0.dev21",
- "name": "OpenRAG OpenSearch Nudges",
+ "name": "OpenRAG OpenSearch Nudges Flow",
"tags": [
"assistants",
"agents"
diff --git a/frontend/components/layout-wrapper.tsx b/frontend/components/layout-wrapper.tsx
index 08eea73d..dbaf42da 100644
--- a/frontend/components/layout-wrapper.tsx
+++ b/frontend/components/layout-wrapper.tsx
@@ -15,6 +15,7 @@ import {
} from "@/components/provider-health-banner";
import { TaskNotificationMenu } from "@/components/task-notification-menu";
import { useAuth } from "@/contexts/auth-context";
+import { useChat } from "@/contexts/chat-context";
import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
import { useTask } from "@/contexts/task-context";
import { cn } from "@/lib/utils";
@@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
const { isMenuOpen } = useTask();
const { isPanelOpen } = useKnowledgeFilter();
const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
+ const { isOnboardingComplete } = useChat();
// List of paths that should not show navigation
const authPaths = ["/login", "/auth/callback"];
@@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
isOpen={isDoclingUnhealthy}
className="w-full"
>
-
+
+
+ {settings?.edited && isOnboardingComplete && (
+
+
- {settings?.edited && (
-
-
-
- )}
+ )}
{children}
diff --git a/src/config/settings.py b/src/config/settings.py
index b590ab8b..f3e334b4 100644
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
"DISABLE_INGEST_WITH_LANGFLOW", "false"
).lower() in ("true", "1", "yes")
+# Langflow HTTP timeout configuration (in seconds)
+# For large documents (300+ pages), ingestion can take 30+ minutes
+# Default: 40 minutes total, 40 minutes read timeout
+LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
+LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
+
def is_no_auth_mode():
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
@@ -317,9 +323,22 @@ class AppClients:
# Initialize document converter
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
- # Initialize Langflow HTTP client
+ # Initialize Langflow HTTP client with extended timeouts for large documents
+ # Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
self.langflow_http_client = httpx.AsyncClient(
- base_url=LANGFLOW_URL, timeout=1200.0
+ base_url=LANGFLOW_URL,
+ timeout=httpx.Timeout(
+ timeout=LANGFLOW_TIMEOUT, # Total timeout
+ connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
+ read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
+ write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
+ pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
+ )
+ )
+ logger.info(
+ "Initialized Langflow HTTP client with extended timeouts",
+ timeout_seconds=LANGFLOW_TIMEOUT,
+ connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
)
return self
diff --git a/src/connectors/onedrive/connector.py b/src/connectors/onedrive/connector.py
index a88321d3..796e4310 100644
--- a/src/connectors/onedrive/connector.py
+++ b/src/connectors/onedrive/connector.py
@@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):
# Required BaseConnector class attributes
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
- CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+ CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
# Connector metadata
CONNECTOR_NAME = "OneDrive"
diff --git a/src/connectors/sharepoint/connector.py b/src/connectors/sharepoint/connector.py
index f84d3575..df6dc102 100644
--- a/src/connectors/sharepoint/connector.py
+++ b/src/connectors/sharepoint/connector.py
@@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):
# Required BaseConnector class attributes
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
- CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
+ CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
# Connector metadata
CONNECTOR_NAME = "SharePoint"
diff --git a/src/models/processors.py b/src/models/processors.py
index 9731adb7..8f84c3dc 100644
--- a/src/models/processors.py
+++ b/src/models/processors.py
@@ -197,10 +197,27 @@ class TaskProcessor:
file_hash=file_hash,
)
- # Convert and extract
- result = clients.converter.convert(file_path)
- full_doc = result.document.export_to_dict()
- slim_doc = extract_relevant(full_doc)
+ # Check if this is a .txt file - use simple processing instead of docling
+ import os
+ file_ext = os.path.splitext(file_path)[1].lower()
+
+ if file_ext == '.txt':
+ # Simple text file processing without docling
+ from utils.document_processing import process_text_file
+ logger.info(
+ "Processing as plain text file (bypassing docling)",
+ file_path=file_path,
+ file_hash=file_hash,
+ )
+ slim_doc = process_text_file(file_path)
+ # Override filename with original_filename if provided
+ if original_filename:
+ slim_doc["filename"] = original_filename
+ else:
+ # Convert and extract using docling for other file types
+ result = clients.converter.convert(file_path)
+ full_doc = result.document.export_to_dict()
+ slim_doc = extract_relevant(full_doc)
texts = [c["text"] for c in slim_doc["chunks"]]
diff --git a/src/services/document_service.py b/src/services/document_service.py
index de1b3cf6..f40c3d82 100644
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@@ -181,6 +181,7 @@ class DocumentService:
async def process_upload_context(self, upload_file, filename: str = None):
"""Process uploaded file and return content for context"""
import io
+ import os
if not filename:
filename = upload_file.filename or "uploaded_document"
@@ -194,22 +195,37 @@ class DocumentService:
content.write(chunk)
content.seek(0) # Reset to beginning for reading
- # Create DocumentStream and process with docling
- doc_stream = DocumentStream(name=filename, stream=content)
- result = clients.converter.convert(doc_stream)
- full_doc = result.document.export_to_dict()
- slim_doc = extract_relevant(full_doc)
+ # Check if this is a .txt file - use simple processing
+ file_ext = os.path.splitext(filename)[1].lower()
+
+ if file_ext == '.txt':
+ # Simple text file processing for chat context
+ text_content = content.read().decode('utf-8', errors='replace')
+
+ # For context, we don't need to chunk - just return the full content
+ return {
+ "filename": filename,
+ "content": text_content,
+ "pages": 1, # Text files don't have pages
+ "content_length": len(text_content),
+ }
+ else:
+ # Create DocumentStream and process with docling
+ doc_stream = DocumentStream(name=filename, stream=content)
+ result = clients.converter.convert(doc_stream)
+ full_doc = result.document.export_to_dict()
+ slim_doc = extract_relevant(full_doc)
- # Extract all text content
- all_text = []
- for chunk in slim_doc["chunks"]:
- all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
+ # Extract all text content
+ all_text = []
+ for chunk in slim_doc["chunks"]:
+ all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
- full_content = "\n\n".join(all_text)
+ full_content = "\n\n".join(all_text)
- return {
- "filename": filename,
- "content": full_content,
- "pages": len(slim_doc["chunks"]),
- "content_length": len(full_content),
- }
+ return {
+ "filename": filename,
+ "content": full_content,
+ "pages": len(slim_doc["chunks"]),
+ "content_length": len(full_content),
+ }
diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py
index a3d3ea6f..51e2a11f 100644
--- a/src/tui/managers/env_manager.py
+++ b/src/tui/managers/env_manager.py
@@ -123,28 +123,29 @@ class EnvManager:
import os
# Map env vars to config attributes
- attr_map = {
- "OPENAI_API_KEY": "openai_api_key",
- "ANTHROPIC_API_KEY": "anthropic_api_key",
+ # These are environment variable names, not actual secrets
+ attr_map = { # pragma: allowlist secret
+ "OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret
+ "ANTHROPIC_API_KEY": "anthropic_api_key", # pragma: allowlist secret
"OLLAMA_ENDPOINT": "ollama_endpoint",
- "WATSONX_API_KEY": "watsonx_api_key",
+ "WATSONX_API_KEY": "watsonx_api_key", # pragma: allowlist secret
"WATSONX_ENDPOINT": "watsonx_endpoint",
"WATSONX_PROJECT_ID": "watsonx_project_id",
- "OPENSEARCH_PASSWORD": "opensearch_password",
- "LANGFLOW_SECRET_KEY": "langflow_secret_key",
+ "OPENSEARCH_PASSWORD": "opensearch_password", # pragma: allowlist secret
+ "LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret
"LANGFLOW_SUPERUSER": "langflow_superuser",
- "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
+ "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret
"LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
"LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
"LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
"NUDGES_FLOW_ID": "nudges_flow_id",
"GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
- "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
+ "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", # pragma: allowlist secret
"MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
- "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
+ "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", # pragma: allowlist secret
"WEBHOOK_BASE_URL": "webhook_base_url",
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
- "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
+ "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret
"LANGFLOW_PUBLIC_URL": "langflow_public_url",
"OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
"OPENSEARCH_DATA_PATH": "opensearch_data_path",
diff --git a/src/utils/document_processing.py b/src/utils/document_processing.py
index fcb458fb..9619cf74 100644
--- a/src/utils/document_processing.py
+++ b/src/utils/document_processing.py
@@ -119,6 +119,82 @@ def get_worker_converter():
return _worker_converter
+def process_text_file(file_path: str) -> dict:
+ """
+ Process a plain text file without using docling.
+ Returns the same structure as extract_relevant() for consistency.
+
+ Args:
+ file_path: Path to the .txt file
+
+ Returns:
+ dict with keys: id, filename, mimetype, chunks
+ """
+ import os
+ from utils.hash_utils import hash_id
+
+ # Read the file
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+ content = f.read()
+
+ # Compute hash
+ file_hash = hash_id(file_path)
+ filename = os.path.basename(file_path)
+
+ # Split content into chunks of ~1000 characters to match typical docling chunk sizes
+ # This ensures embeddings stay within reasonable token limits
+ chunk_size = 1000
+ chunks = []
+
+ # Split by paragraphs first (double newline)
+ paragraphs = content.split('\n\n')
+ current_chunk = ""
+ chunk_index = 0
+
+ for para in paragraphs:
+ para = para.strip()
+ if not para:
+ continue
+
+ # If adding this paragraph would exceed chunk size, save current chunk
+ if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
+ chunks.append({
+ "page": chunk_index + 1, # Use chunk_index + 1 as "page" number
+ "type": "text",
+ "text": current_chunk.strip()
+ })
+ chunk_index += 1
+ current_chunk = para
+ else:
+ if current_chunk:
+ current_chunk += "\n\n" + para
+ else:
+ current_chunk = para
+
+ # Add the last chunk if any
+ if current_chunk.strip():
+ chunks.append({
+ "page": chunk_index + 1,
+ "type": "text",
+ "text": current_chunk.strip()
+ })
+
+ # If no chunks were created (empty file), create a single empty chunk
+ if not chunks:
+ chunks.append({
+ "page": 1,
+ "type": "text",
+ "text": ""
+ })
+
+ return {
+ "id": file_hash,
+ "filename": filename,
+ "mimetype": "text/plain",
+ "chunks": chunks,
+ }
+
+
def extract_relevant(doc_dict: dict) -> dict:
"""
Given the full export_to_dict() result: