diff --git a/.env.example b/.env.example index b4b1b88b..081c9026 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,14 @@ # Set to true to disable Langflow ingestion and use traditional OpenRAG processor # If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete) DISABLE_INGEST_WITH_LANGFLOW=false + +# Langflow HTTP timeout configuration (in seconds) +# For large documents (300+ pages), ingestion can take 30+ minutes +# Increase these values if you experience timeouts with very large PDFs +# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout +# LANGFLOW_TIMEOUT=2400 +# LANGFLOW_CONNECT_TIMEOUT=30 + # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key LANGFLOW_SECRET_KEY= diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..3b871ae9 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 + +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + commit-message: + prefix: "build(deps):" + include: scope + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..b6c7a6fc --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"] + diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 00000000..28837d45 --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,180 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_baseline_file", + "filename": ".secrets.baseline" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + }, + { + "path": "detect_secrets.filters.regex.should_exclude_file", + "pattern": [ + "flows/.*\\.json$" + ] + }, + { + "path": "detect_secrets.filters.regex.should_exclude_line", + "pattern": [ + "code_hash" + ] + } + ], + "results": { + "docs/docs/_partial-integrate-chat.mdx": [ + { + "type": "Secret Keyword", + "filename": "docs/docs/_partial-integrate-chat.mdx", + "hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996", + "is_verified": false, + "line_number": 30 + } + ], + "src/main.py": [ + { + "type": "Base64 High Entropy String", + "filename": "src/main.py", + "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee", + "is_verified": false, + "line_number": 404 + } + ], + "src/models/processors.py": [ + { + "type": "Base64 High Entropy String", + "filename": "src/models/processors.py", + "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee", + "is_verified": false, + "line_number": 763 + } + ], + "src/services/langflow_file_service.py": [ + { + "type": "Base64 High Entropy String", + "filename": "src/services/langflow_file_service.py", + "hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee", + "is_verified": false, + "line_number": 97 + } + ] + }, + "generated_at": "2025-12-09T20:33:13Z" +} diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index d9475aac..bb1adc71 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -4787,7 +4787,7 @@ "is_component": false, "locked": true, "last_tested_version": "1.7.0.dev21", - "name": "OpenRAG OpenSearch Agent", + "name": "OpenRAG OpenSearch Agent Flow", "tags": [ "assistants", "agents" diff --git a/flows/openrag_nudges.json b/flows/openrag_nudges.json index d9d79e60..475833f9 100644 --- a/flows/openrag_nudges.json +++ b/flows/openrag_nudges.json @@ -4114,7 +4114,7 @@ "is_component": false, "locked": true, "last_tested_version": "1.7.0.dev21", - "name": "OpenRAG OpenSearch Nudges", + "name": "OpenRAG OpenSearch Nudges Flow", "tags": [ "assistants", "agents" diff --git a/frontend/components/layout-wrapper.tsx b/frontend/components/layout-wrapper.tsx index 08eea73d..dbaf42da 100644 --- a/frontend/components/layout-wrapper.tsx +++ b/frontend/components/layout-wrapper.tsx @@ -15,6 +15,7 @@ import { } from "@/components/provider-health-banner"; import { TaskNotificationMenu } from "@/components/task-notification-menu"; import { useAuth } from "@/contexts/auth-context"; +import { useChat } from "@/contexts/chat-context"; import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context"; import { useTask } from "@/contexts/task-context"; import { cn } from "@/lib/utils"; @@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) { const { isMenuOpen } = useTask(); const { isPanelOpen } = useKnowledgeFilter(); const { isLoading, isAuthenticated, isNoAuthMode } = useAuth(); + const { isOnboardingComplete } = useChat(); // List of paths that should not show navigation const authPaths = ["/login", "/auth/callback"]; @@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) { isOpen={isDoclingUnhealthy} className="w-full" > - + + + {settings?.edited && isOnboardingComplete && ( + + - {settings?.edited && ( - - - - )} + )} {children} diff --git a/src/config/settings.py b/src/config/settings.py index b590ab8b..f3e334b4 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv( "DISABLE_INGEST_WITH_LANGFLOW", "false" ).lower() in ("true", "1", "yes") +# Langflow HTTP timeout configuration (in seconds) +# For large documents (300+ pages), ingestion can take 30+ minutes +# Default: 40 minutes total, 40 minutes read timeout +LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes +LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds + def is_no_auth_mode(): """Check if we're running in no-auth mode (OAuth credentials missing)""" @@ -317,9 +323,22 @@ class AppClients: # Initialize document converter self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE) - # Initialize Langflow HTTP client + # Initialize Langflow HTTP client with extended timeouts for large documents + # Use explicit timeout configuration to handle large PDF ingestion (300+ pages) self.langflow_http_client = httpx.AsyncClient( - base_url=LANGFLOW_URL, timeout=1200.0 + base_url=LANGFLOW_URL, + timeout=httpx.Timeout( + timeout=LANGFLOW_TIMEOUT, # Total timeout + connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout + read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs) + write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout + pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout + ) + ) + logger.info( + "Initialized Langflow HTTP client with extended timeouts", + timeout_seconds=LANGFLOW_TIMEOUT, + connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT, ) return self diff --git a/src/connectors/onedrive/connector.py b/src/connectors/onedrive/connector.py index a88321d3..796e4310 100644 --- a/src/connectors/onedrive/connector.py +++ b/src/connectors/onedrive/connector.py @@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector): # Required BaseConnector class attributes CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID" - CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" + CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret # Connector metadata CONNECTOR_NAME = "OneDrive" diff --git a/src/connectors/sharepoint/connector.py b/src/connectors/sharepoint/connector.py index f84d3575..df6dc102 100644 --- a/src/connectors/sharepoint/connector.py +++ b/src/connectors/sharepoint/connector.py @@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector): # Required BaseConnector class attributes CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID" - CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" + CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret # Connector metadata CONNECTOR_NAME = "SharePoint" diff --git a/src/models/processors.py b/src/models/processors.py index 9731adb7..8f84c3dc 100644 --- a/src/models/processors.py +++ b/src/models/processors.py @@ -197,10 +197,27 @@ class TaskProcessor: file_hash=file_hash, ) - # Convert and extract - result = clients.converter.convert(file_path) - full_doc = result.document.export_to_dict() - slim_doc = extract_relevant(full_doc) + # Check if this is a .txt file - use simple processing instead of docling + import os + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext == '.txt': + # Simple text file processing without docling + from utils.document_processing import process_text_file + logger.info( + "Processing as plain text file (bypassing docling)", + file_path=file_path, + file_hash=file_hash, + ) + slim_doc = process_text_file(file_path) + # Override filename with original_filename if provided + if original_filename: + slim_doc["filename"] = original_filename + else: + # Convert and extract using docling for other file types + result = clients.converter.convert(file_path) + full_doc = result.document.export_to_dict() + slim_doc = extract_relevant(full_doc) texts = [c["text"] for c in slim_doc["chunks"]] diff --git a/src/services/document_service.py b/src/services/document_service.py index de1b3cf6..f40c3d82 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -181,6 +181,7 @@ class DocumentService: async def process_upload_context(self, upload_file, filename: str = None): """Process uploaded file and return content for context""" import io + import os if not filename: filename = upload_file.filename or "uploaded_document" @@ -194,22 +195,37 @@ class DocumentService: content.write(chunk) content.seek(0) # Reset to beginning for reading - # Create DocumentStream and process with docling - doc_stream = DocumentStream(name=filename, stream=content) - result = clients.converter.convert(doc_stream) - full_doc = result.document.export_to_dict() - slim_doc = extract_relevant(full_doc) + # Check if this is a .txt file - use simple processing + file_ext = os.path.splitext(filename)[1].lower() + + if file_ext == '.txt': + # Simple text file processing for chat context + text_content = content.read().decode('utf-8', errors='replace') + + # For context, we don't need to chunk - just return the full content + return { + "filename": filename, + "content": text_content, + "pages": 1, # Text files don't have pages + "content_length": len(text_content), + } + else: + # Create DocumentStream and process with docling + doc_stream = DocumentStream(name=filename, stream=content) + result = clients.converter.convert(doc_stream) + full_doc = result.document.export_to_dict() + slim_doc = extract_relevant(full_doc) - # Extract all text content - all_text = [] - for chunk in slim_doc["chunks"]: - all_text.append(f"Page {chunk['page']}:\n{chunk['text']}") + # Extract all text content + all_text = [] + for chunk in slim_doc["chunks"]: + all_text.append(f"Page {chunk['page']}:\n{chunk['text']}") - full_content = "\n\n".join(all_text) + full_content = "\n\n".join(all_text) - return { - "filename": filename, - "content": full_content, - "pages": len(slim_doc["chunks"]), - "content_length": len(full_content), - } + return { + "filename": filename, + "content": full_content, + "pages": len(slim_doc["chunks"]), + "content_length": len(full_content), + } diff --git a/src/tui/managers/env_manager.py b/src/tui/managers/env_manager.py index a3d3ea6f..51e2a11f 100644 --- a/src/tui/managers/env_manager.py +++ b/src/tui/managers/env_manager.py @@ -123,28 +123,29 @@ class EnvManager: import os # Map env vars to config attributes - attr_map = { - "OPENAI_API_KEY": "openai_api_key", - "ANTHROPIC_API_KEY": "anthropic_api_key", + # These are environment variable names, not actual secrets + attr_map = { # pragma: allowlist secret + "OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret + "ANTHROPIC_API_KEY": "anthropic_api_key", # pragma: allowlist secret "OLLAMA_ENDPOINT": "ollama_endpoint", - "WATSONX_API_KEY": "watsonx_api_key", + "WATSONX_API_KEY": "watsonx_api_key", # pragma: allowlist secret "WATSONX_ENDPOINT": "watsonx_endpoint", "WATSONX_PROJECT_ID": "watsonx_project_id", - "OPENSEARCH_PASSWORD": "opensearch_password", - "LANGFLOW_SECRET_KEY": "langflow_secret_key", + "OPENSEARCH_PASSWORD": "opensearch_password", # pragma: allowlist secret + "LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret "LANGFLOW_SUPERUSER": "langflow_superuser", - "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", + "LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret "LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id", "LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id", "LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id", "NUDGES_FLOW_ID": "nudges_flow_id", "GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id", - "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", + "GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", # pragma: allowlist secret "MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id", - "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", + "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", # pragma: allowlist secret "WEBHOOK_BASE_URL": "webhook_base_url", "AWS_ACCESS_KEY_ID": "aws_access_key_id", - "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", + "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret "LANGFLOW_PUBLIC_URL": "langflow_public_url", "OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths", "OPENSEARCH_DATA_PATH": "opensearch_data_path", diff --git a/src/utils/document_processing.py b/src/utils/document_processing.py index fcb458fb..9619cf74 100644 --- a/src/utils/document_processing.py +++ b/src/utils/document_processing.py @@ -119,6 +119,82 @@ def get_worker_converter(): return _worker_converter +def process_text_file(file_path: str) -> dict: + """ + Process a plain text file without using docling. + Returns the same structure as extract_relevant() for consistency. + + Args: + file_path: Path to the .txt file + + Returns: + dict with keys: id, filename, mimetype, chunks + """ + import os + from utils.hash_utils import hash_id + + # Read the file + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + # Compute hash + file_hash = hash_id(file_path) + filename = os.path.basename(file_path) + + # Split content into chunks of ~1000 characters to match typical docling chunk sizes + # This ensures embeddings stay within reasonable token limits + chunk_size = 1000 + chunks = [] + + # Split by paragraphs first (double newline) + paragraphs = content.split('\n\n') + current_chunk = "" + chunk_index = 0 + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # If adding this paragraph would exceed chunk size, save current chunk + if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk: + chunks.append({ + "page": chunk_index + 1, # Use chunk_index + 1 as "page" number + "type": "text", + "text": current_chunk.strip() + }) + chunk_index += 1 + current_chunk = para + else: + if current_chunk: + current_chunk += "\n\n" + para + else: + current_chunk = para + + # Add the last chunk if any + if current_chunk.strip(): + chunks.append({ + "page": chunk_index + 1, + "type": "text", + "text": current_chunk.strip() + }) + + # If no chunks were created (empty file), create a single empty chunk + if not chunks: + chunks.append({ + "page": 1, + "type": "text", + "text": "" + }) + + return { + "id": file_hash, + "filename": filename, + "mimetype": "text/plain", + "chunks": chunks, + } + + def extract_relevant(doc_dict: dict) -> dict: """ Given the full export_to_dict() result: