Merge branch 'main' into prune-image-tui

This commit is contained in:
Edwin Jose 2025-12-10 20:54:43 -05:00 committed by GitHub
commit 823e5925ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 383 additions and 46 deletions

View file

@ -2,6 +2,14 @@
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
DISABLE_INGEST_WITH_LANGFLOW=false
# Langflow HTTP timeout configuration (in seconds)
# For large documents (300+ pages), ingestion can take 30+ minutes
# Increase these values if you experience timeouts with very large PDFs
# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
# LANGFLOW_TIMEOUT=2400
# LANGFLOW_CONNECT_TIMEOUT=30
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
LANGFLOW_SECRET_KEY=

11
.github/dependabot.yml vendored Normal file
View file

@ -0,0 +1,11 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
commit-message:
prefix: "build(deps):"
include: scope

7
.pre-commit-config.yaml Normal file
View file

@ -0,0 +1,7 @@
repos:
- repo: https://github.com/Yelp/detect-secrets
rev: v1.5.0
hooks:
- id: detect-secrets
args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]

180
.secrets.baseline Normal file
View file

@ -0,0 +1,180 @@
{
"version": "1.5.0",
"plugins_used": [
{
"name": "ArtifactoryDetector"
},
{
"name": "AWSKeyDetector"
},
{
"name": "AzureStorageKeyDetector"
},
{
"name": "Base64HighEntropyString",
"limit": 4.5
},
{
"name": "BasicAuthDetector"
},
{
"name": "CloudantDetector"
},
{
"name": "DiscordBotTokenDetector"
},
{
"name": "GitHubTokenDetector"
},
{
"name": "GitLabTokenDetector"
},
{
"name": "HexHighEntropyString",
"limit": 3.0
},
{
"name": "IbmCloudIamDetector"
},
{
"name": "IbmCosHmacDetector"
},
{
"name": "IPPublicDetector"
},
{
"name": "JwtTokenDetector"
},
{
"name": "KeywordDetector",
"keyword_exclude": ""
},
{
"name": "MailchimpDetector"
},
{
"name": "NpmDetector"
},
{
"name": "OpenAIDetector"
},
{
"name": "PrivateKeyDetector"
},
{
"name": "PypiTokenDetector"
},
{
"name": "SendGridDetector"
},
{
"name": "SlackDetector"
},
{
"name": "SoftlayerDetector"
},
{
"name": "SquareOAuthDetector"
},
{
"name": "StripeDetector"
},
{
"name": "TelegramBotTokenDetector"
},
{
"name": "TwilioKeyDetector"
}
],
"filters_used": [
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_baseline_file",
"filename": ".secrets.baseline"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
},
{
"path": "detect_secrets.filters.heuristic.is_indirect_reference"
},
{
"path": "detect_secrets.filters.heuristic.is_likely_id_string"
},
{
"path": "detect_secrets.filters.heuristic.is_lock_file"
},
{
"path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
},
{
"path": "detect_secrets.filters.heuristic.is_potential_uuid"
},
{
"path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
},
{
"path": "detect_secrets.filters.heuristic.is_sequential_string"
},
{
"path": "detect_secrets.filters.heuristic.is_swagger_file"
},
{
"path": "detect_secrets.filters.heuristic.is_templated_secret"
},
{
"path": "detect_secrets.filters.regex.should_exclude_file",
"pattern": [
"flows/.*\\.json$"
]
},
{
"path": "detect_secrets.filters.regex.should_exclude_line",
"pattern": [
"code_hash"
]
}
],
"results": {
"docs/docs/_partial-integrate-chat.mdx": [
{
"type": "Secret Keyword",
"filename": "docs/docs/_partial-integrate-chat.mdx",
"hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
"is_verified": false,
"line_number": 30
}
],
"src/main.py": [
{
"type": "Base64 High Entropy String",
"filename": "src/main.py",
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
"is_verified": false,
"line_number": 404
}
],
"src/models/processors.py": [
{
"type": "Base64 High Entropy String",
"filename": "src/models/processors.py",
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
"is_verified": false,
"line_number": 763
}
],
"src/services/langflow_file_service.py": [
{
"type": "Base64 High Entropy String",
"filename": "src/services/langflow_file_service.py",
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
"is_verified": false,
"line_number": 97
}
]
},
"generated_at": "2025-12-09T20:33:13Z"
}

View file

@ -4787,7 +4787,7 @@
"is_component": false,
"locked": true,
"last_tested_version": "1.7.0.dev21",
"name": "OpenRAG OpenSearch Agent",
"name": "OpenRAG OpenSearch Agent Flow",
"tags": [
"assistants",
"agents"

View file

@ -4114,7 +4114,7 @@
"is_component": false,
"locked": true,
"last_tested_version": "1.7.0.dev21",
"name": "OpenRAG OpenSearch Nudges",
"name": "OpenRAG OpenSearch Nudges Flow",
"tags": [
"assistants",
"agents"

View file

@ -15,6 +15,7 @@ import {
} from "@/components/provider-health-banner";
import { TaskNotificationMenu } from "@/components/task-notification-menu";
import { useAuth } from "@/contexts/auth-context";
import { useChat } from "@/contexts/chat-context";
import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
import { useTask } from "@/contexts/task-context";
import { cn } from "@/lib/utils";
@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
const { isMenuOpen } = useTask();
const { isPanelOpen } = useKnowledgeFilter();
const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
const { isOnboardingComplete } = useChat();
// List of paths that should not show navigation
const authPaths = ["/login", "/auth/callback"];
@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
isOpen={isDoclingUnhealthy}
className="w-full"
>
<DoclingHealthBanner />
<DoclingHealthBanner />
</AnimatedConditional>
{settings?.edited && isOnboardingComplete && (
<AnimatedConditional
vertical
isOpen={isProviderUnhealthy}
className="w-full"
>
<ProviderHealthBanner />
</AnimatedConditional>
{settings?.edited && (
<AnimatedConditional
vertical
isOpen={isProviderUnhealthy}
className="w-full"
>
<ProviderHealthBanner />
</AnimatedConditional>
)}
)}
</div>
<ChatRenderer settings={settings}>{children}</ChatRenderer>

View file

@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
"DISABLE_INGEST_WITH_LANGFLOW", "false"
).lower() in ("true", "1", "yes")
# Langflow HTTP timeout configuration (in seconds)
# For large documents (300+ pages), ingestion can take 30+ minutes
# Default: 40 minutes total, 40 minutes read timeout
LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
def is_no_auth_mode():
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
@ -317,9 +323,22 @@ class AppClients:
# Initialize document converter
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
# Initialize Langflow HTTP client
# Initialize Langflow HTTP client with extended timeouts for large documents
# Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
self.langflow_http_client = httpx.AsyncClient(
base_url=LANGFLOW_URL, timeout=1200.0
base_url=LANGFLOW_URL,
timeout=httpx.Timeout(
timeout=LANGFLOW_TIMEOUT, # Total timeout
connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
)
)
logger.info(
"Initialized Langflow HTTP client with extended timeouts",
timeout_seconds=LANGFLOW_TIMEOUT,
connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
)
return self

View file

@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):
# Required BaseConnector class attributes
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
# Connector metadata
CONNECTOR_NAME = "OneDrive"

View file

@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):
# Required BaseConnector class attributes
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
# Connector metadata
CONNECTOR_NAME = "SharePoint"

View file

@ -197,10 +197,27 @@ class TaskProcessor:
file_hash=file_hash,
)
# Convert and extract
result = clients.converter.convert(file_path)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Check if this is a .txt file - use simple processing instead of docling
import os
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.txt':
# Simple text file processing without docling
from utils.document_processing import process_text_file
logger.info(
"Processing as plain text file (bypassing docling)",
file_path=file_path,
file_hash=file_hash,
)
slim_doc = process_text_file(file_path)
# Override filename with original_filename if provided
if original_filename:
slim_doc["filename"] = original_filename
else:
# Convert and extract using docling for other file types
result = clients.converter.convert(file_path)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
texts = [c["text"] for c in slim_doc["chunks"]]

View file

@ -181,6 +181,7 @@ class DocumentService:
async def process_upload_context(self, upload_file, filename: str = None):
"""Process uploaded file and return content for context"""
import io
import os
if not filename:
filename = upload_file.filename or "uploaded_document"
@ -194,22 +195,37 @@ class DocumentService:
content.write(chunk)
content.seek(0) # Reset to beginning for reading
# Create DocumentStream and process with docling
doc_stream = DocumentStream(name=filename, stream=content)
result = clients.converter.convert(doc_stream)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Check if this is a .txt file - use simple processing
file_ext = os.path.splitext(filename)[1].lower()
if file_ext == '.txt':
# Simple text file processing for chat context
text_content = content.read().decode('utf-8', errors='replace')
# For context, we don't need to chunk - just return the full content
return {
"filename": filename,
"content": text_content,
"pages": 1, # Text files don't have pages
"content_length": len(text_content),
}
else:
# Create DocumentStream and process with docling
doc_stream = DocumentStream(name=filename, stream=content)
result = clients.converter.convert(doc_stream)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Extract all text content
all_text = []
for chunk in slim_doc["chunks"]:
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
# Extract all text content
all_text = []
for chunk in slim_doc["chunks"]:
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
full_content = "\n\n".join(all_text)
full_content = "\n\n".join(all_text)
return {
"filename": filename,
"content": full_content,
"pages": len(slim_doc["chunks"]),
"content_length": len(full_content),
}
return {
"filename": filename,
"content": full_content,
"pages": len(slim_doc["chunks"]),
"content_length": len(full_content),
}

View file

@ -123,28 +123,29 @@ class EnvManager:
import os
# Map env vars to config attributes
attr_map = {
"OPENAI_API_KEY": "openai_api_key",
"ANTHROPIC_API_KEY": "anthropic_api_key",
# These are environment variable names, not actual secrets
attr_map = { # pragma: allowlist secret
"OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret
"ANTHROPIC_API_KEY": "anthropic_api_key", # pragma: allowlist secret
"OLLAMA_ENDPOINT": "ollama_endpoint",
"WATSONX_API_KEY": "watsonx_api_key",
"WATSONX_API_KEY": "watsonx_api_key", # pragma: allowlist secret
"WATSONX_ENDPOINT": "watsonx_endpoint",
"WATSONX_PROJECT_ID": "watsonx_project_id",
"OPENSEARCH_PASSWORD": "opensearch_password",
"LANGFLOW_SECRET_KEY": "langflow_secret_key",
"OPENSEARCH_PASSWORD": "opensearch_password", # pragma: allowlist secret
"LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret
"LANGFLOW_SUPERUSER": "langflow_superuser",
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret
"LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
"LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
"LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
"NUDGES_FLOW_ID": "nudges_flow_id",
"GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", # pragma: allowlist secret
"MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", # pragma: allowlist secret
"WEBHOOK_BASE_URL": "webhook_base_url",
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret
"LANGFLOW_PUBLIC_URL": "langflow_public_url",
"OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
"OPENSEARCH_DATA_PATH": "opensearch_data_path",

View file

@ -119,6 +119,82 @@ def get_worker_converter():
return _worker_converter
def process_text_file(file_path: str) -> dict:
"""
Process a plain text file without using docling.
Returns the same structure as extract_relevant() for consistency.
Args:
file_path: Path to the .txt file
Returns:
dict with keys: id, filename, mimetype, chunks
"""
import os
from utils.hash_utils import hash_id
# Read the file
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# Compute hash
file_hash = hash_id(file_path)
filename = os.path.basename(file_path)
# Split content into chunks of ~1000 characters to match typical docling chunk sizes
# This ensures embeddings stay within reasonable token limits
chunk_size = 1000
chunks = []
# Split by paragraphs first (double newline)
paragraphs = content.split('\n\n')
current_chunk = ""
chunk_index = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
# If adding this paragraph would exceed chunk size, save current chunk
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
chunks.append({
"page": chunk_index + 1, # Use chunk_index + 1 as "page" number
"type": "text",
"text": current_chunk.strip()
})
chunk_index += 1
current_chunk = para
else:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
# Add the last chunk if any
if current_chunk.strip():
chunks.append({
"page": chunk_index + 1,
"type": "text",
"text": current_chunk.strip()
})
# If no chunks were created (empty file), create a single empty chunk
if not chunks:
chunks.append({
"page": 1,
"type": "text",
"text": ""
})
return {
"id": file_hash,
"filename": filename,
"mimetype": "text/plain",
"chunks": chunks,
}
def extract_relevant(doc_dict: dict) -> dict:
"""
Given the full export_to_dict() result: