Merge branch 'main' into prune-image-tui
This commit is contained in:
commit
823e5925ee
14 changed files with 383 additions and 46 deletions
|
|
@ -2,6 +2,14 @@
|
||||||
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
|
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
|
||||||
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
|
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
|
||||||
DISABLE_INGEST_WITH_LANGFLOW=false
|
DISABLE_INGEST_WITH_LANGFLOW=false
|
||||||
|
|
||||||
|
# Langflow HTTP timeout configuration (in seconds)
|
||||||
|
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||||
|
# Increase these values if you experience timeouts with very large PDFs
|
||||||
|
# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
|
||||||
|
# LANGFLOW_TIMEOUT=2400
|
||||||
|
# LANGFLOW_CONNECT_TIMEOUT=30
|
||||||
|
|
||||||
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
|
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
|
||||||
LANGFLOW_SECRET_KEY=
|
LANGFLOW_SECRET_KEY=
|
||||||
|
|
||||||
|
|
|
||||||
11
.github/dependabot.yml
vendored
Normal file
11
.github/dependabot.yml
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
version: 2
|
||||||
|
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: "github-actions"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "monthly"
|
||||||
|
commit-message:
|
||||||
|
prefix: "build(deps):"
|
||||||
|
include: scope
|
||||||
|
|
||||||
7
.pre-commit-config.yaml
Normal file
7
.pre-commit-config.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/Yelp/detect-secrets
|
||||||
|
rev: v1.5.0
|
||||||
|
hooks:
|
||||||
|
- id: detect-secrets
|
||||||
|
args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]
|
||||||
|
|
||||||
180
.secrets.baseline
Normal file
180
.secrets.baseline
Normal file
|
|
@ -0,0 +1,180 @@
|
||||||
|
{
|
||||||
|
"version": "1.5.0",
|
||||||
|
"plugins_used": [
|
||||||
|
{
|
||||||
|
"name": "ArtifactoryDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "AWSKeyDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "AzureStorageKeyDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Base64HighEntropyString",
|
||||||
|
"limit": 4.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "BasicAuthDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "CloudantDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "DiscordBotTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "GitHubTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "GitLabTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "HexHighEntropyString",
|
||||||
|
"limit": 3.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "IbmCloudIamDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "IbmCosHmacDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "IPPublicDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "JwtTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "KeywordDetector",
|
||||||
|
"keyword_exclude": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "MailchimpDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "NpmDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "OpenAIDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PrivateKeyDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "PypiTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SendGridDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SlackDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SoftlayerDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SquareOAuthDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "StripeDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "TelegramBotTokenDetector"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "TwilioKeyDetector"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"filters_used": [
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.common.is_baseline_file",
|
||||||
|
"filename": ".secrets.baseline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
|
||||||
|
"min_level": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_indirect_reference"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_likely_id_string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_lock_file"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_potential_uuid"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_sequential_string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_swagger_file"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.heuristic.is_templated_secret"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.regex.should_exclude_file",
|
||||||
|
"pattern": [
|
||||||
|
"flows/.*\\.json$"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "detect_secrets.filters.regex.should_exclude_line",
|
||||||
|
"pattern": [
|
||||||
|
"code_hash"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"results": {
|
||||||
|
"docs/docs/_partial-integrate-chat.mdx": [
|
||||||
|
{
|
||||||
|
"type": "Secret Keyword",
|
||||||
|
"filename": "docs/docs/_partial-integrate-chat.mdx",
|
||||||
|
"hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
|
||||||
|
"is_verified": false,
|
||||||
|
"line_number": 30
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"src/main.py": [
|
||||||
|
{
|
||||||
|
"type": "Base64 High Entropy String",
|
||||||
|
"filename": "src/main.py",
|
||||||
|
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||||
|
"is_verified": false,
|
||||||
|
"line_number": 404
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"src/models/processors.py": [
|
||||||
|
{
|
||||||
|
"type": "Base64 High Entropy String",
|
||||||
|
"filename": "src/models/processors.py",
|
||||||
|
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||||
|
"is_verified": false,
|
||||||
|
"line_number": 763
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"src/services/langflow_file_service.py": [
|
||||||
|
{
|
||||||
|
"type": "Base64 High Entropy String",
|
||||||
|
"filename": "src/services/langflow_file_service.py",
|
||||||
|
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||||
|
"is_verified": false,
|
||||||
|
"line_number": 97
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"generated_at": "2025-12-09T20:33:13Z"
|
||||||
|
}
|
||||||
|
|
@ -4787,7 +4787,7 @@
|
||||||
"is_component": false,
|
"is_component": false,
|
||||||
"locked": true,
|
"locked": true,
|
||||||
"last_tested_version": "1.7.0.dev21",
|
"last_tested_version": "1.7.0.dev21",
|
||||||
"name": "OpenRAG OpenSearch Agent",
|
"name": "OpenRAG OpenSearch Agent Flow",
|
||||||
"tags": [
|
"tags": [
|
||||||
"assistants",
|
"assistants",
|
||||||
"agents"
|
"agents"
|
||||||
|
|
|
||||||
|
|
@ -4114,7 +4114,7 @@
|
||||||
"is_component": false,
|
"is_component": false,
|
||||||
"locked": true,
|
"locked": true,
|
||||||
"last_tested_version": "1.7.0.dev21",
|
"last_tested_version": "1.7.0.dev21",
|
||||||
"name": "OpenRAG OpenSearch Nudges",
|
"name": "OpenRAG OpenSearch Nudges Flow",
|
||||||
"tags": [
|
"tags": [
|
||||||
"assistants",
|
"assistants",
|
||||||
"agents"
|
"agents"
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ import {
|
||||||
} from "@/components/provider-health-banner";
|
} from "@/components/provider-health-banner";
|
||||||
import { TaskNotificationMenu } from "@/components/task-notification-menu";
|
import { TaskNotificationMenu } from "@/components/task-notification-menu";
|
||||||
import { useAuth } from "@/contexts/auth-context";
|
import { useAuth } from "@/contexts/auth-context";
|
||||||
|
import { useChat } from "@/contexts/chat-context";
|
||||||
import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
|
import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
|
||||||
import { useTask } from "@/contexts/task-context";
|
import { useTask } from "@/contexts/task-context";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
|
||||||
const { isMenuOpen } = useTask();
|
const { isMenuOpen } = useTask();
|
||||||
const { isPanelOpen } = useKnowledgeFilter();
|
const { isPanelOpen } = useKnowledgeFilter();
|
||||||
const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
|
const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
|
||||||
|
const { isOnboardingComplete } = useChat();
|
||||||
|
|
||||||
// List of paths that should not show navigation
|
// List of paths that should not show navigation
|
||||||
const authPaths = ["/login", "/auth/callback"];
|
const authPaths = ["/login", "/auth/callback"];
|
||||||
|
|
@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
|
||||||
isOpen={isDoclingUnhealthy}
|
isOpen={isDoclingUnhealthy}
|
||||||
className="w-full"
|
className="w-full"
|
||||||
>
|
>
|
||||||
<DoclingHealthBanner />
|
<DoclingHealthBanner />
|
||||||
|
</AnimatedConditional>
|
||||||
|
{settings?.edited && isOnboardingComplete && (
|
||||||
|
<AnimatedConditional
|
||||||
|
vertical
|
||||||
|
isOpen={isProviderUnhealthy}
|
||||||
|
className="w-full"
|
||||||
|
>
|
||||||
|
<ProviderHealthBanner />
|
||||||
</AnimatedConditional>
|
</AnimatedConditional>
|
||||||
{settings?.edited && (
|
)}
|
||||||
<AnimatedConditional
|
|
||||||
vertical
|
|
||||||
isOpen={isProviderUnhealthy}
|
|
||||||
className="w-full"
|
|
||||||
>
|
|
||||||
<ProviderHealthBanner />
|
|
||||||
</AnimatedConditional>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<ChatRenderer settings={settings}>{children}</ChatRenderer>
|
<ChatRenderer settings={settings}>{children}</ChatRenderer>
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
|
||||||
"DISABLE_INGEST_WITH_LANGFLOW", "false"
|
"DISABLE_INGEST_WITH_LANGFLOW", "false"
|
||||||
).lower() in ("true", "1", "yes")
|
).lower() in ("true", "1", "yes")
|
||||||
|
|
||||||
|
# Langflow HTTP timeout configuration (in seconds)
|
||||||
|
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||||
|
# Default: 40 minutes total, 40 minutes read timeout
|
||||||
|
LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
|
||||||
|
LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
|
||||||
|
|
||||||
|
|
||||||
def is_no_auth_mode():
|
def is_no_auth_mode():
|
||||||
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
|
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
|
||||||
|
|
@ -317,9 +323,22 @@ class AppClients:
|
||||||
# Initialize document converter
|
# Initialize document converter
|
||||||
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
||||||
|
|
||||||
# Initialize Langflow HTTP client
|
# Initialize Langflow HTTP client with extended timeouts for large documents
|
||||||
|
# Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
|
||||||
self.langflow_http_client = httpx.AsyncClient(
|
self.langflow_http_client = httpx.AsyncClient(
|
||||||
base_url=LANGFLOW_URL, timeout=1200.0
|
base_url=LANGFLOW_URL,
|
||||||
|
timeout=httpx.Timeout(
|
||||||
|
timeout=LANGFLOW_TIMEOUT, # Total timeout
|
||||||
|
connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
|
||||||
|
read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
|
||||||
|
write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
|
||||||
|
pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Initialized Langflow HTTP client with extended timeouts",
|
||||||
|
timeout_seconds=LANGFLOW_TIMEOUT,
|
||||||
|
connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):
|
||||||
|
|
||||||
# Required BaseConnector class attributes
|
# Required BaseConnector class attributes
|
||||||
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
||||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
|
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
|
||||||
|
|
||||||
# Connector metadata
|
# Connector metadata
|
||||||
CONNECTOR_NAME = "OneDrive"
|
CONNECTOR_NAME = "OneDrive"
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):
|
||||||
|
|
||||||
# Required BaseConnector class attributes
|
# Required BaseConnector class attributes
|
||||||
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
||||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
|
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
|
||||||
|
|
||||||
# Connector metadata
|
# Connector metadata
|
||||||
CONNECTOR_NAME = "SharePoint"
|
CONNECTOR_NAME = "SharePoint"
|
||||||
|
|
|
||||||
|
|
@ -197,10 +197,27 @@ class TaskProcessor:
|
||||||
file_hash=file_hash,
|
file_hash=file_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert and extract
|
# Check if this is a .txt file - use simple processing instead of docling
|
||||||
result = clients.converter.convert(file_path)
|
import os
|
||||||
full_doc = result.document.export_to_dict()
|
file_ext = os.path.splitext(file_path)[1].lower()
|
||||||
slim_doc = extract_relevant(full_doc)
|
|
||||||
|
if file_ext == '.txt':
|
||||||
|
# Simple text file processing without docling
|
||||||
|
from utils.document_processing import process_text_file
|
||||||
|
logger.info(
|
||||||
|
"Processing as plain text file (bypassing docling)",
|
||||||
|
file_path=file_path,
|
||||||
|
file_hash=file_hash,
|
||||||
|
)
|
||||||
|
slim_doc = process_text_file(file_path)
|
||||||
|
# Override filename with original_filename if provided
|
||||||
|
if original_filename:
|
||||||
|
slim_doc["filename"] = original_filename
|
||||||
|
else:
|
||||||
|
# Convert and extract using docling for other file types
|
||||||
|
result = clients.converter.convert(file_path)
|
||||||
|
full_doc = result.document.export_to_dict()
|
||||||
|
slim_doc = extract_relevant(full_doc)
|
||||||
|
|
||||||
texts = [c["text"] for c in slim_doc["chunks"]]
|
texts = [c["text"] for c in slim_doc["chunks"]]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -181,6 +181,7 @@ class DocumentService:
|
||||||
async def process_upload_context(self, upload_file, filename: str = None):
|
async def process_upload_context(self, upload_file, filename: str = None):
|
||||||
"""Process uploaded file and return content for context"""
|
"""Process uploaded file and return content for context"""
|
||||||
import io
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
if not filename:
|
if not filename:
|
||||||
filename = upload_file.filename or "uploaded_document"
|
filename = upload_file.filename or "uploaded_document"
|
||||||
|
|
@ -194,22 +195,37 @@ class DocumentService:
|
||||||
content.write(chunk)
|
content.write(chunk)
|
||||||
content.seek(0) # Reset to beginning for reading
|
content.seek(0) # Reset to beginning for reading
|
||||||
|
|
||||||
# Create DocumentStream and process with docling
|
# Check if this is a .txt file - use simple processing
|
||||||
doc_stream = DocumentStream(name=filename, stream=content)
|
file_ext = os.path.splitext(filename)[1].lower()
|
||||||
result = clients.converter.convert(doc_stream)
|
|
||||||
full_doc = result.document.export_to_dict()
|
if file_ext == '.txt':
|
||||||
slim_doc = extract_relevant(full_doc)
|
# Simple text file processing for chat context
|
||||||
|
text_content = content.read().decode('utf-8', errors='replace')
|
||||||
|
|
||||||
|
# For context, we don't need to chunk - just return the full content
|
||||||
|
return {
|
||||||
|
"filename": filename,
|
||||||
|
"content": text_content,
|
||||||
|
"pages": 1, # Text files don't have pages
|
||||||
|
"content_length": len(text_content),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Create DocumentStream and process with docling
|
||||||
|
doc_stream = DocumentStream(name=filename, stream=content)
|
||||||
|
result = clients.converter.convert(doc_stream)
|
||||||
|
full_doc = result.document.export_to_dict()
|
||||||
|
slim_doc = extract_relevant(full_doc)
|
||||||
|
|
||||||
# Extract all text content
|
# Extract all text content
|
||||||
all_text = []
|
all_text = []
|
||||||
for chunk in slim_doc["chunks"]:
|
for chunk in slim_doc["chunks"]:
|
||||||
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
||||||
|
|
||||||
full_content = "\n\n".join(all_text)
|
full_content = "\n\n".join(all_text)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
"content": full_content,
|
"content": full_content,
|
||||||
"pages": len(slim_doc["chunks"]),
|
"pages": len(slim_doc["chunks"]),
|
||||||
"content_length": len(full_content),
|
"content_length": len(full_content),
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -123,28 +123,29 @@ class EnvManager:
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Map env vars to config attributes
|
# Map env vars to config attributes
|
||||||
attr_map = {
|
# These are environment variable names, not actual secrets
|
||||||
"OPENAI_API_KEY": "openai_api_key",
|
attr_map = { # pragma: allowlist secret
|
||||||
"ANTHROPIC_API_KEY": "anthropic_api_key",
|
"OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret
|
||||||
|
"ANTHROPIC_API_KEY": "anthropic_api_key", # pragma: allowlist secret
|
||||||
"OLLAMA_ENDPOINT": "ollama_endpoint",
|
"OLLAMA_ENDPOINT": "ollama_endpoint",
|
||||||
"WATSONX_API_KEY": "watsonx_api_key",
|
"WATSONX_API_KEY": "watsonx_api_key", # pragma: allowlist secret
|
||||||
"WATSONX_ENDPOINT": "watsonx_endpoint",
|
"WATSONX_ENDPOINT": "watsonx_endpoint",
|
||||||
"WATSONX_PROJECT_ID": "watsonx_project_id",
|
"WATSONX_PROJECT_ID": "watsonx_project_id",
|
||||||
"OPENSEARCH_PASSWORD": "opensearch_password",
|
"OPENSEARCH_PASSWORD": "opensearch_password", # pragma: allowlist secret
|
||||||
"LANGFLOW_SECRET_KEY": "langflow_secret_key",
|
"LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret
|
||||||
"LANGFLOW_SUPERUSER": "langflow_superuser",
|
"LANGFLOW_SUPERUSER": "langflow_superuser",
|
||||||
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
|
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret
|
||||||
"LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
|
"LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
|
||||||
"LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
|
"LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
|
||||||
"LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
|
"LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
|
||||||
"NUDGES_FLOW_ID": "nudges_flow_id",
|
"NUDGES_FLOW_ID": "nudges_flow_id",
|
||||||
"GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
|
"GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
|
||||||
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
|
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", # pragma: allowlist secret
|
||||||
"MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
|
"MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
|
||||||
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
|
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", # pragma: allowlist secret
|
||||||
"WEBHOOK_BASE_URL": "webhook_base_url",
|
"WEBHOOK_BASE_URL": "webhook_base_url",
|
||||||
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
|
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
|
||||||
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
|
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret
|
||||||
"LANGFLOW_PUBLIC_URL": "langflow_public_url",
|
"LANGFLOW_PUBLIC_URL": "langflow_public_url",
|
||||||
"OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
|
"OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
|
||||||
"OPENSEARCH_DATA_PATH": "opensearch_data_path",
|
"OPENSEARCH_DATA_PATH": "opensearch_data_path",
|
||||||
|
|
|
||||||
|
|
@ -119,6 +119,82 @@ def get_worker_converter():
|
||||||
return _worker_converter
|
return _worker_converter
|
||||||
|
|
||||||
|
|
||||||
|
def process_text_file(file_path: str) -> dict:
|
||||||
|
"""
|
||||||
|
Process a plain text file without using docling.
|
||||||
|
Returns the same structure as extract_relevant() for consistency.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the .txt file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with keys: id, filename, mimetype, chunks
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from utils.hash_utils import hash_id
|
||||||
|
|
||||||
|
# Read the file
|
||||||
|
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Compute hash
|
||||||
|
file_hash = hash_id(file_path)
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
|
|
||||||
|
# Split content into chunks of ~1000 characters to match typical docling chunk sizes
|
||||||
|
# This ensures embeddings stay within reasonable token limits
|
||||||
|
chunk_size = 1000
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
# Split by paragraphs first (double newline)
|
||||||
|
paragraphs = content.split('\n\n')
|
||||||
|
current_chunk = ""
|
||||||
|
chunk_index = 0
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
para = para.strip()
|
||||||
|
if not para:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If adding this paragraph would exceed chunk size, save current chunk
|
||||||
|
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
|
||||||
|
chunks.append({
|
||||||
|
"page": chunk_index + 1, # Use chunk_index + 1 as "page" number
|
||||||
|
"type": "text",
|
||||||
|
"text": current_chunk.strip()
|
||||||
|
})
|
||||||
|
chunk_index += 1
|
||||||
|
current_chunk = para
|
||||||
|
else:
|
||||||
|
if current_chunk:
|
||||||
|
current_chunk += "\n\n" + para
|
||||||
|
else:
|
||||||
|
current_chunk = para
|
||||||
|
|
||||||
|
# Add the last chunk if any
|
||||||
|
if current_chunk.strip():
|
||||||
|
chunks.append({
|
||||||
|
"page": chunk_index + 1,
|
||||||
|
"type": "text",
|
||||||
|
"text": current_chunk.strip()
|
||||||
|
})
|
||||||
|
|
||||||
|
# If no chunks were created (empty file), create a single empty chunk
|
||||||
|
if not chunks:
|
||||||
|
chunks.append({
|
||||||
|
"page": 1,
|
||||||
|
"type": "text",
|
||||||
|
"text": ""
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": file_hash,
|
||||||
|
"filename": filename,
|
||||||
|
"mimetype": "text/plain",
|
||||||
|
"chunks": chunks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_relevant(doc_dict: dict) -> dict:
|
def extract_relevant(doc_dict: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Given the full export_to_dict() result:
|
Given the full export_to_dict() result:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue