Merge branch 'main' into prune-image-tui
This commit is contained in:
commit
823e5925ee
14 changed files with 383 additions and 46 deletions
|
|
@ -2,6 +2,14 @@
|
|||
# Set to true to disable Langflow ingestion and use traditional OpenRAG processor
|
||||
# If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete)
|
||||
DISABLE_INGEST_WITH_LANGFLOW=false
|
||||
|
||||
# Langflow HTTP timeout configuration (in seconds)
|
||||
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||
# Increase these values if you experience timeouts with very large PDFs
|
||||
# Default: 2400 seconds (40 minutes) total timeout, 30 seconds connection timeout
|
||||
# LANGFLOW_TIMEOUT=2400
|
||||
# LANGFLOW_CONNECT_TIMEOUT=30
|
||||
|
||||
# make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key
|
||||
LANGFLOW_SECRET_KEY=
|
||||
|
||||
|
|
|
|||
11
.github/dependabot.yml
vendored
Normal file
11
.github/dependabot.yml
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
version: 2
|
||||
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
commit-message:
|
||||
prefix: "build(deps):"
|
||||
include: scope
|
||||
|
||||
7
.pre-commit-config.yaml
Normal file
7
.pre-commit-config.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
repos:
|
||||
- repo: https://github.com/Yelp/detect-secrets
|
||||
rev: v1.5.0
|
||||
hooks:
|
||||
- id: detect-secrets
|
||||
args: ["--baseline", ".secrets.baseline", "--exclude-lines", "code_hash"]
|
||||
|
||||
180
.secrets.baseline
Normal file
180
.secrets.baseline
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
{
|
||||
"version": "1.5.0",
|
||||
"plugins_used": [
|
||||
{
|
||||
"name": "ArtifactoryDetector"
|
||||
},
|
||||
{
|
||||
"name": "AWSKeyDetector"
|
||||
},
|
||||
{
|
||||
"name": "AzureStorageKeyDetector"
|
||||
},
|
||||
{
|
||||
"name": "Base64HighEntropyString",
|
||||
"limit": 4.5
|
||||
},
|
||||
{
|
||||
"name": "BasicAuthDetector"
|
||||
},
|
||||
{
|
||||
"name": "CloudantDetector"
|
||||
},
|
||||
{
|
||||
"name": "DiscordBotTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "GitHubTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "GitLabTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "HexHighEntropyString",
|
||||
"limit": 3.0
|
||||
},
|
||||
{
|
||||
"name": "IbmCloudIamDetector"
|
||||
},
|
||||
{
|
||||
"name": "IbmCosHmacDetector"
|
||||
},
|
||||
{
|
||||
"name": "IPPublicDetector"
|
||||
},
|
||||
{
|
||||
"name": "JwtTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "KeywordDetector",
|
||||
"keyword_exclude": ""
|
||||
},
|
||||
{
|
||||
"name": "MailchimpDetector"
|
||||
},
|
||||
{
|
||||
"name": "NpmDetector"
|
||||
},
|
||||
{
|
||||
"name": "OpenAIDetector"
|
||||
},
|
||||
{
|
||||
"name": "PrivateKeyDetector"
|
||||
},
|
||||
{
|
||||
"name": "PypiTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "SendGridDetector"
|
||||
},
|
||||
{
|
||||
"name": "SlackDetector"
|
||||
},
|
||||
{
|
||||
"name": "SoftlayerDetector"
|
||||
},
|
||||
{
|
||||
"name": "SquareOAuthDetector"
|
||||
},
|
||||
{
|
||||
"name": "StripeDetector"
|
||||
},
|
||||
{
|
||||
"name": "TelegramBotTokenDetector"
|
||||
},
|
||||
{
|
||||
"name": "TwilioKeyDetector"
|
||||
}
|
||||
],
|
||||
"filters_used": [
|
||||
{
|
||||
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.common.is_baseline_file",
|
||||
"filename": ".secrets.baseline"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
|
||||
"min_level": 2
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_indirect_reference"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_likely_id_string"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_lock_file"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_potential_uuid"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_sequential_string"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_swagger_file"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.heuristic.is_templated_secret"
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.regex.should_exclude_file",
|
||||
"pattern": [
|
||||
"flows/.*\\.json$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"path": "detect_secrets.filters.regex.should_exclude_line",
|
||||
"pattern": [
|
||||
"code_hash"
|
||||
]
|
||||
}
|
||||
],
|
||||
"results": {
|
||||
"docs/docs/_partial-integrate-chat.mdx": [
|
||||
{
|
||||
"type": "Secret Keyword",
|
||||
"filename": "docs/docs/_partial-integrate-chat.mdx",
|
||||
"hashed_secret": "e42fd8b9ad15d8fa5f4718cad7cf19b522807996",
|
||||
"is_verified": false,
|
||||
"line_number": 30
|
||||
}
|
||||
],
|
||||
"src/main.py": [
|
||||
{
|
||||
"type": "Base64 High Entropy String",
|
||||
"filename": "src/main.py",
|
||||
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||
"is_verified": false,
|
||||
"line_number": 404
|
||||
}
|
||||
],
|
||||
"src/models/processors.py": [
|
||||
{
|
||||
"type": "Base64 High Entropy String",
|
||||
"filename": "src/models/processors.py",
|
||||
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||
"is_verified": false,
|
||||
"line_number": 763
|
||||
}
|
||||
],
|
||||
"src/services/langflow_file_service.py": [
|
||||
{
|
||||
"type": "Base64 High Entropy String",
|
||||
"filename": "src/services/langflow_file_service.py",
|
||||
"hashed_secret": "131a83e9ef8660d7dd0771da7ce5954d9ea801ee",
|
||||
"is_verified": false,
|
||||
"line_number": 97
|
||||
}
|
||||
]
|
||||
},
|
||||
"generated_at": "2025-12-09T20:33:13Z"
|
||||
}
|
||||
|
|
@ -4787,7 +4787,7 @@
|
|||
"is_component": false,
|
||||
"locked": true,
|
||||
"last_tested_version": "1.7.0.dev21",
|
||||
"name": "OpenRAG OpenSearch Agent",
|
||||
"name": "OpenRAG OpenSearch Agent Flow",
|
||||
"tags": [
|
||||
"assistants",
|
||||
"agents"
|
||||
|
|
|
|||
|
|
@ -4114,7 +4114,7 @@
|
|||
"is_component": false,
|
||||
"locked": true,
|
||||
"last_tested_version": "1.7.0.dev21",
|
||||
"name": "OpenRAG OpenSearch Nudges",
|
||||
"name": "OpenRAG OpenSearch Nudges Flow",
|
||||
"tags": [
|
||||
"assistants",
|
||||
"agents"
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import {
|
|||
} from "@/components/provider-health-banner";
|
||||
import { TaskNotificationMenu } from "@/components/task-notification-menu";
|
||||
import { useAuth } from "@/contexts/auth-context";
|
||||
import { useChat } from "@/contexts/chat-context";
|
||||
import { useKnowledgeFilter } from "@/contexts/knowledge-filter-context";
|
||||
import { useTask } from "@/contexts/task-context";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
|
@ -27,6 +28,7 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
|
|||
const { isMenuOpen } = useTask();
|
||||
const { isPanelOpen } = useKnowledgeFilter();
|
||||
const { isLoading, isAuthenticated, isNoAuthMode } = useAuth();
|
||||
const { isOnboardingComplete } = useChat();
|
||||
|
||||
// List of paths that should not show navigation
|
||||
const authPaths = ["/login", "/auth/callback"];
|
||||
|
|
@ -91,17 +93,17 @@ export function LayoutWrapper({ children }: { children: React.ReactNode }) {
|
|||
isOpen={isDoclingUnhealthy}
|
||||
className="w-full"
|
||||
>
|
||||
<DoclingHealthBanner />
|
||||
<DoclingHealthBanner />
|
||||
</AnimatedConditional>
|
||||
{settings?.edited && isOnboardingComplete && (
|
||||
<AnimatedConditional
|
||||
vertical
|
||||
isOpen={isProviderUnhealthy}
|
||||
className="w-full"
|
||||
>
|
||||
<ProviderHealthBanner />
|
||||
</AnimatedConditional>
|
||||
{settings?.edited && (
|
||||
<AnimatedConditional
|
||||
vertical
|
||||
isOpen={isProviderUnhealthy}
|
||||
className="w-full"
|
||||
>
|
||||
<ProviderHealthBanner />
|
||||
</AnimatedConditional>
|
||||
)}
|
||||
)}
|
||||
</div>
|
||||
|
||||
<ChatRenderer settings={settings}>{children}</ChatRenderer>
|
||||
|
|
|
|||
|
|
@ -59,6 +59,12 @@ DISABLE_INGEST_WITH_LANGFLOW = os.getenv(
|
|||
"DISABLE_INGEST_WITH_LANGFLOW", "false"
|
||||
).lower() in ("true", "1", "yes")
|
||||
|
||||
# Langflow HTTP timeout configuration (in seconds)
|
||||
# For large documents (300+ pages), ingestion can take 30+ minutes
|
||||
# Default: 40 minutes total, 40 minutes read timeout
|
||||
LANGFLOW_TIMEOUT = float(os.getenv("LANGFLOW_TIMEOUT", "2400")) # 40 minutes
|
||||
LANGFLOW_CONNECT_TIMEOUT = float(os.getenv("LANGFLOW_CONNECT_TIMEOUT", "30")) # 30 seconds
|
||||
|
||||
|
||||
def is_no_auth_mode():
|
||||
"""Check if we're running in no-auth mode (OAuth credentials missing)"""
|
||||
|
|
@ -317,9 +323,22 @@ class AppClients:
|
|||
# Initialize document converter
|
||||
self.converter = create_document_converter(ocr_engine=DOCLING_OCR_ENGINE)
|
||||
|
||||
# Initialize Langflow HTTP client
|
||||
# Initialize Langflow HTTP client with extended timeouts for large documents
|
||||
# Use explicit timeout configuration to handle large PDF ingestion (300+ pages)
|
||||
self.langflow_http_client = httpx.AsyncClient(
|
||||
base_url=LANGFLOW_URL, timeout=1200.0
|
||||
base_url=LANGFLOW_URL,
|
||||
timeout=httpx.Timeout(
|
||||
timeout=LANGFLOW_TIMEOUT, # Total timeout
|
||||
connect=LANGFLOW_CONNECT_TIMEOUT, # Connection timeout
|
||||
read=LANGFLOW_TIMEOUT, # Read timeout (most important for large PDFs)
|
||||
write=LANGFLOW_CONNECT_TIMEOUT, # Write timeout
|
||||
pool=LANGFLOW_CONNECT_TIMEOUT, # Pool timeout
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"Initialized Langflow HTTP client with extended timeouts",
|
||||
timeout_seconds=LANGFLOW_TIMEOUT,
|
||||
connect_timeout_seconds=LANGFLOW_CONNECT_TIMEOUT,
|
||||
)
|
||||
|
||||
return self
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class OneDriveConnector(BaseConnector):
|
|||
|
||||
# Required BaseConnector class attributes
|
||||
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
|
||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
|
||||
|
||||
# Connector metadata
|
||||
CONNECTOR_NAME = "OneDrive"
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ class SharePointConnector(BaseConnector):
|
|||
|
||||
# Required BaseConnector class attributes
|
||||
CLIENT_ID_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_ID"
|
||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET"
|
||||
CLIENT_SECRET_ENV_VAR = "MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET" # pragma: allowlist secret
|
||||
|
||||
# Connector metadata
|
||||
CONNECTOR_NAME = "SharePoint"
|
||||
|
|
|
|||
|
|
@ -197,10 +197,27 @@ class TaskProcessor:
|
|||
file_hash=file_hash,
|
||||
)
|
||||
|
||||
# Convert and extract
|
||||
result = clients.converter.convert(file_path)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
# Check if this is a .txt file - use simple processing instead of docling
|
||||
import os
|
||||
file_ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_ext == '.txt':
|
||||
# Simple text file processing without docling
|
||||
from utils.document_processing import process_text_file
|
||||
logger.info(
|
||||
"Processing as plain text file (bypassing docling)",
|
||||
file_path=file_path,
|
||||
file_hash=file_hash,
|
||||
)
|
||||
slim_doc = process_text_file(file_path)
|
||||
# Override filename with original_filename if provided
|
||||
if original_filename:
|
||||
slim_doc["filename"] = original_filename
|
||||
else:
|
||||
# Convert and extract using docling for other file types
|
||||
result = clients.converter.convert(file_path)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
|
||||
texts = [c["text"] for c in slim_doc["chunks"]]
|
||||
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ class DocumentService:
|
|||
async def process_upload_context(self, upload_file, filename: str = None):
|
||||
"""Process uploaded file and return content for context"""
|
||||
import io
|
||||
import os
|
||||
|
||||
if not filename:
|
||||
filename = upload_file.filename or "uploaded_document"
|
||||
|
|
@ -194,22 +195,37 @@ class DocumentService:
|
|||
content.write(chunk)
|
||||
content.seek(0) # Reset to beginning for reading
|
||||
|
||||
# Create DocumentStream and process with docling
|
||||
doc_stream = DocumentStream(name=filename, stream=content)
|
||||
result = clients.converter.convert(doc_stream)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
# Check if this is a .txt file - use simple processing
|
||||
file_ext = os.path.splitext(filename)[1].lower()
|
||||
|
||||
if file_ext == '.txt':
|
||||
# Simple text file processing for chat context
|
||||
text_content = content.read().decode('utf-8', errors='replace')
|
||||
|
||||
# For context, we don't need to chunk - just return the full content
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": text_content,
|
||||
"pages": 1, # Text files don't have pages
|
||||
"content_length": len(text_content),
|
||||
}
|
||||
else:
|
||||
# Create DocumentStream and process with docling
|
||||
doc_stream = DocumentStream(name=filename, stream=content)
|
||||
result = clients.converter.convert(doc_stream)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
|
||||
# Extract all text content
|
||||
all_text = []
|
||||
for chunk in slim_doc["chunks"]:
|
||||
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
||||
# Extract all text content
|
||||
all_text = []
|
||||
for chunk in slim_doc["chunks"]:
|
||||
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
||||
|
||||
full_content = "\n\n".join(all_text)
|
||||
full_content = "\n\n".join(all_text)
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": full_content,
|
||||
"pages": len(slim_doc["chunks"]),
|
||||
"content_length": len(full_content),
|
||||
}
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": full_content,
|
||||
"pages": len(slim_doc["chunks"]),
|
||||
"content_length": len(full_content),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -123,28 +123,29 @@ class EnvManager:
|
|||
import os
|
||||
|
||||
# Map env vars to config attributes
|
||||
attr_map = {
|
||||
"OPENAI_API_KEY": "openai_api_key",
|
||||
"ANTHROPIC_API_KEY": "anthropic_api_key",
|
||||
# These are environment variable names, not actual secrets
|
||||
attr_map = { # pragma: allowlist secret
|
||||
"OPENAI_API_KEY": "openai_api_key", # pragma: allowlist secret
|
||||
"ANTHROPIC_API_KEY": "anthropic_api_key", # pragma: allowlist secret
|
||||
"OLLAMA_ENDPOINT": "ollama_endpoint",
|
||||
"WATSONX_API_KEY": "watsonx_api_key",
|
||||
"WATSONX_API_KEY": "watsonx_api_key", # pragma: allowlist secret
|
||||
"WATSONX_ENDPOINT": "watsonx_endpoint",
|
||||
"WATSONX_PROJECT_ID": "watsonx_project_id",
|
||||
"OPENSEARCH_PASSWORD": "opensearch_password",
|
||||
"LANGFLOW_SECRET_KEY": "langflow_secret_key",
|
||||
"OPENSEARCH_PASSWORD": "opensearch_password", # pragma: allowlist secret
|
||||
"LANGFLOW_SECRET_KEY": "langflow_secret_key", # pragma: allowlist secret
|
||||
"LANGFLOW_SUPERUSER": "langflow_superuser",
|
||||
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password",
|
||||
"LANGFLOW_SUPERUSER_PASSWORD": "langflow_superuser_password", # pragma: allowlist secret
|
||||
"LANGFLOW_CHAT_FLOW_ID": "langflow_chat_flow_id",
|
||||
"LANGFLOW_INGEST_FLOW_ID": "langflow_ingest_flow_id",
|
||||
"LANGFLOW_URL_INGEST_FLOW_ID": "langflow_url_ingest_flow_id",
|
||||
"NUDGES_FLOW_ID": "nudges_flow_id",
|
||||
"GOOGLE_OAUTH_CLIENT_ID": "google_oauth_client_id",
|
||||
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret",
|
||||
"GOOGLE_OAUTH_CLIENT_SECRET": "google_oauth_client_secret", # pragma: allowlist secret
|
||||
"MICROSOFT_GRAPH_OAUTH_CLIENT_ID": "microsoft_graph_oauth_client_id",
|
||||
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret",
|
||||
"MICROSOFT_GRAPH_OAUTH_CLIENT_SECRET": "microsoft_graph_oauth_client_secret", # pragma: allowlist secret
|
||||
"WEBHOOK_BASE_URL": "webhook_base_url",
|
||||
"AWS_ACCESS_KEY_ID": "aws_access_key_id",
|
||||
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
|
||||
"AWS_SECRET_ACCESS_KEY": "aws_secret_access_key", # pragma: allowlist secret
|
||||
"LANGFLOW_PUBLIC_URL": "langflow_public_url",
|
||||
"OPENRAG_DOCUMENTS_PATHS": "openrag_documents_paths",
|
||||
"OPENSEARCH_DATA_PATH": "opensearch_data_path",
|
||||
|
|
|
|||
|
|
@ -119,6 +119,82 @@ def get_worker_converter():
|
|||
return _worker_converter
|
||||
|
||||
|
||||
def process_text_file(file_path: str) -> dict:
|
||||
"""
|
||||
Process a plain text file without using docling.
|
||||
Returns the same structure as extract_relevant() for consistency.
|
||||
|
||||
Args:
|
||||
file_path: Path to the .txt file
|
||||
|
||||
Returns:
|
||||
dict with keys: id, filename, mimetype, chunks
|
||||
"""
|
||||
import os
|
||||
from utils.hash_utils import hash_id
|
||||
|
||||
# Read the file
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
content = f.read()
|
||||
|
||||
# Compute hash
|
||||
file_hash = hash_id(file_path)
|
||||
filename = os.path.basename(file_path)
|
||||
|
||||
# Split content into chunks of ~1000 characters to match typical docling chunk sizes
|
||||
# This ensures embeddings stay within reasonable token limits
|
||||
chunk_size = 1000
|
||||
chunks = []
|
||||
|
||||
# Split by paragraphs first (double newline)
|
||||
paragraphs = content.split('\n\n')
|
||||
current_chunk = ""
|
||||
chunk_index = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
# If adding this paragraph would exceed chunk size, save current chunk
|
||||
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
|
||||
chunks.append({
|
||||
"page": chunk_index + 1, # Use chunk_index + 1 as "page" number
|
||||
"type": "text",
|
||||
"text": current_chunk.strip()
|
||||
})
|
||||
chunk_index += 1
|
||||
current_chunk = para
|
||||
else:
|
||||
if current_chunk:
|
||||
current_chunk += "\n\n" + para
|
||||
else:
|
||||
current_chunk = para
|
||||
|
||||
# Add the last chunk if any
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"page": chunk_index + 1,
|
||||
"type": "text",
|
||||
"text": current_chunk.strip()
|
||||
})
|
||||
|
||||
# If no chunks were created (empty file), create a single empty chunk
|
||||
if not chunks:
|
||||
chunks.append({
|
||||
"page": 1,
|
||||
"type": "text",
|
||||
"text": ""
|
||||
})
|
||||
|
||||
return {
|
||||
"id": file_hash,
|
||||
"filename": filename,
|
||||
"mimetype": "text/plain",
|
||||
"chunks": chunks,
|
||||
}
|
||||
|
||||
|
||||
def extract_relevant(doc_dict: dict) -> dict:
|
||||
"""
|
||||
Given the full export_to_dict() result:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue