ingsest refactor
This commit is contained in:
parent
0866b5218e
commit
c6ba47d118
8 changed files with 197 additions and 483 deletions
|
|
@ -134,7 +134,7 @@ export function KnowledgeDropdown({ active, variant = 'navigation' }: KnowledgeD
|
||||||
formData.append('file', files[0])
|
formData.append('file', files[0])
|
||||||
|
|
||||||
// Use router upload and ingest endpoint (automatically routes based on configuration)
|
// Use router upload and ingest endpoint (automatically routes based on configuration)
|
||||||
const uploadIngestRes = await fetch('/api/router/upload_ingest', {
|
const uploadIngestRes = await fetch('/api/upload', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
body: formData,
|
body: formData,
|
||||||
})
|
})
|
||||||
|
|
@ -463,4 +463,4 @@ export function KnowledgeDropdown({ active, variant = 'navigation' }: KnowledgeD
|
||||||
|
|
||||||
</>
|
</>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ function AdminPage() {
|
||||||
const formData = new FormData()
|
const formData = new FormData()
|
||||||
formData.append("file", selectedFile)
|
formData.append("file", selectedFile)
|
||||||
|
|
||||||
const response = await fetch("/api/router/upload_ingest", {
|
const response = await fetch("/api/upload", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: formData,
|
body: formData,
|
||||||
})
|
})
|
||||||
|
|
@ -326,4 +326,4 @@ export default function ProtectedAdminPage() {
|
||||||
<AdminPage />
|
<AdminPage />
|
||||||
</ProtectedRoute>
|
</ProtectedRoute>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,249 +6,6 @@ from utils.logging_config import get_logger
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
async def upload_user_file(
|
|
||||||
request: Request, langflow_file_service: LangflowFileService, session_manager
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
logger.debug("upload_user_file endpoint called")
|
|
||||||
form = await request.form()
|
|
||||||
upload_file = form.get("file")
|
|
||||||
if upload_file is None:
|
|
||||||
logger.error("No file provided in upload request")
|
|
||||||
return JSONResponse({"error": "Missing file"}, status_code=400)
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"Processing file", filename=upload_file.filename, size=upload_file.size
|
|
||||||
)
|
|
||||||
|
|
||||||
# starlette UploadFile provides file-like; httpx needs (filename, file, content_type)
|
|
||||||
content = await upload_file.read()
|
|
||||||
file_tuple = (
|
|
||||||
upload_file.filename,
|
|
||||||
content,
|
|
||||||
upload_file.content_type or "application/octet-stream",
|
|
||||||
)
|
|
||||||
|
|
||||||
jwt_token = getattr(request.state, "jwt_token", None)
|
|
||||||
logger.debug("JWT token status", jwt_present=jwt_token is not None)
|
|
||||||
|
|
||||||
logger.debug("Calling langflow_file_service.upload_user_file")
|
|
||||||
result = await langflow_file_service.upload_user_file(file_tuple, jwt_token)
|
|
||||||
logger.debug("Upload successful", result=result)
|
|
||||||
return JSONResponse(result, status_code=201)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
"upload_user_file endpoint failed",
|
|
||||||
error_type=type(e).__name__,
|
|
||||||
error=str(e),
|
|
||||||
)
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
logger.error("Full traceback", traceback=traceback.format_exc())
|
|
||||||
return JSONResponse({"error": str(e)}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
async def run_ingestion(
|
|
||||||
request: Request, langflow_file_service: LangflowFileService, session_manager
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
payload = await request.json()
|
|
||||||
file_ids = payload.get("file_ids")
|
|
||||||
file_paths = payload.get("file_paths") or []
|
|
||||||
session_id = payload.get("session_id")
|
|
||||||
tweaks = payload.get("tweaks") or {}
|
|
||||||
settings = payload.get("settings", {})
|
|
||||||
|
|
||||||
# We assume file_paths is provided. If only file_ids are provided, client would need to resolve to paths via Files API (not implemented here).
|
|
||||||
if not file_paths and not file_ids:
|
|
||||||
return JSONResponse(
|
|
||||||
{"error": "Provide file_paths or file_ids"}, status_code=400
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert UI settings to component tweaks using exact component IDs
|
|
||||||
if settings:
|
|
||||||
logger.debug("Applying ingestion settings", settings=settings)
|
|
||||||
|
|
||||||
# Split Text component tweaks (SplitText-QIKhg)
|
|
||||||
if (
|
|
||||||
settings.get("chunkSize")
|
|
||||||
or settings.get("chunkOverlap")
|
|
||||||
or settings.get("separator")
|
|
||||||
):
|
|
||||||
if "SplitText-QIKhg" not in tweaks:
|
|
||||||
tweaks["SplitText-QIKhg"] = {}
|
|
||||||
if settings.get("chunkSize"):
|
|
||||||
tweaks["SplitText-QIKhg"]["chunk_size"] = settings["chunkSize"]
|
|
||||||
if settings.get("chunkOverlap"):
|
|
||||||
tweaks["SplitText-QIKhg"]["chunk_overlap"] = settings[
|
|
||||||
"chunkOverlap"
|
|
||||||
]
|
|
||||||
if settings.get("separator"):
|
|
||||||
tweaks["SplitText-QIKhg"]["separator"] = settings["separator"]
|
|
||||||
|
|
||||||
# OpenAI Embeddings component tweaks (OpenAIEmbeddings-joRJ6)
|
|
||||||
if settings.get("embeddingModel"):
|
|
||||||
if "OpenAIEmbeddings-joRJ6" not in tweaks:
|
|
||||||
tweaks["OpenAIEmbeddings-joRJ6"] = {}
|
|
||||||
tweaks["OpenAIEmbeddings-joRJ6"]["model"] = settings["embeddingModel"]
|
|
||||||
|
|
||||||
# Note: OpenSearch component tweaks not needed for ingestion
|
|
||||||
# (search parameters are for retrieval, not document processing)
|
|
||||||
|
|
||||||
logger.debug("Final tweaks with settings applied", tweaks=tweaks)
|
|
||||||
# Include user JWT if available
|
|
||||||
jwt_token = getattr(request.state, "jwt_token", None)
|
|
||||||
|
|
||||||
# Extract user info from User object
|
|
||||||
user = getattr(request.state, "user", None)
|
|
||||||
user_id = user.user_id if user else None
|
|
||||||
user_name = user.name if user else None
|
|
||||||
user_email = user.email if user else None
|
|
||||||
|
|
||||||
if jwt_token:
|
|
||||||
# Set auth context for downstream services
|
|
||||||
from auth_context import set_auth_context
|
|
||||||
|
|
||||||
set_auth_context(user_id, jwt_token)
|
|
||||||
|
|
||||||
result = await langflow_file_service.run_ingestion_flow(
|
|
||||||
file_paths=file_paths or [],
|
|
||||||
jwt_token=jwt_token,
|
|
||||||
session_id=session_id,
|
|
||||||
tweaks=tweaks,
|
|
||||||
owner=user_id,
|
|
||||||
owner_name=user_name,
|
|
||||||
owner_email=user_email,
|
|
||||||
connector_type="local",
|
|
||||||
)
|
|
||||||
return JSONResponse(result)
|
|
||||||
except Exception as e:
|
|
||||||
return JSONResponse({"error": str(e)}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
async def upload_and_ingest_user_file(
|
|
||||||
request: Request, langflow_file_service: LangflowFileService, session_manager, task_service
|
|
||||||
):
|
|
||||||
"""Combined upload and ingest endpoint - uses task service for tracking and cancellation"""
|
|
||||||
try:
|
|
||||||
logger.debug("upload_and_ingest_user_file endpoint called - using task service")
|
|
||||||
form = await request.form()
|
|
||||||
upload_file = form.get("file")
|
|
||||||
if upload_file is None:
|
|
||||||
logger.error("No file provided in upload_and_ingest request")
|
|
||||||
return JSONResponse({"error": "Missing file"}, status_code=400)
|
|
||||||
|
|
||||||
# Extract optional parameters
|
|
||||||
session_id = form.get("session_id")
|
|
||||||
settings_json = form.get("settings")
|
|
||||||
tweaks_json = form.get("tweaks")
|
|
||||||
delete_after_ingest = form.get("delete_after_ingest", "true").lower() == "true"
|
|
||||||
|
|
||||||
# Parse JSON fields if provided
|
|
||||||
settings = None
|
|
||||||
tweaks = None
|
|
||||||
|
|
||||||
if settings_json:
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
settings = json.loads(settings_json)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error("Invalid settings JSON", error=str(e))
|
|
||||||
return JSONResponse({"error": "Invalid settings JSON"}, status_code=400)
|
|
||||||
|
|
||||||
if tweaks_json:
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
tweaks = json.loads(tweaks_json)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.error("Invalid tweaks JSON", error=str(e))
|
|
||||||
return JSONResponse({"error": "Invalid tweaks JSON"}, status_code=400)
|
|
||||||
|
|
||||||
# Get user info from request state
|
|
||||||
user = getattr(request.state, "user", None)
|
|
||||||
user_id = user.user_id if user else None
|
|
||||||
user_name = user.name if user else None
|
|
||||||
user_email = user.email if user else None
|
|
||||||
jwt_token = getattr(request.state, "jwt_token", None)
|
|
||||||
|
|
||||||
if not user_id:
|
|
||||||
return JSONResponse({"error": "User authentication required"}, status_code=401)
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"Processing file for task-based upload and ingest",
|
|
||||||
filename=upload_file.filename,
|
|
||||||
size=upload_file.size,
|
|
||||||
session_id=session_id,
|
|
||||||
has_settings=bool(settings),
|
|
||||||
has_tweaks=bool(tweaks),
|
|
||||||
delete_after_ingest=delete_after_ingest,
|
|
||||||
user_id=user_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create temporary file for task processing
|
|
||||||
import tempfile
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Read file content
|
|
||||||
content = await upload_file.read()
|
|
||||||
|
|
||||||
# Create temporary file
|
|
||||||
safe_filename = upload_file.filename.replace(" ", "_").replace("/", "_")
|
|
||||||
temp_fd, temp_path = tempfile.mkstemp(
|
|
||||||
suffix=f"_{safe_filename}"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Write content to temp file
|
|
||||||
with os.fdopen(temp_fd, 'wb') as temp_file:
|
|
||||||
temp_file.write(content)
|
|
||||||
|
|
||||||
logger.debug("Created temporary file for task processing", temp_path=temp_path)
|
|
||||||
|
|
||||||
# Create langflow upload task for single file
|
|
||||||
task_id = await task_service.create_langflow_upload_task(
|
|
||||||
user_id=user_id,
|
|
||||||
file_paths=[temp_path],
|
|
||||||
langflow_file_service=langflow_file_service,
|
|
||||||
session_manager=session_manager,
|
|
||||||
jwt_token=jwt_token,
|
|
||||||
owner_name=user_name,
|
|
||||||
owner_email=user_email,
|
|
||||||
session_id=session_id,
|
|
||||||
tweaks=tweaks,
|
|
||||||
settings=settings,
|
|
||||||
delete_after_ingest=delete_after_ingest,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug("Langflow upload task created successfully", task_id=task_id)
|
|
||||||
|
|
||||||
return JSONResponse({
|
|
||||||
"task_id": task_id,
|
|
||||||
"message": f"Langflow upload task created for file '{upload_file.filename}'",
|
|
||||||
"filename": upload_file.filename
|
|
||||||
}, status_code=202) # 202 Accepted for async processing
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
# Clean up temp file on error
|
|
||||||
try:
|
|
||||||
if os.path.exists(temp_path):
|
|
||||||
os.unlink(temp_path)
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore cleanup errors
|
|
||||||
raise
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
"upload_and_ingest_user_file endpoint failed",
|
|
||||||
error_type=type(e).__name__,
|
|
||||||
error=str(e),
|
|
||||||
)
|
|
||||||
import traceback
|
|
||||||
logger.error("Full traceback", traceback=traceback.format_exc())
|
|
||||||
return JSONResponse({"error": str(e)}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
async def delete_user_files(
|
async def delete_user_files(
|
||||||
request: Request, langflow_file_service: LangflowFileService, session_manager
|
request: Request, langflow_file_service: LangflowFileService, session_manager
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,8 @@
|
||||||
from starlette.requests import Request
|
from starlette.requests import Request
|
||||||
from starlette.responses import JSONResponse
|
from starlette.responses import JSONResponse
|
||||||
|
|
||||||
from config.settings import DISABLE_INGEST_WITH_LANGFLOW
|
|
||||||
from utils.logging_config import get_logger
|
from utils.logging_config import get_logger
|
||||||
|
from .upload_utils import extract_user_context, create_temp_files_from_form_files
|
||||||
# Import the actual endpoint implementations
|
|
||||||
from .upload import upload as traditional_upload
|
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
@ -29,20 +26,57 @@ async def upload_ingest_router(
|
||||||
All langflow uploads are processed as background tasks for better scalability.
|
All langflow uploads are processed as background tasks for better scalability.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.debug(
|
# Read setting at request time to avoid stale module-level values
|
||||||
"Router upload_ingest endpoint called",
|
from config import settings as cfg
|
||||||
disable_langflow_ingest=DISABLE_INGEST_WITH_LANGFLOW
|
disable_langflow_ingest = cfg.DISABLE_INGEST_WITH_LANGFLOW
|
||||||
)
|
logger.debug("Router upload_ingest endpoint called", disable_langflow_ingest=disable_langflow_ingest)
|
||||||
|
|
||||||
# Route based on configuration
|
# Route based on configuration
|
||||||
if DISABLE_INGEST_WITH_LANGFLOW:
|
if disable_langflow_ingest:
|
||||||
# Route to traditional OpenRAG upload
|
# Traditional OpenRAG path: create a background task via TaskService
|
||||||
logger.debug("Routing to traditional OpenRAG upload")
|
logger.debug("Routing to traditional OpenRAG upload via task service (async)")
|
||||||
return await traditional_upload(request, document_service, session_manager)
|
form = await request.form()
|
||||||
|
upload_files = form.getlist("file")
|
||||||
|
if not upload_files:
|
||||||
|
return JSONResponse({"error": "Missing file"}, status_code=400)
|
||||||
|
# Extract user context
|
||||||
|
ctx = await extract_user_context(request)
|
||||||
|
|
||||||
|
# Create temporary files
|
||||||
|
temp_file_paths = await create_temp_files_from_form_files(upload_files)
|
||||||
|
try:
|
||||||
|
# Create traditional upload task for all files
|
||||||
|
task_id = await task_service.create_upload_task(
|
||||||
|
ctx["owner_user_id"],
|
||||||
|
temp_file_paths,
|
||||||
|
jwt_token=ctx["jwt_token"],
|
||||||
|
owner_name=ctx["owner_name"],
|
||||||
|
owner_email=ctx["owner_email"],
|
||||||
|
)
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"task_id": task_id,
|
||||||
|
"message": f"Traditional upload task created for {len(upload_files)} file(s)",
|
||||||
|
"file_count": len(upload_files),
|
||||||
|
},
|
||||||
|
status_code=201,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Clean up temp files on error
|
||||||
|
import os
|
||||||
|
for p in temp_file_paths:
|
||||||
|
try:
|
||||||
|
if os.path.exists(p):
|
||||||
|
os.unlink(p)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
# Route to Langflow upload and ingest using task service
|
# Route to Langflow upload-ingest via task service for async processing (202 + task_id)
|
||||||
logger.debug("Routing to Langflow upload-ingest pipeline via task service")
|
logger.debug("Routing to Langflow upload-ingest pipeline via task service (async)")
|
||||||
return await langflow_upload_ingest_task(request, langflow_file_service, session_manager, task_service)
|
return await langflow_upload_ingest_task(
|
||||||
|
request, langflow_file_service, session_manager, task_service
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error in upload_ingest_router", error=str(e))
|
logger.error("Error in upload_ingest_router", error=str(e))
|
||||||
|
|
@ -98,37 +132,19 @@ async def langflow_upload_ingest_task(
|
||||||
logger.error("Invalid tweaks JSON", error=str(e))
|
logger.error("Invalid tweaks JSON", error=str(e))
|
||||||
return JSONResponse({"error": "Invalid tweaks JSON"}, status_code=400)
|
return JSONResponse({"error": "Invalid tweaks JSON"}, status_code=400)
|
||||||
|
|
||||||
# Get user info from request state
|
# Get user/auth context (allows no-auth mode)
|
||||||
user = getattr(request.state, "user", None)
|
ctx = await extract_user_context(request)
|
||||||
user_id = user.user_id if user else None
|
user_id = ctx["owner_user_id"]
|
||||||
user_name = user.name if user else None
|
user_name = ctx["owner_name"]
|
||||||
user_email = user.email if user else None
|
user_email = ctx["owner_email"]
|
||||||
jwt_token = getattr(request.state, "jwt_token", None)
|
jwt_token = ctx["jwt_token"]
|
||||||
|
|
||||||
if not user_id:
|
|
||||||
return JSONResponse({"error": "User authentication required"}, status_code=401)
|
|
||||||
|
|
||||||
# Create temporary files for task processing
|
# Create temporary files for task processing
|
||||||
import tempfile
|
|
||||||
import os
|
import os
|
||||||
temp_file_paths = []
|
temp_file_paths = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for upload_file in upload_files:
|
temp_file_paths = await create_temp_files_from_form_files(upload_files)
|
||||||
# Read file content
|
|
||||||
content = await upload_file.read()
|
|
||||||
|
|
||||||
# Create temporary file
|
|
||||||
safe_filename = upload_file.filename.replace(" ", "_").replace("/", "_")
|
|
||||||
temp_fd, temp_path = tempfile.mkstemp(
|
|
||||||
suffix=f"_{safe_filename}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Write content to temp file
|
|
||||||
with os.fdopen(temp_fd, 'wb') as temp_file:
|
|
||||||
temp_file.write(content)
|
|
||||||
|
|
||||||
temp_file_paths.append(temp_path)
|
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Created temporary files for task-based processing",
|
"Created temporary files for task-based processing",
|
||||||
|
|
@ -160,7 +176,7 @@ async def langflow_upload_ingest_task(
|
||||||
"task_id": task_id,
|
"task_id": task_id,
|
||||||
"message": f"Langflow upload task created for {len(upload_files)} file(s)",
|
"message": f"Langflow upload task created for {len(upload_files)} file(s)",
|
||||||
"file_count": len(upload_files)
|
"file_count": len(upload_files)
|
||||||
}, status_code=202) # 202 Accepted for async processing
|
}, status_code=201)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Clean up temp files on error
|
# Clean up temp files on error
|
||||||
|
|
|
||||||
|
|
@ -3,46 +3,7 @@ from urllib.parse import urlparse
|
||||||
import boto3
|
import boto3
|
||||||
from starlette.requests import Request
|
from starlette.requests import Request
|
||||||
from starlette.responses import JSONResponse
|
from starlette.responses import JSONResponse
|
||||||
|
from .upload_utils import extract_user_context
|
||||||
|
|
||||||
async def upload(request: Request, document_service, session_manager):
|
|
||||||
"""Upload a single file"""
|
|
||||||
try:
|
|
||||||
form = await request.form()
|
|
||||||
upload_file = form["file"]
|
|
||||||
user = request.state.user
|
|
||||||
jwt_token = request.state.jwt_token
|
|
||||||
|
|
||||||
from config.settings import is_no_auth_mode
|
|
||||||
|
|
||||||
# In no-auth mode, pass None for owner fields so documents have no owner
|
|
||||||
# This allows all users to see them when switching to auth mode
|
|
||||||
if is_no_auth_mode():
|
|
||||||
owner_user_id = None
|
|
||||||
owner_name = None
|
|
||||||
owner_email = None
|
|
||||||
else:
|
|
||||||
owner_user_id = user.user_id
|
|
||||||
owner_name = user.name
|
|
||||||
owner_email = user.email
|
|
||||||
|
|
||||||
result = await document_service.process_upload_file(
|
|
||||||
upload_file,
|
|
||||||
owner_user_id=owner_user_id,
|
|
||||||
jwt_token=jwt_token,
|
|
||||||
owner_name=owner_name,
|
|
||||||
owner_email=owner_email,
|
|
||||||
)
|
|
||||||
return JSONResponse(result, status_code=201) # Created
|
|
||||||
except Exception as e:
|
|
||||||
error_msg = str(e)
|
|
||||||
if (
|
|
||||||
"AuthenticationException" in error_msg
|
|
||||||
or "access denied" in error_msg.lower()
|
|
||||||
):
|
|
||||||
return JSONResponse({"error": error_msg}, status_code=403)
|
|
||||||
else:
|
|
||||||
return JSONResponse({"error": error_msg}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
async def upload_path(request: Request, task_service, session_manager, langflow_file_service):
|
async def upload_path(request: Request, task_service, session_manager, langflow_file_service):
|
||||||
|
|
@ -59,20 +20,11 @@ async def upload_path(request: Request, task_service, session_manager, langflow_
|
||||||
if not file_paths:
|
if not file_paths:
|
||||||
return JSONResponse({"error": "No files found in directory"}, status_code=400)
|
return JSONResponse({"error": "No files found in directory"}, status_code=400)
|
||||||
|
|
||||||
user = request.state.user
|
ctx = await extract_user_context(request)
|
||||||
jwt_token = request.state.jwt_token
|
owner_user_id = ctx["owner_user_id"]
|
||||||
|
owner_name = ctx["owner_name"]
|
||||||
from config.settings import is_no_auth_mode
|
owner_email = ctx["owner_email"]
|
||||||
|
jwt_token = ctx["jwt_token"]
|
||||||
# In no-auth mode, pass None for owner fields so documents have no owner
|
|
||||||
if is_no_auth_mode():
|
|
||||||
owner_user_id = None
|
|
||||||
owner_name = None
|
|
||||||
owner_email = None
|
|
||||||
else:
|
|
||||||
owner_user_id = user.user_id
|
|
||||||
owner_name = user.name
|
|
||||||
owner_email = user.email
|
|
||||||
|
|
||||||
from config.settings import DISABLE_INGEST_WITH_LANGFLOW
|
from config.settings import DISABLE_INGEST_WITH_LANGFLOW
|
||||||
|
|
||||||
|
|
@ -184,23 +136,15 @@ async def upload_bucket(request: Request, task_service, session_manager):
|
||||||
if not keys:
|
if not keys:
|
||||||
return JSONResponse({"error": "No files found in bucket"}, status_code=400)
|
return JSONResponse({"error": "No files found in bucket"}, status_code=400)
|
||||||
|
|
||||||
user = request.state.user
|
|
||||||
jwt_token = request.state.jwt_token
|
|
||||||
|
|
||||||
from models.processors import S3FileProcessor
|
from models.processors import S3FileProcessor
|
||||||
from config.settings import is_no_auth_mode
|
from .upload_utils import extract_user_context
|
||||||
|
|
||||||
# In no-auth mode, pass None for owner fields so documents have no owner
|
ctx = await extract_user_context(request)
|
||||||
if is_no_auth_mode():
|
owner_user_id = ctx["owner_user_id"]
|
||||||
owner_user_id = None
|
owner_name = ctx["owner_name"]
|
||||||
owner_name = None
|
owner_email = ctx["owner_email"]
|
||||||
owner_email = None
|
jwt_token = ctx["jwt_token"]
|
||||||
task_user_id = None
|
task_user_id = owner_user_id
|
||||||
else:
|
|
||||||
owner_user_id = user.user_id
|
|
||||||
owner_name = user.name
|
|
||||||
owner_email = user.email
|
|
||||||
task_user_id = user.user_id
|
|
||||||
|
|
||||||
processor = S3FileProcessor(
|
processor = S3FileProcessor(
|
||||||
task_service.document_service,
|
task_service.document_service,
|
||||||
|
|
|
||||||
47
src/api/upload_utils.py
Normal file
47
src/api/upload_utils.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from starlette.requests import Request
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_user_context(request: Request) -> dict:
|
||||||
|
"""Extract user/auth context from request.state. Honors no-auth mode."""
|
||||||
|
from config.settings import is_no_auth_mode
|
||||||
|
|
||||||
|
user = getattr(request.state, "user", None)
|
||||||
|
jwt_token = getattr(request.state, "jwt_token", None)
|
||||||
|
|
||||||
|
if is_no_auth_mode():
|
||||||
|
return {
|
||||||
|
"owner_user_id": None,
|
||||||
|
"owner_name": None,
|
||||||
|
"owner_email": None,
|
||||||
|
"jwt_token": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"owner_user_id": getattr(user, "user_id", None),
|
||||||
|
"owner_name": getattr(user, "name", None),
|
||||||
|
"owner_email": getattr(user, "email", None),
|
||||||
|
"jwt_token": jwt_token,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def create_temp_files_from_form_files(upload_files: List) -> list[str]:
|
||||||
|
"""Persist UploadFile items to temp files; return list of paths."""
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
temp_file_paths: list[str] = []
|
||||||
|
for upload_file in upload_files:
|
||||||
|
content = await upload_file.read()
|
||||||
|
safe_filename = (
|
||||||
|
upload_file.filename.replace(" ", "_").replace("/", "_")
|
||||||
|
if getattr(upload_file, "filename", None)
|
||||||
|
else "uploaded"
|
||||||
|
)
|
||||||
|
fd, temp_path = tempfile.mkstemp(suffix=f"_{safe_filename}")
|
||||||
|
with os.fdopen(fd, "wb") as temp_file:
|
||||||
|
temp_file.write(content)
|
||||||
|
temp_file_paths.append(temp_path)
|
||||||
|
return temp_file_paths
|
||||||
|
|
||||||
195
src/main.py
195
src/main.py
|
|
@ -263,96 +263,60 @@ async def ingest_default_documents_when_ready(services):
|
||||||
|
|
||||||
|
|
||||||
async def _ingest_default_documents_langflow(services, file_paths):
|
async def _ingest_default_documents_langflow(services, file_paths):
|
||||||
"""Ingest default documents using Langflow upload-ingest-delete pipeline."""
|
"""Ingest default documents using Langflow via a single background task (aligned with router semantics)."""
|
||||||
langflow_file_service = services["langflow_file_service"]
|
langflow_file_service = services["langflow_file_service"]
|
||||||
session_manager = services["session_manager"]
|
session_manager = services["session_manager"]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Using Langflow ingestion pipeline for default documents",
|
"Using Langflow ingestion pipeline for default documents (task-based)",
|
||||||
file_count=len(file_paths),
|
file_count=len(file_paths),
|
||||||
)
|
)
|
||||||
|
|
||||||
success_count = 0
|
# Use AnonymousUser for default documents
|
||||||
error_count = 0
|
from session_manager import AnonymousUser
|
||||||
|
|
||||||
for file_path in file_paths:
|
anonymous_user = AnonymousUser()
|
||||||
try:
|
|
||||||
logger.debug("Processing file with Langflow pipeline", file_path=file_path)
|
|
||||||
|
|
||||||
# Read file content
|
# Ensure an (anonymous) JWT is available for OpenSearch/flow auth
|
||||||
with open(file_path, "rb") as f:
|
effective_jwt = None
|
||||||
content = f.read()
|
try:
|
||||||
|
session_manager.get_user_opensearch_client(anonymous_user.user_id, None)
|
||||||
|
if hasattr(session_manager, "_anonymous_jwt"):
|
||||||
|
effective_jwt = session_manager._anonymous_jwt
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Create file tuple for upload
|
# Prepare tweaks with anonymous metadata for OpenSearch component
|
||||||
filename = os.path.basename(file_path)
|
default_tweaks = {
|
||||||
# Determine content type based on file extension
|
"OpenSearchHybrid-Ve6bS": {
|
||||||
content_type, _ = mimetypes.guess_type(filename)
|
"docs_metadata": [
|
||||||
if not content_type:
|
{"key": "owner", "value": None},
|
||||||
content_type = "application/octet-stream"
|
{"key": "owner_name", "value": anonymous_user.name},
|
||||||
|
{"key": "owner_email", "value": anonymous_user.email},
|
||||||
|
{"key": "connector_type", "value": "system_default"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
file_tuple = (filename, content, content_type)
|
# Create a single task to process all default documents through Langflow
|
||||||
|
task_id = await services["task_service"].create_langflow_upload_task(
|
||||||
# Use AnonymousUser details for default documents
|
user_id=anonymous_user.user_id,
|
||||||
from session_manager import AnonymousUser
|
file_paths=file_paths,
|
||||||
|
langflow_file_service=langflow_file_service,
|
||||||
anonymous_user = AnonymousUser()
|
session_manager=session_manager,
|
||||||
|
jwt_token=effective_jwt,
|
||||||
# Get JWT token using same logic as DocumentFileProcessor
|
owner_name=anonymous_user.name,
|
||||||
# This will handle anonymous JWT creation if needed for anonymous user
|
owner_email=anonymous_user.email,
|
||||||
effective_jwt = None
|
session_id=None,
|
||||||
|
tweaks=default_tweaks,
|
||||||
# Let session manager handle anonymous JWT creation if needed
|
settings=None,
|
||||||
if session_manager:
|
delete_after_ingest=True,
|
||||||
# This call will create anonymous JWT if needed (same as DocumentFileProcessor)
|
)
|
||||||
session_manager.get_user_opensearch_client(
|
|
||||||
anonymous_user.user_id, effective_jwt
|
|
||||||
)
|
|
||||||
# Get the JWT that was created by session manager
|
|
||||||
if hasattr(session_manager, "_anonymous_jwt"):
|
|
||||||
effective_jwt = session_manager._anonymous_jwt
|
|
||||||
|
|
||||||
# Prepare tweaks for default documents with anonymous user metadata
|
|
||||||
default_tweaks = {
|
|
||||||
"OpenSearchHybrid-Ve6bS": {
|
|
||||||
"docs_metadata": [
|
|
||||||
{"key": "owner", "value": None},
|
|
||||||
{"key": "owner_name", "value": anonymous_user.name},
|
|
||||||
{"key": "owner_email", "value": anonymous_user.email},
|
|
||||||
{"key": "connector_type", "value": "system_default"},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Use langflow upload_and_ingest_file method with JWT token
|
|
||||||
result = await langflow_file_service.upload_and_ingest_file(
|
|
||||||
file_tuple=file_tuple,
|
|
||||||
session_id=None, # No session for default documents
|
|
||||||
tweaks=default_tweaks, # Add anonymous user metadata
|
|
||||||
settings=None, # Use default ingestion settings
|
|
||||||
jwt_token=effective_jwt, # Use JWT token (anonymous if needed)
|
|
||||||
delete_after_ingest=True, # Clean up after ingestion
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Successfully ingested file via Langflow",
|
|
||||||
file_path=file_path,
|
|
||||||
result_status=result.get("status"),
|
|
||||||
)
|
|
||||||
success_count += 1
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
"Failed to ingest file via Langflow",
|
|
||||||
file_path=file_path,
|
|
||||||
error=str(e),
|
|
||||||
)
|
|
||||||
error_count += 1
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Langflow ingestion completed",
|
"Started Langflow ingestion task for default documents",
|
||||||
success_count=success_count,
|
task_id=task_id,
|
||||||
error_count=error_count,
|
file_count=len(file_paths),
|
||||||
total_files=len(file_paths),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -486,41 +450,7 @@ async def create_app():
|
||||||
|
|
||||||
# Create route handlers with service dependencies injected
|
# Create route handlers with service dependencies injected
|
||||||
routes = [
|
routes = [
|
||||||
# Upload endpoints
|
# Langflow direct upload/ingest endpoints removed in favor of router (/router/upload_ingest)
|
||||||
Route(
|
|
||||||
"/upload",
|
|
||||||
require_auth(services["session_manager"])(
|
|
||||||
partial(
|
|
||||||
upload.upload,
|
|
||||||
document_service=services["document_service"],
|
|
||||||
session_manager=services["session_manager"],
|
|
||||||
)
|
|
||||||
),
|
|
||||||
methods=["POST"],
|
|
||||||
),
|
|
||||||
# Langflow Files endpoints
|
|
||||||
Route(
|
|
||||||
"/langflow/files/upload",
|
|
||||||
optional_auth(services["session_manager"])(
|
|
||||||
partial(
|
|
||||||
langflow_files.upload_user_file,
|
|
||||||
langflow_file_service=services["langflow_file_service"],
|
|
||||||
session_manager=services["session_manager"],
|
|
||||||
)
|
|
||||||
),
|
|
||||||
methods=["POST"],
|
|
||||||
),
|
|
||||||
Route(
|
|
||||||
"/langflow/ingest",
|
|
||||||
require_auth(services["session_manager"])(
|
|
||||||
partial(
|
|
||||||
langflow_files.run_ingestion,
|
|
||||||
langflow_file_service=services["langflow_file_service"],
|
|
||||||
session_manager=services["session_manager"],
|
|
||||||
)
|
|
||||||
),
|
|
||||||
methods=["POST"],
|
|
||||||
),
|
|
||||||
Route(
|
Route(
|
||||||
"/langflow/files",
|
"/langflow/files",
|
||||||
require_auth(services["session_manager"])(
|
require_auth(services["session_manager"])(
|
||||||
|
|
@ -532,18 +462,6 @@ async def create_app():
|
||||||
),
|
),
|
||||||
methods=["DELETE"],
|
methods=["DELETE"],
|
||||||
),
|
),
|
||||||
Route(
|
|
||||||
"/langflow/upload_ingest",
|
|
||||||
require_auth(services["session_manager"])(
|
|
||||||
partial(
|
|
||||||
langflow_files.upload_and_ingest_user_file,
|
|
||||||
langflow_file_service=services["langflow_file_service"],
|
|
||||||
session_manager=services["session_manager"],
|
|
||||||
task_service=services["task_service"],
|
|
||||||
)
|
|
||||||
),
|
|
||||||
methods=["POST"],
|
|
||||||
),
|
|
||||||
Route(
|
Route(
|
||||||
"/upload_context",
|
"/upload_context",
|
||||||
require_auth(services["session_manager"])(
|
require_auth(services["session_manager"])(
|
||||||
|
|
@ -939,7 +857,7 @@ async def create_app():
|
||||||
methods=["POST"],
|
methods=["POST"],
|
||||||
),
|
),
|
||||||
Route(
|
Route(
|
||||||
"/router/upload_ingest",
|
"/upload",
|
||||||
require_auth(services["session_manager"])(
|
require_auth(services["session_manager"])(
|
||||||
partial(
|
partial(
|
||||||
router.upload_ingest_router,
|
router.upload_ingest_router,
|
||||||
|
|
@ -969,6 +887,33 @@ async def create_app():
|
||||||
@app.on_event("shutdown")
|
@app.on_event("shutdown")
|
||||||
async def shutdown_event():
|
async def shutdown_event():
|
||||||
await cleanup_subscriptions_proper(services)
|
await cleanup_subscriptions_proper(services)
|
||||||
|
# Close HTTP/OpenSearch clients cleanly
|
||||||
|
try:
|
||||||
|
from config.settings import clients as _clients
|
||||||
|
|
||||||
|
if getattr(_clients, "langflow_http_client", None):
|
||||||
|
try:
|
||||||
|
await _clients.langflow_http_client.aclose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if getattr(_clients, "opensearch", None):
|
||||||
|
try:
|
||||||
|
await _clients.opensearch.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Close any per-user OpenSearch clients
|
||||||
|
try:
|
||||||
|
sm = services.get("session_manager")
|
||||||
|
if sm and getattr(sm, "user_opensearch_clients", None):
|
||||||
|
for oc in sm.user_opensearch_clients.values():
|
||||||
|
try:
|
||||||
|
await oc.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -215,7 +215,12 @@ class DocumentService:
|
||||||
):
|
):
|
||||||
"""Process an uploaded file from form data"""
|
"""Process an uploaded file from form data"""
|
||||||
sha256 = hashlib.sha256()
|
sha256 = hashlib.sha256()
|
||||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
# Preserve file extension so the converter can detect format
|
||||||
|
try:
|
||||||
|
_, ext = os.path.splitext(getattr(upload_file, "filename", "") or "")
|
||||||
|
except Exception:
|
||||||
|
ext = ""
|
||||||
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
|
||||||
file_size = 0
|
file_size = 0
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue