auto cleanup temp files

This commit is contained in:
phact 2025-09-29 14:08:36 -04:00
parent cb58b2165e
commit 6612388586
7 changed files with 246 additions and 207 deletions

View file

@ -231,11 +231,8 @@ async def upload_and_ingest_user_file(
except Exception: except Exception:
# Clean up temp file on error # Clean up temp file on error
try: from utils.file_utils import safe_unlink
if os.path.exists(temp_path): safe_unlink(temp_path)
os.unlink(temp_path)
except Exception:
pass # Ignore cleanup errors
raise raise
except Exception as e: except Exception as e:

View file

@ -164,12 +164,9 @@ async def langflow_upload_ingest_task(
except Exception: except Exception:
# Clean up temp files on error # Clean up temp files on error
from utils.file_utils import safe_unlink
for temp_path in temp_file_paths: for temp_path in temp_file_paths:
try: safe_unlink(temp_path)
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception:
pass # Ignore cleanup errors
raise raise
except Exception as e: except Exception as e:

View file

@ -53,45 +53,46 @@ class LangflowConnectorService:
filename=document.filename, filename=document.filename,
) )
from utils.file_utils import auto_cleanup_tempfile
suffix = self._get_file_extension(document.mimetype) suffix = self._get_file_extension(document.mimetype)
# Create temporary file from document content # Create temporary file from document content
with tempfile.NamedTemporaryFile( with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
delete=False, suffix=suffix # Write document content to temp file
) as tmp_file: with open(tmp_path, 'wb') as f:
tmp_file.write(document.content) f.write(document.content)
tmp_file.flush()
# Step 1: Upload file to Langflow
logger.debug("Uploading file to Langflow", filename=document.filename)
content = document.content
file_tuple = (
document.filename.replace(" ", "_").replace("/", "_")+suffix,
content,
document.mimetype or "application/octet-stream",
)
upload_result = await self.langflow_service.upload_user_file(
file_tuple, jwt_token
)
langflow_file_id = upload_result["id"]
langflow_file_path = upload_result["path"]
logger.debug(
"File uploaded to Langflow",
file_id=langflow_file_id,
path=langflow_file_path,
)
# Step 2: Run ingestion flow with the uploaded file
logger.debug(
"Running Langflow ingestion flow", file_path=langflow_file_path
)
# Use the same tweaks pattern as LangflowFileService
tweaks = {} # Let Langflow handle the ingestion with default settings
try: try:
# Step 1: Upload file to Langflow
logger.debug("Uploading file to Langflow", filename=document.filename)
content = document.content
file_tuple = (
document.filename.replace(" ", "_").replace("/", "_")+suffix,
content,
document.mimetype or "application/octet-stream",
)
upload_result = await self.langflow_service.upload_user_file(
file_tuple, jwt_token
)
langflow_file_id = upload_result["id"]
langflow_file_path = upload_result["path"]
logger.debug(
"File uploaded to Langflow",
file_id=langflow_file_id,
path=langflow_file_path,
)
# Step 2: Run ingestion flow with the uploaded file
logger.debug(
"Running Langflow ingestion flow", file_path=langflow_file_path
)
# Use the same tweaks pattern as LangflowFileService
tweaks = {} # Let Langflow handle the ingestion with default settings
ingestion_result = await self.langflow_service.run_ingestion_flow( ingestion_result = await self.langflow_service.run_ingestion_flow(
file_paths=[langflow_file_path], file_paths=[langflow_file_path],
jwt_token=jwt_token, jwt_token=jwt_token,
@ -125,25 +126,20 @@ class LangflowConnectorService:
error=str(e), error=str(e),
) )
# Try to clean up Langflow file if upload succeeded but processing failed # Try to clean up Langflow file if upload succeeded but processing failed
if "langflow_file_id" in locals(): try:
try: await self.langflow_service.delete_user_file(langflow_file_id)
await self.langflow_service.delete_user_file(langflow_file_id) logger.debug(
logger.debug( "Cleaned up Langflow file after error",
"Cleaned up Langflow file after error", file_id=langflow_file_id,
file_id=langflow_file_id, )
) except Exception as cleanup_error:
except Exception as cleanup_error: logger.warning(
logger.warning( "Failed to cleanup Langflow file",
"Failed to cleanup Langflow file", file_id=langflow_file_id,
file_id=langflow_file_id, error=str(cleanup_error),
error=str(cleanup_error), )
)
raise raise
finally:
# Clean up temporary file
os.unlink(tmp_file.name)
def _get_file_extension(self, mimetype: str) -> str: def _get_file_extension(self, mimetype: str) -> str:
"""Get file extension based on MIME type""" """Get file extension based on MIME type"""
mime_to_ext = { mime_to_ext = {

View file

@ -54,54 +54,50 @@ class ConnectorService:
"""Process a document from a connector using existing processing pipeline""" """Process a document from a connector using existing processing pipeline"""
# Create temporary file from document content # Create temporary file from document content
with tempfile.NamedTemporaryFile( from utils.file_utils import auto_cleanup_tempfile
delete=False, suffix=self._get_file_extension(document.mimetype)
) as tmp_file:
tmp_file.write(document.content)
tmp_file.flush()
try: with auto_cleanup_tempfile(suffix=self._get_file_extension(document.mimetype)) as tmp_path:
# Use existing process_file_common function with connector document metadata # Write document content to temp file
# We'll use the document service's process_file_common method with open(tmp_path, 'wb') as f:
from services.document_service import DocumentService f.write(document.content)
doc_service = DocumentService(session_manager=self.session_manager) # Use existing process_file_common function with connector document metadata
# We'll use the document service's process_file_common method
from services.document_service import DocumentService
logger.debug("Processing connector document", document_id=document.id) doc_service = DocumentService(session_manager=self.session_manager)
# Process using consolidated processing pipeline logger.debug("Processing connector document", document_id=document.id)
from models.processors import TaskProcessor
processor = TaskProcessor(document_service=doc_service) # Process using consolidated processing pipeline
result = await processor.process_document_standard( from models.processors import TaskProcessor
file_path=tmp_file.name, processor = TaskProcessor(document_service=doc_service)
file_hash=document.id, # Use connector document ID as hash result = await processor.process_document_standard(
owner_user_id=owner_user_id, file_path=tmp_path,
original_filename=document.filename, # Pass the original Google Doc title file_hash=document.id, # Use connector document ID as hash
jwt_token=jwt_token, owner_user_id=owner_user_id,
owner_name=owner_name, original_filename=document.filename, # Pass the original Google Doc title
owner_email=owner_email, jwt_token=jwt_token,
file_size=len(document.content) if document.content else 0, owner_name=owner_name,
connector_type=connector_type, owner_email=owner_email,
file_size=len(document.content) if document.content else 0,
connector_type=connector_type,
)
logger.debug("Document processing result", result=result)
# If successfully indexed or already exists, update the indexed documents with connector metadata
if result["status"] in ["indexed", "unchanged"]:
# Update all chunks with connector-specific metadata
await self._update_connector_metadata(
document, owner_user_id, connector_type, jwt_token
) )
logger.debug("Document processing result", result=result) return {
**result,
# If successfully indexed or already exists, update the indexed documents with connector metadata "filename": document.filename,
if result["status"] in ["indexed", "unchanged"]: "source_url": document.source_url,
# Update all chunks with connector-specific metadata }
await self._update_connector_metadata(
document, owner_user_id, connector_type, jwt_token
)
return {
**result,
"filename": document.filename,
"source_url": document.source_url,
}
finally:
# Clean up temporary file
os.unlink(tmp_file.name)
async def _update_connector_metadata( async def _update_connector_metadata(
self, self,

View file

@ -277,37 +277,35 @@ class ConnectorFileProcessor(TaskProcessor):
raise ValueError("user_id not provided to ConnectorFileProcessor") raise ValueError("user_id not provided to ConnectorFileProcessor")
# Create temporary file from document content # Create temporary file from document content
from utils.file_utils import auto_cleanup_tempfile
suffix = self.connector_service._get_file_extension(document.mimetype) suffix = self.connector_service._get_file_extension(document.mimetype)
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file: with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
tmp_file.write(document.content) # Write content to temp file
tmp_file.flush() with open(tmp_path, 'wb') as f:
f.write(document.content)
try: # Compute hash
# Compute hash file_hash = hash_id(tmp_path)
file_hash = hash_id(tmp_file.name)
# Use consolidated standard processing # Use consolidated standard processing
result = await self.process_document_standard( result = await self.process_document_standard(
file_path=tmp_file.name, file_path=tmp_path,
file_hash=file_hash, file_hash=file_hash,
owner_user_id=self.user_id, owner_user_id=self.user_id,
original_filename=document.filename, original_filename=document.filename,
jwt_token=self.jwt_token, jwt_token=self.jwt_token,
owner_name=self.owner_name, owner_name=self.owner_name,
owner_email=self.owner_email, owner_email=self.owner_email,
file_size=len(document.content), file_size=len(document.content),
connector_type=connection.connector_type, connector_type=connection.connector_type,
) )
# Add connector-specific metadata # Add connector-specific metadata
result.update({ result.update({
"source_url": document.source_url, "source_url": document.source_url,
"document_id": document.id, "document_id": document.id,
}) })
finally:
if os.path.exists(tmp_file.name):
os.unlink(tmp_file.name)
file_task.status = TaskStatus.COMPLETED file_task.status = TaskStatus.COMPLETED
file_task.result = result file_task.result = result
@ -379,39 +377,37 @@ class LangflowConnectorFileProcessor(TaskProcessor):
raise ValueError("user_id not provided to LangflowConnectorFileProcessor") raise ValueError("user_id not provided to LangflowConnectorFileProcessor")
# Create temporary file and compute hash to check for duplicates # Create temporary file and compute hash to check for duplicates
from utils.file_utils import auto_cleanup_tempfile
suffix = self.langflow_connector_service._get_file_extension(document.mimetype) suffix = self.langflow_connector_service._get_file_extension(document.mimetype)
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file: with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
tmp_file.write(document.content) # Write content to temp file
tmp_file.flush() with open(tmp_path, 'wb') as f:
f.write(document.content)
try: # Compute hash and check if already exists
# Compute hash and check if already exists file_hash = hash_id(tmp_path)
file_hash = hash_id(tmp_file.name)
# Check if document already exists # Check if document already exists
opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client( opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client(
self.user_id, self.jwt_token self.user_id, self.jwt_token
) )
if await self.check_document_exists(file_hash, opensearch_client): if await self.check_document_exists(file_hash, opensearch_client):
file_task.status = TaskStatus.COMPLETED file_task.status = TaskStatus.COMPLETED
file_task.result = {"status": "unchanged", "id": file_hash} file_task.result = {"status": "unchanged", "id": file_hash}
file_task.updated_at = time.time() file_task.updated_at = time.time()
upload_task.successful_files += 1 upload_task.successful_files += 1
return return
# Process using Langflow pipeline # Process using Langflow pipeline
result = await self.langflow_connector_service.process_connector_document( result = await self.langflow_connector_service.process_connector_document(
document, document,
self.user_id, self.user_id,
connection.connector_type, connection.connector_type,
jwt_token=self.jwt_token, jwt_token=self.jwt_token,
owner_name=self.owner_name, owner_name=self.owner_name,
owner_email=self.owner_email, owner_email=self.owner_email,
) )
finally:
if os.path.exists(tmp_file.name):
os.unlink(tmp_file.name)
file_task.status = TaskStatus.COMPLETED file_task.status = TaskStatus.COMPLETED
file_task.result = result file_task.result = result
@ -466,48 +462,48 @@ class S3FileProcessor(TaskProcessor):
file_task.status = TaskStatus.RUNNING file_task.status = TaskStatus.RUNNING
file_task.updated_at = time.time() file_task.updated_at = time.time()
tmp = tempfile.NamedTemporaryFile(delete=False) from utils.file_utils import auto_cleanup_tempfile
from utils.hash_utils import hash_id
try: try:
# Download object to temporary file with auto_cleanup_tempfile() as tmp_path:
self.s3_client.download_fileobj(self.bucket, item, tmp) # Download object to temporary file
tmp.flush() with open(tmp_path, 'wb') as tmp_file:
self.s3_client.download_fileobj(self.bucket, item, tmp_file)
# Compute hash # Compute hash
from utils.hash_utils import hash_id file_hash = hash_id(tmp_path)
file_hash = hash_id(tmp.name)
# Get object size # Get object size
try: try:
obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item) obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item)
file_size = obj_info.get("ContentLength", 0) file_size = obj_info.get("ContentLength", 0)
except Exception: except Exception:
file_size = 0 file_size = 0
# Use consolidated standard processing # Use consolidated standard processing
result = await self.process_document_standard( result = await self.process_document_standard(
file_path=tmp.name, file_path=tmp_path,
file_hash=file_hash, file_hash=file_hash,
owner_user_id=self.owner_user_id, owner_user_id=self.owner_user_id,
original_filename=item, # Use S3 key as filename original_filename=item, # Use S3 key as filename
jwt_token=self.jwt_token, jwt_token=self.jwt_token,
owner_name=self.owner_name, owner_name=self.owner_name,
owner_email=self.owner_email, owner_email=self.owner_email,
file_size=file_size, file_size=file_size,
connector_type="s3", connector_type="s3",
) )
result["path"] = f"s3://{self.bucket}/{item}" result["path"] = f"s3://{self.bucket}/{item}"
file_task.status = TaskStatus.COMPLETED file_task.status = TaskStatus.COMPLETED
file_task.result = result file_task.result = result
upload_task.successful_files += 1 upload_task.successful_files += 1
except Exception as e: except Exception as e:
file_task.status = TaskStatus.FAILED file_task.status = TaskStatus.FAILED
file_task.error = str(e) file_task.error = str(e)
upload_task.failed_files += 1 upload_task.failed_files += 1
finally: finally:
if os.path.exists(tmp.name):
os.unlink(tmp.name)
file_task.updated_at = time.time() file_task.updated_at = time.time()

View file

@ -123,19 +123,21 @@ class DocumentService:
): ):
"""Process an uploaded file from form data""" """Process an uploaded file from form data"""
from utils.hash_utils import hash_id from utils.hash_utils import hash_id
from utils.file_utils import auto_cleanup_tempfile
import os import os
tmp = tempfile.NamedTemporaryFile(delete=False)
file_size = 0
try:
while True:
chunk = await upload_file.read(1 << 20)
if not chunk:
break
tmp.write(chunk)
file_size += len(chunk)
tmp.flush()
file_hash = hash_id(tmp.name) with auto_cleanup_tempfile() as tmp_path:
# Stream upload file to temporary file
file_size = 0
with open(tmp_path, 'wb') as tmp_file:
while True:
chunk = await upload_file.read(1 << 20)
if not chunk:
break
tmp_file.write(chunk)
file_size += len(chunk)
file_hash = hash_id(tmp_path)
# Get user's OpenSearch client with JWT for OIDC auth # Get user's OpenSearch client with JWT for OIDC auth
opensearch_client = self.session_manager.get_user_opensearch_client( opensearch_client = self.session_manager.get_user_opensearch_client(
owner_user_id, jwt_token owner_user_id, jwt_token
@ -149,14 +151,13 @@ class DocumentService:
) )
raise raise
if exists: if exists:
os.unlink(tmp.name) # Delete temp file since we don't need it
return {"status": "unchanged", "id": file_hash} return {"status": "unchanged", "id": file_hash}
# Use consolidated standard processing # Use consolidated standard processing
from models.processors import TaskProcessor from models.processors import TaskProcessor
processor = TaskProcessor(document_service=self) processor = TaskProcessor(document_service=self)
result = await processor.process_document_standard( result = await processor.process_document_standard(
file_path=tmp.name, file_path=tmp_path,
file_hash=file_hash, file_hash=file_hash,
owner_user_id=owner_user_id, owner_user_id=owner_user_id,
original_filename=upload_file.filename, original_filename=upload_file.filename,
@ -168,10 +169,6 @@ class DocumentService:
) )
return result return result
finally:
tmp.close()
os.remove(tmp.name)
async def process_upload_context(self, upload_file, filename: str = None): async def process_upload_context(self, upload_file, filename: str = None):
"""Process uploaded file and return content for context""" """Process uploaded file and return content for context"""
import io import io

60
src/utils/file_utils.py Normal file
View file

@ -0,0 +1,60 @@
"""File handling utilities for OpenRAG"""
import os
import tempfile
from contextlib import contextmanager
from typing import Optional
@contextmanager
def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None):
"""
Context manager for temporary files that automatically cleans up.
Unlike tempfile.NamedTemporaryFile with delete=True, this keeps the file
on disk for the duration of the context, making it safe for async operations.
Usage:
with auto_cleanup_tempfile(suffix=".pdf") as tmp_path:
# Write to the file
with open(tmp_path, 'wb') as f:
f.write(content)
# Use tmp_path for processing
result = await process_file(tmp_path)
# File is automatically deleted here
Args:
suffix: Optional file suffix/extension (e.g., ".pdf")
prefix: Optional file prefix
dir: Optional directory for temp file
Yields:
str: Path to the temporary file
"""
fd, path = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir)
try:
os.close(fd) # Close the file descriptor immediately
yield path
finally:
# Always clean up, even if an exception occurred
try:
if os.path.exists(path):
os.unlink(path)
except Exception:
# Silently ignore cleanup errors
pass
def safe_unlink(path: str) -> None:
"""
Safely delete a file, ignoring errors if it doesn't exist.
Args:
path: Path to the file to delete
"""
try:
if path and os.path.exists(path):
os.unlink(path)
except Exception:
# Silently ignore errors
pass