auto cleanup temp files

2025-09-29 14:08:36 -04:00 · 2025-09-29 14:08:36 -04:00 · 6612388586
commit 6612388586
parent cb58b2165e
7 changed files with 246 additions and 207 deletions
--- a/src/api/langflow_files.py
+++ b/src/api/langflow_files.py
@ -231,11 +231,8 @@ async def upload_and_ingest_user_file(
        except Exception:
            # Clean up temp file on error
-            try:
+            from utils.file_utils import safe_unlink
-                if os.path.exists(temp_path):
+            safe_unlink(temp_path)
                    os.unlink(temp_path)
            except Exception:
                pass  # Ignore cleanup errors
            raise
    except Exception as e:
--- a/src/api/router.py
+++ b/src/api/router.py
@ -164,12 +164,9 @@ async def langflow_upload_ingest_task(
        except Exception:
            # Clean up temp files on error
            from utils.file_utils import safe_unlink
            for temp_path in temp_file_paths:
-                try:
+                safe_unlink(temp_path)
                    if os.path.exists(temp_path):
                        os.unlink(temp_path)
                except Exception:
                    pass  # Ignore cleanup errors
            raise
    except Exception as e:
--- a/src/connectors/langflow_connector_service.py
+++ b/src/connectors/langflow_connector_service.py
@ -53,45 +53,46 @@ class LangflowConnectorService:
            filename=document.filename,
        )
        from utils.file_utils import auto_cleanup_tempfile
        suffix = self._get_file_extension(document.mimetype)
        # Create temporary file from document content
-        with tempfile.NamedTemporaryFile(
+        with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
-            delete=False, suffix=suffix
+            # Write document content to temp file
-        ) as tmp_file:
+            with open(tmp_path, 'wb') as f:
-            tmp_file.write(document.content)
+                f.write(document.content)
-            tmp_file.flush()
+
            # Step 1: Upload file to Langflow
            logger.debug("Uploading file to Langflow", filename=document.filename)
            content = document.content
            file_tuple = (
                document.filename.replace(" ", "_").replace("/", "_")+suffix,
                content,
                document.mimetype or "application/octet-stream",
            )
            upload_result = await self.langflow_service.upload_user_file(
                file_tuple, jwt_token
            )
            langflow_file_id = upload_result["id"]
            langflow_file_path = upload_result["path"]
            logger.debug(
                "File uploaded to Langflow",
                file_id=langflow_file_id,
                path=langflow_file_path,
            )
            # Step 2: Run ingestion flow with the uploaded file
            logger.debug(
                "Running Langflow ingestion flow", file_path=langflow_file_path
            )
            # Use the same tweaks pattern as LangflowFileService
            tweaks = {}  # Let Langflow handle the ingestion with default settings
            try:
                # Step 1: Upload file to Langflow
                logger.debug("Uploading file to Langflow", filename=document.filename)
                content = document.content
                file_tuple = (
                    document.filename.replace(" ", "_").replace("/", "_")+suffix,
                    content,
                    document.mimetype or "application/octet-stream",
                )
                upload_result = await self.langflow_service.upload_user_file(
                    file_tuple, jwt_token
                )
                langflow_file_id = upload_result["id"]
                langflow_file_path = upload_result["path"]
                logger.debug(
                    "File uploaded to Langflow",
                    file_id=langflow_file_id,
                    path=langflow_file_path,
                )
                # Step 2: Run ingestion flow with the uploaded file
                logger.debug(
                    "Running Langflow ingestion flow", file_path=langflow_file_path
                )
                # Use the same tweaks pattern as LangflowFileService
                tweaks = {}  # Let Langflow handle the ingestion with default settings
                ingestion_result = await self.langflow_service.run_ingestion_flow(
                    file_paths=[langflow_file_path],
                    jwt_token=jwt_token,
@ -125,25 +126,20 @@ class LangflowConnectorService:
                    error=str(e),
                )
                # Try to clean up Langflow file if upload succeeded but processing failed
-                if "langflow_file_id" in locals():
+                try:
-                    try:
+                    await self.langflow_service.delete_user_file(langflow_file_id)
-                        await self.langflow_service.delete_user_file(langflow_file_id)
+                    logger.debug(
-                        logger.debug(
+                        "Cleaned up Langflow file after error",
-                            "Cleaned up Langflow file after error",
+                        file_id=langflow_file_id,
-                            file_id=langflow_file_id,
+                    )
-                        )
+                except Exception as cleanup_error:
-                    except Exception as cleanup_error:
+                    logger.warning(
-                        logger.warning(
+                        "Failed to cleanup Langflow file",
-                            "Failed to cleanup Langflow file",
+                        file_id=langflow_file_id,
-                            file_id=langflow_file_id,
+                        error=str(cleanup_error),
-                            error=str(cleanup_error),
+                    )
                        )
                raise
            finally:
                # Clean up temporary file
                os.unlink(tmp_file.name)
    def _get_file_extension(self, mimetype: str) -> str:
        """Get file extension based on MIME type"""
        mime_to_ext = {
--- a/src/connectors/service.py
+++ b/src/connectors/service.py
@ -54,54 +54,50 @@ class ConnectorService:
        """Process a document from a connector using existing processing pipeline"""
        # Create temporary file from document content
-        with tempfile.NamedTemporaryFile(
+        from utils.file_utils import auto_cleanup_tempfile
            delete=False, suffix=self._get_file_extension(document.mimetype)
        ) as tmp_file:
            tmp_file.write(document.content)
            tmp_file.flush()
-            try:
+        with auto_cleanup_tempfile(suffix=self._get_file_extension(document.mimetype)) as tmp_path:
-                # Use existing process_file_common function with connector document metadata
+            # Write document content to temp file
-                # We'll use the document service's process_file_common method
+            with open(tmp_path, 'wb') as f:
-                from services.document_service import DocumentService
+                f.write(document.content)
-                doc_service = DocumentService(session_manager=self.session_manager)
+            # Use existing process_file_common function with connector document metadata
            # We'll use the document service's process_file_common method
            from services.document_service import DocumentService
-                logger.debug("Processing connector document", document_id=document.id)
+            doc_service = DocumentService(session_manager=self.session_manager)
-                # Process using consolidated processing pipeline
+            logger.debug("Processing connector document", document_id=document.id)
-                from models.processors import TaskProcessor
+
-                processor = TaskProcessor(document_service=doc_service)
+            # Process using consolidated processing pipeline
-                result = await processor.process_document_standard(
+            from models.processors import TaskProcessor
-                    file_path=tmp_file.name,
+            processor = TaskProcessor(document_service=doc_service)
-                    file_hash=document.id,  # Use connector document ID as hash
+            result = await processor.process_document_standard(
-                    owner_user_id=owner_user_id,
+                file_path=tmp_path,
-                    original_filename=document.filename,  # Pass the original Google Doc title
+                file_hash=document.id,  # Use connector document ID as hash
-                    jwt_token=jwt_token,
+                owner_user_id=owner_user_id,
-                    owner_name=owner_name,
+                original_filename=document.filename,  # Pass the original Google Doc title
-                    owner_email=owner_email,
+                jwt_token=jwt_token,
-                    file_size=len(document.content) if document.content else 0,
+                owner_name=owner_name,
-                    connector_type=connector_type,
+                owner_email=owner_email,
                file_size=len(document.content) if document.content else 0,
                connector_type=connector_type,
            )
            logger.debug("Document processing result", result=result)
            # If successfully indexed or already exists, update the indexed documents with connector metadata
            if result["status"] in ["indexed", "unchanged"]:
                # Update all chunks with connector-specific metadata
                await self._update_connector_metadata(
                    document, owner_user_id, connector_type, jwt_token
                )
-                logger.debug("Document processing result", result=result)
+            return {
-
+                **result,
-                # If successfully indexed or already exists, update the indexed documents with connector metadata
+                "filename": document.filename,
-                if result["status"] in ["indexed", "unchanged"]:
+                "source_url": document.source_url,
-                    # Update all chunks with connector-specific metadata
+            }
                    await self._update_connector_metadata(
                        document, owner_user_id, connector_type, jwt_token
                    )
                return {
                    **result,
                    "filename": document.filename,
                    "source_url": document.source_url,
                }
            finally:
                # Clean up temporary file
                os.unlink(tmp_file.name)
    async def _update_connector_metadata(
        self,
--- a/src/models/processors.py
+++ b/src/models/processors.py
@ -277,37 +277,35 @@ class ConnectorFileProcessor(TaskProcessor):
                raise ValueError("user_id not provided to ConnectorFileProcessor")
            # Create temporary file from document content
            from utils.file_utils import auto_cleanup_tempfile
            suffix = self.connector_service._get_file_extension(document.mimetype)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+            with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
-                tmp_file.write(document.content)
+                # Write content to temp file
-                tmp_file.flush()
+                with open(tmp_path, 'wb') as f:
                    f.write(document.content)
-                try:
+                # Compute hash
-                    # Compute hash
+                file_hash = hash_id(tmp_path)
                    file_hash = hash_id(tmp_file.name)
-                    # Use consolidated standard processing
+                # Use consolidated standard processing
-                    result = await self.process_document_standard(
+                result = await self.process_document_standard(
-                        file_path=tmp_file.name,
+                    file_path=tmp_path,
-                        file_hash=file_hash,
+                    file_hash=file_hash,
-                        owner_user_id=self.user_id,
+                    owner_user_id=self.user_id,
-                        original_filename=document.filename,
+                    original_filename=document.filename,
-                        jwt_token=self.jwt_token,
+                    jwt_token=self.jwt_token,
-                        owner_name=self.owner_name,
+                    owner_name=self.owner_name,
-                        owner_email=self.owner_email,
+                    owner_email=self.owner_email,
-                        file_size=len(document.content),
+                    file_size=len(document.content),
-                        connector_type=connection.connector_type,
+                    connector_type=connection.connector_type,
-                    )
+                )
-                    # Add connector-specific metadata
+                # Add connector-specific metadata
-                    result.update({
+                result.update({
-                        "source_url": document.source_url,
+                    "source_url": document.source_url,
-                        "document_id": document.id,
+                    "document_id": document.id,
-                    })
+                })
                finally:
                    if os.path.exists(tmp_file.name):
                        os.unlink(tmp_file.name)
            file_task.status = TaskStatus.COMPLETED
            file_task.result = result
@ -379,39 +377,37 @@ class LangflowConnectorFileProcessor(TaskProcessor):
                raise ValueError("user_id not provided to LangflowConnectorFileProcessor")
            # Create temporary file and compute hash to check for duplicates
            from utils.file_utils import auto_cleanup_tempfile
            suffix = self.langflow_connector_service._get_file_extension(document.mimetype)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+            with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
-                tmp_file.write(document.content)
+                # Write content to temp file
-                tmp_file.flush()
+                with open(tmp_path, 'wb') as f:
                    f.write(document.content)
-                try:
+                # Compute hash and check if already exists
-                    # Compute hash and check if already exists
+                file_hash = hash_id(tmp_path)
                    file_hash = hash_id(tmp_file.name)
-                    # Check if document already exists
+                # Check if document already exists
-                    opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client(
+                opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client(
-                        self.user_id, self.jwt_token
+                    self.user_id, self.jwt_token
-                    )
+                )
-                    if await self.check_document_exists(file_hash, opensearch_client):
+                if await self.check_document_exists(file_hash, opensearch_client):
-                        file_task.status = TaskStatus.COMPLETED
+                    file_task.status = TaskStatus.COMPLETED
-                        file_task.result = {"status": "unchanged", "id": file_hash}
+                    file_task.result = {"status": "unchanged", "id": file_hash}
-                        file_task.updated_at = time.time()
+                    file_task.updated_at = time.time()
-                        upload_task.successful_files += 1
+                    upload_task.successful_files += 1
-                        return
+                    return
-                    # Process using Langflow pipeline
+                # Process using Langflow pipeline
-                    result = await self.langflow_connector_service.process_connector_document(
+                result = await self.langflow_connector_service.process_connector_document(
-                        document,
+                    document,
-                        self.user_id,
+                    self.user_id,
-                        connection.connector_type,
+                    connection.connector_type,
-                        jwt_token=self.jwt_token,
+                    jwt_token=self.jwt_token,
-                        owner_name=self.owner_name,
+                    owner_name=self.owner_name,
-                        owner_email=self.owner_email,
+                    owner_email=self.owner_email,
-                    )
+                )
                finally:
                    if os.path.exists(tmp_file.name):
                        os.unlink(tmp_file.name)
            file_task.status = TaskStatus.COMPLETED
            file_task.result = result
@ -466,48 +462,48 @@ class S3FileProcessor(TaskProcessor):
        file_task.status = TaskStatus.RUNNING
        file_task.updated_at = time.time()
-        tmp = tempfile.NamedTemporaryFile(delete=False)
+        from utils.file_utils import auto_cleanup_tempfile
        from utils.hash_utils import hash_id
        try:
-            # Download object to temporary file
+            with auto_cleanup_tempfile() as tmp_path:
-            self.s3_client.download_fileobj(self.bucket, item, tmp)
+                # Download object to temporary file
-            tmp.flush()
+                with open(tmp_path, 'wb') as tmp_file:
                    self.s3_client.download_fileobj(self.bucket, item, tmp_file)
-            # Compute hash
+                # Compute hash
-            from utils.hash_utils import hash_id
+                file_hash = hash_id(tmp_path)
            file_hash = hash_id(tmp.name)
-            # Get object size
+                # Get object size
-            try:
+                try:
-                obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item)
+                    obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item)
-                file_size = obj_info.get("ContentLength", 0)
+                    file_size = obj_info.get("ContentLength", 0)
-            except Exception:
+                except Exception:
-                file_size = 0
+                    file_size = 0
-            # Use consolidated standard processing
+                # Use consolidated standard processing
-            result = await self.process_document_standard(
+                result = await self.process_document_standard(
-                file_path=tmp.name,
+                    file_path=tmp_path,
-                file_hash=file_hash,
+                    file_hash=file_hash,
-                owner_user_id=self.owner_user_id,
+                    owner_user_id=self.owner_user_id,
-                original_filename=item,  # Use S3 key as filename
+                    original_filename=item,  # Use S3 key as filename
-                jwt_token=self.jwt_token,
+                    jwt_token=self.jwt_token,
-                owner_name=self.owner_name,
+                    owner_name=self.owner_name,
-                owner_email=self.owner_email,
+                    owner_email=self.owner_email,
-                file_size=file_size,
+                    file_size=file_size,
-                connector_type="s3",
+                    connector_type="s3",
-            )
+                )
-            result["path"] = f"s3://{self.bucket}/{item}"
+                result["path"] = f"s3://{self.bucket}/{item}"
-            file_task.status = TaskStatus.COMPLETED
+                file_task.status = TaskStatus.COMPLETED
-            file_task.result = result
+                file_task.result = result
-            upload_task.successful_files += 1
+                upload_task.successful_files += 1
        except Exception as e:
            file_task.status = TaskStatus.FAILED
            file_task.error = str(e)
            upload_task.failed_files += 1
        finally:
            if os.path.exists(tmp.name):
                os.unlink(tmp.name)
            file_task.updated_at = time.time()
--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -123,19 +123,21 @@ class DocumentService:
    ):
        """Process an uploaded file from form data"""
        from utils.hash_utils import hash_id
        from utils.file_utils import auto_cleanup_tempfile
        import os
        tmp = tempfile.NamedTemporaryFile(delete=False)
        file_size = 0
        try:
            while True:
                chunk = await upload_file.read(1 << 20)
                if not chunk:
                    break
                tmp.write(chunk)
                file_size += len(chunk)
            tmp.flush()
-            file_hash = hash_id(tmp.name)
+        with auto_cleanup_tempfile() as tmp_path:
            # Stream upload file to temporary file
            file_size = 0
            with open(tmp_path, 'wb') as tmp_file:
                while True:
                    chunk = await upload_file.read(1 << 20)
                    if not chunk:
                        break
                    tmp_file.write(chunk)
                    file_size += len(chunk)
            file_hash = hash_id(tmp_path)
            # Get user's OpenSearch client with JWT for OIDC auth
            opensearch_client = self.session_manager.get_user_opensearch_client(
                owner_user_id, jwt_token
@ -149,14 +151,13 @@ class DocumentService:
                )
                raise
            if exists:
                os.unlink(tmp.name)  # Delete temp file since we don't need it
                return {"status": "unchanged", "id": file_hash}
            # Use consolidated standard processing
            from models.processors import TaskProcessor
            processor = TaskProcessor(document_service=self)
            result = await processor.process_document_standard(
-                file_path=tmp.name,
+                file_path=tmp_path,
                file_hash=file_hash,
                owner_user_id=owner_user_id,
                original_filename=upload_file.filename,
@ -168,10 +169,6 @@ class DocumentService:
            )
            return result
        finally:
            tmp.close()
            os.remove(tmp.name)
    async def process_upload_context(self, upload_file, filename: str = None):
        """Process uploaded file and return content for context"""
        import io
--- a/src/utils/file_utils.py
+++ b/src/utils/file_utils.py
@ -0,0 +1,60 @@
 """File handling utilities for OpenRAG"""
 import os
 import tempfile
 from contextlib import contextmanager
 from typing import Optional
@contextmanager
 def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None):
    """
    Context manager for temporary files that automatically cleans up.
    Unlike tempfile.NamedTemporaryFile with delete=True, this keeps the file
    on disk for the duration of the context, making it safe for async operations.
    Usage:
        with auto_cleanup_tempfile(suffix=".pdf") as tmp_path:
            # Write to the file
            with open(tmp_path, 'wb') as f:
                f.write(content)
            # Use tmp_path for processing
            result = await process_file(tmp_path)
        # File is automatically deleted here
    Args:
        suffix: Optional file suffix/extension (e.g., ".pdf")
        prefix: Optional file prefix
        dir: Optional directory for temp file
    Yields:
        str: Path to the temporary file
    """
    fd, path = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir)
    try:
        os.close(fd)  # Close the file descriptor immediately
        yield path
    finally:
        # Always clean up, even if an exception occurred
        try:
            if os.path.exists(path):
                os.unlink(path)
        except Exception:
            # Silently ignore cleanup errors
            pass
 def safe_unlink(path: str) -> None:
    """
    Safely delete a file, ignoring errors if it doesn't exist.
    Args:
        path: Path to the file to delete
    """
    try:
        if path and os.path.exists(path):
            os.unlink(path)
    except Exception:
        # Silently ignore errors
        pass