auto cleanup temp files
This commit is contained in:
parent
cb58b2165e
commit
6612388586
7 changed files with 246 additions and 207 deletions
|
|
@ -231,11 +231,8 @@ async def upload_and_ingest_user_file(
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Clean up temp file on error
|
# Clean up temp file on error
|
||||||
try:
|
from utils.file_utils import safe_unlink
|
||||||
if os.path.exists(temp_path):
|
safe_unlink(temp_path)
|
||||||
os.unlink(temp_path)
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore cleanup errors
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -164,12 +164,9 @@ async def langflow_upload_ingest_task(
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Clean up temp files on error
|
# Clean up temp files on error
|
||||||
|
from utils.file_utils import safe_unlink
|
||||||
for temp_path in temp_file_paths:
|
for temp_path in temp_file_paths:
|
||||||
try:
|
safe_unlink(temp_path)
|
||||||
if os.path.exists(temp_path):
|
|
||||||
os.unlink(temp_path)
|
|
||||||
except Exception:
|
|
||||||
pass # Ignore cleanup errors
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -53,45 +53,46 @@ class LangflowConnectorService:
|
||||||
filename=document.filename,
|
filename=document.filename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
|
|
||||||
suffix = self._get_file_extension(document.mimetype)
|
suffix = self._get_file_extension(document.mimetype)
|
||||||
|
|
||||||
# Create temporary file from document content
|
# Create temporary file from document content
|
||||||
with tempfile.NamedTemporaryFile(
|
with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
|
||||||
delete=False, suffix=suffix
|
# Write document content to temp file
|
||||||
) as tmp_file:
|
with open(tmp_path, 'wb') as f:
|
||||||
tmp_file.write(document.content)
|
f.write(document.content)
|
||||||
tmp_file.flush()
|
|
||||||
|
# Step 1: Upload file to Langflow
|
||||||
|
logger.debug("Uploading file to Langflow", filename=document.filename)
|
||||||
|
content = document.content
|
||||||
|
file_tuple = (
|
||||||
|
document.filename.replace(" ", "_").replace("/", "_")+suffix,
|
||||||
|
content,
|
||||||
|
document.mimetype or "application/octet-stream",
|
||||||
|
)
|
||||||
|
|
||||||
|
upload_result = await self.langflow_service.upload_user_file(
|
||||||
|
file_tuple, jwt_token
|
||||||
|
)
|
||||||
|
langflow_file_id = upload_result["id"]
|
||||||
|
langflow_file_path = upload_result["path"]
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"File uploaded to Langflow",
|
||||||
|
file_id=langflow_file_id,
|
||||||
|
path=langflow_file_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: Run ingestion flow with the uploaded file
|
||||||
|
logger.debug(
|
||||||
|
"Running Langflow ingestion flow", file_path=langflow_file_path
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the same tweaks pattern as LangflowFileService
|
||||||
|
tweaks = {} # Let Langflow handle the ingestion with default settings
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Step 1: Upload file to Langflow
|
|
||||||
logger.debug("Uploading file to Langflow", filename=document.filename)
|
|
||||||
content = document.content
|
|
||||||
file_tuple = (
|
|
||||||
document.filename.replace(" ", "_").replace("/", "_")+suffix,
|
|
||||||
content,
|
|
||||||
document.mimetype or "application/octet-stream",
|
|
||||||
)
|
|
||||||
|
|
||||||
upload_result = await self.langflow_service.upload_user_file(
|
|
||||||
file_tuple, jwt_token
|
|
||||||
)
|
|
||||||
langflow_file_id = upload_result["id"]
|
|
||||||
langflow_file_path = upload_result["path"]
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"File uploaded to Langflow",
|
|
||||||
file_id=langflow_file_id,
|
|
||||||
path=langflow_file_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Step 2: Run ingestion flow with the uploaded file
|
|
||||||
logger.debug(
|
|
||||||
"Running Langflow ingestion flow", file_path=langflow_file_path
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use the same tweaks pattern as LangflowFileService
|
|
||||||
tweaks = {} # Let Langflow handle the ingestion with default settings
|
|
||||||
|
|
||||||
ingestion_result = await self.langflow_service.run_ingestion_flow(
|
ingestion_result = await self.langflow_service.run_ingestion_flow(
|
||||||
file_paths=[langflow_file_path],
|
file_paths=[langflow_file_path],
|
||||||
jwt_token=jwt_token,
|
jwt_token=jwt_token,
|
||||||
|
|
@ -125,25 +126,20 @@ class LangflowConnectorService:
|
||||||
error=str(e),
|
error=str(e),
|
||||||
)
|
)
|
||||||
# Try to clean up Langflow file if upload succeeded but processing failed
|
# Try to clean up Langflow file if upload succeeded but processing failed
|
||||||
if "langflow_file_id" in locals():
|
try:
|
||||||
try:
|
await self.langflow_service.delete_user_file(langflow_file_id)
|
||||||
await self.langflow_service.delete_user_file(langflow_file_id)
|
logger.debug(
|
||||||
logger.debug(
|
"Cleaned up Langflow file after error",
|
||||||
"Cleaned up Langflow file after error",
|
file_id=langflow_file_id,
|
||||||
file_id=langflow_file_id,
|
)
|
||||||
)
|
except Exception as cleanup_error:
|
||||||
except Exception as cleanup_error:
|
logger.warning(
|
||||||
logger.warning(
|
"Failed to cleanup Langflow file",
|
||||||
"Failed to cleanup Langflow file",
|
file_id=langflow_file_id,
|
||||||
file_id=langflow_file_id,
|
error=str(cleanup_error),
|
||||||
error=str(cleanup_error),
|
)
|
||||||
)
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up temporary file
|
|
||||||
os.unlink(tmp_file.name)
|
|
||||||
|
|
||||||
def _get_file_extension(self, mimetype: str) -> str:
|
def _get_file_extension(self, mimetype: str) -> str:
|
||||||
"""Get file extension based on MIME type"""
|
"""Get file extension based on MIME type"""
|
||||||
mime_to_ext = {
|
mime_to_ext = {
|
||||||
|
|
|
||||||
|
|
@ -54,54 +54,50 @@ class ConnectorService:
|
||||||
"""Process a document from a connector using existing processing pipeline"""
|
"""Process a document from a connector using existing processing pipeline"""
|
||||||
|
|
||||||
# Create temporary file from document content
|
# Create temporary file from document content
|
||||||
with tempfile.NamedTemporaryFile(
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
delete=False, suffix=self._get_file_extension(document.mimetype)
|
|
||||||
) as tmp_file:
|
|
||||||
tmp_file.write(document.content)
|
|
||||||
tmp_file.flush()
|
|
||||||
|
|
||||||
try:
|
with auto_cleanup_tempfile(suffix=self._get_file_extension(document.mimetype)) as tmp_path:
|
||||||
# Use existing process_file_common function with connector document metadata
|
# Write document content to temp file
|
||||||
# We'll use the document service's process_file_common method
|
with open(tmp_path, 'wb') as f:
|
||||||
from services.document_service import DocumentService
|
f.write(document.content)
|
||||||
|
|
||||||
doc_service = DocumentService(session_manager=self.session_manager)
|
# Use existing process_file_common function with connector document metadata
|
||||||
|
# We'll use the document service's process_file_common method
|
||||||
|
from services.document_service import DocumentService
|
||||||
|
|
||||||
logger.debug("Processing connector document", document_id=document.id)
|
doc_service = DocumentService(session_manager=self.session_manager)
|
||||||
|
|
||||||
# Process using consolidated processing pipeline
|
logger.debug("Processing connector document", document_id=document.id)
|
||||||
from models.processors import TaskProcessor
|
|
||||||
processor = TaskProcessor(document_service=doc_service)
|
# Process using consolidated processing pipeline
|
||||||
result = await processor.process_document_standard(
|
from models.processors import TaskProcessor
|
||||||
file_path=tmp_file.name,
|
processor = TaskProcessor(document_service=doc_service)
|
||||||
file_hash=document.id, # Use connector document ID as hash
|
result = await processor.process_document_standard(
|
||||||
owner_user_id=owner_user_id,
|
file_path=tmp_path,
|
||||||
original_filename=document.filename, # Pass the original Google Doc title
|
file_hash=document.id, # Use connector document ID as hash
|
||||||
jwt_token=jwt_token,
|
owner_user_id=owner_user_id,
|
||||||
owner_name=owner_name,
|
original_filename=document.filename, # Pass the original Google Doc title
|
||||||
owner_email=owner_email,
|
jwt_token=jwt_token,
|
||||||
file_size=len(document.content) if document.content else 0,
|
owner_name=owner_name,
|
||||||
connector_type=connector_type,
|
owner_email=owner_email,
|
||||||
|
file_size=len(document.content) if document.content else 0,
|
||||||
|
connector_type=connector_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug("Document processing result", result=result)
|
||||||
|
|
||||||
|
# If successfully indexed or already exists, update the indexed documents with connector metadata
|
||||||
|
if result["status"] in ["indexed", "unchanged"]:
|
||||||
|
# Update all chunks with connector-specific metadata
|
||||||
|
await self._update_connector_metadata(
|
||||||
|
document, owner_user_id, connector_type, jwt_token
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Document processing result", result=result)
|
return {
|
||||||
|
**result,
|
||||||
# If successfully indexed or already exists, update the indexed documents with connector metadata
|
"filename": document.filename,
|
||||||
if result["status"] in ["indexed", "unchanged"]:
|
"source_url": document.source_url,
|
||||||
# Update all chunks with connector-specific metadata
|
}
|
||||||
await self._update_connector_metadata(
|
|
||||||
document, owner_user_id, connector_type, jwt_token
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
**result,
|
|
||||||
"filename": document.filename,
|
|
||||||
"source_url": document.source_url,
|
|
||||||
}
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up temporary file
|
|
||||||
os.unlink(tmp_file.name)
|
|
||||||
|
|
||||||
async def _update_connector_metadata(
|
async def _update_connector_metadata(
|
||||||
self,
|
self,
|
||||||
|
|
|
||||||
|
|
@ -277,37 +277,35 @@ class ConnectorFileProcessor(TaskProcessor):
|
||||||
raise ValueError("user_id not provided to ConnectorFileProcessor")
|
raise ValueError("user_id not provided to ConnectorFileProcessor")
|
||||||
|
|
||||||
# Create temporary file from document content
|
# Create temporary file from document content
|
||||||
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
|
|
||||||
suffix = self.connector_service._get_file_extension(document.mimetype)
|
suffix = self.connector_service._get_file_extension(document.mimetype)
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
|
||||||
tmp_file.write(document.content)
|
# Write content to temp file
|
||||||
tmp_file.flush()
|
with open(tmp_path, 'wb') as f:
|
||||||
|
f.write(document.content)
|
||||||
|
|
||||||
try:
|
# Compute hash
|
||||||
# Compute hash
|
file_hash = hash_id(tmp_path)
|
||||||
file_hash = hash_id(tmp_file.name)
|
|
||||||
|
|
||||||
# Use consolidated standard processing
|
# Use consolidated standard processing
|
||||||
result = await self.process_document_standard(
|
result = await self.process_document_standard(
|
||||||
file_path=tmp_file.name,
|
file_path=tmp_path,
|
||||||
file_hash=file_hash,
|
file_hash=file_hash,
|
||||||
owner_user_id=self.user_id,
|
owner_user_id=self.user_id,
|
||||||
original_filename=document.filename,
|
original_filename=document.filename,
|
||||||
jwt_token=self.jwt_token,
|
jwt_token=self.jwt_token,
|
||||||
owner_name=self.owner_name,
|
owner_name=self.owner_name,
|
||||||
owner_email=self.owner_email,
|
owner_email=self.owner_email,
|
||||||
file_size=len(document.content),
|
file_size=len(document.content),
|
||||||
connector_type=connection.connector_type,
|
connector_type=connection.connector_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add connector-specific metadata
|
# Add connector-specific metadata
|
||||||
result.update({
|
result.update({
|
||||||
"source_url": document.source_url,
|
"source_url": document.source_url,
|
||||||
"document_id": document.id,
|
"document_id": document.id,
|
||||||
})
|
})
|
||||||
|
|
||||||
finally:
|
|
||||||
if os.path.exists(tmp_file.name):
|
|
||||||
os.unlink(tmp_file.name)
|
|
||||||
|
|
||||||
file_task.status = TaskStatus.COMPLETED
|
file_task.status = TaskStatus.COMPLETED
|
||||||
file_task.result = result
|
file_task.result = result
|
||||||
|
|
@ -379,39 +377,37 @@ class LangflowConnectorFileProcessor(TaskProcessor):
|
||||||
raise ValueError("user_id not provided to LangflowConnectorFileProcessor")
|
raise ValueError("user_id not provided to LangflowConnectorFileProcessor")
|
||||||
|
|
||||||
# Create temporary file and compute hash to check for duplicates
|
# Create temporary file and compute hash to check for duplicates
|
||||||
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
|
|
||||||
suffix = self.langflow_connector_service._get_file_extension(document.mimetype)
|
suffix = self.langflow_connector_service._get_file_extension(document.mimetype)
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
with auto_cleanup_tempfile(suffix=suffix) as tmp_path:
|
||||||
tmp_file.write(document.content)
|
# Write content to temp file
|
||||||
tmp_file.flush()
|
with open(tmp_path, 'wb') as f:
|
||||||
|
f.write(document.content)
|
||||||
|
|
||||||
try:
|
# Compute hash and check if already exists
|
||||||
# Compute hash and check if already exists
|
file_hash = hash_id(tmp_path)
|
||||||
file_hash = hash_id(tmp_file.name)
|
|
||||||
|
|
||||||
# Check if document already exists
|
# Check if document already exists
|
||||||
opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client(
|
opensearch_client = self.langflow_connector_service.session_manager.get_user_opensearch_client(
|
||||||
self.user_id, self.jwt_token
|
self.user_id, self.jwt_token
|
||||||
)
|
)
|
||||||
if await self.check_document_exists(file_hash, opensearch_client):
|
if await self.check_document_exists(file_hash, opensearch_client):
|
||||||
file_task.status = TaskStatus.COMPLETED
|
file_task.status = TaskStatus.COMPLETED
|
||||||
file_task.result = {"status": "unchanged", "id": file_hash}
|
file_task.result = {"status": "unchanged", "id": file_hash}
|
||||||
file_task.updated_at = time.time()
|
file_task.updated_at = time.time()
|
||||||
upload_task.successful_files += 1
|
upload_task.successful_files += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
# Process using Langflow pipeline
|
# Process using Langflow pipeline
|
||||||
result = await self.langflow_connector_service.process_connector_document(
|
result = await self.langflow_connector_service.process_connector_document(
|
||||||
document,
|
document,
|
||||||
self.user_id,
|
self.user_id,
|
||||||
connection.connector_type,
|
connection.connector_type,
|
||||||
jwt_token=self.jwt_token,
|
jwt_token=self.jwt_token,
|
||||||
owner_name=self.owner_name,
|
owner_name=self.owner_name,
|
||||||
owner_email=self.owner_email,
|
owner_email=self.owner_email,
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
|
||||||
if os.path.exists(tmp_file.name):
|
|
||||||
os.unlink(tmp_file.name)
|
|
||||||
|
|
||||||
file_task.status = TaskStatus.COMPLETED
|
file_task.status = TaskStatus.COMPLETED
|
||||||
file_task.result = result
|
file_task.result = result
|
||||||
|
|
@ -466,48 +462,48 @@ class S3FileProcessor(TaskProcessor):
|
||||||
file_task.status = TaskStatus.RUNNING
|
file_task.status = TaskStatus.RUNNING
|
||||||
file_task.updated_at = time.time()
|
file_task.updated_at = time.time()
|
||||||
|
|
||||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
|
from utils.hash_utils import hash_id
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Download object to temporary file
|
with auto_cleanup_tempfile() as tmp_path:
|
||||||
self.s3_client.download_fileobj(self.bucket, item, tmp)
|
# Download object to temporary file
|
||||||
tmp.flush()
|
with open(tmp_path, 'wb') as tmp_file:
|
||||||
|
self.s3_client.download_fileobj(self.bucket, item, tmp_file)
|
||||||
|
|
||||||
# Compute hash
|
# Compute hash
|
||||||
from utils.hash_utils import hash_id
|
file_hash = hash_id(tmp_path)
|
||||||
file_hash = hash_id(tmp.name)
|
|
||||||
|
|
||||||
# Get object size
|
# Get object size
|
||||||
try:
|
try:
|
||||||
obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item)
|
obj_info = self.s3_client.head_object(Bucket=self.bucket, Key=item)
|
||||||
file_size = obj_info.get("ContentLength", 0)
|
file_size = obj_info.get("ContentLength", 0)
|
||||||
except Exception:
|
except Exception:
|
||||||
file_size = 0
|
file_size = 0
|
||||||
|
|
||||||
# Use consolidated standard processing
|
# Use consolidated standard processing
|
||||||
result = await self.process_document_standard(
|
result = await self.process_document_standard(
|
||||||
file_path=tmp.name,
|
file_path=tmp_path,
|
||||||
file_hash=file_hash,
|
file_hash=file_hash,
|
||||||
owner_user_id=self.owner_user_id,
|
owner_user_id=self.owner_user_id,
|
||||||
original_filename=item, # Use S3 key as filename
|
original_filename=item, # Use S3 key as filename
|
||||||
jwt_token=self.jwt_token,
|
jwt_token=self.jwt_token,
|
||||||
owner_name=self.owner_name,
|
owner_name=self.owner_name,
|
||||||
owner_email=self.owner_email,
|
owner_email=self.owner_email,
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
connector_type="s3",
|
connector_type="s3",
|
||||||
)
|
)
|
||||||
|
|
||||||
result["path"] = f"s3://{self.bucket}/{item}"
|
result["path"] = f"s3://{self.bucket}/{item}"
|
||||||
file_task.status = TaskStatus.COMPLETED
|
file_task.status = TaskStatus.COMPLETED
|
||||||
file_task.result = result
|
file_task.result = result
|
||||||
upload_task.successful_files += 1
|
upload_task.successful_files += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
file_task.status = TaskStatus.FAILED
|
file_task.status = TaskStatus.FAILED
|
||||||
file_task.error = str(e)
|
file_task.error = str(e)
|
||||||
upload_task.failed_files += 1
|
upload_task.failed_files += 1
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(tmp.name):
|
|
||||||
os.unlink(tmp.name)
|
|
||||||
file_task.updated_at = time.time()
|
file_task.updated_at = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -123,19 +123,21 @@ class DocumentService:
|
||||||
):
|
):
|
||||||
"""Process an uploaded file from form data"""
|
"""Process an uploaded file from form data"""
|
||||||
from utils.hash_utils import hash_id
|
from utils.hash_utils import hash_id
|
||||||
|
from utils.file_utils import auto_cleanup_tempfile
|
||||||
import os
|
import os
|
||||||
tmp = tempfile.NamedTemporaryFile(delete=False)
|
|
||||||
file_size = 0
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
chunk = await upload_file.read(1 << 20)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
tmp.write(chunk)
|
|
||||||
file_size += len(chunk)
|
|
||||||
tmp.flush()
|
|
||||||
|
|
||||||
file_hash = hash_id(tmp.name)
|
with auto_cleanup_tempfile() as tmp_path:
|
||||||
|
# Stream upload file to temporary file
|
||||||
|
file_size = 0
|
||||||
|
with open(tmp_path, 'wb') as tmp_file:
|
||||||
|
while True:
|
||||||
|
chunk = await upload_file.read(1 << 20)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
tmp_file.write(chunk)
|
||||||
|
file_size += len(chunk)
|
||||||
|
|
||||||
|
file_hash = hash_id(tmp_path)
|
||||||
# Get user's OpenSearch client with JWT for OIDC auth
|
# Get user's OpenSearch client with JWT for OIDC auth
|
||||||
opensearch_client = self.session_manager.get_user_opensearch_client(
|
opensearch_client = self.session_manager.get_user_opensearch_client(
|
||||||
owner_user_id, jwt_token
|
owner_user_id, jwt_token
|
||||||
|
|
@ -149,14 +151,13 @@ class DocumentService:
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
if exists:
|
if exists:
|
||||||
os.unlink(tmp.name) # Delete temp file since we don't need it
|
|
||||||
return {"status": "unchanged", "id": file_hash}
|
return {"status": "unchanged", "id": file_hash}
|
||||||
|
|
||||||
# Use consolidated standard processing
|
# Use consolidated standard processing
|
||||||
from models.processors import TaskProcessor
|
from models.processors import TaskProcessor
|
||||||
processor = TaskProcessor(document_service=self)
|
processor = TaskProcessor(document_service=self)
|
||||||
result = await processor.process_document_standard(
|
result = await processor.process_document_standard(
|
||||||
file_path=tmp.name,
|
file_path=tmp_path,
|
||||||
file_hash=file_hash,
|
file_hash=file_hash,
|
||||||
owner_user_id=owner_user_id,
|
owner_user_id=owner_user_id,
|
||||||
original_filename=upload_file.filename,
|
original_filename=upload_file.filename,
|
||||||
|
|
@ -168,10 +169,6 @@ class DocumentService:
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
finally:
|
|
||||||
tmp.close()
|
|
||||||
os.remove(tmp.name)
|
|
||||||
|
|
||||||
async def process_upload_context(self, upload_file, filename: str = None):
|
async def process_upload_context(self, upload_file, filename: str = None):
|
||||||
"""Process uploaded file and return content for context"""
|
"""Process uploaded file and return content for context"""
|
||||||
import io
|
import io
|
||||||
|
|
|
||||||
60
src/utils/file_utils.py
Normal file
60
src/utils/file_utils.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
"""File handling utilities for OpenRAG"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def auto_cleanup_tempfile(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Context manager for temporary files that automatically cleans up.
|
||||||
|
|
||||||
|
Unlike tempfile.NamedTemporaryFile with delete=True, this keeps the file
|
||||||
|
on disk for the duration of the context, making it safe for async operations.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
with auto_cleanup_tempfile(suffix=".pdf") as tmp_path:
|
||||||
|
# Write to the file
|
||||||
|
with open(tmp_path, 'wb') as f:
|
||||||
|
f.write(content)
|
||||||
|
# Use tmp_path for processing
|
||||||
|
result = await process_file(tmp_path)
|
||||||
|
# File is automatically deleted here
|
||||||
|
|
||||||
|
Args:
|
||||||
|
suffix: Optional file suffix/extension (e.g., ".pdf")
|
||||||
|
prefix: Optional file prefix
|
||||||
|
dir: Optional directory for temp file
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
str: Path to the temporary file
|
||||||
|
"""
|
||||||
|
fd, path = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir)
|
||||||
|
try:
|
||||||
|
os.close(fd) # Close the file descriptor immediately
|
||||||
|
yield path
|
||||||
|
finally:
|
||||||
|
# Always clean up, even if an exception occurred
|
||||||
|
try:
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.unlink(path)
|
||||||
|
except Exception:
|
||||||
|
# Silently ignore cleanup errors
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def safe_unlink(path: str) -> None:
|
||||||
|
"""
|
||||||
|
Safely delete a file, ignoring errors if it doesn't exist.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the file to delete
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if path and os.path.exists(path):
|
||||||
|
os.unlink(path)
|
||||||
|
except Exception:
|
||||||
|
# Silently ignore errors
|
||||||
|
pass
|
||||||
Loading…
Add table
Reference in a new issue