Fix the regular sync functionality

This commit is contained in:
Eric Hare 2025-09-05 09:21:16 -07:00
parent 39e52273d7
commit d8ec7584da
No known key found for this signature in database
GPG key ID: A73DF73724270AB7
4 changed files with 49 additions and 18 deletions

View file

@ -23,9 +23,6 @@ async def connector_sync(request: Request, connector_service, session_manager):
data = await request.json()
max_files = data.get("max_files")
if not data.get("selected_files"):
return JSONResponse({"error": "selected_files is required"}, status_code=400)
try:
user = request.state.user
jwt_token = request.state.jwt_token
@ -50,8 +47,6 @@ async def connector_sync(request: Request, connector_service, session_manager):
user.user_id,
max_files,
jwt_token=jwt_token,
selected_files=data.get("selected_files"),
selected_folders=data.get("selected_folders"),
)
task_ids.append(task_id)

View file

@ -108,7 +108,7 @@ class BaseConnector(ABC):
pass
@abstractmethod
async def list_files(self, page_token: Optional[str] = None) -> Dict[str, Any]:
async def list_files(self, page_token: Optional[str] = None, max_files: Optional[int] = None) -> Dict[str, Any]:
"""List all files. Returns files and next_page_token if any."""
pass

View file

@ -452,6 +452,7 @@ class GoogleDriveConnector(BaseConnector):
async def list_files(
self,
page_token: Optional[str] = None,
max_files: Optional[int] = None,
**kwargs
) -> Dict[str, Any]:
"""
@ -466,7 +467,6 @@ class GoogleDriveConnector(BaseConnector):
items = self._iter_selected_items()
# Optionally honor a request-scoped max_files (e.g., from your API payload)
max_files = kwargs.get("max_files")
if isinstance(max_files, int) and max_files > 0:
items = items[:max_files]

View file

@ -1,14 +1,13 @@
import asyncio
import tempfile
import os
from typing import Dict, Any, List, Optional
from .base import BaseConnector, ConnectorDocument
from .google_drive import GoogleDriveConnector
from .sharepoint import SharePointConnector
from .onedrive import OneDriveConnector
from utils.logging_config import get_logger
from .connection_manager import ConnectionManager
logger = get_logger(__name__)
class ConnectorService:
"""Service to manage document connectors and process files"""
@ -194,8 +193,6 @@ class ConnectorService:
user_id: str,
max_files: int = None,
jwt_token: str = None,
selected_files: List[str] = None,
selected_folders: List[str] = None,
) -> str:
"""Sync files from a connector connection using existing task tracking system"""
if not self.task_service:
@ -203,8 +200,10 @@ class ConnectorService:
"TaskService not available - connector sync requires task service dependency"
)
print(
f"[DEBUG] Starting sync for connection {connection_id}, max_files={max_files}"
logger.debug(
"Starting sync for connection",
connection_id=connection_id,
max_files=max_files,
)
connector = await self.get_connector(connection_id)
@ -213,11 +212,45 @@ class ConnectorService:
f"Connection '{connection_id}' not found or not authenticated"
)
print(f"[DEBUG] Got connector, authenticated: {connector.is_authenticated}")
logger.debug("Got connector", authenticated=connector.is_authenticated)
if not connector.is_authenticated:
raise ValueError(f"Connection '{connection_id}' not authenticated")
# Collect files to process (limited by max_files)
files_to_process = []
page_token = None
# Calculate page size to minimize API calls
page_size = min(max_files or 100, 1000) if max_files else 100
while True:
# List files from connector with limit
logger.info(
"Calling list_files", page_size=page_size, page_token=page_token
)
file_list = await connector.list_files(page_token, max_files=page_size)
logger.info(
"Got files from connector", file_count=len(file_list.get("files", []))
)
files = file_list["files"]
if not files:
break
for file_info in files:
if max_files and len(files_to_process) >= max_files:
break
files_to_process.append(file_info)
# Stop if we have enough files or no more pages
if (max_files and len(files_to_process) >= max_files) or not file_list.get(
"nextPageToken"
):
break
page_token = file_list.get("nextPageToken")
# Get user information
user = self.session_manager.get_user(user_id) if self.session_manager else None
owner_name = user.name if user else None
@ -229,16 +262,19 @@ class ConnectorService:
processor = ConnectorFileProcessor(
self,
connection_id,
selected_files or [],
files_to_process,
user_id,
jwt_token=jwt_token,
owner_name=owner_name,
owner_email=owner_email,
)
# Use file IDs as items (no more fake file paths!)
file_ids = [file_info["id"] for file_info in files_to_process]
# Create custom task using TaskService
task_id = await self.task_service.create_custom_task(
user_id, selected_files, processor
user_id, file_ids, processor
)
return task_id