openrag/src/connectors/service.py

402 lines
15 KiB
Python

from typing import Any, Dict, List, Optional
from utils.logging_config import get_logger
from .base import BaseConnector, ConnectorDocument
from .connection_manager import ConnectionManager
logger = get_logger(__name__)
class ConnectorService:
"""Service to manage document connectors and process files"""
def __init__(
self,
patched_async_client,
process_pool,
embed_model: str,
index_name: str,
task_service=None,
session_manager=None,
):
self.clients = patched_async_client # Store the clients object to access the property
self.process_pool = process_pool
self.embed_model = embed_model
self.index_name = index_name
self.task_service = task_service
self.session_manager = session_manager
self.connection_manager = ConnectionManager()
async def initialize(self):
"""Initialize the service by loading existing connections"""
await self.connection_manager.load_connections()
async def get_connector(self, connection_id: str) -> Optional[BaseConnector]:
"""Get a connector by connection ID"""
return await self.connection_manager.get_connector(connection_id)
async def process_connector_document(
self,
document: ConnectorDocument,
owner_user_id: str,
connector_type: str,
jwt_token: str = None,
owner_name: str = None,
owner_email: str = None,
) -> Dict[str, Any]:
"""Process a document from a connector using existing processing pipeline"""
# Create temporary file from document content
from utils.file_utils import auto_cleanup_tempfile
with auto_cleanup_tempfile(
suffix=self._get_file_extension(document.mimetype)
) as tmp_path:
# Write document content to temp file
with open(tmp_path, "wb") as f:
f.write(document.content)
# Use existing process_file_common function with connector document metadata
# We'll use the document service's process_file_common method
from services.document_service import DocumentService
doc_service = DocumentService(session_manager=self.session_manager)
logger.debug("Processing connector document", document_id=document.id)
# Process using consolidated processing pipeline
from models.processors import TaskProcessor
processor = TaskProcessor(document_service=doc_service)
result = await processor.process_document_standard(
file_path=tmp_path,
file_hash=document.id, # Use connector document ID as hash
owner_user_id=owner_user_id,
original_filename=document.filename, # Pass the original Google Doc title
jwt_token=jwt_token,
owner_name=owner_name,
owner_email=owner_email,
file_size=len(document.content) if document.content else 0,
connector_type=connector_type,
)
logger.debug("Document processing result", result=result)
# If successfully indexed or already exists, update the indexed documents with connector metadata
if result["status"] in ["indexed", "unchanged"]:
# Update all chunks with connector-specific metadata
await self._update_connector_metadata(
document, owner_user_id, connector_type, jwt_token
)
return {
**result,
"filename": document.filename,
"source_url": document.source_url,
}
async def _update_connector_metadata(
self,
document: ConnectorDocument,
owner_user_id: str,
connector_type: str,
jwt_token: str = None,
):
"""Update indexed chunks with connector-specific metadata"""
logger.debug("Looking for chunks", document_id=document.id)
# Find all chunks for this document
query = {"query": {"term": {"document_id": document.id}}}
# Get user's OpenSearch client
opensearch_client = self.session_manager.get_user_opensearch_client(
owner_user_id, jwt_token
)
try:
response = await opensearch_client.search(index=self.index_name, body=query)
except Exception as e:
logger.error(
"OpenSearch search failed for connector metadata update",
error=str(e),
query=query,
)
raise
logger.debug(
"Search query executed",
query=query,
chunks_found=len(response["hits"]["hits"]),
document_id=document.id,
)
# Update each chunk with connector metadata
logger.debug(
"Updating chunks with connector_type",
chunk_count=len(response["hits"]["hits"]),
connector_type=connector_type,
)
for hit in response["hits"]["hits"]:
chunk_id = hit["_id"]
current_connector_type = hit["_source"].get("connector_type", "unknown")
logger.debug(
"Updating chunk connector metadata",
chunk_id=chunk_id,
current_connector_type=current_connector_type,
new_connector_type=connector_type,
)
update_body = {
"doc": {
"source_url": document.source_url,
"connector_type": connector_type, # Override the "local" set by process_file_common
# Additional ACL info beyond owner (already set by process_file_common)
"allowed_users": document.acl.allowed_users,
"allowed_groups": document.acl.allowed_groups,
"user_permissions": document.acl.user_permissions,
"group_permissions": document.acl.group_permissions,
# Timestamps
"created_time": document.created_time.isoformat()
if document.created_time
else None,
"modified_time": document.modified_time.isoformat()
if document.modified_time
else None,
# Additional metadata
"metadata": document.metadata,
}
}
try:
await opensearch_client.update(
index=self.index_name, id=chunk_id, body=update_body
)
logger.debug("Updated chunk with connector metadata", chunk_id=chunk_id)
except Exception as e:
logger.error(
"OpenSearch update failed for chunk",
chunk_id=chunk_id,
error=str(e),
update_body=update_body,
)
raise
def _get_file_extension(self, mimetype: str) -> str:
"""Get file extension based on MIME type"""
mime_to_ext = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/msword": ".doc",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/vnd.ms-powerpoint": ".ppt",
"text/plain": ".txt",
"text/html": ".html",
"application/rtf": ".rtf",
"application/vnd.google-apps.document": ".pdf", # Exported as PDF
"application/vnd.google-apps.presentation": ".pdf",
"application/vnd.google-apps.spreadsheet": ".pdf",
}
return mime_to_ext.get(mimetype, ".bin")
async def sync_connector_files(
self,
connection_id: str,
user_id: str,
max_files: int = None,
jwt_token: str = None,
) -> str:
"""Sync files from a connector connection using existing task tracking system"""
if not self.task_service:
raise ValueError(
"TaskService not available - connector sync requires task service dependency"
)
logger.debug(
"Starting sync for connection",
connection_id=connection_id,
max_files=max_files,
)
connector = await self.get_connector(connection_id)
if not connector:
raise ValueError(
f"Connection '{connection_id}' not found or not authenticated"
)
logger.debug("Got connector", authenticated=connector.is_authenticated)
if not connector.is_authenticated:
raise ValueError(f"Connection '{connection_id}' not authenticated")
# Collect files to process (limited by max_files)
files_to_process = []
page_token = None
# Calculate page size to minimize API calls
page_size = min(max_files or 100, 1000) if max_files else 100
while True:
# List files from connector with limit
logger.debug(
"Calling list_files", page_size=page_size, page_token=page_token
)
file_list = await connector.list_files(page_token, limit=page_size)
logger.debug(
"Got files from connector", file_count=len(file_list.get("files", []))
)
files = file_list["files"]
if not files:
break
for file_info in files:
if max_files and len(files_to_process) >= max_files:
break
files_to_process.append(file_info)
# Stop if we have enough files or no more pages
if (max_files and len(files_to_process) >= max_files) or not file_list.get(
"nextPageToken"
):
break
page_token = file_list.get("nextPageToken")
# Get user information
user = self.session_manager.get_user(user_id) if self.session_manager else None
owner_name = user.name if user else None
owner_email = user.email if user else None
# Create custom processor for connector files
from models.processors import ConnectorFileProcessor
from services.document_service import DocumentService
processor = ConnectorFileProcessor(
self,
connection_id,
files_to_process,
user_id,
jwt_token=jwt_token,
owner_name=owner_name,
owner_email=owner_email,
document_service=(
self.task_service.document_service
if self.task_service and self.task_service.document_service
else DocumentService(session_manager=self.session_manager)
),
)
# Use file IDs as items (no more fake file paths!)
file_ids = [file_info["id"] for file_info in files_to_process]
# Create custom task using TaskService
task_id = await self.task_service.create_custom_task(
user_id, file_ids, processor
)
return task_id
async def sync_specific_files(
self,
connection_id: str,
user_id: str,
file_ids: List[str],
jwt_token: str = None,
) -> str:
"""
Sync specific files by their IDs (used for webhook-triggered syncs or manual selection).
Automatically expands folders to their contents.
"""
if not self.task_service:
raise ValueError(
"TaskService not available - connector sync requires task service dependency"
)
connector = await self.get_connector(connection_id)
if not connector:
raise ValueError(
f"Connection '{connection_id}' not found or not authenticated"
)
if not connector.is_authenticated:
raise ValueError(f"Connection '{connection_id}' not authenticated")
if not file_ids:
raise ValueError("No file IDs provided")
# Get user information
user = self.session_manager.get_user(user_id) if self.session_manager else None
owner_name = user.name if user else None
owner_email = user.email if user else None
# Temporarily set file_ids in the connector's config so list_files() can use them
# Store the original values to restore later
original_file_ids = None
original_folder_ids = None
if hasattr(connector, "cfg"):
original_file_ids = getattr(connector.cfg, "file_ids", None)
original_folder_ids = getattr(connector.cfg, "folder_ids", None)
try:
# Set the file_ids we want to sync in the connector's config
if hasattr(connector, "cfg"):
connector.cfg.file_ids = file_ids # type: ignore
connector.cfg.folder_ids = None # type: ignore
# Get the expanded list of file IDs (folders will be expanded to their contents)
# This uses the connector's list_files() which calls _iter_selected_items()
result = await connector.list_files()
expanded_file_ids = [f["id"] for f in result.get("files", [])]
if not expanded_file_ids:
logger.warning(
f"No files found after expanding file_ids. "
f"Original IDs: {file_ids}. This may indicate all IDs were folders "
f"with no contents, or files that were filtered out."
)
# Return empty task rather than failing
raise ValueError("No files to sync after expanding folders")
except Exception as e:
logger.error(f"Failed to expand file_ids via list_files(): {e}")
# Fallback to original file_ids if expansion fails
expanded_file_ids = file_ids
finally:
# Restore original config values
if hasattr(connector, "cfg"):
connector.cfg.file_ids = original_file_ids # type: ignore
connector.cfg.folder_ids = original_folder_ids # type: ignore
# Create custom processor for specific connector files
from models.processors import ConnectorFileProcessor
from services.document_service import DocumentService
# Use expanded_file_ids which has folders already expanded
processor = ConnectorFileProcessor(
self,
connection_id,
expanded_file_ids,
user_id,
jwt_token=jwt_token,
owner_name=owner_name,
owner_email=owner_email,
document_service=(
self.task_service.document_service
if self.task_service and self.task_service.document_service
else DocumentService(session_manager=self.session_manager)
),
)
# Create custom task using TaskService
task_id = await self.task_service.create_custom_task(
user_id, expanded_file_ids, processor
)
return task_id
async def _get_connector(self, connection_id: str) -> Optional[BaseConnector]:
"""Get a connector by connection ID (alias for get_connector)"""
return await self.get_connector(connection_id)