diff --git a/src/models/processors.py b/src/models/processors.py index 9731adb7..8f84c3dc 100644 --- a/src/models/processors.py +++ b/src/models/processors.py @@ -197,10 +197,27 @@ class TaskProcessor: file_hash=file_hash, ) - # Convert and extract - result = clients.converter.convert(file_path) - full_doc = result.document.export_to_dict() - slim_doc = extract_relevant(full_doc) + # Check if this is a .txt file - use simple processing instead of docling + import os + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext == '.txt': + # Simple text file processing without docling + from utils.document_processing import process_text_file + logger.info( + "Processing as plain text file (bypassing docling)", + file_path=file_path, + file_hash=file_hash, + ) + slim_doc = process_text_file(file_path) + # Override filename with original_filename if provided + if original_filename: + slim_doc["filename"] = original_filename + else: + # Convert and extract using docling for other file types + result = clients.converter.convert(file_path) + full_doc = result.document.export_to_dict() + slim_doc = extract_relevant(full_doc) texts = [c["text"] for c in slim_doc["chunks"]] diff --git a/src/services/document_service.py b/src/services/document_service.py index de1b3cf6..f40c3d82 100644 --- a/src/services/document_service.py +++ b/src/services/document_service.py @@ -181,6 +181,7 @@ class DocumentService: async def process_upload_context(self, upload_file, filename: str = None): """Process uploaded file and return content for context""" import io + import os if not filename: filename = upload_file.filename or "uploaded_document" @@ -194,22 +195,37 @@ class DocumentService: content.write(chunk) content.seek(0) # Reset to beginning for reading - # Create DocumentStream and process with docling - doc_stream = DocumentStream(name=filename, stream=content) - result = clients.converter.convert(doc_stream) - full_doc = result.document.export_to_dict() - slim_doc = extract_relevant(full_doc) + # Check if this is a .txt file - use simple processing + file_ext = os.path.splitext(filename)[1].lower() + + if file_ext == '.txt': + # Simple text file processing for chat context + text_content = content.read().decode('utf-8', errors='replace') + + # For context, we don't need to chunk - just return the full content + return { + "filename": filename, + "content": text_content, + "pages": 1, # Text files don't have pages + "content_length": len(text_content), + } + else: + # Create DocumentStream and process with docling + doc_stream = DocumentStream(name=filename, stream=content) + result = clients.converter.convert(doc_stream) + full_doc = result.document.export_to_dict() + slim_doc = extract_relevant(full_doc) - # Extract all text content - all_text = [] - for chunk in slim_doc["chunks"]: - all_text.append(f"Page {chunk['page']}:\n{chunk['text']}") + # Extract all text content + all_text = [] + for chunk in slim_doc["chunks"]: + all_text.append(f"Page {chunk['page']}:\n{chunk['text']}") - full_content = "\n\n".join(all_text) + full_content = "\n\n".join(all_text) - return { - "filename": filename, - "content": full_content, - "pages": len(slim_doc["chunks"]), - "content_length": len(full_content), - } + return { + "filename": filename, + "content": full_content, + "pages": len(slim_doc["chunks"]), + "content_length": len(full_content), + } diff --git a/src/utils/document_processing.py b/src/utils/document_processing.py index fcb458fb..9619cf74 100644 --- a/src/utils/document_processing.py +++ b/src/utils/document_processing.py @@ -119,6 +119,82 @@ def get_worker_converter(): return _worker_converter +def process_text_file(file_path: str) -> dict: + """ + Process a plain text file without using docling. + Returns the same structure as extract_relevant() for consistency. + + Args: + file_path: Path to the .txt file + + Returns: + dict with keys: id, filename, mimetype, chunks + """ + import os + from utils.hash_utils import hash_id + + # Read the file + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + # Compute hash + file_hash = hash_id(file_path) + filename = os.path.basename(file_path) + + # Split content into chunks of ~1000 characters to match typical docling chunk sizes + # This ensures embeddings stay within reasonable token limits + chunk_size = 1000 + chunks = [] + + # Split by paragraphs first (double newline) + paragraphs = content.split('\n\n') + current_chunk = "" + chunk_index = 0 + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # If adding this paragraph would exceed chunk size, save current chunk + if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk: + chunks.append({ + "page": chunk_index + 1, # Use chunk_index + 1 as "page" number + "type": "text", + "text": current_chunk.strip() + }) + chunk_index += 1 + current_chunk = para + else: + if current_chunk: + current_chunk += "\n\n" + para + else: + current_chunk = para + + # Add the last chunk if any + if current_chunk.strip(): + chunks.append({ + "page": chunk_index + 1, + "type": "text", + "text": current_chunk.strip() + }) + + # If no chunks were created (empty file), create a single empty chunk + if not chunks: + chunks.append({ + "page": 1, + "type": "text", + "text": "" + }) + + return { + "id": file_hash, + "filename": filename, + "mimetype": "text/plain", + "chunks": chunks, + } + + def extract_relevant(doc_dict: dict) -> dict: """ Given the full export_to_dict() result: