fix: Support for txt file processing outside of Docling

2025-12-08 09:09:05 -08:00 · 2025-12-08 09:09:05 -08:00 · a467e8a9b6
commit a467e8a9b6
parent f3ab58853f
3 changed files with 129 additions and 20 deletions
--- a/src/models/processors.py
+++ b/src/models/processors.py
@ -197,10 +197,27 @@ class TaskProcessor:
            file_hash=file_hash,
        )

-        # Convert and extract
-        result = clients.converter.convert(file_path)
-        full_doc = result.document.export_to_dict()
-        slim_doc = extract_relevant(full_doc)
+        # Check if this is a .txt file - use simple processing instead of docling
+        import os
+        file_ext = os.path.splitext(file_path)[1].lower()
+        
+        if file_ext == '.txt':
+            # Simple text file processing without docling
+            from utils.document_processing import process_text_file
+            logger.info(
+                "Processing as plain text file (bypassing docling)",
+                file_path=file_path,
+                file_hash=file_hash,
+            )
+            slim_doc = process_text_file(file_path)
+            # Override filename with original_filename if provided
+            if original_filename:
+                slim_doc["filename"] = original_filename
+        else:
+            # Convert and extract using docling for other file types
+            result = clients.converter.convert(file_path)
+            full_doc = result.document.export_to_dict()
+            slim_doc = extract_relevant(full_doc)

        texts = [c["text"] for c in slim_doc["chunks"]]

--- a/src/services/document_service.py
+++ b/src/services/document_service.py
@ -181,6 +181,7 @@ class DocumentService:
    async def process_upload_context(self, upload_file, filename: str = None):
        """Process uploaded file and return content for context"""
        import io
+        import os

        if not filename:
            filename = upload_file.filename or "uploaded_document"
@ -194,22 +195,37 @@ class DocumentService:
            content.write(chunk)
        content.seek(0)  # Reset to beginning for reading

-        # Create DocumentStream and process with docling
-        doc_stream = DocumentStream(name=filename, stream=content)
-        result = clients.converter.convert(doc_stream)
-        full_doc = result.document.export_to_dict()
-        slim_doc = extract_relevant(full_doc)
+        # Check if this is a .txt file - use simple processing
+        file_ext = os.path.splitext(filename)[1].lower()
+        
+        if file_ext == '.txt':
+            # Simple text file processing for chat context
+            text_content = content.read().decode('utf-8', errors='replace')
+            
+            # For context, we don't need to chunk - just return the full content
+            return {
+                "filename": filename,
+                "content": text_content,
+                "pages": 1,  # Text files don't have pages
+                "content_length": len(text_content),
+            }
+        else:
+            # Create DocumentStream and process with docling
+            doc_stream = DocumentStream(name=filename, stream=content)
+            result = clients.converter.convert(doc_stream)
+            full_doc = result.document.export_to_dict()
+            slim_doc = extract_relevant(full_doc)

-        # Extract all text content
-        all_text = []
-        for chunk in slim_doc["chunks"]:
-            all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
+            # Extract all text content
+            all_text = []
+            for chunk in slim_doc["chunks"]:
+                all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")

-        full_content = "\n\n".join(all_text)
+            full_content = "\n\n".join(all_text)

-        return {
-            "filename": filename,
-            "content": full_content,
-            "pages": len(slim_doc["chunks"]),
-            "content_length": len(full_content),
-        }
+            return {
+                "filename": filename,
+                "content": full_content,
+                "pages": len(slim_doc["chunks"]),
+                "content_length": len(full_content),
+            }
--- a/src/utils/document_processing.py
+++ b/src/utils/document_processing.py
@ -119,6 +119,82 @@ def get_worker_converter():
    return _worker_converter


+def process_text_file(file_path: str) -> dict:
+    """
+    Process a plain text file without using docling.
+    Returns the same structure as extract_relevant() for consistency.
+    
+    Args:
+        file_path: Path to the .txt file
+        
+    Returns:
+        dict with keys: id, filename, mimetype, chunks
+    """
+    import os
+    from utils.hash_utils import hash_id
+    
+    # Read the file
+    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+        content = f.read()
+    
+    # Compute hash
+    file_hash = hash_id(file_path)
+    filename = os.path.basename(file_path)
+    
+    # Split content into chunks of ~1000 characters to match typical docling chunk sizes
+    # This ensures embeddings stay within reasonable token limits
+    chunk_size = 1000
+    chunks = []
+    
+    # Split by paragraphs first (double newline)
+    paragraphs = content.split('\n\n')
+    current_chunk = ""
+    chunk_index = 0
+    
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+            
+        # If adding this paragraph would exceed chunk size, save current chunk
+        if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
+            chunks.append({
+                "page": chunk_index + 1,  # Use chunk_index + 1 as "page" number
+                "type": "text",
+                "text": current_chunk.strip()
+            })
+            chunk_index += 1
+            current_chunk = para
+        else:
+            if current_chunk:
+                current_chunk += "\n\n" + para
+            else:
+                current_chunk = para
+    
+    # Add the last chunk if any
+    if current_chunk.strip():
+        chunks.append({
+            "page": chunk_index + 1,
+            "type": "text",
+            "text": current_chunk.strip()
+        })
+    
+    # If no chunks were created (empty file), create a single empty chunk
+    if not chunks:
+        chunks.append({
+            "page": 1,
+            "type": "text",
+            "text": ""
+        })
+    
+    return {
+        "id": file_hash,
+        "filename": filename,
+        "mimetype": "text/plain",
+        "chunks": chunks,
+    }
+
+
 def extract_relevant(doc_dict: dict) -> dict:
    """
    Given the full export_to_dict() result: