fix: Support for txt file processing outside of Docling

This commit is contained in:
Eric Hare 2025-12-08 09:09:05 -08:00
parent f3ab58853f
commit a467e8a9b6
No known key found for this signature in database
GPG key ID: A73DF73724270AB7
3 changed files with 129 additions and 20 deletions

View file

@ -197,10 +197,27 @@ class TaskProcessor:
file_hash=file_hash,
)
# Convert and extract
result = clients.converter.convert(file_path)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Check if this is a .txt file - use simple processing instead of docling
import os
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.txt':
# Simple text file processing without docling
from utils.document_processing import process_text_file
logger.info(
"Processing as plain text file (bypassing docling)",
file_path=file_path,
file_hash=file_hash,
)
slim_doc = process_text_file(file_path)
# Override filename with original_filename if provided
if original_filename:
slim_doc["filename"] = original_filename
else:
# Convert and extract using docling for other file types
result = clients.converter.convert(file_path)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
texts = [c["text"] for c in slim_doc["chunks"]]

View file

@ -181,6 +181,7 @@ class DocumentService:
async def process_upload_context(self, upload_file, filename: str = None):
"""Process uploaded file and return content for context"""
import io
import os
if not filename:
filename = upload_file.filename or "uploaded_document"
@ -194,22 +195,37 @@ class DocumentService:
content.write(chunk)
content.seek(0) # Reset to beginning for reading
# Create DocumentStream and process with docling
doc_stream = DocumentStream(name=filename, stream=content)
result = clients.converter.convert(doc_stream)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Check if this is a .txt file - use simple processing
file_ext = os.path.splitext(filename)[1].lower()
if file_ext == '.txt':
# Simple text file processing for chat context
text_content = content.read().decode('utf-8', errors='replace')
# For context, we don't need to chunk - just return the full content
return {
"filename": filename,
"content": text_content,
"pages": 1, # Text files don't have pages
"content_length": len(text_content),
}
else:
# Create DocumentStream and process with docling
doc_stream = DocumentStream(name=filename, stream=content)
result = clients.converter.convert(doc_stream)
full_doc = result.document.export_to_dict()
slim_doc = extract_relevant(full_doc)
# Extract all text content
all_text = []
for chunk in slim_doc["chunks"]:
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
# Extract all text content
all_text = []
for chunk in slim_doc["chunks"]:
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
full_content = "\n\n".join(all_text)
full_content = "\n\n".join(all_text)
return {
"filename": filename,
"content": full_content,
"pages": len(slim_doc["chunks"]),
"content_length": len(full_content),
}
return {
"filename": filename,
"content": full_content,
"pages": len(slim_doc["chunks"]),
"content_length": len(full_content),
}

View file

@ -119,6 +119,82 @@ def get_worker_converter():
return _worker_converter
def process_text_file(file_path: str) -> dict:
"""
Process a plain text file without using docling.
Returns the same structure as extract_relevant() for consistency.
Args:
file_path: Path to the .txt file
Returns:
dict with keys: id, filename, mimetype, chunks
"""
import os
from utils.hash_utils import hash_id
# Read the file
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# Compute hash
file_hash = hash_id(file_path)
filename = os.path.basename(file_path)
# Split content into chunks of ~1000 characters to match typical docling chunk sizes
# This ensures embeddings stay within reasonable token limits
chunk_size = 1000
chunks = []
# Split by paragraphs first (double newline)
paragraphs = content.split('\n\n')
current_chunk = ""
chunk_index = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
# If adding this paragraph would exceed chunk size, save current chunk
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
chunks.append({
"page": chunk_index + 1, # Use chunk_index + 1 as "page" number
"type": "text",
"text": current_chunk.strip()
})
chunk_index += 1
current_chunk = para
else:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
# Add the last chunk if any
if current_chunk.strip():
chunks.append({
"page": chunk_index + 1,
"type": "text",
"text": current_chunk.strip()
})
# If no chunks were created (empty file), create a single empty chunk
if not chunks:
chunks.append({
"page": 1,
"type": "text",
"text": ""
})
return {
"id": file_hash,
"filename": filename,
"mimetype": "text/plain",
"chunks": chunks,
}
def extract_relevant(doc_dict: dict) -> dict:
"""
Given the full export_to_dict() result: