Merge branch 'main' into fix-markdown-upload
This commit is contained in:
commit
88e57e944f
3 changed files with 129 additions and 20 deletions
|
|
@ -197,10 +197,27 @@ class TaskProcessor:
|
|||
file_hash=file_hash,
|
||||
)
|
||||
|
||||
# Convert and extract
|
||||
result = clients.converter.convert(file_path)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
# Check if this is a .txt file - use simple processing instead of docling
|
||||
import os
|
||||
file_ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if file_ext == '.txt':
|
||||
# Simple text file processing without docling
|
||||
from utils.document_processing import process_text_file
|
||||
logger.info(
|
||||
"Processing as plain text file (bypassing docling)",
|
||||
file_path=file_path,
|
||||
file_hash=file_hash,
|
||||
)
|
||||
slim_doc = process_text_file(file_path)
|
||||
# Override filename with original_filename if provided
|
||||
if original_filename:
|
||||
slim_doc["filename"] = original_filename
|
||||
else:
|
||||
# Convert and extract using docling for other file types
|
||||
result = clients.converter.convert(file_path)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
|
||||
texts = [c["text"] for c in slim_doc["chunks"]]
|
||||
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ class DocumentService:
|
|||
async def process_upload_context(self, upload_file, filename: str = None):
|
||||
"""Process uploaded file and return content for context"""
|
||||
import io
|
||||
import os
|
||||
|
||||
if not filename:
|
||||
filename = upload_file.filename or "uploaded_document"
|
||||
|
|
@ -194,22 +195,37 @@ class DocumentService:
|
|||
content.write(chunk)
|
||||
content.seek(0) # Reset to beginning for reading
|
||||
|
||||
# Create DocumentStream and process with docling
|
||||
doc_stream = DocumentStream(name=filename, stream=content)
|
||||
result = clients.converter.convert(doc_stream)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
# Check if this is a .txt file - use simple processing
|
||||
file_ext = os.path.splitext(filename)[1].lower()
|
||||
|
||||
if file_ext == '.txt':
|
||||
# Simple text file processing for chat context
|
||||
text_content = content.read().decode('utf-8', errors='replace')
|
||||
|
||||
# For context, we don't need to chunk - just return the full content
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": text_content,
|
||||
"pages": 1, # Text files don't have pages
|
||||
"content_length": len(text_content),
|
||||
}
|
||||
else:
|
||||
# Create DocumentStream and process with docling
|
||||
doc_stream = DocumentStream(name=filename, stream=content)
|
||||
result = clients.converter.convert(doc_stream)
|
||||
full_doc = result.document.export_to_dict()
|
||||
slim_doc = extract_relevant(full_doc)
|
||||
|
||||
# Extract all text content
|
||||
all_text = []
|
||||
for chunk in slim_doc["chunks"]:
|
||||
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
||||
# Extract all text content
|
||||
all_text = []
|
||||
for chunk in slim_doc["chunks"]:
|
||||
all_text.append(f"Page {chunk['page']}:\n{chunk['text']}")
|
||||
|
||||
full_content = "\n\n".join(all_text)
|
||||
full_content = "\n\n".join(all_text)
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": full_content,
|
||||
"pages": len(slim_doc["chunks"]),
|
||||
"content_length": len(full_content),
|
||||
}
|
||||
return {
|
||||
"filename": filename,
|
||||
"content": full_content,
|
||||
"pages": len(slim_doc["chunks"]),
|
||||
"content_length": len(full_content),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -119,6 +119,82 @@ def get_worker_converter():
|
|||
return _worker_converter
|
||||
|
||||
|
||||
def process_text_file(file_path: str) -> dict:
|
||||
"""
|
||||
Process a plain text file without using docling.
|
||||
Returns the same structure as extract_relevant() for consistency.
|
||||
|
||||
Args:
|
||||
file_path: Path to the .txt file
|
||||
|
||||
Returns:
|
||||
dict with keys: id, filename, mimetype, chunks
|
||||
"""
|
||||
import os
|
||||
from utils.hash_utils import hash_id
|
||||
|
||||
# Read the file
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
content = f.read()
|
||||
|
||||
# Compute hash
|
||||
file_hash = hash_id(file_path)
|
||||
filename = os.path.basename(file_path)
|
||||
|
||||
# Split content into chunks of ~1000 characters to match typical docling chunk sizes
|
||||
# This ensures embeddings stay within reasonable token limits
|
||||
chunk_size = 1000
|
||||
chunks = []
|
||||
|
||||
# Split by paragraphs first (double newline)
|
||||
paragraphs = content.split('\n\n')
|
||||
current_chunk = ""
|
||||
chunk_index = 0
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
# If adding this paragraph would exceed chunk size, save current chunk
|
||||
if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
|
||||
chunks.append({
|
||||
"page": chunk_index + 1, # Use chunk_index + 1 as "page" number
|
||||
"type": "text",
|
||||
"text": current_chunk.strip()
|
||||
})
|
||||
chunk_index += 1
|
||||
current_chunk = para
|
||||
else:
|
||||
if current_chunk:
|
||||
current_chunk += "\n\n" + para
|
||||
else:
|
||||
current_chunk = para
|
||||
|
||||
# Add the last chunk if any
|
||||
if current_chunk.strip():
|
||||
chunks.append({
|
||||
"page": chunk_index + 1,
|
||||
"type": "text",
|
||||
"text": current_chunk.strip()
|
||||
})
|
||||
|
||||
# If no chunks were created (empty file), create a single empty chunk
|
||||
if not chunks:
|
||||
chunks.append({
|
||||
"page": 1,
|
||||
"type": "text",
|
||||
"text": ""
|
||||
})
|
||||
|
||||
return {
|
||||
"id": file_hash,
|
||||
"filename": filename,
|
||||
"mimetype": "text/plain",
|
||||
"chunks": chunks,
|
||||
}
|
||||
|
||||
|
||||
def extract_relevant(doc_dict: dict) -> dict:
|
||||
"""
|
||||
Given the full export_to_dict() result:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue