Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
14 KiB
14 KiB
Task Service Analysis - Background Job Queue
Tổng Quan
task_service.py (20KB) quản lý background task queue cho document processing, sử dụng Redis làm message broker.
File Location
/api/db/services/task_service.py
Task Model
class Task(DataBaseModel):
id = CharField(primary_key=True, max_length=32)
doc_id = CharField(max_length=32, index=True)
# Page range for chunked processing
from_page = IntegerField(default=0)
to_page = IntegerField(default=-1) # -1 = all pages
# Task type
task_type = CharField(max_length=32, default="")
# Types: "", "graphrag", "raptor", "mindmap", "dataflow"
# Priority (lower = higher priority)
priority = IntegerField(default=0)
# Timing
begin_at = DateTimeField(null=True)
process_duration = FloatField(default=0)
# Progress tracking
progress = FloatField(default=0) # 0-1, -1 = failed
progress_msg = TextField(default="")
# Retry handling
retry_count = IntegerField(default=0)
# Config digest for chunk reuse
digest = TextField(default="")
# Result: space-separated chunk IDs
chunk_ids = LongTextField(default="")
Task Lifecycle
┌─────────────────────────────────────────────────────────────────────────┐
│ TASK LIFECYCLE │
└─────────────────────────────────────────────────────────────────────────┘
[1] TASK CREATION (queue_tasks)
│
├─── Calculate page ranges
├─── Create Task records in MySQL
├─── Calculate config digest (xxhash)
├─── Check for chunk reuse from previous tasks
└─── Push to Redis queue
│
▼
[2] TASK PENDING
│
│ Status: progress=0, retry_count=0
│ Location: Redis queue
│
│
▼
[3] TASK PICKED UP (get_task)
│
├─── Consumer pulls from Redis
├─── Increment retry_count
├─── Load full task context (doc, kb, tenant)
└─── Return task dict or None if max retries exceeded
│
▼
[4] TASK PROCESSING
│
│ Status: progress=0.x, begin_at=now
│ Worker: task_executor.py
│
├─── Download file from MinIO
├─── Parse document
├─── Generate chunks
├─── Create embeddings
├─── Store in Elasticsearch
└─── Update progress periodically
│
▼
[5] TASK COMPLETED
│
│ Status: progress=1.0 (success) or -1 (failed)
│ Stored: chunk_ids in Task record
│
└─── Document.progress updated via _sync_progress()
Core Methods
queue_tasks() - Task Creation
def queue_tasks(doc: dict, bucket: str, name: str, priority: int = 0):
"""
Create and queue processing tasks for a document.
Args:
doc: Document dict with id, kb_id, parser_config
bucket: Storage bucket name
name: File name
priority: Task priority (0 = normal)
Process:
1. Calculate page/row ranges based on file type
2. Create Task records
3. Check for chunk reuse
4. Push to Redis queue
"""
# Get file from storage
blob = STORAGE_IMPL.get(bucket, name)
# Determine splitting strategy based on file type
if is_pdf(name):
# PDF: Split by pages
pages = get_pdf_page_count(blob)
page_size = doc["parser_config"].get("task_page_size", 12)
tasks = []
for start in range(0, pages, page_size):
end = min(start + page_size, pages)
tasks.append({
"id": get_uuid(),
"doc_id": doc["id"],
"from_page": start,
"to_page": end,
"digest": calculate_digest(doc, start, end)
})
elif is_excel(name) and doc["parser_id"] == "table":
# Excel: Split by rows
row_count = get_excel_row_count(blob)
row_size = 3000
tasks = []
for start in range(0, row_count, row_size):
end = min(start + row_size, row_count)
tasks.append({
"id": get_uuid(),
"doc_id": doc["id"],
"from_page": start,
"to_page": end,
"digest": calculate_digest(doc, start, end)
})
else:
# Other files: Single task
tasks = [{
"id": get_uuid(),
"doc_id": doc["id"],
"from_page": 0,
"to_page": -1,
"digest": calculate_digest(doc, 0, -1)
}]
# Check for chunk reuse from previous tasks
prev_tasks = Task.select().where(Task.doc_id == doc["id"])
for task in tasks:
reused = reuse_prev_task_chunks(task, prev_tasks, doc["parser_config"])
if reused:
task["progress"] = 1.0 # Mark as done
# Bulk insert tasks
bulk_insert_into_db(Task, tasks, ignore_conflicts=True)
# Queue unfinished tasks to Redis
for task in tasks:
if task["progress"] < 1.0:
REDIS_CONN.queue_product(
get_queue_name(doc["tenant_id"]),
task
)
get_task() - Task Retrieval
@classmethod
@DB.connection_context()
def get_task(cls, task_id: str, doc_ids: list = []):
"""
Retrieve task with full context for processing.
Args:
task_id: Task ID
doc_ids: Optional filter by document IDs
Returns:
Task dict with document, KB, and tenant info
None if task should be abandoned (max retries)
"""
# Increment retry count
Task.update(
retry_count=Task.retry_count + 1
).where(Task.id == task_id).execute()
# Check max retries
task = Task.get_by_id(task_id)
if task.retry_count >= 3:
# Abandon task
Task.update(
progress=-1,
progress_msg="Task abandoned after 3 retries"
).where(Task.id == task_id).execute()
return None
# Build full context query
query = (Task
.select(Task, Document, Knowledgebase, Tenant)
.join(Document, on=(Task.doc_id == Document.id))
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(Task.id == task_id))
if doc_ids:
query = query.where(Task.doc_id.in_(doc_ids))
result = list(query.dicts())
return result[0] if result else None
update_progress() - Safe Progress Update
@classmethod
@DB.connection_context()
def update_progress(cls, id: str, info: dict):
"""
Safely update task progress.
Safety rules:
- progress_msg: Always append (not replace)
- progress: Only update if new > current (or new == -1 for failure)
Args:
id: Task ID
info: Dict with progress and/or progress_msg
"""
# Get current state
task = Task.get_by_id(id)
# Handle progress_msg (append)
if "progress_msg" in info:
# Append new message
current_msg = task.progress_msg or ""
new_msg = info["progress_msg"]
combined = current_msg + new_msg
# Trim to 3000 lines if needed
lines = combined.split("\n")
if len(lines) > 3000:
combined = "\n".join(lines[-3000:])
info["progress_msg"] = combined
# Handle progress (only increase)
if "progress" in info:
new_progress = info["progress"]
current_progress = task.progress
# Only update if:
# - Current is not failed (-1) AND
# - New is failure (-1) OR new > current
if current_progress == -1:
del info["progress"] # Don't update failed tasks
elif new_progress != -1 and new_progress <= current_progress:
del info["progress"] # Don't go backwards
# Use database lock for thread safety (non-macOS)
if platform.system() != "Darwin":
with DB.lock(f"task_progress_{id}", timeout=30):
Task.update(info).where(Task.id == id).execute()
else:
Task.update(info).where(Task.id == id).execute()
reuse_prev_task_chunks() - Optimization
def reuse_prev_task_chunks(
task: dict,
prev_tasks: list,
chunking_config: dict
) -> int:
"""
Attempt to reuse chunks from previous task runs.
Reuse conditions:
- Same from_page
- Same config digest
- Previous task completed (progress=1.0)
- Previous task has chunk_ids
Returns:
Number of chunks reused (0 if not reused)
"""
for prev in prev_tasks:
if (prev.from_page == task["from_page"] and
prev.digest == task["digest"] and
prev.progress == 1.0 and
prev.chunk_ids):
# Reuse chunks
task["chunk_ids"] = prev.chunk_ids
task["progress"] = 1.0
# Update document chunk count
chunk_count = len(prev.chunk_ids.split())
DocumentService.increment_chunk_num(
task["doc_id"],
chunking_config["kb_id"],
0, # No new tokens
chunk_count,
0 # No duration
)
return chunk_count
return 0
Config Digest Calculation
def calculate_digest(doc: dict, from_page: int, to_page: int) -> str:
"""
Calculate configuration digest for chunk reuse detection.
Digest includes:
- Parser ID
- Parser config (chunk_token_num, delimiter, etc.)
- Page range
- Embedding model ID
Returns:
xxhash digest string
"""
config = {
"parser_id": doc["parser_id"],
"parser_config": doc["parser_config"],
"from_page": from_page,
"to_page": to_page,
"embd_id": doc.get("embd_id")
}
config_str = json.dumps(config, sort_keys=True)
return xxhash.xxh64(config_str).hexdigest()
Queue Management
Redis Queue Structure
# Queue naming
def get_queue_name(tenant_id: str) -> str:
return f"ragflow:task:{tenant_id}"
# Queue operations
class RedisQueueConnection:
def queue_product(self, queue_name: str, task: dict):
"""Add task to queue."""
self.redis.xadd(
queue_name,
{"task": json.dumps(task)},
maxlen=10000
)
def queue_consume(self, queue_name: str, consumer_group: str):
"""Consume task from queue."""
result = self.redis.xreadgroup(
consumer_group,
consumer_id,
{queue_name: ">"},
count=1,
block=5000
)
return result
def queue_ack(self, queue_name: str, message_id: str):
"""Acknowledge processed message."""
self.redis.xack(queue_name, consumer_group, message_id)
Consumer Group Setup
# Create consumer group
REDIS_CONN.redis.xgroup_create(
queue_name,
SVR_CONSUMER_GROUP_NAME,
id="0",
mkstream=True
)
# Read pending (unacked) messages first
pending = REDIS_CONN.redis.xpending_range(
queue_name,
SVR_CONSUMER_GROUP_NAME,
min="-",
max="+",
count=10
)
Progress Synchronization
# In DocumentService._sync_progress()
def _sync_progress(docs: list):
"""
Synchronize task progress to document level.
Called periodically to update document status.
"""
for doc in docs:
tasks = Task.select().where(Task.doc_id == doc["id"])
if not tasks:
continue
# Calculate aggregate progress
total_progress = sum(t.progress for t in tasks if t.progress >= 0)
failed_count = sum(1 for t in tasks if t.progress == -1)
if failed_count > 0:
# Any failure = document failed
doc_progress = -1
status = "FAIL"
elif all(t.progress == 1.0 for t in tasks):
# All complete
doc_progress = 1.0
status = "DONE"
else:
# In progress
doc_progress = total_progress / len(tasks)
status = "RUNNING"
# Special handling for async tasks
special_tasks = ["graphrag", "raptor", "mindmap"]
has_special = any(t.task_type in special_tasks for t in tasks)
if has_special:
# Freeze progress while special tasks run
freeze_progress = True
# Update document
DocumentService.update_by_id(doc["id"], {
"progress": doc_progress,
"status": status
})
Task Types
| Type | Purpose | Execution |
|---|---|---|
"" (default) |
Standard document parsing | Synchronous |
graphrag |
Knowledge graph extraction | Async |
raptor |
RAPTOR tree building | Async |
mindmap |
Mind map generation | Async |
dataflow |
Custom pipeline execution | Async |
Performance Optimizations
1. Batch Insert
bulk_insert_into_db(Task, tasks, ignore_conflicts=True)
2. Page-based Splitting
# PDF split into 12-page chunks
task_page_size = doc["parser_config"].get("task_page_size", 12)
3. Chunk Reuse
# Skip processing if config unchanged
if prev.digest == task["digest"] and prev.progress == 1.0:
reuse_chunks(prev, task)
4. Redis Streams
# Use Redis Streams for reliable queue
# - Message persistence
# - Consumer groups
# - Acknowledgment tracking
Related Files
/rag/svr/task_executor.py- Task execution/api/db/services/document_service.py- Progress sync/common/connection_utils.py- Redis connection