Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
12 KiB
12 KiB
Task Executor Analysis
Tong Quan
Task executor là main orchestration engine xử lý documents asynchronously với queue-based processing.
File Location
/rag/svr/task_executor.py
Architecture
┌─────────────────────────────────────────────────────────────────┐
│ TASK EXECUTOR ARCHITECTURE │
└─────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────┐
│ Main Event Loop (trio) │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ report_status() - Heartbeat (30s interval) │ │
│ │ - Update server status │ │
│ │ - Cleanup stale tasks │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Task Manager Loop │ │
│ │ ├── collect() - Get task from Redis queue │ │
│ │ ├── do_handle_task() - Process with semaphore │ │
│ │ │ ├── build_chunks() │ │
│ │ │ ├── embedding() │ │
│ │ │ └── insert_es() │ │
│ │ └── handle_task() - ACK and error handling │ │
│ └─────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
Main Entry Point
async def main():
"""Main entry point for task executor."""
# Initialize connections
init_db_connection()
init_es_connection()
init_minio_connection()
# Start concurrent tasks
async with trio.open_nursery() as nursery:
# Heartbeat reporter
nursery.start_soon(report_status)
# Task processing loop
nursery.start_soon(task_loop)
async def task_loop():
"""Main task processing loop."""
while True:
try:
# Get task from queue
task = await collect()
if task:
# Process with concurrency limit
async with semaphore:
await do_handle_task(task)
except Exception as e:
logging.exception(e)
await trio.sleep(1)
Task Collection
async def collect():
"""
Collect task from Redis queue.
Returns:
Task dict or None if no tasks available
"""
# Try to get from queue
result = REDIS_CONN.queue_consume(
queue_name=get_queue_name(),
consumer_group=SVR_CONSUMER_GROUP_NAME,
block=5000 # 5 second timeout
)
if not result:
return None
# Parse task
message_id, task_data = result
task = json.loads(task_data["task"])
# Get full task context
task_info = TaskService.get_task(task["id"])
if not task_info:
# Task canceled or max retries exceeded
REDIS_CONN.queue_ack(queue_name, message_id)
return None
task_info["message_id"] = message_id
return task_info
Task Handling
async def do_handle_task(task):
"""
Main task processing logic.
Steps:
1. Download file from MinIO
2. Build chunks (parse + chunk + enrich)
3. Generate embeddings
4. Index in Elasticsearch
"""
doc_id = task["doc_id"]
task_id = task["id"]
try:
# Update progress: Starting
TaskService.update_progress(task_id, {
"progress": 0.1,
"progress_msg": "Starting document processing..."
})
# 1. Download file
file_blob = await download_from_minio(task)
# 2. Build chunks
chunks = await build_chunks(task, file_blob)
if not chunks:
TaskService.update_progress(task_id, {
"progress": -1,
"progress_msg": "No content extracted"
})
return
# 3. Generate embeddings
chunks = await embedding(chunks, task)
# 4. Index in Elasticsearch
await insert_es(chunks, task)
# 5. Update success
TaskService.update_progress(task_id, {
"progress": 1.0,
"progress_msg": f"Completed. {len(chunks)} chunks created.",
"chunk_ids": " ".join([c["id"] for c in chunks])
})
except Exception as e:
logging.exception(e)
TaskService.update_progress(task_id, {
"progress": -1,
"progress_msg": str(e)
})
async def handle_task(task, result):
"""
Post-processing: ACK queue and cleanup.
"""
REDIS_CONN.queue_ack(
get_queue_name(),
task["message_id"]
)
Chunk Building
async def build_chunks(task, file_blob):
"""
Build chunks from document.
Process:
1. Select parser based on file type
2. Parse document
3. Chunk content
4. Enrich chunks (keywords, questions)
"""
file_name = task["name"]
parser_id = task["parser_id"]
parser_config = task["parser_config"]
# Select parser
if file_name.endswith(".pdf"):
if parser_config.get("layout_recognize") == "DeepDOC":
parser = RAGFlowPdfParser()
elif parser_config.get("layout_recognize") == "Plain":
parser = PlainParser()
else:
parser = VisionParser()
elif file_name.endswith(".docx"):
parser = DocxParser()
elif file_name.endswith(".xlsx"):
parser = ExcelParser()
else:
parser = TextParser()
# Parse document
sections = parser.parse(
file_blob,
from_page=task.get("from_page", 0),
to_page=task.get("to_page", -1),
callback=lambda p, m: TaskService.update_progress(task["id"], {
"progress": p,
"progress_msg": m
})
)
# Chunk content
chunks = naive_merge(
sections,
chunk_token_num=parser_config.get("chunk_token_num", 512),
delimiter=parser_config.get("delimiter", "\n。;!?"),
overlapped_percent=parser_config.get("overlapped_percent", 0)
)
# Build chunk records
chunk_records = []
for i, (content, positions) in enumerate(chunks):
chunk_id = xxhash.xxh64(content + task["doc_id"]).hexdigest()
chunk_records.append({
"id": chunk_id,
"doc_id": task["doc_id"],
"kb_id": task["kb_id"],
"content_with_weight": content,
"docnm_kwd": task["name"],
"page_num_int": extract_page_nums(positions),
"position_int": encode_positions(positions),
"create_time": datetime.now().isoformat(),
})
# Enrich chunks
if parser_config.get("auto_keywords"):
await add_keywords(chunk_records, task)
if parser_config.get("auto_questions"):
await add_questions(chunk_records, task)
return chunk_records
Embedding Generation
async def embedding(chunks, task):
"""
Generate embeddings for chunks.
"""
embd_mdl = LLMBundle(
task["tenant_id"],
LLMType.EMBEDDING,
task.get("embd_id")
)
batch_size = 16
total_tokens = 0
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i+batch_size]
# Prepare texts
texts = [c["content_with_weight"] for c in batch]
# Generate embeddings
embeddings, tokens = embd_mdl.encode(texts)
total_tokens += tokens
# Store vectors
for j, emb in enumerate(embeddings):
chunk_idx = i + j
vec_field = f"q_{len(emb)}_vec"
chunks[chunk_idx][vec_field] = emb.tolist()
# Update progress
progress = 0.7 + 0.2 * (i / len(chunks))
TaskService.update_progress(task["id"], {
"progress": progress,
"progress_msg": f"Embedding {i+len(batch)}/{len(chunks)} chunks"
})
return chunks
Elasticsearch Indexing
async def insert_es(chunks, task):
"""
Bulk insert chunks to Elasticsearch.
"""
es = get_es_connection()
index_name = f"ragflow_{task['kb_id']}"
# Ensure index exists
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name, body=ES_MAPPING)
# Bulk insert
bulk_size = 64
for i in range(0, len(chunks), bulk_size):
batch = chunks[i:i+bulk_size]
actions = []
for chunk in batch:
actions.append({
"_index": index_name,
"_id": chunk["id"],
"_source": chunk
})
helpers.bulk(es, actions)
# Update progress
progress = 0.9 + 0.1 * (i / len(chunks))
TaskService.update_progress(task["id"], {
"progress": progress,
"progress_msg": f"Indexing {i+len(batch)}/{len(chunks)} chunks"
})
Concurrency Control
# Global semaphores
task_semaphore = trio.Semaphore(MAX_CONCURRENT_TASKS) # 5
chunk_semaphore = trio.Semaphore(MAX_CONCURRENT_CHUNK_BUILDERS) # 1
minio_semaphore = trio.Semaphore(MAX_CONCURRENT_MINIO) # 10
async def do_handle_task(task):
async with task_semaphore:
# ... processing
async def build_chunks(task, blob):
async with chunk_semaphore:
# ... chunk building
async def download_from_minio(task):
async with minio_semaphore:
# ... download
Progress Tracking
# Progress stages:
# 0.0 - 0.1: Starting
# 0.1 - 0.4: Image extraction (PDF)
# 0.4 - 0.6: OCR
# 0.6 - 0.7: Layout + text merge
# 0.7 - 0.9: Embedding
# 0.9 - 1.0: Indexing
def update_progress(task_id, info):
"""
Thread-safe progress update.
Rules:
- progress_msg: Always append
- progress: Only update if new > current (or -1 for failure)
"""
# ... implementation
Task Types
TASK_TYPES = {
"": "standard", # Standard document parsing
"graphrag": "graphrag", # Knowledge graph extraction
"raptor": "raptor", # RAPTOR tree building
"mindmap": "mindmap", # Mind map generation
"dataflow": "dataflow", # Custom pipeline
}
async def do_handle_task(task):
task_type = task.get("task_type", "")
if task_type == "graphrag":
await handle_graphrag_task(task)
elif task_type == "raptor":
await handle_raptor_task(task)
else:
await handle_standard_task(task)
Configuration
# Environment variables
MAX_CONCURRENT_TASKS = int(os.environ.get("MAX_CONCURRENT_TASKS", 5))
MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get("MAX_CONCURRENT_CHUNK_BUILDERS", 1))
MAX_CONCURRENT_MINIO = int(os.environ.get("MAX_CONCURRENT_MINIO", 10))
DOC_MAXIMUM_SIZE = 100 * 1024 * 1024 # 100MB
DOC_BULK_SIZE = 64
EMBEDDING_BATCH_SIZE = 16
Related Files
/rag/svr/task_executor.py- Main executor/api/db/services/task_service.py- Task management/rag/app/naive.py- Document parsing/rag/nlp/__init__.py- Chunking