Add comprehensive documentation covering 6 modules: - 01-API-LAYER: Authentication, routing, SSE streaming - 02-SERVICE-LAYER: Dialog, Task, LLM service analysis - 03-RAG-ENGINE: Hybrid search, embedding, reranking - 04-AGENT-SYSTEM: Canvas engine, components, tools - 05-DOCUMENT-PROCESSING: Task executor, PDF parsing - 06-ALGORITHMS: BM25, fusion, RAPTOR Total 28 documentation files with code analysis, diagrams, and formulas.
420 lines
12 KiB
Markdown
420 lines
12 KiB
Markdown
# Task Executor Analysis
|
|
|
|
## Tong Quan
|
|
|
|
Task executor là main orchestration engine xử lý documents asynchronously với queue-based processing.
|
|
|
|
## File Location
|
|
```
|
|
/rag/svr/task_executor.py
|
|
```
|
|
|
|
## Architecture
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ TASK EXECUTOR ARCHITECTURE │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ Main Event Loop (trio) │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ report_status() - Heartbeat (30s interval) │ │
|
|
│ │ - Update server status │ │
|
|
│ │ - Cleanup stale tasks │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
│ │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Task Manager Loop │ │
|
|
│ │ ├── collect() - Get task from Redis queue │ │
|
|
│ │ ├── do_handle_task() - Process with semaphore │ │
|
|
│ │ │ ├── build_chunks() │ │
|
|
│ │ │ ├── embedding() │ │
|
|
│ │ │ └── insert_es() │ │
|
|
│ │ └── handle_task() - ACK and error handling │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
## Main Entry Point
|
|
|
|
```python
|
|
async def main():
|
|
"""Main entry point for task executor."""
|
|
|
|
# Initialize connections
|
|
init_db_connection()
|
|
init_es_connection()
|
|
init_minio_connection()
|
|
|
|
# Start concurrent tasks
|
|
async with trio.open_nursery() as nursery:
|
|
# Heartbeat reporter
|
|
nursery.start_soon(report_status)
|
|
|
|
# Task processing loop
|
|
nursery.start_soon(task_loop)
|
|
|
|
async def task_loop():
|
|
"""Main task processing loop."""
|
|
while True:
|
|
try:
|
|
# Get task from queue
|
|
task = await collect()
|
|
|
|
if task:
|
|
# Process with concurrency limit
|
|
async with semaphore:
|
|
await do_handle_task(task)
|
|
except Exception as e:
|
|
logging.exception(e)
|
|
await trio.sleep(1)
|
|
```
|
|
|
|
## Task Collection
|
|
|
|
```python
|
|
async def collect():
|
|
"""
|
|
Collect task from Redis queue.
|
|
|
|
Returns:
|
|
Task dict or None if no tasks available
|
|
"""
|
|
# Try to get from queue
|
|
result = REDIS_CONN.queue_consume(
|
|
queue_name=get_queue_name(),
|
|
consumer_group=SVR_CONSUMER_GROUP_NAME,
|
|
block=5000 # 5 second timeout
|
|
)
|
|
|
|
if not result:
|
|
return None
|
|
|
|
# Parse task
|
|
message_id, task_data = result
|
|
task = json.loads(task_data["task"])
|
|
|
|
# Get full task context
|
|
task_info = TaskService.get_task(task["id"])
|
|
|
|
if not task_info:
|
|
# Task canceled or max retries exceeded
|
|
REDIS_CONN.queue_ack(queue_name, message_id)
|
|
return None
|
|
|
|
task_info["message_id"] = message_id
|
|
return task_info
|
|
```
|
|
|
|
## Task Handling
|
|
|
|
```python
|
|
async def do_handle_task(task):
|
|
"""
|
|
Main task processing logic.
|
|
|
|
Steps:
|
|
1. Download file from MinIO
|
|
2. Build chunks (parse + chunk + enrich)
|
|
3. Generate embeddings
|
|
4. Index in Elasticsearch
|
|
"""
|
|
|
|
doc_id = task["doc_id"]
|
|
task_id = task["id"]
|
|
|
|
try:
|
|
# Update progress: Starting
|
|
TaskService.update_progress(task_id, {
|
|
"progress": 0.1,
|
|
"progress_msg": "Starting document processing..."
|
|
})
|
|
|
|
# 1. Download file
|
|
file_blob = await download_from_minio(task)
|
|
|
|
# 2. Build chunks
|
|
chunks = await build_chunks(task, file_blob)
|
|
|
|
if not chunks:
|
|
TaskService.update_progress(task_id, {
|
|
"progress": -1,
|
|
"progress_msg": "No content extracted"
|
|
})
|
|
return
|
|
|
|
# 3. Generate embeddings
|
|
chunks = await embedding(chunks, task)
|
|
|
|
# 4. Index in Elasticsearch
|
|
await insert_es(chunks, task)
|
|
|
|
# 5. Update success
|
|
TaskService.update_progress(task_id, {
|
|
"progress": 1.0,
|
|
"progress_msg": f"Completed. {len(chunks)} chunks created.",
|
|
"chunk_ids": " ".join([c["id"] for c in chunks])
|
|
})
|
|
|
|
except Exception as e:
|
|
logging.exception(e)
|
|
TaskService.update_progress(task_id, {
|
|
"progress": -1,
|
|
"progress_msg": str(e)
|
|
})
|
|
|
|
async def handle_task(task, result):
|
|
"""
|
|
Post-processing: ACK queue and cleanup.
|
|
"""
|
|
REDIS_CONN.queue_ack(
|
|
get_queue_name(),
|
|
task["message_id"]
|
|
)
|
|
```
|
|
|
|
## Chunk Building
|
|
|
|
```python
|
|
async def build_chunks(task, file_blob):
|
|
"""
|
|
Build chunks from document.
|
|
|
|
Process:
|
|
1. Select parser based on file type
|
|
2. Parse document
|
|
3. Chunk content
|
|
4. Enrich chunks (keywords, questions)
|
|
"""
|
|
|
|
file_name = task["name"]
|
|
parser_id = task["parser_id"]
|
|
parser_config = task["parser_config"]
|
|
|
|
# Select parser
|
|
if file_name.endswith(".pdf"):
|
|
if parser_config.get("layout_recognize") == "DeepDOC":
|
|
parser = RAGFlowPdfParser()
|
|
elif parser_config.get("layout_recognize") == "Plain":
|
|
parser = PlainParser()
|
|
else:
|
|
parser = VisionParser()
|
|
|
|
elif file_name.endswith(".docx"):
|
|
parser = DocxParser()
|
|
|
|
elif file_name.endswith(".xlsx"):
|
|
parser = ExcelParser()
|
|
|
|
else:
|
|
parser = TextParser()
|
|
|
|
# Parse document
|
|
sections = parser.parse(
|
|
file_blob,
|
|
from_page=task.get("from_page", 0),
|
|
to_page=task.get("to_page", -1),
|
|
callback=lambda p, m: TaskService.update_progress(task["id"], {
|
|
"progress": p,
|
|
"progress_msg": m
|
|
})
|
|
)
|
|
|
|
# Chunk content
|
|
chunks = naive_merge(
|
|
sections,
|
|
chunk_token_num=parser_config.get("chunk_token_num", 512),
|
|
delimiter=parser_config.get("delimiter", "\n。;!?"),
|
|
overlapped_percent=parser_config.get("overlapped_percent", 0)
|
|
)
|
|
|
|
# Build chunk records
|
|
chunk_records = []
|
|
for i, (content, positions) in enumerate(chunks):
|
|
chunk_id = xxhash.xxh64(content + task["doc_id"]).hexdigest()
|
|
|
|
chunk_records.append({
|
|
"id": chunk_id,
|
|
"doc_id": task["doc_id"],
|
|
"kb_id": task["kb_id"],
|
|
"content_with_weight": content,
|
|
"docnm_kwd": task["name"],
|
|
"page_num_int": extract_page_nums(positions),
|
|
"position_int": encode_positions(positions),
|
|
"create_time": datetime.now().isoformat(),
|
|
})
|
|
|
|
# Enrich chunks
|
|
if parser_config.get("auto_keywords"):
|
|
await add_keywords(chunk_records, task)
|
|
|
|
if parser_config.get("auto_questions"):
|
|
await add_questions(chunk_records, task)
|
|
|
|
return chunk_records
|
|
```
|
|
|
|
## Embedding Generation
|
|
|
|
```python
|
|
async def embedding(chunks, task):
|
|
"""
|
|
Generate embeddings for chunks.
|
|
"""
|
|
embd_mdl = LLMBundle(
|
|
task["tenant_id"],
|
|
LLMType.EMBEDDING,
|
|
task.get("embd_id")
|
|
)
|
|
|
|
batch_size = 16
|
|
total_tokens = 0
|
|
|
|
for i in range(0, len(chunks), batch_size):
|
|
batch = chunks[i:i+batch_size]
|
|
|
|
# Prepare texts
|
|
texts = [c["content_with_weight"] for c in batch]
|
|
|
|
# Generate embeddings
|
|
embeddings, tokens = embd_mdl.encode(texts)
|
|
total_tokens += tokens
|
|
|
|
# Store vectors
|
|
for j, emb in enumerate(embeddings):
|
|
chunk_idx = i + j
|
|
vec_field = f"q_{len(emb)}_vec"
|
|
chunks[chunk_idx][vec_field] = emb.tolist()
|
|
|
|
# Update progress
|
|
progress = 0.7 + 0.2 * (i / len(chunks))
|
|
TaskService.update_progress(task["id"], {
|
|
"progress": progress,
|
|
"progress_msg": f"Embedding {i+len(batch)}/{len(chunks)} chunks"
|
|
})
|
|
|
|
return chunks
|
|
```
|
|
|
|
## Elasticsearch Indexing
|
|
|
|
```python
|
|
async def insert_es(chunks, task):
|
|
"""
|
|
Bulk insert chunks to Elasticsearch.
|
|
"""
|
|
es = get_es_connection()
|
|
index_name = f"ragflow_{task['kb_id']}"
|
|
|
|
# Ensure index exists
|
|
if not es.indices.exists(index=index_name):
|
|
es.indices.create(index=index_name, body=ES_MAPPING)
|
|
|
|
# Bulk insert
|
|
bulk_size = 64
|
|
for i in range(0, len(chunks), bulk_size):
|
|
batch = chunks[i:i+bulk_size]
|
|
|
|
actions = []
|
|
for chunk in batch:
|
|
actions.append({
|
|
"_index": index_name,
|
|
"_id": chunk["id"],
|
|
"_source": chunk
|
|
})
|
|
|
|
helpers.bulk(es, actions)
|
|
|
|
# Update progress
|
|
progress = 0.9 + 0.1 * (i / len(chunks))
|
|
TaskService.update_progress(task["id"], {
|
|
"progress": progress,
|
|
"progress_msg": f"Indexing {i+len(batch)}/{len(chunks)} chunks"
|
|
})
|
|
```
|
|
|
|
## Concurrency Control
|
|
|
|
```python
|
|
# Global semaphores
|
|
task_semaphore = trio.Semaphore(MAX_CONCURRENT_TASKS) # 5
|
|
chunk_semaphore = trio.Semaphore(MAX_CONCURRENT_CHUNK_BUILDERS) # 1
|
|
minio_semaphore = trio.Semaphore(MAX_CONCURRENT_MINIO) # 10
|
|
|
|
async def do_handle_task(task):
|
|
async with task_semaphore:
|
|
# ... processing
|
|
|
|
async def build_chunks(task, blob):
|
|
async with chunk_semaphore:
|
|
# ... chunk building
|
|
|
|
async def download_from_minio(task):
|
|
async with minio_semaphore:
|
|
# ... download
|
|
```
|
|
|
|
## Progress Tracking
|
|
|
|
```python
|
|
# Progress stages:
|
|
# 0.0 - 0.1: Starting
|
|
# 0.1 - 0.4: Image extraction (PDF)
|
|
# 0.4 - 0.6: OCR
|
|
# 0.6 - 0.7: Layout + text merge
|
|
# 0.7 - 0.9: Embedding
|
|
# 0.9 - 1.0: Indexing
|
|
|
|
def update_progress(task_id, info):
|
|
"""
|
|
Thread-safe progress update.
|
|
|
|
Rules:
|
|
- progress_msg: Always append
|
|
- progress: Only update if new > current (or -1 for failure)
|
|
"""
|
|
# ... implementation
|
|
```
|
|
|
|
## Task Types
|
|
|
|
```python
|
|
TASK_TYPES = {
|
|
"": "standard", # Standard document parsing
|
|
"graphrag": "graphrag", # Knowledge graph extraction
|
|
"raptor": "raptor", # RAPTOR tree building
|
|
"mindmap": "mindmap", # Mind map generation
|
|
"dataflow": "dataflow", # Custom pipeline
|
|
}
|
|
|
|
async def do_handle_task(task):
|
|
task_type = task.get("task_type", "")
|
|
|
|
if task_type == "graphrag":
|
|
await handle_graphrag_task(task)
|
|
elif task_type == "raptor":
|
|
await handle_raptor_task(task)
|
|
else:
|
|
await handle_standard_task(task)
|
|
```
|
|
|
|
## Configuration
|
|
|
|
```python
|
|
# Environment variables
|
|
MAX_CONCURRENT_TASKS = int(os.environ.get("MAX_CONCURRENT_TASKS", 5))
|
|
MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get("MAX_CONCURRENT_CHUNK_BUILDERS", 1))
|
|
MAX_CONCURRENT_MINIO = int(os.environ.get("MAX_CONCURRENT_MINIO", 10))
|
|
|
|
DOC_MAXIMUM_SIZE = 100 * 1024 * 1024 # 100MB
|
|
DOC_BULK_SIZE = 64
|
|
EMBEDDING_BATCH_SIZE = 16
|
|
```
|
|
|
|
## Related Files
|
|
|
|
- `/rag/svr/task_executor.py` - Main executor
|
|
- `/api/db/services/task_service.py` - Task management
|
|
- `/rag/app/naive.py` - Document parsing
|
|
- `/rag/nlp/__init__.py` - Chunking
|