Add detailed analysis documentation for RAGFlow's document processing pipeline: - README.md: Overview and architecture diagram - task_executor_analysis.md: Task execution pipeline details - pdf_parsing.md: PDF parsing with layout analysis - ocr_pipeline.md: PaddleOCR integration and text detection - layout_detection.md: Detectron2 layout recognition - table_extraction.md: Table structure recognition (TSR) - file_type_handlers.md: Handlers for all supported file types These documents explain the document processing flow for newcomers to understand how RAGFlow handles various file formats.
16 KiB
16 KiB
OCR Pipeline - PaddleOCR Integration
Tong Quan
OCR (Optical Character Recognition) pipeline trong RAGFlow su dung PaddleOCR de extract text tu images. He thong duoc toi uu hoa de ho tro ca CPU va GPU, voi kha nang xu ly batch va multi-GPU parallel processing.
File Location
/deepdoc/vision/ocr.py
Architecture
OCR PIPELINE ARCHITECTURE
Input Image
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ TEXT DETECTOR │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Model: det.onnx (DBNet) │ │
│ │ - Resize image (max 960px) │ │
│ │ - Normalize: mean=[0.485,0.456,0.406] │ │
│ │ - Detect text regions → Bounding boxes │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌────────────────────────┐
│ Crop Text Regions │
│ Sort: top→bottom │
│ left→right │
└────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ TEXT RECOGNIZER │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Model: rec.onnx (CRNN + CTC) │ │
│ │ - Resize to 48x320 │ │
│ │ - Batch processing (16 images/batch) │ │
│ │ - CTC decode với character dictionary │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌────────────────────────┐
│ Filter by confidence │
│ (threshold: 0.5) │
└────────────────────────┘
│
▼
Text + Bounding Boxes
Core Components
1. OCR Class (Main Entry Point)
class OCR:
def __init__(self, model_dir=None):
"""
Initialize OCR with optional model directory.
Features:
- Auto-download models from HuggingFace if not found
- Multi-GPU support via PARALLEL_DEVICES setting
- Model caching for performance
"""
if settings.PARALLEL_DEVICES > 0:
# Create detector/recognizer for each GPU
self.text_detector = []
self.text_recognizer = []
for device_id in range(settings.PARALLEL_DEVICES):
self.text_detector.append(TextDetector(model_dir, device_id))
self.text_recognizer.append(TextRecognizer(model_dir, device_id))
else:
# Single device (CPU or GPU 0)
self.text_detector = [TextDetector(model_dir)]
self.text_recognizer = [TextRecognizer(model_dir)]
self.drop_score = 0.5 # Confidence threshold
def __call__(self, img, device_id=0):
"""
Full OCR pipeline: detect + recognize.
Returns:
List of (bounding_box, (text, confidence))
"""
# 1. Detect text regions
dt_boxes, det_time = self.text_detector[device_id](img)
# 2. Sort boxes (top-to-bottom, left-to-right)
dt_boxes = self.sorted_boxes(dt_boxes)
# 3. Crop and recognize each region
img_crop_list = []
for box in dt_boxes:
img_crop = self.get_rotate_crop_image(img, box)
img_crop_list.append(img_crop)
# 4. Batch recognize
rec_res, rec_time = self.text_recognizer[device_id](img_crop_list)
# 5. Filter by confidence
results = []
for box, (text, score) in zip(dt_boxes, rec_res):
if score >= self.drop_score:
results.append((box.tolist(), (text, score)))
return results
2. TextDetector Class
class TextDetector:
"""
Detect text regions using DBNet model.
Input: Image (numpy array)
Output: List of 4-point polygons (bounding boxes)
"""
def __init__(self, model_dir, device_id=None):
# Preprocessing pipeline
self.preprocess_op = [
DetResizeForTest(limit_side_len=960, limit_type="max"),
NormalizeImage(
std=[0.229, 0.224, 0.225],
mean=[0.485, 0.456, 0.406],
scale='1./255.'
),
ToCHWImage(),
]
# Postprocessing: DBNet decode
self.postprocess_op = DBPostProcess(
thresh=0.3,
box_thresh=0.5,
max_candidates=1000,
unclip_ratio=1.5
)
# Load ONNX model
self.predictor, self.run_options = load_model(model_dir, 'det', device_id)
def __call__(self, img):
"""
Detect text regions in image.
Process:
1. Preprocess (resize, normalize)
2. Run inference
3. Postprocess (decode probability map to polygons)
4. Filter small boxes
"""
ori_im = img.copy()
# Preprocess
data = transform({'image': img}, self.preprocess_op)
img_tensor, shape_list = data
# Inference
outputs = self.predictor.run(None, {self.input_tensor.name: img_tensor})
# Postprocess
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
dt_boxes = post_result[0]['points']
# Filter small boxes (width or height <= 3)
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
return dt_boxes
3. TextRecognizer Class
class TextRecognizer:
"""
Recognize text from cropped images using CRNN model.
Input: List of cropped text region images
Output: List of (text, confidence) tuples
"""
def __init__(self, model_dir, device_id=None):
self.rec_image_shape = [3, 48, 320] # C, H, W
self.rec_batch_num = 16
# CTC decoder with character dictionary
self.postprocess_op = CTCLabelDecode(
character_dict_path=os.path.join(model_dir, "ocr.res"),
use_space_char=True
)
# Load ONNX model
self.predictor, self.run_options = load_model(model_dir, 'rec', device_id)
def __call__(self, img_list):
"""
Recognize text from list of images.
Process:
1. Sort by width for efficient batching
2. Resize and normalize each image
3. Batch inference
4. CTC decode
"""
img_num = len(img_list)
# Sort by aspect ratio (width/height)
width_list = [img.shape[1] / float(img.shape[0]) for img in img_list]
indices = np.argsort(np.array(width_list))
rec_res = [['', 0.0]] * img_num
# Process in batches
for beg_idx in range(0, img_num, self.rec_batch_num):
end_idx = min(img_num, beg_idx + self.rec_batch_num)
# Prepare batch
norm_img_batch = []
max_wh_ratio = self.rec_image_shape[2] / self.rec_image_shape[1]
for idx in range(beg_idx, end_idx):
h, w = img_list[indices[idx]].shape[0:2]
max_wh_ratio = max(max_wh_ratio, w / h)
for idx in range(beg_idx, end_idx):
norm_img = self.resize_norm_img(
img_list[indices[idx]],
max_wh_ratio
)
norm_img_batch.append(norm_img[np.newaxis, :])
norm_img_batch = np.concatenate(norm_img_batch)
# Inference
outputs = self.predictor.run(None, {
self.input_tensor.name: norm_img_batch
})
# CTC decode
preds = outputs[0]
rec_result = self.postprocess_op(preds)
# Store results in original order
for i, result in enumerate(rec_result):
rec_res[indices[beg_idx + i]] = result
return rec_res
Model Loading
def load_model(model_dir, nm, device_id=None):
"""
Load ONNX model with GPU/CPU support.
Features:
- Model caching (avoid reloading)
- Auto GPU detection
- Configurable GPU memory limit
"""
model_file_path = os.path.join(model_dir, nm + ".onnx")
# Check cache
global loaded_models
cache_key = model_file_path + str(device_id)
if cache_key in loaded_models:
return loaded_models[cache_key]
# Configure session
options = ort.SessionOptions()
options.enable_cpu_mem_arena = False
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
options.intra_op_num_threads = 2
options.inter_op_num_threads = 2
# GPU configuration
if cuda_is_available():
gpu_mem_limit_mb = int(os.environ.get("OCR_GPU_MEM_LIMIT_MB", "2048"))
cuda_provider_options = {
"device_id": device_id or 0,
"gpu_mem_limit": gpu_mem_limit_mb * 1024 * 1024,
"arena_extend_strategy": "kNextPowerOfTwo"
}
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CUDAExecutionProvider'],
provider_options=[cuda_provider_options]
)
else:
sess = ort.InferenceSession(
model_file_path,
options=options,
providers=['CPUExecutionProvider']
)
# Cache and return
run_options = ort.RunOptions()
loaded_models[cache_key] = (sess, run_options)
return loaded_models[cache_key]
Image Processing Utilities
Rotate Crop Image
def get_rotate_crop_image(self, img, points):
"""
Crop text region with perspective transform.
Handles rotated/skewed text by:
1. Calculate crop dimensions
2. Apply perspective transform
3. Auto-rotate if height > width
"""
assert len(points) == 4, "shape of points must be 4*2"
# Calculate target dimensions
img_crop_width = int(max(
np.linalg.norm(points[0] - points[1]),
np.linalg.norm(points[2] - points[3])
))
img_crop_height = int(max(
np.linalg.norm(points[0] - points[3]),
np.linalg.norm(points[1] - points[2])
))
# Standard rectangle coordinates
pts_std = np.float32([
[0, 0],
[img_crop_width, 0],
[img_crop_width, img_crop_height],
[0, img_crop_height]
])
# Perspective transform
M = cv2.getPerspectiveTransform(points, pts_std)
dst_img = cv2.warpPerspective(
img, M, (img_crop_width, img_crop_height),
borderMode=cv2.BORDER_REPLICATE,
flags=cv2.INTER_CUBIC
)
# Auto-rotate if needed (height/width >= 1.5)
if dst_img.shape[0] / dst_img.shape[1] >= 1.5:
# Try different rotations, pick best recognition score
best_img = self._find_best_rotation(dst_img)
return best_img
return dst_img
Box Sorting
def sorted_boxes(self, dt_boxes):
"""
Sort text boxes: top-to-bottom, left-to-right.
Algorithm:
1. Initial sort by (y, x) coordinates
2. Fine-tune: swap adjacent boxes if on same line
and right box is to the left
"""
num_boxes = dt_boxes.shape[0]
# Sort by top-left corner (y first, then x)
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
_boxes = list(sorted_boxes)
# Fine-tune for same-line boxes
for i in range(num_boxes - 1):
for j in range(i, -1, -1):
# If boxes on same line (y diff < 10) and wrong order
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
_boxes[j + 1][0][0] < _boxes[j][0][0]:
# Swap
_boxes[j], _boxes[j + 1] = _boxes[j + 1], _boxes[j]
else:
break
return _boxes
Configuration
# Environment variables
OCR_GPU_MEM_LIMIT_MB = 2048 # GPU memory limit per model
OCR_ARENA_EXTEND_STRATEGY = "kNextPowerOfTwo" # Memory allocation strategy
PARALLEL_DEVICES = 0 # Number of GPUs (0 = single device)
# Model parameters
DETECTION_PARAMS = {
"limit_side_len": 960, # Max image dimension
"thresh": 0.3, # Binary threshold
"box_thresh": 0.5, # Box confidence threshold
"max_candidates": 1000, # Max detected boxes
"unclip_ratio": 1.5 # Box expansion ratio
}
RECOGNITION_PARAMS = {
"image_shape": [3, 48, 320], # Input shape (C, H, W)
"batch_num": 16, # Batch size
"drop_score": 0.5 # Confidence threshold
}
Models Used
| Model | File | Purpose | Architecture |
|---|---|---|---|
| Text Detection | det.onnx | Find text regions | DBNet (Differentiable Binarization) |
| Text Recognition | rec.onnx | Read text content | CRNN + CTC |
| Character Dict | ocr.res | Character mapping | CTC vocabulary |
Integration with PDF Parser
# In pdf_parser.py
def __ocr(self, callback, start_progress, end_progress):
"""
Run OCR on PDF page images.
For each page:
1. Call OCR to get text boxes with positions
2. Convert coordinates to page coordinate system
3. Store boxes with page number for later processing
"""
self.boxes = []
for page_idx, img in enumerate(self.page_images):
# Get OCR results
results = self.ocr(img)
if not results:
continue
# Convert to internal format
for box, (text, score) in results:
x0 = min(p[0] for p in box)
x1 = max(p[0] for p in box)
y0 = min(p[1] for p in box)
y1 = max(p[1] for p in box)
self.boxes.append({
"x0": x0 / self.ZM,
"x1": x1 / self.ZM,
"top": y0 / self.ZM + self.page_cum_height[page_idx],
"bottom": y1 / self.ZM + self.page_cum_height[page_idx],
"text": text,
"page_number": page_idx,
"score": score
})
# Update progress
if callback:
progress = start_progress + (end_progress - start_progress) * \
(page_idx / len(self.page_images))
callback(progress, f"OCR page {page_idx + 1}")
Related Files
/deepdoc/vision/ocr.py- Main OCR implementation/deepdoc/vision/operators.py- Image preprocessing operators/deepdoc/vision/postprocess.py- DBNet and CTC postprocessing/rag/res/deepdoc/- Model files (det.onnx, rec.onnx, ocr.res)