Add detailed analysis documentation for RAGFlow's document processing pipeline: - README.md: Overview and architecture diagram - task_executor_analysis.md: Task execution pipeline details - pdf_parsing.md: PDF parsing with layout analysis - ocr_pipeline.md: PaddleOCR integration and text detection - layout_detection.md: Detectron2 layout recognition - table_extraction.md: Table structure recognition (TSR) - file_type_handlers.md: Handlers for all supported file types These documents explain the document processing flow for newcomers to understand how RAGFlow handles various file formats.
480 lines
16 KiB
Markdown
480 lines
16 KiB
Markdown
# OCR Pipeline - PaddleOCR Integration
|
|
|
|
## Tong Quan
|
|
|
|
OCR (Optical Character Recognition) pipeline trong RAGFlow su dung PaddleOCR de extract text tu images. He thong duoc toi uu hoa de ho tro ca CPU va GPU, voi kha nang xu ly batch va multi-GPU parallel processing.
|
|
|
|
## File Location
|
|
```
|
|
/deepdoc/vision/ocr.py
|
|
```
|
|
|
|
## Architecture
|
|
|
|
```
|
|
OCR PIPELINE ARCHITECTURE
|
|
|
|
Input Image
|
|
│
|
|
▼
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ TEXT DETECTOR │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Model: det.onnx (DBNet) │ │
|
|
│ │ - Resize image (max 960px) │ │
|
|
│ │ - Normalize: mean=[0.485,0.456,0.406] │ │
|
|
│ │ - Detect text regions → Bounding boxes │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└──────────────────────────┬──────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌────────────────────────┐
|
|
│ Crop Text Regions │
|
|
│ Sort: top→bottom │
|
|
│ left→right │
|
|
└────────────────────────┘
|
|
│
|
|
▼
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ TEXT RECOGNIZER │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Model: rec.onnx (CRNN + CTC) │ │
|
|
│ │ - Resize to 48x320 │ │
|
|
│ │ - Batch processing (16 images/batch) │ │
|
|
│ │ - CTC decode với character dictionary │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└──────────────────────────┬──────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌────────────────────────┐
|
|
│ Filter by confidence │
|
|
│ (threshold: 0.5) │
|
|
└────────────────────────┘
|
|
│
|
|
▼
|
|
Text + Bounding Boxes
|
|
```
|
|
|
|
## Core Components
|
|
|
|
### 1. OCR Class (Main Entry Point)
|
|
|
|
```python
|
|
class OCR:
|
|
def __init__(self, model_dir=None):
|
|
"""
|
|
Initialize OCR with optional model directory.
|
|
|
|
Features:
|
|
- Auto-download models from HuggingFace if not found
|
|
- Multi-GPU support via PARALLEL_DEVICES setting
|
|
- Model caching for performance
|
|
"""
|
|
if settings.PARALLEL_DEVICES > 0:
|
|
# Create detector/recognizer for each GPU
|
|
self.text_detector = []
|
|
self.text_recognizer = []
|
|
for device_id in range(settings.PARALLEL_DEVICES):
|
|
self.text_detector.append(TextDetector(model_dir, device_id))
|
|
self.text_recognizer.append(TextRecognizer(model_dir, device_id))
|
|
else:
|
|
# Single device (CPU or GPU 0)
|
|
self.text_detector = [TextDetector(model_dir)]
|
|
self.text_recognizer = [TextRecognizer(model_dir)]
|
|
|
|
self.drop_score = 0.5 # Confidence threshold
|
|
|
|
def __call__(self, img, device_id=0):
|
|
"""
|
|
Full OCR pipeline: detect + recognize.
|
|
|
|
Returns:
|
|
List of (bounding_box, (text, confidence))
|
|
"""
|
|
# 1. Detect text regions
|
|
dt_boxes, det_time = self.text_detector[device_id](img)
|
|
|
|
# 2. Sort boxes (top-to-bottom, left-to-right)
|
|
dt_boxes = self.sorted_boxes(dt_boxes)
|
|
|
|
# 3. Crop and recognize each region
|
|
img_crop_list = []
|
|
for box in dt_boxes:
|
|
img_crop = self.get_rotate_crop_image(img, box)
|
|
img_crop_list.append(img_crop)
|
|
|
|
# 4. Batch recognize
|
|
rec_res, rec_time = self.text_recognizer[device_id](img_crop_list)
|
|
|
|
# 5. Filter by confidence
|
|
results = []
|
|
for box, (text, score) in zip(dt_boxes, rec_res):
|
|
if score >= self.drop_score:
|
|
results.append((box.tolist(), (text, score)))
|
|
|
|
return results
|
|
```
|
|
|
|
### 2. TextDetector Class
|
|
|
|
```python
|
|
class TextDetector:
|
|
"""
|
|
Detect text regions using DBNet model.
|
|
|
|
Input: Image (numpy array)
|
|
Output: List of 4-point polygons (bounding boxes)
|
|
"""
|
|
|
|
def __init__(self, model_dir, device_id=None):
|
|
# Preprocessing pipeline
|
|
self.preprocess_op = [
|
|
DetResizeForTest(limit_side_len=960, limit_type="max"),
|
|
NormalizeImage(
|
|
std=[0.229, 0.224, 0.225],
|
|
mean=[0.485, 0.456, 0.406],
|
|
scale='1./255.'
|
|
),
|
|
ToCHWImage(),
|
|
]
|
|
|
|
# Postprocessing: DBNet decode
|
|
self.postprocess_op = DBPostProcess(
|
|
thresh=0.3,
|
|
box_thresh=0.5,
|
|
max_candidates=1000,
|
|
unclip_ratio=1.5
|
|
)
|
|
|
|
# Load ONNX model
|
|
self.predictor, self.run_options = load_model(model_dir, 'det', device_id)
|
|
|
|
def __call__(self, img):
|
|
"""
|
|
Detect text regions in image.
|
|
|
|
Process:
|
|
1. Preprocess (resize, normalize)
|
|
2. Run inference
|
|
3. Postprocess (decode probability map to polygons)
|
|
4. Filter small boxes
|
|
"""
|
|
ori_im = img.copy()
|
|
|
|
# Preprocess
|
|
data = transform({'image': img}, self.preprocess_op)
|
|
img_tensor, shape_list = data
|
|
|
|
# Inference
|
|
outputs = self.predictor.run(None, {self.input_tensor.name: img_tensor})
|
|
|
|
# Postprocess
|
|
post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
|
|
dt_boxes = post_result[0]['points']
|
|
|
|
# Filter small boxes (width or height <= 3)
|
|
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
|
|
|
|
return dt_boxes
|
|
```
|
|
|
|
### 3. TextRecognizer Class
|
|
|
|
```python
|
|
class TextRecognizer:
|
|
"""
|
|
Recognize text from cropped images using CRNN model.
|
|
|
|
Input: List of cropped text region images
|
|
Output: List of (text, confidence) tuples
|
|
"""
|
|
|
|
def __init__(self, model_dir, device_id=None):
|
|
self.rec_image_shape = [3, 48, 320] # C, H, W
|
|
self.rec_batch_num = 16
|
|
|
|
# CTC decoder with character dictionary
|
|
self.postprocess_op = CTCLabelDecode(
|
|
character_dict_path=os.path.join(model_dir, "ocr.res"),
|
|
use_space_char=True
|
|
)
|
|
|
|
# Load ONNX model
|
|
self.predictor, self.run_options = load_model(model_dir, 'rec', device_id)
|
|
|
|
def __call__(self, img_list):
|
|
"""
|
|
Recognize text from list of images.
|
|
|
|
Process:
|
|
1. Sort by width for efficient batching
|
|
2. Resize and normalize each image
|
|
3. Batch inference
|
|
4. CTC decode
|
|
"""
|
|
img_num = len(img_list)
|
|
|
|
# Sort by aspect ratio (width/height)
|
|
width_list = [img.shape[1] / float(img.shape[0]) for img in img_list]
|
|
indices = np.argsort(np.array(width_list))
|
|
|
|
rec_res = [['', 0.0]] * img_num
|
|
|
|
# Process in batches
|
|
for beg_idx in range(0, img_num, self.rec_batch_num):
|
|
end_idx = min(img_num, beg_idx + self.rec_batch_num)
|
|
|
|
# Prepare batch
|
|
norm_img_batch = []
|
|
max_wh_ratio = self.rec_image_shape[2] / self.rec_image_shape[1]
|
|
|
|
for idx in range(beg_idx, end_idx):
|
|
h, w = img_list[indices[idx]].shape[0:2]
|
|
max_wh_ratio = max(max_wh_ratio, w / h)
|
|
|
|
for idx in range(beg_idx, end_idx):
|
|
norm_img = self.resize_norm_img(
|
|
img_list[indices[idx]],
|
|
max_wh_ratio
|
|
)
|
|
norm_img_batch.append(norm_img[np.newaxis, :])
|
|
|
|
norm_img_batch = np.concatenate(norm_img_batch)
|
|
|
|
# Inference
|
|
outputs = self.predictor.run(None, {
|
|
self.input_tensor.name: norm_img_batch
|
|
})
|
|
|
|
# CTC decode
|
|
preds = outputs[0]
|
|
rec_result = self.postprocess_op(preds)
|
|
|
|
# Store results in original order
|
|
for i, result in enumerate(rec_result):
|
|
rec_res[indices[beg_idx + i]] = result
|
|
|
|
return rec_res
|
|
```
|
|
|
|
## Model Loading
|
|
|
|
```python
|
|
def load_model(model_dir, nm, device_id=None):
|
|
"""
|
|
Load ONNX model with GPU/CPU support.
|
|
|
|
Features:
|
|
- Model caching (avoid reloading)
|
|
- Auto GPU detection
|
|
- Configurable GPU memory limit
|
|
"""
|
|
model_file_path = os.path.join(model_dir, nm + ".onnx")
|
|
|
|
# Check cache
|
|
global loaded_models
|
|
cache_key = model_file_path + str(device_id)
|
|
if cache_key in loaded_models:
|
|
return loaded_models[cache_key]
|
|
|
|
# Configure session
|
|
options = ort.SessionOptions()
|
|
options.enable_cpu_mem_arena = False
|
|
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
|
options.intra_op_num_threads = 2
|
|
options.inter_op_num_threads = 2
|
|
|
|
# GPU configuration
|
|
if cuda_is_available():
|
|
gpu_mem_limit_mb = int(os.environ.get("OCR_GPU_MEM_LIMIT_MB", "2048"))
|
|
cuda_provider_options = {
|
|
"device_id": device_id or 0,
|
|
"gpu_mem_limit": gpu_mem_limit_mb * 1024 * 1024,
|
|
"arena_extend_strategy": "kNextPowerOfTwo"
|
|
}
|
|
sess = ort.InferenceSession(
|
|
model_file_path,
|
|
options=options,
|
|
providers=['CUDAExecutionProvider'],
|
|
provider_options=[cuda_provider_options]
|
|
)
|
|
else:
|
|
sess = ort.InferenceSession(
|
|
model_file_path,
|
|
options=options,
|
|
providers=['CPUExecutionProvider']
|
|
)
|
|
|
|
# Cache and return
|
|
run_options = ort.RunOptions()
|
|
loaded_models[cache_key] = (sess, run_options)
|
|
return loaded_models[cache_key]
|
|
```
|
|
|
|
## Image Processing Utilities
|
|
|
|
### Rotate Crop Image
|
|
|
|
```python
|
|
def get_rotate_crop_image(self, img, points):
|
|
"""
|
|
Crop text region with perspective transform.
|
|
|
|
Handles rotated/skewed text by:
|
|
1. Calculate crop dimensions
|
|
2. Apply perspective transform
|
|
3. Auto-rotate if height > width
|
|
"""
|
|
assert len(points) == 4, "shape of points must be 4*2"
|
|
|
|
# Calculate target dimensions
|
|
img_crop_width = int(max(
|
|
np.linalg.norm(points[0] - points[1]),
|
|
np.linalg.norm(points[2] - points[3])
|
|
))
|
|
img_crop_height = int(max(
|
|
np.linalg.norm(points[0] - points[3]),
|
|
np.linalg.norm(points[1] - points[2])
|
|
))
|
|
|
|
# Standard rectangle coordinates
|
|
pts_std = np.float32([
|
|
[0, 0],
|
|
[img_crop_width, 0],
|
|
[img_crop_width, img_crop_height],
|
|
[0, img_crop_height]
|
|
])
|
|
|
|
# Perspective transform
|
|
M = cv2.getPerspectiveTransform(points, pts_std)
|
|
dst_img = cv2.warpPerspective(
|
|
img, M, (img_crop_width, img_crop_height),
|
|
borderMode=cv2.BORDER_REPLICATE,
|
|
flags=cv2.INTER_CUBIC
|
|
)
|
|
|
|
# Auto-rotate if needed (height/width >= 1.5)
|
|
if dst_img.shape[0] / dst_img.shape[1] >= 1.5:
|
|
# Try different rotations, pick best recognition score
|
|
best_img = self._find_best_rotation(dst_img)
|
|
return best_img
|
|
|
|
return dst_img
|
|
```
|
|
|
|
### Box Sorting
|
|
|
|
```python
|
|
def sorted_boxes(self, dt_boxes):
|
|
"""
|
|
Sort text boxes: top-to-bottom, left-to-right.
|
|
|
|
Algorithm:
|
|
1. Initial sort by (y, x) coordinates
|
|
2. Fine-tune: swap adjacent boxes if on same line
|
|
and right box is to the left
|
|
"""
|
|
num_boxes = dt_boxes.shape[0]
|
|
|
|
# Sort by top-left corner (y first, then x)
|
|
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
|
|
_boxes = list(sorted_boxes)
|
|
|
|
# Fine-tune for same-line boxes
|
|
for i in range(num_boxes - 1):
|
|
for j in range(i, -1, -1):
|
|
# If boxes on same line (y diff < 10) and wrong order
|
|
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
|
|
_boxes[j + 1][0][0] < _boxes[j][0][0]:
|
|
# Swap
|
|
_boxes[j], _boxes[j + 1] = _boxes[j + 1], _boxes[j]
|
|
else:
|
|
break
|
|
|
|
return _boxes
|
|
```
|
|
|
|
## Configuration
|
|
|
|
```python
|
|
# Environment variables
|
|
OCR_GPU_MEM_LIMIT_MB = 2048 # GPU memory limit per model
|
|
OCR_ARENA_EXTEND_STRATEGY = "kNextPowerOfTwo" # Memory allocation strategy
|
|
PARALLEL_DEVICES = 0 # Number of GPUs (0 = single device)
|
|
|
|
# Model parameters
|
|
DETECTION_PARAMS = {
|
|
"limit_side_len": 960, # Max image dimension
|
|
"thresh": 0.3, # Binary threshold
|
|
"box_thresh": 0.5, # Box confidence threshold
|
|
"max_candidates": 1000, # Max detected boxes
|
|
"unclip_ratio": 1.5 # Box expansion ratio
|
|
}
|
|
|
|
RECOGNITION_PARAMS = {
|
|
"image_shape": [3, 48, 320], # Input shape (C, H, W)
|
|
"batch_num": 16, # Batch size
|
|
"drop_score": 0.5 # Confidence threshold
|
|
}
|
|
```
|
|
|
|
## Models Used
|
|
|
|
| Model | File | Purpose | Architecture |
|
|
|-------|------|---------|--------------|
|
|
| Text Detection | det.onnx | Find text regions | DBNet (Differentiable Binarization) |
|
|
| Text Recognition | rec.onnx | Read text content | CRNN + CTC |
|
|
| Character Dict | ocr.res | Character mapping | CTC vocabulary |
|
|
|
|
## Integration with PDF Parser
|
|
|
|
```python
|
|
# In pdf_parser.py
|
|
def __ocr(self, callback, start_progress, end_progress):
|
|
"""
|
|
Run OCR on PDF page images.
|
|
|
|
For each page:
|
|
1. Call OCR to get text boxes with positions
|
|
2. Convert coordinates to page coordinate system
|
|
3. Store boxes with page number for later processing
|
|
"""
|
|
self.boxes = []
|
|
|
|
for page_idx, img in enumerate(self.page_images):
|
|
# Get OCR results
|
|
results = self.ocr(img)
|
|
|
|
if not results:
|
|
continue
|
|
|
|
# Convert to internal format
|
|
for box, (text, score) in results:
|
|
x0 = min(p[0] for p in box)
|
|
x1 = max(p[0] for p in box)
|
|
y0 = min(p[1] for p in box)
|
|
y1 = max(p[1] for p in box)
|
|
|
|
self.boxes.append({
|
|
"x0": x0 / self.ZM,
|
|
"x1": x1 / self.ZM,
|
|
"top": y0 / self.ZM + self.page_cum_height[page_idx],
|
|
"bottom": y1 / self.ZM + self.page_cum_height[page_idx],
|
|
"text": text,
|
|
"page_number": page_idx,
|
|
"score": score
|
|
})
|
|
|
|
# Update progress
|
|
if callback:
|
|
progress = start_progress + (end_progress - start_progress) * \
|
|
(page_idx / len(self.page_images))
|
|
callback(progress, f"OCR page {page_idx + 1}")
|
|
```
|
|
|
|
## Related Files
|
|
|
|
- `/deepdoc/vision/ocr.py` - Main OCR implementation
|
|
- `/deepdoc/vision/operators.py` - Image preprocessing operators
|
|
- `/deepdoc/vision/postprocess.py` - DBNet and CTC postprocessing
|
|
- `/rag/res/deepdoc/` - Model files (det.onnx, rec.onnx, ocr.res)
|