Add detailed analysis documentation for RAGFlow's document processing pipeline: - README.md: Overview and architecture diagram - task_executor_analysis.md: Task execution pipeline details - pdf_parsing.md: PDF parsing with layout analysis - ocr_pipeline.md: PaddleOCR integration and text detection - layout_detection.md: Detectron2 layout recognition - table_extraction.md: Table structure recognition (TSR) - file_type_handlers.md: Handlers for all supported file types These documents explain the document processing flow for newcomers to understand how RAGFlow handles various file formats.
18 KiB
18 KiB
Layout Detection - Detectron2 Layout Recognition
Tong Quan
Layout detection la buoc quan trong trong document processing pipeline, giup phan loai cac vung noi dung trong document (text, title, table, figure, etc.). RAGFlow su dung Detectron2-based models va ho tro nhieu backend khac nhau (ONNX, YOLOv10, Ascend NPU).
File Location
/deepdoc/vision/layout_recognizer.py
Architecture
LAYOUT DETECTION PIPELINE
Page Image
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ LAYOUT RECOGNIZER │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Model Options: │ │
│ │ - ONNX (default): layout.onnx │ │
│ │ - YOLOv10: layout_yolov10.onnx │ │
│ │ - Ascend NPU: layout.om │ │
│ │ - TensorRT DLA: External service │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ DETECTED LAYOUTS │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Layout Types: │ │
│ │ • Text • Table • Header │ │
│ │ • Title • Table caption • Footer │ │
│ │ • Figure • Figure caption • Reference │ │
│ │ • Equation │ │
│ └─────────────────────────────────────────────────────────┘ │
└──────────────────────────┬──────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ TAG OCR BOXES │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ For each OCR box: │ │
│ │ 1. Find overlapping layout region │ │
│ │ 2. Assign layout_type and layoutno │ │
│ │ 3. Filter garbage (headers, footers, page numbers) │ │
│ └─────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
Layout Types
| Type | Description | Xu Ly |
|---|---|---|
| Text | Regular body text | Keep as content |
| Title | Section/document titles | Mark as heading |
| Figure | Images, diagrams, charts | Extract image + caption |
| Figure caption | Descriptions below figures | Associate with figure |
| Table | Data tables | Extract structure (TSR) |
| Table caption | Descriptions for tables | Associate with table |
| Header | Page headers | Filter (garbage) |
| Footer | Page footers | Filter (garbage) |
| Reference | Bibliography section | Filter (optional) |
| Equation | Mathematical formulas | Keep as figure |
Core Implementation
LayoutRecognizer Class
class LayoutRecognizer(Recognizer):
"""
Base layout recognizer using ONNX model.
Inherits from Recognizer base class for model loading
and inference.
"""
labels = [
"_background_",
"Text",
"Title",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Header",
"Footer",
"Reference",
"Equation",
]
def __init__(self, domain):
"""
Initialize with model from HuggingFace or local.
Args:
domain: Model domain name (e.g., "layout")
"""
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"
)
super().__init__(self.labels, domain, model_dir)
# Layouts to filter out
self.garbage_layouts = ["footer", "header", "reference"]
# Optional TensorRT DLA client
if os.environ.get("TENSORRT_DLA_SVR"):
self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2,
batch_size=16, drop=True):
"""
Detect layouts and tag OCR boxes.
Args:
image_list: List of page images
ocr_res: OCR results per page
scale_factor: Image zoom factor (default 3)
thr: Confidence threshold
batch_size: Inference batch size
drop: Whether to drop garbage layouts
Returns:
- ocr_res: OCR boxes with layout tags
- page_layout: Layout regions per page
"""
Layout Detection Process
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2,
batch_size=16, drop=True):
"""
Main layout detection and OCR tagging pipeline.
"""
# 1. Run layout detection
if self.client:
# Use TensorRT DLA service
layouts = self.client.predict(image_list)
else:
# Use local ONNX model
layouts = super().__call__(image_list, thr, batch_size)
boxes = []
garbages = {}
page_layout = []
# 2. Process each page
for pn, lts in enumerate(layouts):
bxs = ocr_res[pn]
# Convert layout format
lts = [{
"type": b["type"],
"score": float(b["score"]),
"x0": b["bbox"][0] / scale_factor,
"x1": b["bbox"][2] / scale_factor,
"top": b["bbox"][1] / scale_factor,
"bottom": b["bbox"][-1] / scale_factor,
"page_number": pn,
} for b in lts if float(b["score"]) >= 0.4 or
b["type"] not in self.garbage_layouts]
# Sort layouts by Y position
lts = self.sort_Y_firstly(lts, np.mean([
lt["bottom"] - lt["top"] for lt in lts
]) / 2)
# Cleanup overlapping layouts
lts = self.layouts_cleanup(bxs, lts)
page_layout.append(lts)
# 3. Tag OCR boxes with layout types
for lt_type in ["footer", "header", "reference",
"figure caption", "table caption",
"title", "table", "text", "figure", "equation"]:
self._findLayout(lt_type, bxs, lts, pn, image_list,
scale_factor, garbages, drop)
# 4. Add unvisited figures
for i, lt in enumerate([lt for lt in lts
if lt["type"] in ["figure", "equation"]]):
if lt.get("visited"):
continue
lt = deepcopy(lt)
del lt["type"]
lt["text"] = ""
lt["layout_type"] = "figure"
lt["layoutno"] = f"figure-{i}"
bxs.append(lt)
boxes.extend(bxs)
# 5. Remove duplicate garbage text
garbag_set = set()
for k in garbages.keys():
garbages[k] = Counter(garbages[k])
for g, c in garbages[k].items():
if c > 1: # Appears on multiple pages
garbag_set.add(g)
ocr_res = [b for b in boxes if b["text"].strip() not in garbag_set]
return ocr_res, page_layout
Layout-OCR Box Matching
def _findLayout(self, ty, bxs, lts, pn, image_list, scale_factor,
garbages, drop):
"""
Find matching layout for each OCR box.
Process:
1. Get all layouts of specified type
2. For each untagged OCR box:
- Check if it's garbage (page numbers, etc.)
- Find overlapping layout region
- Tag with layout type
- Filter garbage layouts if drop=True
"""
lts_of_type = [lt for lt in lts if lt["type"] == ty]
i = 0
while i < len(bxs):
# Skip already tagged boxes
if bxs[i].get("layout_type"):
i += 1
continue
# Check for garbage patterns
if self._is_garbage(bxs[i]):
bxs.pop(i)
continue
# Find overlapping layout
ii = self.find_overlapped_with_threshold(bxs[i], lts_of_type, thr=0.4)
if ii is None:
# No matching layout
bxs[i]["layout_type"] = ""
i += 1
continue
lts_of_type[ii]["visited"] = True
# Check if should keep garbage layout
keep_feats = [
lts_of_type[ii]["type"] == "footer" and
bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
lts_of_type[ii]["type"] == "header" and
bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
]
if drop and lts_of_type[ii]["type"] in self.garbage_layouts \
and not any(keep_feats):
# Collect garbage for deduplication
garbages.setdefault(lts_of_type[ii]["type"], []).append(
bxs[i]["text"]
)
bxs.pop(i)
continue
# Tag box with layout info
bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_of_type[ii]["type"] \
if lts_of_type[ii]["type"] != "equation" else "figure"
i += 1
Garbage Pattern Detection
def _is_garbage(self, b):
"""
Detect garbage text patterns.
Patterns:
- Bullet points only: "•••"
- Page numbers: "1 / 10", "3 of 15"
- URLs: "http://..."
- Font encoding issues: "(cid:123)"
"""
patt = [
r"^•+$", # Bullet points
"^[0-9]{1,2} / ?[0-9]{1,2}$", # Page X / Y
r"^[0-9]{1,2} of [0-9]{1,2}$", # Page X of Y
"^http://[^ ]{12,}", # URLs
r"\(cid *: *[0-9]+ *\)", # Font encoding
]
return any([re.search(p, b["text"]) for p in patt])
YOLOv10 Variant
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
"""
YOLOv10-based layout recognizer.
Differences from base:
- Different label set
- Custom preprocessing (LetterBox resize)
- YOLO-specific postprocessing
"""
labels = [
"title", "Text", "Reference", "Figure",
"Figure caption", "Table", "Table caption",
"Table caption", "Equation", "Figure caption",
]
def preprocess(self, image_list):
"""
YOLOv10 preprocessing with letterbox resize.
"""
inputs = []
new_shape = self.input_shape
for img in image_list:
shape = img.shape[:2] # H, W
# Scale ratio
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
dw /= 2
dh /= 2
# Resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
# Pad
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=(114, 114, 114)
)
# Normalize
img = img / 255.0
img = img.transpose(2, 0, 1)[np.newaxis, :].astype(np.float32)
inputs.append({
self.input_names[0]: img,
"scale_factor": [shape[1] / new_unpad[0],
shape[0] / new_unpad[1], dw, dh]
})
return inputs
def postprocess(self, boxes, inputs, thr):
"""
YOLO-specific postprocessing with NMS.
"""
thr = 0.08
boxes = np.squeeze(boxes)
# Filter by score
scores = boxes[:, 4]
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
class_ids = boxes[:, -1].astype(int)
boxes = boxes[:, :4]
# Remove padding offset
boxes[:, 0] -= inputs["scale_factor"][2]
boxes[:, 2] -= inputs["scale_factor"][2]
boxes[:, 1] -= inputs["scale_factor"][3]
boxes[:, 3] -= inputs["scale_factor"][3]
# Scale to original image
input_shape = np.array([
inputs["scale_factor"][0], inputs["scale_factor"][1],
inputs["scale_factor"][0], inputs["scale_factor"][1]
])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
# NMS per class
indices = []
for class_id in np.unique(class_ids):
class_mask = class_ids == class_id
class_boxes = boxes[class_mask]
class_scores = scores[class_mask]
class_keep = nms(class_boxes, class_scores, 0.45)
indices.extend(np.where(class_mask)[0][class_keep])
return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": boxes[i].tolist(),
"score": float(scores[i])
} for i in indices]
Ascend NPU Support
class AscendLayoutRecognizer(Recognizer):
"""
Layout recognizer for Huawei Ascend NPU.
Uses .om (Offline Model) format and ais_bench
for inference.
"""
def __init__(self, domain):
from ais_bench.infer.interface import InferSession
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc"
)
model_file_path = os.path.join(model_dir, domain + ".om")
device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
self.session = InferSession(
device_id=device_id,
model_path=model_file_path
)
Layout Cleanup
def layouts_cleanup(self, bxs, lts):
"""
Clean up overlapping layout regions.
Process:
1. Remove layouts that don't overlap with any OCR boxes
2. Merge overlapping layouts of same type
3. Adjust boundaries based on OCR boxes
"""
# Implementation in base Recognizer class
pass
def find_overlapped_with_threshold(self, box, layouts, thr=0.4):
"""
Find layout region that overlaps with box.
Args:
box: OCR box with x0, x1, top, bottom
layouts: List of layout regions
thr: Minimum overlap ratio (IoU)
Returns:
Index of best matching layout or None
"""
best_idx = None
best_overlap = 0
for idx, lt in enumerate(layouts):
# Calculate intersection
x_overlap = max(0, min(box["x1"], lt["x1"]) - max(box["x0"], lt["x0"]))
y_overlap = max(0, min(box["bottom"], lt["bottom"]) -
max(box["top"], lt["top"]))
intersection = x_overlap * y_overlap
# Calculate union
box_area = (box["x1"] - box["x0"]) * (box["bottom"] - box["top"])
lt_area = (lt["x1"] - lt["x0"]) * (lt["bottom"] - lt["top"])
union = box_area + lt_area - intersection
# IoU
iou = intersection / union if union > 0 else 0
if iou > thr and iou > best_overlap:
best_overlap = iou
best_idx = idx
return best_idx
Configuration
# Model selection
LAYOUT_RECOGNIZER_TYPE = "onnx" # onnx, yolov10, ascend
# Detection parameters
LAYOUT_DETECTION_PARAMS = {
"threshold": 0.2, # Confidence threshold
"batch_size": 16, # Inference batch size
"scale_factor": 3, # Image zoom factor
"drop_garbage": True, # Filter headers/footers
}
# TensorRT DLA (optional)
TENSORRT_DLA_SVR = None # "http://localhost:8080"
# Ascend NPU (optional)
ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID = 0
Integration with PDF Parser
# In pdf_parser.py
def _layouts_rec(self, zoomin):
"""
Run layout recognition on all pages.
Process:
1. Initialize LayoutRecognizer
2. Run detection on page images
3. Tag OCR boxes with layout types
4. Store layout information for later processing
"""
# Initialize recognizer
self.layout_recognizer = LayoutRecognizer("layout")
# Convert PIL images to numpy
images = [np.array(img) for img in self.page_images]
# Run layout detection and tagging
self.boxes, self.page_layout = self.layout_recognizer(
images,
[self.boxes], # OCR results
scale_factor=zoomin,
thr=0.2,
batch_size=16,
drop=True
)
Related Files
/deepdoc/vision/layout_recognizer.py- Layout detection/deepdoc/vision/recognizer.py- Base recognizer class/deepdoc/vision/operators.py- NMS and preprocessing/rag/res/deepdoc/layout.onnx- ONNX model