Add detailed analysis documentation for RAGFlow's document processing pipeline: - README.md: Overview and architecture diagram - task_executor_analysis.md: Task execution pipeline details - pdf_parsing.md: PDF parsing with layout analysis - ocr_pipeline.md: PaddleOCR integration and text detection - layout_detection.md: Detectron2 layout recognition - table_extraction.md: Table structure recognition (TSR) - file_type_handlers.md: Handlers for all supported file types These documents explain the document processing flow for newcomers to understand how RAGFlow handles various file formats.
547 lines
18 KiB
Markdown
547 lines
18 KiB
Markdown
# Layout Detection - Detectron2 Layout Recognition
|
|
|
|
## Tong Quan
|
|
|
|
Layout detection la buoc quan trong trong document processing pipeline, giup phan loai cac vung noi dung trong document (text, title, table, figure, etc.). RAGFlow su dung Detectron2-based models va ho tro nhieu backend khac nhau (ONNX, YOLOv10, Ascend NPU).
|
|
|
|
## File Location
|
|
```
|
|
/deepdoc/vision/layout_recognizer.py
|
|
```
|
|
|
|
## Architecture
|
|
|
|
```
|
|
LAYOUT DETECTION PIPELINE
|
|
|
|
Page Image
|
|
│
|
|
▼
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ LAYOUT RECOGNIZER │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Model Options: │ │
|
|
│ │ - ONNX (default): layout.onnx │ │
|
|
│ │ - YOLOv10: layout_yolov10.onnx │ │
|
|
│ │ - Ascend NPU: layout.om │ │
|
|
│ │ - TensorRT DLA: External service │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└──────────────────────────┬──────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ DETECTED LAYOUTS │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ Layout Types: │ │
|
|
│ │ • Text • Table • Header │ │
|
|
│ │ • Title • Table caption • Footer │ │
|
|
│ │ • Figure • Figure caption • Reference │ │
|
|
│ │ • Equation │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└──────────────────────────┬──────────────────────────────────────┘
|
|
│
|
|
▼
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ TAG OCR BOXES │
|
|
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
│ │ For each OCR box: │ │
|
|
│ │ 1. Find overlapping layout region │ │
|
|
│ │ 2. Assign layout_type and layoutno │ │
|
|
│ │ 3. Filter garbage (headers, footers, page numbers) │ │
|
|
│ └─────────────────────────────────────────────────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
## Layout Types
|
|
|
|
| Type | Description | Xu Ly |
|
|
|------|-------------|-------|
|
|
| Text | Regular body text | Keep as content |
|
|
| Title | Section/document titles | Mark as heading |
|
|
| Figure | Images, diagrams, charts | Extract image + caption |
|
|
| Figure caption | Descriptions below figures | Associate with figure |
|
|
| Table | Data tables | Extract structure (TSR) |
|
|
| Table caption | Descriptions for tables | Associate with table |
|
|
| Header | Page headers | Filter (garbage) |
|
|
| Footer | Page footers | Filter (garbage) |
|
|
| Reference | Bibliography section | Filter (optional) |
|
|
| Equation | Mathematical formulas | Keep as figure |
|
|
|
|
## Core Implementation
|
|
|
|
### LayoutRecognizer Class
|
|
|
|
```python
|
|
class LayoutRecognizer(Recognizer):
|
|
"""
|
|
Base layout recognizer using ONNX model.
|
|
|
|
Inherits from Recognizer base class for model loading
|
|
and inference.
|
|
"""
|
|
|
|
labels = [
|
|
"_background_",
|
|
"Text",
|
|
"Title",
|
|
"Figure",
|
|
"Figure caption",
|
|
"Table",
|
|
"Table caption",
|
|
"Header",
|
|
"Footer",
|
|
"Reference",
|
|
"Equation",
|
|
]
|
|
|
|
def __init__(self, domain):
|
|
"""
|
|
Initialize with model from HuggingFace or local.
|
|
|
|
Args:
|
|
domain: Model domain name (e.g., "layout")
|
|
"""
|
|
model_dir = os.path.join(
|
|
get_project_base_directory(),
|
|
"rag/res/deepdoc"
|
|
)
|
|
super().__init__(self.labels, domain, model_dir)
|
|
|
|
# Layouts to filter out
|
|
self.garbage_layouts = ["footer", "header", "reference"]
|
|
|
|
# Optional TensorRT DLA client
|
|
if os.environ.get("TENSORRT_DLA_SVR"):
|
|
self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
|
|
|
|
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2,
|
|
batch_size=16, drop=True):
|
|
"""
|
|
Detect layouts and tag OCR boxes.
|
|
|
|
Args:
|
|
image_list: List of page images
|
|
ocr_res: OCR results per page
|
|
scale_factor: Image zoom factor (default 3)
|
|
thr: Confidence threshold
|
|
batch_size: Inference batch size
|
|
drop: Whether to drop garbage layouts
|
|
|
|
Returns:
|
|
- ocr_res: OCR boxes with layout tags
|
|
- page_layout: Layout regions per page
|
|
"""
|
|
```
|
|
|
|
### Layout Detection Process
|
|
|
|
```python
|
|
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2,
|
|
batch_size=16, drop=True):
|
|
"""
|
|
Main layout detection and OCR tagging pipeline.
|
|
"""
|
|
# 1. Run layout detection
|
|
if self.client:
|
|
# Use TensorRT DLA service
|
|
layouts = self.client.predict(image_list)
|
|
else:
|
|
# Use local ONNX model
|
|
layouts = super().__call__(image_list, thr, batch_size)
|
|
|
|
boxes = []
|
|
garbages = {}
|
|
page_layout = []
|
|
|
|
# 2. Process each page
|
|
for pn, lts in enumerate(layouts):
|
|
bxs = ocr_res[pn]
|
|
|
|
# Convert layout format
|
|
lts = [{
|
|
"type": b["type"],
|
|
"score": float(b["score"]),
|
|
"x0": b["bbox"][0] / scale_factor,
|
|
"x1": b["bbox"][2] / scale_factor,
|
|
"top": b["bbox"][1] / scale_factor,
|
|
"bottom": b["bbox"][-1] / scale_factor,
|
|
"page_number": pn,
|
|
} for b in lts if float(b["score"]) >= 0.4 or
|
|
b["type"] not in self.garbage_layouts]
|
|
|
|
# Sort layouts by Y position
|
|
lts = self.sort_Y_firstly(lts, np.mean([
|
|
lt["bottom"] - lt["top"] for lt in lts
|
|
]) / 2)
|
|
|
|
# Cleanup overlapping layouts
|
|
lts = self.layouts_cleanup(bxs, lts)
|
|
page_layout.append(lts)
|
|
|
|
# 3. Tag OCR boxes with layout types
|
|
for lt_type in ["footer", "header", "reference",
|
|
"figure caption", "table caption",
|
|
"title", "table", "text", "figure", "equation"]:
|
|
self._findLayout(lt_type, bxs, lts, pn, image_list,
|
|
scale_factor, garbages, drop)
|
|
|
|
# 4. Add unvisited figures
|
|
for i, lt in enumerate([lt for lt in lts
|
|
if lt["type"] in ["figure", "equation"]]):
|
|
if lt.get("visited"):
|
|
continue
|
|
lt = deepcopy(lt)
|
|
del lt["type"]
|
|
lt["text"] = ""
|
|
lt["layout_type"] = "figure"
|
|
lt["layoutno"] = f"figure-{i}"
|
|
bxs.append(lt)
|
|
|
|
boxes.extend(bxs)
|
|
|
|
# 5. Remove duplicate garbage text
|
|
garbag_set = set()
|
|
for k in garbages.keys():
|
|
garbages[k] = Counter(garbages[k])
|
|
for g, c in garbages[k].items():
|
|
if c > 1: # Appears on multiple pages
|
|
garbag_set.add(g)
|
|
|
|
ocr_res = [b for b in boxes if b["text"].strip() not in garbag_set]
|
|
|
|
return ocr_res, page_layout
|
|
```
|
|
|
|
### Layout-OCR Box Matching
|
|
|
|
```python
|
|
def _findLayout(self, ty, bxs, lts, pn, image_list, scale_factor,
|
|
garbages, drop):
|
|
"""
|
|
Find matching layout for each OCR box.
|
|
|
|
Process:
|
|
1. Get all layouts of specified type
|
|
2. For each untagged OCR box:
|
|
- Check if it's garbage (page numbers, etc.)
|
|
- Find overlapping layout region
|
|
- Tag with layout type
|
|
- Filter garbage layouts if drop=True
|
|
"""
|
|
lts_of_type = [lt for lt in lts if lt["type"] == ty]
|
|
|
|
i = 0
|
|
while i < len(bxs):
|
|
# Skip already tagged boxes
|
|
if bxs[i].get("layout_type"):
|
|
i += 1
|
|
continue
|
|
|
|
# Check for garbage patterns
|
|
if self._is_garbage(bxs[i]):
|
|
bxs.pop(i)
|
|
continue
|
|
|
|
# Find overlapping layout
|
|
ii = self.find_overlapped_with_threshold(bxs[i], lts_of_type, thr=0.4)
|
|
|
|
if ii is None:
|
|
# No matching layout
|
|
bxs[i]["layout_type"] = ""
|
|
i += 1
|
|
continue
|
|
|
|
lts_of_type[ii]["visited"] = True
|
|
|
|
# Check if should keep garbage layout
|
|
keep_feats = [
|
|
lts_of_type[ii]["type"] == "footer" and
|
|
bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
|
|
lts_of_type[ii]["type"] == "header" and
|
|
bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
|
|
]
|
|
|
|
if drop and lts_of_type[ii]["type"] in self.garbage_layouts \
|
|
and not any(keep_feats):
|
|
# Collect garbage for deduplication
|
|
garbages.setdefault(lts_of_type[ii]["type"], []).append(
|
|
bxs[i]["text"]
|
|
)
|
|
bxs.pop(i)
|
|
continue
|
|
|
|
# Tag box with layout info
|
|
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
|
bxs[i]["layout_type"] = lts_of_type[ii]["type"] \
|
|
if lts_of_type[ii]["type"] != "equation" else "figure"
|
|
i += 1
|
|
```
|
|
|
|
### Garbage Pattern Detection
|
|
|
|
```python
|
|
def _is_garbage(self, b):
|
|
"""
|
|
Detect garbage text patterns.
|
|
|
|
Patterns:
|
|
- Bullet points only: "•••"
|
|
- Page numbers: "1 / 10", "3 of 15"
|
|
- URLs: "http://..."
|
|
- Font encoding issues: "(cid:123)"
|
|
"""
|
|
patt = [
|
|
r"^•+$", # Bullet points
|
|
"^[0-9]{1,2} / ?[0-9]{1,2}$", # Page X / Y
|
|
r"^[0-9]{1,2} of [0-9]{1,2}$", # Page X of Y
|
|
"^http://[^ ]{12,}", # URLs
|
|
r"\(cid *: *[0-9]+ *\)", # Font encoding
|
|
]
|
|
return any([re.search(p, b["text"]) for p in patt])
|
|
```
|
|
|
|
## YOLOv10 Variant
|
|
|
|
```python
|
|
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
|
"""
|
|
YOLOv10-based layout recognizer.
|
|
|
|
Differences from base:
|
|
- Different label set
|
|
- Custom preprocessing (LetterBox resize)
|
|
- YOLO-specific postprocessing
|
|
"""
|
|
|
|
labels = [
|
|
"title", "Text", "Reference", "Figure",
|
|
"Figure caption", "Table", "Table caption",
|
|
"Table caption", "Equation", "Figure caption",
|
|
]
|
|
|
|
def preprocess(self, image_list):
|
|
"""
|
|
YOLOv10 preprocessing with letterbox resize.
|
|
"""
|
|
inputs = []
|
|
new_shape = self.input_shape
|
|
|
|
for img in image_list:
|
|
shape = img.shape[:2] # H, W
|
|
|
|
# Scale ratio
|
|
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
|
|
|
# Compute padding
|
|
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
|
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
|
|
dw /= 2
|
|
dh /= 2
|
|
|
|
# Resize
|
|
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
|
|
|
# Pad
|
|
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
|
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
|
img = cv2.copyMakeBorder(
|
|
img, top, bottom, left, right,
|
|
cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
|
)
|
|
|
|
# Normalize
|
|
img = img / 255.0
|
|
img = img.transpose(2, 0, 1)[np.newaxis, :].astype(np.float32)
|
|
|
|
inputs.append({
|
|
self.input_names[0]: img,
|
|
"scale_factor": [shape[1] / new_unpad[0],
|
|
shape[0] / new_unpad[1], dw, dh]
|
|
})
|
|
|
|
return inputs
|
|
|
|
def postprocess(self, boxes, inputs, thr):
|
|
"""
|
|
YOLO-specific postprocessing with NMS.
|
|
"""
|
|
thr = 0.08
|
|
boxes = np.squeeze(boxes)
|
|
|
|
# Filter by score
|
|
scores = boxes[:, 4]
|
|
boxes = boxes[scores > thr, :]
|
|
scores = scores[scores > thr]
|
|
|
|
if len(boxes) == 0:
|
|
return []
|
|
|
|
class_ids = boxes[:, -1].astype(int)
|
|
boxes = boxes[:, :4]
|
|
|
|
# Remove padding offset
|
|
boxes[:, 0] -= inputs["scale_factor"][2]
|
|
boxes[:, 2] -= inputs["scale_factor"][2]
|
|
boxes[:, 1] -= inputs["scale_factor"][3]
|
|
boxes[:, 3] -= inputs["scale_factor"][3]
|
|
|
|
# Scale to original image
|
|
input_shape = np.array([
|
|
inputs["scale_factor"][0], inputs["scale_factor"][1],
|
|
inputs["scale_factor"][0], inputs["scale_factor"][1]
|
|
])
|
|
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
|
|
|
|
# NMS per class
|
|
indices = []
|
|
for class_id in np.unique(class_ids):
|
|
class_mask = class_ids == class_id
|
|
class_boxes = boxes[class_mask]
|
|
class_scores = scores[class_mask]
|
|
class_keep = nms(class_boxes, class_scores, 0.45)
|
|
indices.extend(np.where(class_mask)[0][class_keep])
|
|
|
|
return [{
|
|
"type": self.label_list[class_ids[i]].lower(),
|
|
"bbox": boxes[i].tolist(),
|
|
"score": float(scores[i])
|
|
} for i in indices]
|
|
```
|
|
|
|
## Ascend NPU Support
|
|
|
|
```python
|
|
class AscendLayoutRecognizer(Recognizer):
|
|
"""
|
|
Layout recognizer for Huawei Ascend NPU.
|
|
|
|
Uses .om (Offline Model) format and ais_bench
|
|
for inference.
|
|
"""
|
|
|
|
def __init__(self, domain):
|
|
from ais_bench.infer.interface import InferSession
|
|
|
|
model_dir = os.path.join(
|
|
get_project_base_directory(),
|
|
"rag/res/deepdoc"
|
|
)
|
|
model_file_path = os.path.join(model_dir, domain + ".om")
|
|
|
|
device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
|
|
self.session = InferSession(
|
|
device_id=device_id,
|
|
model_path=model_file_path
|
|
)
|
|
```
|
|
|
|
## Layout Cleanup
|
|
|
|
```python
|
|
def layouts_cleanup(self, bxs, lts):
|
|
"""
|
|
Clean up overlapping layout regions.
|
|
|
|
Process:
|
|
1. Remove layouts that don't overlap with any OCR boxes
|
|
2. Merge overlapping layouts of same type
|
|
3. Adjust boundaries based on OCR boxes
|
|
"""
|
|
# Implementation in base Recognizer class
|
|
pass
|
|
|
|
def find_overlapped_with_threshold(self, box, layouts, thr=0.4):
|
|
"""
|
|
Find layout region that overlaps with box.
|
|
|
|
Args:
|
|
box: OCR box with x0, x1, top, bottom
|
|
layouts: List of layout regions
|
|
thr: Minimum overlap ratio (IoU)
|
|
|
|
Returns:
|
|
Index of best matching layout or None
|
|
"""
|
|
best_idx = None
|
|
best_overlap = 0
|
|
|
|
for idx, lt in enumerate(layouts):
|
|
# Calculate intersection
|
|
x_overlap = max(0, min(box["x1"], lt["x1"]) - max(box["x0"], lt["x0"]))
|
|
y_overlap = max(0, min(box["bottom"], lt["bottom"]) -
|
|
max(box["top"], lt["top"]))
|
|
intersection = x_overlap * y_overlap
|
|
|
|
# Calculate union
|
|
box_area = (box["x1"] - box["x0"]) * (box["bottom"] - box["top"])
|
|
lt_area = (lt["x1"] - lt["x0"]) * (lt["bottom"] - lt["top"])
|
|
union = box_area + lt_area - intersection
|
|
|
|
# IoU
|
|
iou = intersection / union if union > 0 else 0
|
|
|
|
if iou > thr and iou > best_overlap:
|
|
best_overlap = iou
|
|
best_idx = idx
|
|
|
|
return best_idx
|
|
```
|
|
|
|
## Configuration
|
|
|
|
```python
|
|
# Model selection
|
|
LAYOUT_RECOGNIZER_TYPE = "onnx" # onnx, yolov10, ascend
|
|
|
|
# Detection parameters
|
|
LAYOUT_DETECTION_PARAMS = {
|
|
"threshold": 0.2, # Confidence threshold
|
|
"batch_size": 16, # Inference batch size
|
|
"scale_factor": 3, # Image zoom factor
|
|
"drop_garbage": True, # Filter headers/footers
|
|
}
|
|
|
|
# TensorRT DLA (optional)
|
|
TENSORRT_DLA_SVR = None # "http://localhost:8080"
|
|
|
|
# Ascend NPU (optional)
|
|
ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID = 0
|
|
```
|
|
|
|
## Integration with PDF Parser
|
|
|
|
```python
|
|
# In pdf_parser.py
|
|
def _layouts_rec(self, zoomin):
|
|
"""
|
|
Run layout recognition on all pages.
|
|
|
|
Process:
|
|
1. Initialize LayoutRecognizer
|
|
2. Run detection on page images
|
|
3. Tag OCR boxes with layout types
|
|
4. Store layout information for later processing
|
|
"""
|
|
# Initialize recognizer
|
|
self.layout_recognizer = LayoutRecognizer("layout")
|
|
|
|
# Convert PIL images to numpy
|
|
images = [np.array(img) for img in self.page_images]
|
|
|
|
# Run layout detection and tagging
|
|
self.boxes, self.page_layout = self.layout_recognizer(
|
|
images,
|
|
[self.boxes], # OCR results
|
|
scale_factor=zoomin,
|
|
thr=0.2,
|
|
batch_size=16,
|
|
drop=True
|
|
)
|
|
```
|
|
|
|
## Related Files
|
|
|
|
- `/deepdoc/vision/layout_recognizer.py` - Layout detection
|
|
- `/deepdoc/vision/recognizer.py` - Base recognizer class
|
|
- `/deepdoc/vision/operators.py` - NMS and preprocessing
|
|
- `/rag/res/deepdoc/layout.onnx` - ONNX model
|