# Table Structure Recognition (TSR) ## Tong Quan Table Structure Recognition (TSR) la component xu ly cau truc bang trong documents. No phan tich cac vung table da duoc detect boi Layout Recognizer de xac dinh rows, columns, cells va cau truc header. Ket qua duoc su dung de chuyen bang thanh HTML hoac natural language format. ## File Location ``` /deepdoc/vision/table_structure_recognizer.py ``` ## Architecture ``` TABLE STRUCTURE RECOGNITION PIPELINE Table Image Region │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ TABLE TRANSFORMER │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Model: tsr.onnx (TableTransformer) │ │ │ │ Detected Elements: │ │ │ │ • table • table column header │ │ │ │ • table column • table projected row header │ │ │ │ • table row • table spanning cell │ │ │ └─────────────────────────────────────────────────────────┘ │ └──────────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ STRUCTURE ALIGNMENT │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ • Align rows: left & right edges │ │ │ │ • Align columns: top & bottom edges │ │ │ │ • Handle spanning cells │ │ │ └─────────────────────────────────────────────────────────┘ │ └──────────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ TABLE CONSTRUCTION │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ • Map OCR boxes to cells │ │ │ │ • Identify header rows │ │ │ │ • Calculate colspan/rowspan │ │ │ │ • Output: HTML table or Natural language │ │ │ └─────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ ``` ## TSR Labels | Label | Description | |-------|-------------| | table | Overall table boundary | | table column | Vertical column dividers | | table row | Horizontal row dividers | | table column header | Header row(s) at top | | table projected row header | Row headers on left side | | table spanning cell | Merged cells (colspan/rowspan) | ## Core Implementation ### TableStructureRecognizer Class ```python class TableStructureRecognizer(Recognizer): """ Recognize table structure (rows, columns, cells). Uses TableTransformer model to detect: - Row and column boundaries - Header regions - Spanning (merged) cells """ labels = [ "table", "table column", "table row", "table column header", "table projected row header", "table spanning cell", ] def __init__(self): model_dir = os.path.join( get_project_base_directory(), "rag/res/deepdoc" ) super().__init__(self.labels, "tsr", model_dir) def __call__(self, images, thr=0.2): """ Detect table structure in images. Args: images: List of cropped table images thr: Confidence threshold Returns: List of table structures with aligned rows/columns """ # Run inference tbls = super().__call__(images, thr) res = [] for tbl in tbls: # Convert to internal format lts = [{ "label": b["type"], "score": b["score"], "x0": b["bbox"][0], "x1": b["bbox"][2], "top": b["bbox"][1], "bottom": b["bbox"][-1], } for b in tbl] if not lts: continue # Align row boundaries (left & right) lts = self._align_rows(lts) # Align column boundaries (top & bottom) lts = self._align_columns(lts) res.append(lts) return res ``` ### Row/Column Alignment ```python def _align_rows(self, lts): """ Align row boundaries to consistent left/right edges. Process: 1. Find all row and header elements 2. Calculate mean left/right position 3. Adjust elements to align """ # Get row elements row_elements = [b for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0] if not row_elements: return lts # Calculate alignment positions left_positions = [b["x0"] for b in row_elements] right_positions = [b["x1"] for b in row_elements] left = np.mean(left_positions) if len(left_positions) > 4 \ else np.min(left_positions) right = np.mean(right_positions) if len(right_positions) > 4 \ else np.max(right_positions) # Align rows for b in lts: if b["label"].find("row") > 0 or b["label"].find("header") > 0: if b["x0"] > left: b["x0"] = left if b["x1"] < right: b["x1"] = right return lts def _align_columns(self, lts): """ Align column boundaries to consistent top/bottom edges. """ # Get column elements col_elements = [b for b in lts if b["label"] == "table column"] if not col_elements: return lts # Calculate alignment positions top_positions = [b["top"] for b in col_elements] bottom_positions = [b["bottom"] for b in col_elements] top = np.median(top_positions) if len(top_positions) > 4 \ else np.min(top_positions) bottom = np.median(bottom_positions) if len(bottom_positions) > 4 \ else np.max(bottom_positions) # Align columns for b in lts: if b["label"] == "table column": if b["top"] > top: b["top"] = top if b["bottom"] < bottom: b["bottom"] = bottom return lts ``` ### Table Construction ```python @staticmethod def construct_table(boxes, is_english=False, html=True, **kwargs): """ Construct table from OCR boxes with structure info. Args: boxes: OCR boxes with row/column assignments is_english: Language setting html: Output HTML (True) or natural language (False) Returns: HTML string or list of natural language descriptions """ # 1. Extract and remove caption cap = "" i = 0 while i < len(boxes): if TableStructureRecognizer.is_caption(boxes[i]): cap += boxes[i]["text"] boxes.pop(i) else: i += 1 if not boxes: return [] # 2. Classify block types for b in boxes: b["btype"] = TableStructureRecognizer.blockType(b) max_type = Counter([b["btype"] for b in boxes]).most_common(1)[0][0] # 3. Sort and assign row numbers rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b] rowh = np.min(rowh) if rowh else 0 boxes = Recognizer.sort_R_firstly(boxes, rowh / 2) boxes[0]["rn"] = 0 rows = [[boxes[0]]] btm = boxes[0]["bottom"] for b in boxes[1:]: b["rn"] = len(rows) - 1 lst_r = rows[-1] # Check if new row if lst_r[-1].get("R", "") != b.get("R", "") or \ (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")): btm = b["bottom"] b["rn"] += 1 rows.append([b]) continue btm = (btm + b["bottom"]) / 2.0 rows[-1].append(b) # 4. Sort and assign column numbers colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b] colwm = np.min(colwm) if colwm else 0 boxes = Recognizer.sort_C_firstly(boxes, colwm / 2) boxes[0]["cn"] = 0 cols = [[boxes[0]]] right = boxes[0]["x1"] for b in boxes[1:]: b["cn"] = len(cols) - 1 lst_c = cols[-1] # Check if new column if b["x0"] >= right and \ lst_c[-1].get("C", "-1") != b.get("C", "-2"): right = b["x1"] b["cn"] += 1 cols.append([b]) continue right = (right + b["x1"]) / 2.0 cols[-1].append(b) # 5. Build table matrix tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))] for b in boxes: tbl[b["rn"]][b["cn"]].append(b) # 6. Identify header rows hdset = set() for i in range(len(tbl)): cnt, h = 0, 0 for j, arr in enumerate(tbl[i]): if not arr: continue cnt += 1 if any([a.get("H") for a in arr]) or \ (max_type == "Nu" and arr[0]["btype"] != "Nu"): h += 1 if h / cnt > 0.5: hdset.add(i) # 7. Calculate spans tbl = TableStructureRecognizer._cal_spans(boxes, rows, cols, tbl, html) # 8. Output if html: return TableStructureRecognizer._html_table(cap, hdset, tbl) else: return TableStructureRecognizer._desc_table(cap, hdset, tbl, is_english) ``` ### Block Type Classification ```python @staticmethod def blockType(b): """ Classify cell content type. Types: - Dt: Date (2024-01-01, 2024年1月) - Nu: Number (123, 45.6, -78%) - Ca: Code/ID (ABC-123, XYZ_456) - En: English text - NE: Number + English mix - Sg: Single character - Nr: Person name - Tx: Short text (3-12 tokens) - Lx: Long text (>12 tokens) - Ot: Other """ patt = [ # Date patterns ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), (r"^(20|19)[0-9]{2}年$", "Dt"), (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"), ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"), (r"^第*[一二三四1-4]季度$", "Dt"), (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"), (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"), # Number patterns ("^[0-9.,+%/ -]+$", "Nu"), # Code patterns (r"^[0-9A-Z/\._~-]+$", "Ca"), # English text (r"^[A-Z]*[a-z' -]+$", "En"), # Number + English mix (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), # Single character (r"^.{1}$", "Sg"), ] for p, n in patt: if re.search(p, b["text"].strip()): return n # Tokenize and classify tks = [t for t in rag_tokenizer.tokenize(b["text"]).split() if len(t) > 1] if len(tks) > 3: return "Tx" if len(tks) < 12 else "Lx" if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": return "Nr" return "Ot" ``` ### HTML Output ```python @staticmethod def _html_table(cap, hdset, tbl): """ Convert table to HTML format. Features: - Caption support - Header rows (
| " if i not in hdset else " | " continue # Get cell text h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10) txt = " ".join([c["text"] for c in Recognizer.sort_Y_firstly(arr, h)]) txts.append(txt) # Build span attributes sp = "" if arr[0].get("colspan"): sp = f"colspan={arr[0]['colspan']}" if arr[0].get("rowspan"): sp += f" rowspan={arr[0]['rowspan']}" # Add cell if i in hdset: row += f" | {txt} | " else: row += f"{txt} | " if row != "
|---|---|---|---|