This commit is contained in:
buua436 2025-11-20 17:46:19 +08:00
parent dc9001b832
commit ff40d37251

View file

@ -353,74 +353,6 @@ class RAGFlowPdfParser:
self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
def _assign_column(self, boxes, zoomin=3):
# if not boxes:
# return boxes
# if all("col_id" in b for b in boxes):
# return boxes
# by_page = defaultdict(list)
# for b in boxes:
# by_page[b["page_number"]].append(b)
# page_info = {} # pg -> dict(page_w, left_edge, cand_cols)
# counter = Counter()
# for pg, bxs in by_page.items():
# if not bxs:
# page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1}
# counter[1] += 1
# continue
# if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg:
# page_w = self.page_images[pg - 1].size[0] / max(1, zoomin)
# left_edge = 0.0
# else:
# xs0 = [box["x0"] for box in bxs]
# xs1 = [box["x1"] for box in bxs]
# left_edge = float(min(xs0))
# page_w = max(1.0, float(max(xs1) - left_edge))
# widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs]
# median_w = float(np.median(widths)) if widths else 1.0
# raw_cols = int(page_w / max(1.0, median_w))
# # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1
# cand = raw_cols
# page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand}
# counter[cand] += 1
# logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}")
# global_cols = counter.most_common(1)[0][0]
# logging.info(f"Global column_num decided by majority: {global_cols}")
# for pg, bxs in by_page.items():
# if not bxs:
# continue
# page_w = page_info[pg]["page_w"]
# left_edge = page_info[pg]["left_edge"]
# if global_cols == 1:
# for box in bxs:
# box["col_id"] = 0
# continue
# for box in bxs:
# w = box["x1"] - box["x0"]
# if w >= 0.8 * page_w:
# box["col_id"] = 0
# continue
# cx = 0.5 * (box["x0"] + box["x1"])
# norm_cx = (cx - left_edge) / page_w
# norm_cx = max(0.0, min(norm_cx, 0.999999))
# box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols))
# return boxes
if not boxes:
return boxes
if all("col_id" in b for b in boxes):