update
This commit is contained in:
parent
dc9001b832
commit
ff40d37251
1 changed files with 0 additions and 68 deletions
|
|
@ -353,74 +353,6 @@ class RAGFlowPdfParser:
|
|||
self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1]
|
||||
|
||||
def _assign_column(self, boxes, zoomin=3):
|
||||
# if not boxes:
|
||||
# return boxes
|
||||
|
||||
# if all("col_id" in b for b in boxes):
|
||||
# return boxes
|
||||
|
||||
# by_page = defaultdict(list)
|
||||
# for b in boxes:
|
||||
# by_page[b["page_number"]].append(b)
|
||||
|
||||
# page_info = {} # pg -> dict(page_w, left_edge, cand_cols)
|
||||
# counter = Counter()
|
||||
|
||||
# for pg, bxs in by_page.items():
|
||||
# if not bxs:
|
||||
# page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1}
|
||||
# counter[1] += 1
|
||||
# continue
|
||||
|
||||
# if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg:
|
||||
# page_w = self.page_images[pg - 1].size[0] / max(1, zoomin)
|
||||
# left_edge = 0.0
|
||||
# else:
|
||||
# xs0 = [box["x0"] for box in bxs]
|
||||
# xs1 = [box["x1"] for box in bxs]
|
||||
# left_edge = float(min(xs0))
|
||||
# page_w = max(1.0, float(max(xs1) - left_edge))
|
||||
|
||||
# widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs]
|
||||
# median_w = float(np.median(widths)) if widths else 1.0
|
||||
|
||||
# raw_cols = int(page_w / max(1.0, median_w))
|
||||
|
||||
# # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1
|
||||
# cand = raw_cols
|
||||
|
||||
# page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand}
|
||||
# counter[cand] += 1
|
||||
|
||||
# logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}")
|
||||
|
||||
# global_cols = counter.most_common(1)[0][0]
|
||||
# logging.info(f"Global column_num decided by majority: {global_cols}")
|
||||
|
||||
# for pg, bxs in by_page.items():
|
||||
# if not bxs:
|
||||
# continue
|
||||
|
||||
# page_w = page_info[pg]["page_w"]
|
||||
# left_edge = page_info[pg]["left_edge"]
|
||||
|
||||
# if global_cols == 1:
|
||||
# for box in bxs:
|
||||
# box["col_id"] = 0
|
||||
# continue
|
||||
|
||||
# for box in bxs:
|
||||
# w = box["x1"] - box["x0"]
|
||||
# if w >= 0.8 * page_w:
|
||||
# box["col_id"] = 0
|
||||
# continue
|
||||
# cx = 0.5 * (box["x0"] + box["x1"])
|
||||
# norm_cx = (cx - left_edge) / page_w
|
||||
# norm_cx = max(0.0, min(norm_cx, 0.999999))
|
||||
# box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols))
|
||||
|
||||
# return boxes
|
||||
|
||||
if not boxes:
|
||||
return boxes
|
||||
if all("col_id" in b for b in boxes):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue