diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 9f22cc18c..95ab40f97 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -353,74 +353,6 @@ class RAGFlowPdfParser: self.boxes[i]["bottom"] += self.page_cum_height[self.boxes[i]["page_number"] - 1] def _assign_column(self, boxes, zoomin=3): - # if not boxes: - # return boxes - - # if all("col_id" in b for b in boxes): - # return boxes - - # by_page = defaultdict(list) - # for b in boxes: - # by_page[b["page_number"]].append(b) - - # page_info = {} # pg -> dict(page_w, left_edge, cand_cols) - # counter = Counter() - - # for pg, bxs in by_page.items(): - # if not bxs: - # page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1} - # counter[1] += 1 - # continue - - # if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg: - # page_w = self.page_images[pg - 1].size[0] / max(1, zoomin) - # left_edge = 0.0 - # else: - # xs0 = [box["x0"] for box in bxs] - # xs1 = [box["x1"] for box in bxs] - # left_edge = float(min(xs0)) - # page_w = max(1.0, float(max(xs1) - left_edge)) - - # widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs] - # median_w = float(np.median(widths)) if widths else 1.0 - - # raw_cols = int(page_w / max(1.0, median_w)) - - # # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1 - # cand = raw_cols - - # page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand} - # counter[cand] += 1 - - # logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}") - - # global_cols = counter.most_common(1)[0][0] - # logging.info(f"Global column_num decided by majority: {global_cols}") - - # for pg, bxs in by_page.items(): - # if not bxs: - # continue - - # page_w = page_info[pg]["page_w"] - # left_edge = page_info[pg]["left_edge"] - - # if global_cols == 1: - # for box in bxs: - # box["col_id"] = 0 - # continue - - # for box in bxs: - # w = box["x1"] - box["x0"] - # if w >= 0.8 * page_w: - # box["col_id"] = 0 - # continue - # cx = 0.5 * (box["x0"] + box["x1"]) - # norm_cx = (cx - left_edge) / page_w - # norm_cx = max(0.0, min(norm_cx, 0.999999)) - # box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols)) - - # return boxes - if not boxes: return boxes if all("col_id" in b for b in boxes):