Handle 2-tuple sections in manual chunk

This commit is contained in:
少卿 2025-12-03 16:28:18 +08:00
parent 15279fc8d9
commit 4ac7211bee

View file

@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
)
def _normalize_section(section):
# pad section to length 3: (txt, sec_id, poss)
if len(section) == 1:
# Pad/normalize to (txt, layout, positions)
if not isinstance(section, (list, tuple)):
section = (section, "", [])
elif len(section) == 1:
section = (section[0], "", [])
elif len(section) == 2:
section = (section[0], "", section[1])
elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
else:
section = (section[0], section[1], section[2])
txt, layoutno, poss = section
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
if poss:
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list) and pn:
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
if not poss:
poss = []
return (txt, layoutno, poss)