Fix: Solve the OOM issue when passing large PDF files while using QA chunking method. (#8464)
### What problem does this PR solve? Using the QA chunking method with a large PDF (e.g., 300+ pages) may lead to OOM in the ragflow-worker module. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
f21827bc28
commit
5256980ffb
1 changed files with 3 additions and 3 deletions
|
|
@ -310,7 +310,7 @@ def mdQuestionLevel(s):
|
|||
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
||||
|
||||
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
If the file is in excel format, there should be 2 column question and answer without header.
|
||||
|
|
@ -410,7 +410,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||
callback(0.1, "Start to parse.")
|
||||
pdf_parser = Pdf()
|
||||
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=0, to_page=10000, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
for q, a, image, poss in qai_list:
|
||||
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
||||
return res
|
||||
|
|
@ -468,4 +468,4 @@ if __name__ == "__main__":
|
|||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue