From fea157ba08379510b6463984120e42bedc6cca8c Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Tue, 18 Nov 2025 15:22:52 +0800 Subject: [PATCH 1/3] Fix: manual parser with mineru (#11336) ### What problem does this PR solve? Fix: manual parser with mineru #11320 Fix: missing parameter in mineru #11334 Fix: add outlines parameter for pdf parsers ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/docling_parser.py | 4 +++- deepdoc/parser/mineru_parser.py | 1 + deepdoc/parser/tcadp_parser.py | 1 + rag/app/manual.py | 24 ++++++++++++++++++++++++ rag/app/naive.py | 1 + 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 9d67478c8..59fec9250 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser): self.page_images: list[Image.Image] = [] self.page_from = 0 self.page_to = 10_000 - + self.outlines = [] + + def check_installation(self) -> bool: if DocumentConverter is None: self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling") diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 6d3b292d0..99b56e83a 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser): self.mineru_api = mineru_api.rstrip("/") self.mineru_server_url = mineru_server_url.rstrip("/") self.using_api = False + self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) def _extract_zip_no_root(self, zip_path, extract_to, root_dir): diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index 1b7a3e362..920b6f1a1 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -47,6 +47,7 @@ class TencentCloudAPIClient: self.secret_id = secret_id self.secret_key = secret_key self.region = region + self.outlines = [] # Create credentials self.cred = credential.Credential(secret_id, secret_key) diff --git a/rag/app/manual.py b/rag/app/manual.py index 81402d1bd..5808e2498 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, **kwargs ) + def _normalize_section(section): + # pad section to length 3: (txt, sec_id, poss) + if len(section) == 1: + section = (section[0], "", []) + elif len(section) == 2: + section = (section[0], "", section[1]) + elif len(section) != 3: + raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") + + txt, sec_id, poss = section + if isinstance(poss, str): + poss = pdf_parser.extract_positions(poss) + first = poss[0] # tuple: ([pn], x1, x2, y1, y2) + pn = first[0] + + if isinstance(pn, list): + pn = pn[0] # [pn] -> pn + poss[0] = (pn, *first[1:]) + + return (txt, sec_id, poss) + + + sections = [_normalize_section(sec) for sec in sections] + if not sections and not tbls: return [] diff --git a/rag/app/naive.py b/rag/app/naive.py index d88c2bea4..293e4a8b9 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), backend=os.environ.get("MINERU_BACKEND", "pipeline"), + server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), ) return sections, tables, pdf_parser From ded9bf80c5f7e6aa0db2a0a56044eaca147454e0 Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Tue, 18 Nov 2025 15:24:27 +0800 Subject: [PATCH 2/3] Fix:limit random sampling range in check_embedding (#11337) ### What problem does this PR solve? issue: [#11319](https://github.com/infiniflow/ragflow/issues/11319) change: limit random sampling range in check_embedding ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/kb_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 53e42303f..d0c4469dd 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -819,7 +819,7 @@ def check_embedding(): return [] n = min(n, total) - offsets = sorted(random.sample(range(total), n)) + offsets = sorted(random.sample(range(min(total,1000)), n)) out = [] for off in offsets: From 341e5904c847948d13e339f7df6aacd67ab8b652 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Tue, 18 Nov 2025 15:42:31 +0800 Subject: [PATCH 3/3] Fix: No results can be found through the API /api/v1/dify/retrieval (#11338) ### What problem does this PR solve? No results can be found through the API /api/v1/dify/retrieval. #11307 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/sdk/dify_retrieval.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index d2c3485a9..e02875820 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -131,12 +131,10 @@ def retrieval(tenant_id): return build_error_result(message="Knowledgebase not found!", code=RetCode.NOT_FOUND) embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) - print(metadata_condition) - # print("after", convert_conditions(metadata_condition)) - doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) - # print("doc_ids", doc_ids) - if not doc_ids and metadata_condition is not None: - doc_ids = ['-999'] + if metadata_condition: + doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) + if not doc_ids and metadata_condition: + doc_ids = ["-999"] ranks = settings.retriever.retrieval( question, embd_mdl,