From fea157ba08379510b6463984120e42bedc6cca8c Mon Sep 17 00:00:00 2001
From: Billy Bao <newyorkupperbay@gmail.com>
Date: Tue, 18 Nov 2025 15:22:52 +0800
Subject: [PATCH 1/3] Fix: manual parser with mineru (#11336)

### What problem does this PR solve?

Fix: manual parser with mineru #11320
Fix: missing parameter in mineru #11334
Fix: add outlines parameter for pdf parsers

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/docling_parser.py |  4 +++-
 deepdoc/parser/mineru_parser.py  |  1 +
 deepdoc/parser/tcadp_parser.py   |  1 +
 rag/app/manual.py                | 24 ++++++++++++++++++++++++
 rag/app/naive.py                 |  1 +
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py
index 9d67478c8..59fec9250 100644
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser):
         self.page_images: list[Image.Image] = []
         self.page_from = 0
         self.page_to = 10_000
-
+        self.outlines = []
+   
+        
     def check_installation(self) -> bool:
         if DocumentConverter is None:
             self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index 6d3b292d0..99b56e83a 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser):
         self.mineru_api = mineru_api.rstrip("/")
         self.mineru_server_url = mineru_server_url.rstrip("/")
         self.using_api = False
+        self.outlines = []
         self.logger = logging.getLogger(self.__class__.__name__)
 
     def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py
index 1b7a3e362..920b6f1a1 100644
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@@ -47,6 +47,7 @@ class TencentCloudAPIClient:
         self.secret_id = secret_id
         self.secret_key = secret_key
         self.region = region
+        self.outlines = []
         
         # Create credentials
         self.cred = credential.Credential(secret_id, secret_key)
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 81402d1bd..5808e2498 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             **kwargs
         )
 
+        def _normalize_section(section):
+            # pad section to length 3: (txt, sec_id, poss)
+            if len(section) == 1:
+                section = (section[0], "", [])
+            elif len(section) == 2:
+                section = (section[0], "", section[1])
+            elif len(section) != 3:
+                raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
+
+            txt, sec_id, poss = section
+            if isinstance(poss, str):
+                poss = pdf_parser.extract_positions(poss)
+                first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
+                pn = first[0]           
+
+                if isinstance(pn, list):
+                    pn = pn[0]           # [pn] -> pn
+                    poss[0] = (pn, *first[1:])
+
+            return (txt, sec_id, poss)
+        
+
+        sections = [_normalize_section(sec) for sec in sections]
+
         if not sections and not tbls:
             return []
 
diff --git a/rag/app/naive.py b/rag/app/naive.py
index d88c2bea4..293e4a8b9 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
         callback=callback,
         output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
         backend=os.environ.get("MINERU_BACKEND", "pipeline"),
+        server_url=os.environ.get("MINERU_SERVER_URL", ""),
         delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
     )
     return sections, tables, pdf_parser

From ded9bf80c5f7e6aa0db2a0a56044eaca147454e0 Mon Sep 17 00:00:00 2001
From: buua436 <66937541+buua436@users.noreply.github.com>
Date: Tue, 18 Nov 2025 15:24:27 +0800
Subject: [PATCH 2/3] Fix:limit random sampling range in check_embedding
 (#11337)

### What problem does this PR solve?
issue:
[#11319](https://github.com/infiniflow/ragflow/issues/11319)
change:
limit random sampling range in check_embedding

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 api/apps/kb_app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py
index 53e42303f..d0c4469dd 100644
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@@ -819,7 +819,7 @@ def check_embedding():
             return []
 
         n = min(n, total)
-        offsets = sorted(random.sample(range(total), n))
+        offsets = sorted(random.sample(range(min(total,1000)), n))
         out = []
 
         for off in offsets:

From 341e5904c847948d13e339f7df6aacd67ab8b652 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Tue, 18 Nov 2025 15:42:31 +0800
Subject: [PATCH 3/3] Fix: No results can be found through the API
 /api/v1/dify/retrieval (#11338)

### What problem does this PR solve?

No results can be found through the API /api/v1/dify/retrieval. #11307

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 api/apps/sdk/dify_retrieval.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py
index d2c3485a9..e02875820 100644
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@@ -131,12 +131,10 @@ def retrieval(tenant_id):
             return build_error_result(message="Knowledgebase not found!", code=RetCode.NOT_FOUND)
 
         embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
-        print(metadata_condition)
-        # print("after", convert_conditions(metadata_condition))
-        doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
-        # print("doc_ids", doc_ids)
-        if not doc_ids and metadata_condition is not None:
-            doc_ids = ['-999']
+        if metadata_condition:
+            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
+        if not doc_ids and metadata_condition:
+            doc_ids = ["-999"]
         ranks = settings.retriever.retrieval(
             question,
             embd_mdl,