Merge branch 'main' of https://github.com/infiniflow/ragflow

2025-11-18 15:43:39 +08:00 · 2025-11-18 15:43:39 +08:00 · 2dbd1fad46
commit 2dbd1fad46
parent fcaf65c6ae 341e5904c8
7 changed files with 35 additions and 8 deletions
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -819,7 +819,7 @@ def check_embedding():
            return []

        n = min(n, total)
-        offsets = sorted(random.sample(range(total), n))
+        offsets = sorted(random.sample(range(min(total,1000)), n))
        out = []

        for off in offsets:
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@ -131,12 +131,10 @@ def retrieval(tenant_id):
            return build_error_result(message="Knowledgebase not found!", code=RetCode.NOT_FOUND)

        embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
-        print(metadata_condition)
-        # print("after", convert_conditions(metadata_condition))
-        doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
-        # print("doc_ids", doc_ids)
-        if not doc_ids and metadata_condition is not None:
-            doc_ids = ['-999']
+        if metadata_condition:
+            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
+        if not doc_ids and metadata_condition:
+            doc_ids = ["-999"]
        ranks = settings.retriever.retrieval(
            question,
            embd_mdl,
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser):
        self.page_images: list[Image.Image] = []
        self.page_from = 0
        self.page_to = 10_000
-
+        self.outlines = []
+   
+        
    def check_installation(self) -> bool:
        if DocumentConverter is None:
            self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser):
        self.mineru_api = mineru_api.rstrip("/")
        self.mineru_server_url = mineru_server_url.rstrip("/")
        self.using_api = False
+        self.outlines = []
        self.logger = logging.getLogger(self.__class__.__name__)

    def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@ -47,6 +47,7 @@ class TencentCloudAPIClient:
        self.secret_id = secret_id
        self.secret_key = secret_key
        self.region = region
+        self.outlines = []
        
        # Create credentials
        self.cred = credential.Credential(secret_id, secret_key)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            **kwargs
        )

+        def _normalize_section(section):
+            # pad section to length 3: (txt, sec_id, poss)
+            if len(section) == 1:
+                section = (section[0], "", [])
+            elif len(section) == 2:
+                section = (section[0], "", section[1])
+            elif len(section) != 3:
+                raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
+
+            txt, sec_id, poss = section
+            if isinstance(poss, str):
+                poss = pdf_parser.extract_positions(poss)
+                first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
+                pn = first[0]           
+
+                if isinstance(pn, list):
+                    pn = pn[0]           # [pn] -> pn
+                    poss[0] = (pn, *first[1:])
+
+            return (txt, sec_id, poss)
+        
+
+        sections = [_normalize_section(sec) for sec in sections]
+
        if not sections and not tbls:
            return []

--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
        callback=callback,
        output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
        backend=os.environ.get("MINERU_BACKEND", "pipeline"),
+        server_url=os.environ.get("MINERU_SERVER_URL", ""),
        delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
    )
    return sections, tables, pdf_parser