diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 53e42303f..d0c4469dd 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -819,7 +819,7 @@ def check_embedding(): return [] n = min(n, total) - offsets = sorted(random.sample(range(total), n)) + offsets = sorted(random.sample(range(min(total,1000)), n)) out = [] for off in offsets: diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index d2c3485a9..e02875820 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -131,12 +131,10 @@ def retrieval(tenant_id): return build_error_result(message="Knowledgebase not found!", code=RetCode.NOT_FOUND) embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) - print(metadata_condition) - # print("after", convert_conditions(metadata_condition)) - doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) - # print("doc_ids", doc_ids) - if not doc_ids and metadata_condition is not None: - doc_ids = ['-999'] + if metadata_condition: + doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) + if not doc_ids and metadata_condition: + doc_ids = ["-999"] ranks = settings.retriever.retrieval( question, embd_mdl, diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 9d67478c8..59fec9250 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser): self.page_images: list[Image.Image] = [] self.page_from = 0 self.page_to = 10_000 - + self.outlines = [] + + def check_installation(self) -> bool: if DocumentConverter is None: self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling") diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index b50bbc0e5..e3cb62cc7 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser): self.mineru_api = mineru_api.rstrip("/") self.mineru_server_url = mineru_server_url.rstrip("/") self.using_api = False + self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) def _extract_zip_no_root(self, zip_path, extract_to, root_dir): diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index 1b7a3e362..920b6f1a1 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -47,6 +47,7 @@ class TencentCloudAPIClient: self.secret_id = secret_id self.secret_key = secret_key self.region = region + self.outlines = [] # Create credentials self.cred = credential.Credential(secret_id, secret_key) diff --git a/rag/app/manual.py b/rag/app/manual.py index 81402d1bd..5808e2498 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, **kwargs ) + def _normalize_section(section): + # pad section to length 3: (txt, sec_id, poss) + if len(section) == 1: + section = (section[0], "", []) + elif len(section) == 2: + section = (section[0], "", section[1]) + elif len(section) != 3: + raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") + + txt, sec_id, poss = section + if isinstance(poss, str): + poss = pdf_parser.extract_positions(poss) + first = poss[0] # tuple: ([pn], x1, x2, y1, y2) + pn = first[0] + + if isinstance(pn, list): + pn = pn[0] # [pn] -> pn + poss[0] = (pn, *first[1:]) + + return (txt, sec_id, poss) + + + sections = [_normalize_section(sec) for sec in sections] + if not sections and not tbls: return [] diff --git a/rag/app/naive.py b/rag/app/naive.py index d88c2bea4..293e4a8b9 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), backend=os.environ.get("MINERU_BACKEND", "pipeline"), + server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), ) return sections, tables, pdf_parser