This commit is contained in:
yongtenglei 2025-11-18 15:43:39 +08:00
commit 2dbd1fad46
7 changed files with 35 additions and 8 deletions

View file

@ -819,7 +819,7 @@ def check_embedding():
return []
n = min(n, total)
offsets = sorted(random.sample(range(total), n))
offsets = sorted(random.sample(range(min(total,1000)), n))
out = []
for off in offsets:

View file

@ -131,12 +131,10 @@ def retrieval(tenant_id):
return build_error_result(message="Knowledgebase not found!", code=RetCode.NOT_FOUND)
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
print(metadata_condition)
# print("after", convert_conditions(metadata_condition))
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
# print("doc_ids", doc_ids)
if not doc_ids and metadata_condition is not None:
doc_ids = ['-999']
if metadata_condition:
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
if not doc_ids and metadata_condition:
doc_ids = ["-999"]
ranks = settings.retriever.retrieval(
question,
embd_mdl,

View file

@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser):
self.page_images: list[Image.Image] = []
self.page_from = 0
self.page_to = 10_000
self.outlines = []
def check_installation(self) -> bool:
if DocumentConverter is None:
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")

View file

@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser):
self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/")
self.using_api = False
self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__)
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):

View file

@ -47,6 +47,7 @@ class TencentCloudAPIClient:
self.secret_id = secret_id
self.secret_key = secret_key
self.region = region
self.outlines = []
# Create credentials
self.cred = credential.Credential(secret_id, secret_key)

View file

@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
**kwargs
)
def _normalize_section(section):
# pad section to length 3: (txt, sec_id, poss)
if len(section) == 1:
section = (section[0], "", [])
elif len(section) == 2:
section = (section[0], "", section[1])
elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
txt, sec_id, poss = section
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, sec_id, poss)
sections = [_normalize_section(sec) for sec in sections]
if not sections and not tbls:
return []

View file

@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
server_url=os.environ.get("MINERU_SERVER_URL", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser