From 06cef71ba6406b453a29d1e1832d4f491bd31a81 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 20 Nov 2025 14:31:12 +0800 Subject: [PATCH 1/6] Feat: add or logic operations for meta data filters. (#11404) ### What problem does this PR solve? #11376 #11387 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- agent/tools/retrieval.py | 6 +- api/apps/chunk_app.py | 6 +- api/apps/sdk/agents.py | 4 +- api/apps/sdk/dify_retrieval.py | 4 +- api/apps/sdk/doc.py | 4 +- api/apps/sdk/session.py | 14 ++-- api/db/services/canvas_service.py | 10 +-- api/db/services/dialog_service.py | 25 +++---- docs/references/http_api_reference.md | 1 + rag/prompts/generator.py | 8 ++- rag/prompts/meta_filter.md | 95 ++++++++++++++++++++++++--- 11 files changed, 129 insertions(+), 48 deletions(-) diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py index ab388a08e..c3a01e517 100644 --- a/agent/tools/retrieval.py +++ b/agent/tools/retrieval.py @@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC): metas = DocumentService.get_meta_by_kbs(kb_ids) if self._param.meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT) - filters = gen_meta_filter(chat_mdl, metas, query) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, query) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif self._param.meta_data_filter.get("method") == "manual": @@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC): out_parts.append(s[last:]) flt["value"] = "".join(out_parts) - doc_ids.extend(meta_filter(metas, filters)) + doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 7341e336a..e121bcba7 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -305,12 +305,12 @@ async def retrieval_test(): metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/apps/sdk/agents.py b/api/apps/sdk/agents.py index dda696bf4..b20a22ad8 100644 --- a/api/apps/sdk/agents.py +++ b/api/apps/sdk/agents.py @@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str): data=False, message=str(e), code=RetCode.EXCEPTION_ERROR) - def sse(): + async def sse(): nonlocal canvas try: - for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req): + async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req): yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n" cvs.dsl = json.loads(str(canvas)) diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index b86fb5c1e..55ea54faf 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -120,7 +120,7 @@ async def retrieval(tenant_id): retrieval_setting = req.get("retrieval_setting", {}) similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0)) top = int(retrieval_setting.get("top_k", 1024)) - metadata_condition = req.get("metadata_condition", {}) + metadata_condition = req.get("metadata_condition", {}) or {} metas = DocumentService.get_meta_by_kbs([kb_id]) doc_ids = [] @@ -132,7 +132,7 @@ async def retrieval(tenant_id): embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) if metadata_condition: - doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) + doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) if not doc_ids and metadata_condition: doc_ids = ["-999"] ranks = settings.retriever.retrieval( diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 8593667d7..84300ac3c 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -1442,9 +1442,9 @@ async def retrieval_test(tenant_id): if doc_id not in doc_ids_list: return get_error_data_result(f"The datasets don't own the document {doc_id}") if not doc_ids: - metadata_condition = req.get("metadata_condition", {}) + metadata_condition = req.get("metadata_condition", {}) or {} metas = DocumentService.get_meta_by_kbs(kb_ids) - doc_ids = meta_filter(metas, convert_conditions(metadata_condition)) + doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top_k", 1024)) diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index 98151a5fe..533375622 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id): return resp else: # For non-streaming, just return the response directly - response = next( - completion_openai( + async for response in completion_openai( tenant_id, agent_id, question, session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""), stream=False, **req, - ) - ) - return jsonify(response) + ): + return jsonify(response) @manager.route("/agents//completions", methods=["POST"]) # noqa: F821 @@ -977,12 +975,12 @@ async def retrieval_test_embedded(): metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/db/services/canvas_service.py b/api/db/services/canvas_service.py index db8e16068..57b4b5c2a 100644 --- a/api/db/services/canvas_service.py +++ b/api/db/services/canvas_service.py @@ -177,7 +177,7 @@ class UserCanvasService(CommonService): return True -def completion(tenant_id, agent_id, session_id=None, **kwargs): +async def completion(tenant_id, agent_id, session_id=None, **kwargs): query = kwargs.get("query", "") or kwargs.get("question", "") files = kwargs.get("files", []) inputs = kwargs.get("inputs", {}) @@ -219,7 +219,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs): "id": message_id }) txt = "" - for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs): + async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs): ans["session_id"] = session_id if ans["event"] == "message": txt += ans["data"]["content"] @@ -237,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs): API4ConversationService.append_message(conv["id"], conv) -def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): +async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): tiktoken_encoder = tiktoken.get_encoding("cl100k_base") prompt_tokens = len(tiktoken_encoder.encode(str(question))) user_id = kwargs.get("user_id", "") @@ -245,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru if stream: completion_tokens = 0 try: - for ans in completion( + async for ans in completion( tenant_id=tenant_id, agent_id=agent_id, session_id=session_id, @@ -304,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru try: all_content = "" reference = {} - for ans in completion( + async for ans in completion( tenant_id=tenant_id, agent_id=agent_id, session_id=session_id, diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index d2f3b9bc1..db878574d 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -287,7 +287,7 @@ def convert_conditions(metadata_condition): ] -def meta_filter(metas: dict, filters: list[dict]): +def meta_filter(metas: dict, filters: list[dict], logic: str = "and"): doc_ids = set([]) def filter_out(v2docs, operator, value): @@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]): if not doc_ids: doc_ids = set(ids) else: - doc_ids = doc_ids & set(ids) + if logic == "and": + doc_ids = doc_ids & set(ids) + else: + doc_ids = doc_ids | set(ids) if not doc_ids: return [] return list(doc_ids) @@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs): if dialog.meta_data_filter: metas = DocumentService.get_meta_by_kbs(dialog.kb_ids) if dialog.meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, questions[-1]) - attachments.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1]) + attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not attachments: attachments = None elif dialog.meta_data_filter.get("method") == "manual": - attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"])) + attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and"))) if not attachments: attachments = None @@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}): if meta_data_filter: metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None @@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}): if meta_data_filter: metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 481614d13..bc1b15670 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -2085,6 +2085,7 @@ curl --request POST \ "dataset_ids": ["b2a62730759d11ef987d0242ac120004"], "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"], "metadata_condition": { + "logic": "and", "conditions": [ { "name": "author", diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index 3d8438f4a..82c6466a2 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st return re.sub(r"^.*", "", ans, flags=re.DOTALL) -def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: +def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict: sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render( current_date=datetime.datetime.today().strftime('%Y-%m-%d'), metadata_keys=json.dumps(meta_data), @@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: ans = re.sub(r"(^.*|```json\n|```\n*$)", "", ans, flags=re.DOTALL) try: ans = json_repair.loads(ans) - assert isinstance(ans, list), ans + assert isinstance(ans, dict), ans + assert "conditions" in ans and isinstance(ans["conditions"], list), ans return ans except Exception: logging.exception(f"Loading json failure: {ans}") - return [] + + return {"conditions": []} def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None): diff --git a/rag/prompts/meta_filter.md b/rag/prompts/meta_filter.md index 89e322fe5..7df3c4885 100644 --- a/rag/prompts/meta_filter.md +++ b/rag/prompts/meta_filter.md @@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an } 2. **Output Requirements**: - - Always output a JSON array of filter objects - - Each object must have: + - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or'). + - Each filter object in conditions must have: "key": (metadata attribute name), "value": (string value to compare), "op": (operator from allowed list) + - Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions) + 3. **Operator Guide**: - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"] @@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an - Attribute doesn't exist in metadata - Value has no match in metadata -5. **Example**: +5. **Example A**: - User query: "上市日期七月份的有哪些商品,不要蓝色的" - Metadata: { "color": {...}, "listing_date": {...} } - Output: - [ + { + "logic": "and", + "conditions": [ {"key": "listing_date", "value": "2025-07-01", "op": "≥"}, {"key": "listing_date", "value": "2025-08-01", "op": "<"}, {"key": "color", "value": "blue", "op": "≠"} ] + } -6. **Final Output**: - - ONLY output valid JSON array +6. **Example B**: + - User query: "Both blue and red are acceptable." + - Metadata: { "color": {...}, "listing_date": {...} } + - Output: + { + "logic": "or", + "conditions": [ + {"key": "color", "value": "blue", "op": "="}, + {"key": "color", "value": "red", "op": "="} + ] + } + +7. **Final Output**: + - ONLY output valid JSON dictionary - NO additional text/explanations + - Json schema is as following: +```json +{ + "type": "object", + "properties": { + "logic": { + "type": "string", + "description": "Logic relationship between all the conditions, the default is 'and'.", + "enum": [ + "and", + "or" + ] + }, + "conditions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "Metadata attribute name." + }, + "value": { + "type": "string", + "description": "Value to compare." + }, + "op": { + "type": "string", + "description": "Operator from allowed list.", + "enum": [ + "contains", + "not contains", + "start with", + "end with", + "empty", + "not empty", + "=", + "≠", + ">", + "<", + "≥", + "≤" + ] + } + }, + "required": [ + "key", + "value", + "op" + ], + "additionalProperties": false + } + } + }, + "required": [ + "conditions" + ], + "additionalProperties": false +} +``` **Current Task**: -- Today's date: {{current_date}} -- Available metadata keys: {{metadata_keys}} -- User query: "{{user_question}}" +- Today's date: {{ current_date }} +- Available metadata keys: {{ metadata_keys }} +- User query: "{{ user_question }}" From 69578ebfceec893f4622403f38e0fe509dbd7e25 Mon Sep 17 00:00:00 2001 From: chanx <1243304602@qq.com> Date: Thu, 20 Nov 2025 15:32:41 +0800 Subject: [PATCH 2/6] Fix: Change package-lock.json (#11407) ### What problem does this PR solve? Fix: Change package-lock.json ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- web/package-lock.json | 96 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/web/package-lock.json b/web/package-lock.json index e8d87301b..7bae0ed0f 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -66,6 +66,7 @@ "input-otp": "^1.4.1", "js-base64": "^3.7.5", "jsencrypt": "^3.3.2", + "jsoneditor": "^10.4.2", "lexical": "^0.23.1", "lodash": "^4.17.21", "lucide-react": "^0.546.0", @@ -8998,6 +8999,12 @@ "@sinonjs/commons": "^3.0.0" } }, + "node_modules/@sphinxxxx/color-conversion": { + "version": "2.2.2", + "resolved": "https://registry.npmmirror.com/@sphinxxxx/color-conversion/-/color-conversion-2.2.2.tgz", + "integrity": "sha512-XExJS3cLqgrmNBIP3bBw6+1oQ1ksGjFh0+oClDKFYpCCqx/hlqwWO5KO/S63fzUo67SxI9dMrF0y5T/Ey7h8Zw==", + "license": "ISC" + }, "node_modules/@storybook/addon-docs": { "version": "9.1.4", "resolved": "https://registry.npmmirror.com/@storybook/addon-docs/-/addon-docs-9.1.4.tgz", @@ -12962,6 +12969,12 @@ "node": ">= 0.6" } }, + "node_modules/ace-builds": { + "version": "1.43.4", + "resolved": "https://registry.npmmirror.com/ace-builds/-/ace-builds-1.43.4.tgz", + "integrity": "sha512-8hAxVfo2ImICd69BWlZwZlxe9rxDGDjuUhh+WeWgGDvfBCE+r3lkynkQvIovDz4jcMi8O7bsEaFygaDT+h9sBA==", + "license": "BSD-3-Clause" + }, "node_modules/acorn": { "version": "8.15.0", "resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.15.0.tgz", @@ -21894,6 +21907,12 @@ "@pkgjs/parseargs": "^0.11.0" } }, + "node_modules/javascript-natural-sort": { + "version": "0.7.1", + "resolved": "https://registry.npmmirror.com/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz", + "integrity": "sha512-nO6jcEfZWQXDhOiBtG2KvKyEptz7RVbpGP4vTD2hLBdmNQSsCiicO2Ioinv6UI4y9ukqnBpy+XZ9H6uLNgJTlw==", + "license": "MIT" + }, "node_modules/javascript-stringify": { "version": "2.1.0", "resolved": "https://registry.npmmirror.com/javascript-stringify/-/javascript-stringify-2.1.0.tgz", @@ -24253,6 +24272,15 @@ "jiti": "bin/jiti.js" } }, + "node_modules/jmespath": { + "version": "0.16.0", + "resolved": "https://registry.npmmirror.com/jmespath/-/jmespath-0.16.0.tgz", + "integrity": "sha512-9FzQjJ7MATs1tSpnco1K6ayiYE3figslrXA72G2HQ/n76RzvYlofyi5QM+iX4YRs/pu3yzxlVQSST23+dMDknw==", + "license": "Apache-2.0", + "engines": { + "node": ">= 0.6.0" + } + }, "node_modules/js-base64": { "version": "3.7.5", "resolved": "https://registry.npmmirror.com/js-base64/-/js-base64-3.7.5.tgz", @@ -24357,6 +24385,12 @@ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "license": "MIT" }, + "node_modules/json-source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmmirror.com/json-source-map/-/json-source-map-0.6.1.tgz", + "integrity": "sha512-1QoztHPsMQqhDq0hlXY5ZqcEdUzxQEIxgFkKl4WUp2pgShObl+9ovi4kRh2TfvAfxAoHOJ9vIMEqk3k4iex7tg==", + "license": "MIT" + }, "node_modules/json-stable-stringify-without-jsonify": { "version": "1.0.1", "resolved": "https://registry.npmmirror.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", @@ -24393,6 +24427,44 @@ "node": ">=6" } }, + "node_modules/jsoneditor": { + "version": "10.4.2", + "resolved": "https://registry.npmmirror.com/jsoneditor/-/jsoneditor-10.4.2.tgz", + "integrity": "sha512-SQPCXlanU4PqdVsYuj2X7yfbLiiJYjklbksGfMKPsuwLhAIPxDlG43jYfXieGXvxpuq1fkw08YoRbkKXKabcLA==", + "license": "Apache-2.0", + "dependencies": { + "ace-builds": "^1.36.2", + "ajv": "^6.12.6", + "javascript-natural-sort": "^0.7.1", + "jmespath": "^0.16.0", + "json-source-map": "^0.6.1", + "jsonrepair": "^3.8.1", + "picomodal": "^3.0.0", + "vanilla-picker": "^2.12.3" + } + }, + "node_modules/jsoneditor/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmmirror.com/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/jsoneditor/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmmirror.com/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "license": "MIT" + }, "node_modules/jsonfile": { "version": "6.1.0", "resolved": "https://registry.npmmirror.com/jsonfile/-/jsonfile-6.1.0.tgz", @@ -24404,6 +24476,15 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonrepair": { + "version": "3.13.1", + "resolved": "https://registry.npmmirror.com/jsonrepair/-/jsonrepair-3.13.1.tgz", + "integrity": "sha512-WJeiE0jGfxYmtLwBTEk8+y/mYcaleyLXWaqp5bJu0/ZTSeG0KQq/wWQ8pmnkKenEdN6pdnn6QtcoSUkbqDHWNw==", + "license": "ISC", + "bin": { + "jsonrepair": "bin/cli.js" + } + }, "node_modules/jsx-ast-utils": { "version": "3.3.5", "resolved": "https://registry.npmmirror.com/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", @@ -27499,6 +27580,12 @@ "node": ">=8.6" } }, + "node_modules/picomodal": { + "version": "3.0.0", + "resolved": "https://registry.npmmirror.com/picomodal/-/picomodal-3.0.0.tgz", + "integrity": "sha512-FoR3TDfuLlqUvcEeK5ifpKSVVns6B4BQvc8SDF6THVMuadya6LLtji0QgUDSStw0ZR2J7I6UGi5V2V23rnPWTw==", + "license": "MIT" + }, "node_modules/pidtree": { "version": "0.6.0", "resolved": "https://registry.npmmirror.com/pidtree/-/pidtree-0.6.0.tgz", @@ -36235,6 +36322,15 @@ "dev": true, "peer": true }, + "node_modules/vanilla-picker": { + "version": "2.12.3", + "resolved": "https://registry.npmmirror.com/vanilla-picker/-/vanilla-picker-2.12.3.tgz", + "integrity": "sha512-qVkT1E7yMbUsB2mmJNFmaXMWE2hF8ffqzMMwe9zdAikd8u2VfnsVY2HQcOUi2F38bgbxzlJBEdS1UUhOXdF9GQ==", + "license": "ISC", + "dependencies": { + "@sphinxxxx/color-conversion": "^2.2.2" + } + }, "node_modules/vary": { "version": "1.1.2", "resolved": "https://registry.npmmirror.com/vary/-/vary-1.1.2.tgz", From b846a0f547252689b7b8fc2a334c01141bd4d905 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Thu, 20 Nov 2025 15:35:09 +0800 Subject: [PATCH 3/6] Fix: incorrect retrieval total count with pagination enabled (#11400) ### What problem does this PR solve? Incorrect retrieval total count with pagination enabled. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/search.py | 150 +++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 54 deletions(-) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 4dbd9945c..a479e5d3f 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -355,75 +355,102 @@ class Dealer: rag_tokenizer.tokenize(ans).split(), rag_tokenizer.tokenize(inst).split()) - def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2, - vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, - rerank_mdl=None, highlight=False, - rank_feature: dict | None = {PAGERANK_FLD: 10}): + def retrieval( + self, + question, + embd_mdl, + tenant_ids, + kb_ids, + page, + page_size, + similarity_threshold=0.2, + vector_similarity_weight=0.3, + top=1024, + doc_ids=None, + aggs=True, + rerank_mdl=None, + highlight=False, + rank_feature: dict | None = {PAGERANK_FLD: 10}, + ): ranks = {"total": 0, "chunks": [], "doc_aggs": {}} if not question: return ranks # Ensure RERANK_LIMIT is multiple of page_size - RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1 - req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT, - "question": question, "vector": True, "topk": top, - "similarity": similarity_threshold, - "available_int": 1} - + RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1 + req = { + "kb_ids": kb_ids, + "doc_ids": doc_ids, + "page": math.ceil(page_size * page / RERANK_LIMIT), + "size": RERANK_LIMIT, + "question": question, + "vector": True, + "topk": top, + "similarity": similarity_threshold, + "available_int": 1, + } if isinstance(tenant_ids, str): tenant_ids = tenant_ids.split(",") - sres = self.search(req, [index_name(tid) for tid in tenant_ids], - kb_ids, embd_mdl, highlight, rank_feature=rank_feature) + sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature) if rerank_mdl and sres.total > 0: - sim, tsim, vsim = self.rerank_by_model(rerank_mdl, - sres, question, 1 - vector_similarity_weight, - vector_similarity_weight, - rank_feature=rank_feature) + sim, tsim, vsim = self.rerank_by_model( + rerank_mdl, + sres, + question, + 1 - vector_similarity_weight, + vector_similarity_weight, + rank_feature=rank_feature, + ) else: - lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch') - if lower_case_doc_engine in ["elasticsearch","opensearch"]: + lower_case_doc_engine = os.getenv("DOC_ENGINE", "elasticsearch") + if lower_case_doc_engine in ["elasticsearch", "opensearch"]: # ElasticSearch doesn't normalize each way score before fusion. sim, tsim, vsim = self.rerank( - sres, question, 1 - vector_similarity_weight, vector_similarity_weight, - rank_feature=rank_feature) + sres, + question, + 1 - vector_similarity_weight, + vector_similarity_weight, + rank_feature=rank_feature, + ) else: # Don't need rerank here since Infinity normalizes each way score before fusion. sim = [sres.field[id].get("_score", 0.0) for id in sres.ids] - sim = [s if s is not None else 0. for s in sim] + sim = [s if s is not None else 0.0 for s in sim] tsim = sim vsim = sim - # Already paginated in search function - max_pages = RERANK_LIMIT // page_size - page_index = (page % max_pages) - 1 - begin = max(page_index * page_size, 0) - sim = sim[begin : begin + page_size] + sim_np = np.array(sim, dtype=np.float64) - idx = np.argsort(sim_np * -1) + if sim_np.size == 0: + return ranks + + sorted_idx = np.argsort(sim_np * -1) + + valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= similarity_threshold] + filtered_count = len(valid_idx) + ranks["total"] = int(filtered_count) + + if filtered_count == 0: + return ranks + + max_pages = max(RERANK_LIMIT // max(page_size, 1), 1) + page_index = (page - 1) % max_pages + begin = page_index * page_size + end = begin + page_size + page_idx = valid_idx[begin:end] + dim = len(sres.query_vector) vector_column = f"q_{dim}_vec" zero_vector = [0.0] * dim - filtered_count = (sim_np >= similarity_threshold).sum() - ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error - for i in idx: - if np.float64(sim[i]) < similarity_threshold: - break + for i in page_idx: id = sres.ids[i] chunk = sres.field[id] dnm = chunk.get("docnm_kwd", "") did = chunk.get("doc_id", "") - if len(ranks["chunks"]) >= page_size: - if aggs: - if dnm not in ranks["doc_aggs"]: - ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} - ranks["doc_aggs"][dnm]["count"] += 1 - continue - break - position_int = chunk.get("position_int", []) d = { "chunk_id": id, @@ -434,12 +461,12 @@ class Dealer: "kb_id": chunk["kb_id"], "important_kwd": chunk.get("important_kwd", []), "image_id": chunk.get("img_id", ""), - "similarity": sim[i], - "vector_similarity": vsim[i], - "term_similarity": tsim[i], + "similarity": float(sim_np[i]), + "vector_similarity": float(vsim[i]), + "term_similarity": float(tsim[i]), "vector": chunk.get(vector_column, zero_vector), "positions": position_int, - "doc_type_kwd": chunk.get("doc_type_kwd", "") + "doc_type_kwd": chunk.get("doc_type_kwd", ""), } if highlight and sres.highlight: if id in sres.highlight: @@ -447,15 +474,30 @@ class Dealer: else: d["highlight"] = d["content_with_weight"] ranks["chunks"].append(d) - if dnm not in ranks["doc_aggs"]: - ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} - ranks["doc_aggs"][dnm]["count"] += 1 - ranks["doc_aggs"] = [{"doc_name": k, - "doc_id": v["doc_id"], - "count": v["count"]} for k, - v in sorted(ranks["doc_aggs"].items(), - key=lambda x: x[1]["count"] * -1)] - ranks["chunks"] = ranks["chunks"][:page_size] + + if aggs: + for i in valid_idx: + id = sres.ids[i] + chunk = sres.field[id] + dnm = chunk.get("docnm_kwd", "") + did = chunk.get("doc_id", "") + if dnm not in ranks["doc_aggs"]: + ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} + ranks["doc_aggs"][dnm]["count"] += 1 + + ranks["doc_aggs"] = [ + { + "doc_name": k, + "doc_id": v["doc_id"], + "count": v["count"], + } + for k, v in sorted( + ranks["doc_aggs"].items(), + key=lambda x: x[1]["count"] * -1, + ) + ] + else: + ranks["doc_aggs"] = [] return ranks @@ -564,7 +606,7 @@ class Dealer: ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2) if not ids: return chunks - + vector_size = 1024 id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)} for cid, sim in ids: From 0d5589bfda95a5f649722a12741d7195a5440d89 Mon Sep 17 00:00:00 2001 From: balibabu Date: Thu, 20 Nov 2025 15:35:28 +0800 Subject: [PATCH 4/6] Feat: Outputs data is directly synchronized to the canvas without going through the form. #10427 (#11406) ### What problem does this PR solve? Feat: Outputs data is directly synchronized to the canvas without going through the form. #10427 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- web/src/pages/agent/form/agent-form/index.tsx | 27 +++++++++-------- .../use-show-structured-output-dialog.ts | 30 +++++++++++++++---- .../pages/agent/form/agent-form/use-values.ts | 4 ++- .../agent/form/agent-form/use-watch-change.ts | 11 +------ 4 files changed, 43 insertions(+), 29 deletions(-) diff --git a/web/src/pages/agent/form/agent-form/index.tsx b/web/src/pages/agent/form/agent-form/index.tsx index 2b23010cd..38c49b666 100644 --- a/web/src/pages/agent/form/agent-form/index.tsx +++ b/web/src/pages/agent/form/agent-form/index.tsx @@ -22,7 +22,8 @@ import { Switch } from '@/components/ui/switch'; import { LlmModelType } from '@/constants/knowledge'; import { useFindLlmByUuid } from '@/hooks/use-llm-request'; import { zodResolver } from '@hookform/resolvers/zod'; -import { memo, useCallback, useEffect, useMemo } from 'react'; +import { get } from 'lodash'; +import { memo, useEffect, useMemo } from 'react'; import { useForm, useWatch } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { z } from 'zod'; @@ -45,7 +46,10 @@ import { AgentTools, Agents } from './agent-tools'; import { StructuredOutputDialog } from './structured-output-dialog'; import { StructuredOutputPanel } from './structured-output-panel'; import { useBuildPromptExtraPromptOptions } from './use-build-prompt-options'; -import { useShowStructuredOutputDialog } from './use-show-structured-output-dialog'; +import { + useHandleShowStructuredOutput, + useShowStructuredOutputDialog, +} from './use-show-structured-output-dialog'; import { useValues } from './use-values'; import { useWatchFormChange } from './use-watch-change'; @@ -121,22 +125,19 @@ function AgentForm({ node }: INextOperatorForm) { }); const { - initialStructuredOutput, showStructuredOutputDialog, structuredOutputDialogVisible, hideStructuredOutputDialog, handleStructuredOutputDialogOk, } = useShowStructuredOutputDialog(node?.id); - const updateNodeForm = useGraphStore((state) => state.updateNodeForm); + const structuredOutput = get( + node, + `data.form.outputs.${AgentStructuredOutputField}`, + ); - const handleShowStructuredOutput = useCallback( - (val: boolean) => { - if (node?.id && val) { - updateNodeForm(node?.id, {}, ['outputs', AgentStructuredOutputField]); - } - }, - [node?.id, updateNodeForm], + const { handleShowStructuredOutput } = useHandleShowStructuredOutput( + node?.id, ); useEffect(() => { @@ -327,7 +328,7 @@ function AgentForm({ node }: INextOperatorForm) { )} @@ -337,7 +338,7 @@ function AgentForm({ node }: INextOperatorForm) { )} diff --git a/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts b/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts index 19e38cefe..d66fcfb45 100644 --- a/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts +++ b/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts @@ -1,6 +1,8 @@ import { JSONSchema } from '@/components/jsonjoy-builder'; +import { AgentStructuredOutputField } from '@/constants/agent'; import { useSetModalState } from '@/hooks/common-hooks'; import { useCallback } from 'react'; +import { initialAgentValues } from '../../constant'; import useGraphStore from '../../store'; export function useShowStructuredOutputDialog(nodeId?: string) { @@ -9,15 +11,13 @@ export function useShowStructuredOutputDialog(nodeId?: string) { showModal: showStructuredOutputDialog, hideModal: hideStructuredOutputDialog, } = useSetModalState(); - const { updateNodeForm, getNode } = useGraphStore((state) => state); - - const initialStructuredOutput = getNode(nodeId)?.data.form.outputs.structured; + const { updateNodeForm } = useGraphStore((state) => state); const handleStructuredOutputDialogOk = useCallback( (values: JSONSchema) => { // Sync data to canvas if (nodeId) { - updateNodeForm(nodeId, values, ['outputs', 'structured']); + updateNodeForm(nodeId, values, ['outputs', AgentStructuredOutputField]); } hideStructuredOutputDialog(); }, @@ -25,10 +25,30 @@ export function useShowStructuredOutputDialog(nodeId?: string) { ); return { - initialStructuredOutput, structuredOutputDialogVisible, showStructuredOutputDialog, hideStructuredOutputDialog, handleStructuredOutputDialogOk, }; } + +export function useHandleShowStructuredOutput(nodeId?: string) { + const updateNodeForm = useGraphStore((state) => state.updateNodeForm); + + const handleShowStructuredOutput = useCallback( + (val: boolean) => { + if (nodeId) { + if (val) { + updateNodeForm(nodeId, {}, ['outputs', AgentStructuredOutputField]); + } else { + updateNodeForm(nodeId, initialAgentValues.outputs, ['outputs']); + } + } + }, + [nodeId, updateNodeForm], + ); + + return { + handleShowStructuredOutput, + }; +} diff --git a/web/src/pages/agent/form/agent-form/use-values.ts b/web/src/pages/agent/form/agent-form/use-values.ts index f8747e4b4..fb7f94861 100644 --- a/web/src/pages/agent/form/agent-form/use-values.ts +++ b/web/src/pages/agent/form/agent-form/use-values.ts @@ -6,8 +6,10 @@ import { initialAgentValues } from '../../constant'; // You need to exclude the mcp and tools fields that are not in the form, // otherwise the form data update will reset the tools or mcp data to an array +// Exclude data that is not in the form to avoid writing this data to the canvas when using useWatch. +// Outputs, tools, and MCP data are directly synchronized to the canvas without going through the form. function omitToolsAndMcp(values: Record) { - return omit(values, ['mcp', 'tools']); + return omit(values, ['mcp', 'tools', 'outputs']); } export function useValues(node?: RAGFlowNodeType) { diff --git a/web/src/pages/agent/form/agent-form/use-watch-change.ts b/web/src/pages/agent/form/agent-form/use-watch-change.ts index 7c53a8d40..98b0ecf31 100644 --- a/web/src/pages/agent/form/agent-form/use-watch-change.ts +++ b/web/src/pages/agent/form/agent-form/use-watch-change.ts @@ -1,7 +1,6 @@ -import { omit } from 'lodash'; import { useEffect } from 'react'; import { UseFormReturn, useWatch } from 'react-hook-form'; -import { AgentStructuredOutputField, PromptRole } from '../../constant'; +import { PromptRole } from '../../constant'; import useGraphStore from '../../store'; export function useWatchFormChange(id?: string, form?: UseFormReturn) { @@ -17,14 +16,6 @@ export function useWatchFormChange(id?: string, form?: UseFormReturn) { prompts: [{ role: PromptRole.User, content: values.prompts }], }; - if (!values.showStructuredOutput) { - nextValues = { - ...nextValues, - outputs: omit(values.outputs, [AgentStructuredOutputField]), - }; - } else { - nextValues = omit(nextValues, 'outputs'); - } updateNodeForm(id, nextValues); } }, [form?.formState.isDirty, id, updateNodeForm, values]); From c8ab9079b3debc7595d456de809755ba5fcf8515 Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:00:38 +0800 Subject: [PATCH 5/6] Fix:improve multi-column document detection (#11415) ### What problem does this PR solve? change: improve multi-column document detection ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 99 ++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 38 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 5bc877a6a..6d8431c82 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -33,6 +33,8 @@ import xgboost as xgb from huggingface_hub import snapshot_download from PIL import Image from pypdf import PdfReader as pdf2_read +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score from common.file_utils import get_project_base_directory from common.misc_utils import pip_install_torch @@ -353,7 +355,6 @@ class RAGFlowPdfParser: def _assign_column(self, boxes, zoomin=3): if not boxes: return boxes - if all("col_id" in b for b in boxes): return boxes @@ -361,61 +362,80 @@ class RAGFlowPdfParser: for b in boxes: by_page[b["page_number"]].append(b) - page_info = {} # pg -> dict(page_w, left_edge, cand_cols) - counter = Counter() + page_cols = {} for pg, bxs in by_page.items(): if not bxs: - page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1} - counter[1] += 1 + page_cols[pg] = 1 continue - if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg: - page_w = self.page_images[pg - 1].size[0] / max(1, zoomin) - left_edge = 0.0 - else: - xs0 = [box["x0"] for box in bxs] - xs1 = [box["x1"] for box in bxs] - left_edge = float(min(xs0)) - page_w = max(1.0, float(max(xs1) - left_edge)) + x0s_raw = np.array([b["x0"] for b in bxs], dtype=float) - widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs] - median_w = float(np.median(widths)) if widths else 1.0 + min_x0 = np.min(x0s_raw) + max_x1 = np.max([b["x1"] for b in bxs]) + width = max_x1 - min_x0 - raw_cols = int(page_w / max(1.0, median_w)) + INDENT_TOL = width * 0.12 + x0s = [] + for x in x0s_raw: + if abs(x - min_x0) < INDENT_TOL: + x0s.append([min_x0]) + else: + x0s.append([x]) + x0s = np.array(x0s, dtype=float) + + max_try = min(4, len(bxs)) + if max_try < 2: + max_try = 1 + best_k = 1 + best_score = -1 - # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1 - cand = raw_cols + for k in range(1, max_try + 1): + km = KMeans(n_clusters=k, n_init="auto") + labels = km.fit_predict(x0s) - page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand} - counter[cand] += 1 + centers = np.sort(km.cluster_centers_.flatten()) + if len(centers) > 1: + try: + score = silhouette_score(x0s, labels) + except ValueError: + continue + else: + score = 0 + print(f"{k=},{score=}",flush=True) + if score > best_score: + best_score = score + best_k = k - logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}") + page_cols[pg] = best_k + logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}") - global_cols = counter.most_common(1)[0][0] + + global_cols = Counter(page_cols.values()).most_common(1)[0][0] logging.info(f"Global column_num decided by majority: {global_cols}") + for pg, bxs in by_page.items(): if not bxs: continue + k = page_cols[pg] + if len(bxs) < k: + k = 1 + x0s = np.array([[b["x0"]] for b in bxs], dtype=float) + km = KMeans(n_clusters=k, n_init="auto") + labels = km.fit_predict(x0s) - page_w = page_info[pg]["page_w"] - left_edge = page_info[pg]["left_edge"] + centers = km.cluster_centers_.flatten() + order = np.argsort(centers) - if global_cols == 1: - for box in bxs: - box["col_id"] = 0 - continue + remap = {orig: new for new, orig in enumerate(order)} - for box in bxs: - w = box["x1"] - box["x0"] - if w >= 0.8 * page_w: - box["col_id"] = 0 - continue - cx = 0.5 * (box["x0"] + box["x1"]) - norm_cx = (cx - left_edge) / page_w - norm_cx = max(0.0, min(norm_cx, 0.999999)) - box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols)) + for b, lb in zip(bxs, labels): + b["col_id"] = remap[lb] + + grouped = defaultdict(list) + for b in bxs: + grouped[b["col_id"]].append(b) return boxes @@ -1303,7 +1323,10 @@ class RAGFlowPdfParser: positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): - right = left + max_width + if 0 < ii < len(poss) - 1: + right = max(left + 10, right) + else: + right = left + max_width bottom *= ZM for pn in pns[1:]: if 0 <= pn - 1 < page_count: From d3d2ccc76c1078dd401707eb6b3bf6db51200d1d Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 20 Nov 2025 19:07:17 +0800 Subject: [PATCH 6/6] Feat: add more chunking method (#11413) ### What problem does this PR solve? Feat: add more chunking method #11311 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/docling_parser.py | 14 +++++++---- deepdoc/parser/mineru_parser.py | 12 +++++++--- docker/.env | 9 ++++++- rag/app/manual.py | 5 ++-- rag/app/naive.py | 4 ++++ rag/app/paper.py | 41 +++++++++++++++++++++++++------- 6 files changed, 66 insertions(+), 19 deletions(-) diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 59fec9250..965f82265 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser): bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3]) yield (DoclingContentType.EQUATION.value, text, bbox) - def _transfer_to_sections(self, doc) -> list[tuple[str, str]]: + def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]: sections: list[tuple[str, str]] = [] for typ, payload, bbox in self._iter_doc_items(doc): if typ == DoclingContentType.TEXT.value: @@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser): continue tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else "" - sections.append((section, tag)) + if parse_method == "manual": + sections.append((section, typ, tag)) + elif parse_method == "paper": + sections.append((section + tag, typ)) + else: + sections.append((section, tag)) return sections def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1): @@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser): output_dir: Optional[str] = None, lang: Optional[str] = None, method: str = "auto", - delete_output: bool = True, + delete_output: bool = True, + parse_method: str = "raw" ): if not self.check_installation(): @@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser): if callback: callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages") - sections = self._transfer_to_sections(doc) + sections = self._transfer_to_sections(doc, parse_method=parse_method) tables = self._transfer_to_tables(doc) if callback: diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index d2b694188..d4834de39 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser): item[key] = str((subdir / item[key]).resolve()) return data - def _transfer_to_sections(self, outputs: list[dict[str, Any]]): + def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): sections = [] for output in outputs: match output["type"]: @@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.DISCARDED: pass - if section: + if section and parse_method == "manual": + sections.append((section, output["type"], self._line_tag(output))) + elif section and parse_method == "paper": + sections.append((section + self._line_tag(output), output["type"])) + else: sections.append((section, self._line_tag(output))) return sections @@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser): method: str = "auto", server_url: Optional[str] = None, delete_output: bool = True, + parse_method: str = "raw" ) -> tuple: import shutil @@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser): self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") - return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs) + + return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) finally: if temp_pdf and temp_pdf.exists(): try: diff --git a/docker/.env b/docker/.env index d7e4b025f..6423b7824 100644 --- a/docker/.env +++ b/docker/.env @@ -230,9 +230,16 @@ REGISTER_ENABLED=1 # SANDBOX_MAX_MEMORY=256m # b, k, m, g # SANDBOX_TIMEOUT=10s # s, m, 1m30s -# Enable DocLing and Mineru +# Enable DocLing USE_DOCLING=false + +# Enable Mineru USE_MINERU=false +MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru" +MINERU_DELETE_OUTPUT=0 # keep output directory +MINERU_BACKEND=pipeline # or another backend you prefer + + # pptx support DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 \ No newline at end of file diff --git a/rag/app/manual.py b/rag/app/manual.py index 5808e2498..124864041 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang = lang, callback = callback, pdf_cls = Pdf, + parse_method = "manual", **kwargs ) @@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif len(section) != 3: raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") - txt, sec_id, poss = section + txt, layoutno, poss = section if isinstance(poss, str): poss = pdf_parser.extract_positions(poss) first = poss[0] # tuple: ([pn], x1, x2, y1, y2) @@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) - return (txt, sec_id, poss) + return (txt, layoutno, poss) sections = [_normalize_section(sec) for sec in sections] diff --git a/rag/app/naive.py b/rag/app/naive.py index 49dca17af..562336d7f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) + parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "MinerU not found.") @@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" backend=os.environ.get("MINERU_BACKEND", "pipeline"), server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + parse_method=parse_method ) return sections, tables, pdf_parser def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): pdf_parser = DoclingParser() + parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "Docling not found.") @@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + parse_method=parse_method ) return sections, tables, pdf_parser diff --git a/rag/app/paper.py b/rag/app/paper.py index d95976c9f..222be0762 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -21,8 +21,10 @@ import re from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from common.constants import ParserType from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks -from deepdoc.parser import PdfParser, PlainParser +from deepdoc.parser import PdfParser import numpy as np +from rag.app.naive import by_plaintext, PARSERS + class Pdf(PdfParser): def __init__(self): @@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) if re.search(r"\.pdf$", filename, re.IGNORECASE): - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + pdf_parser = PARSERS.get(name, by_plaintext) + callback(0.1, "Start to parse.") + + if name == "deepdoc": + pdf_parser = Pdf() + paper = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + else: + sections, tables, pdf_parser = pdf_parser( + filename=filename, + binary=binary, + from_page=from_page, + to_page=to_page, + lang=lang, + callback=callback, + pdf_cls=Pdf, + parse_method="paper", + **kwargs + ) + paper = { "title": filename, "authors": " ", "abstract": "", - "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], - "tables": [] + "sections": sections, + "tables": tables } - else: - pdf_parser = Pdf() - paper = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) + tbls=paper["tables"] tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) paper["tables"] = tbls