From 06cef71ba6406b453a29d1e1832d4f491bd31a81 Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Thu, 20 Nov 2025 14:31:12 +0800
Subject: [PATCH 1/6] Feat: add or logic operations for meta data filters.
 (#11404)

### What problem does this PR solve?

#11376 #11387

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 agent/tools/retrieval.py              |  6 +-
 api/apps/chunk_app.py                 |  6 +-
 api/apps/sdk/agents.py                |  4 +-
 api/apps/sdk/dify_retrieval.py        |  4 +-
 api/apps/sdk/doc.py                   |  4 +-
 api/apps/sdk/session.py               | 14 ++--
 api/db/services/canvas_service.py     | 10 +--
 api/db/services/dialog_service.py     | 25 +++----
 docs/references/http_api_reference.md |  1 +
 rag/prompts/generator.py              |  8 ++-
 rag/prompts/meta_filter.md            | 95 ++++++++++++++++++++++++---
 11 files changed, 129 insertions(+), 48 deletions(-)

diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py
index ab388a08e..c3a01e517 100644
--- a/agent/tools/retrieval.py
+++ b/agent/tools/retrieval.py
@@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC):
             metas = DocumentService.get_meta_by_kbs(kb_ids)
             if self._param.meta_data_filter.get("method") == "auto":
                 chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
-                filters = gen_meta_filter(chat_mdl, metas, query)
-                doc_ids.extend(meta_filter(metas, filters))
+                filters: dict = gen_meta_filter(chat_mdl, metas, query)
+                doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
                 if not doc_ids:
                     doc_ids = None
             elif self._param.meta_data_filter.get("method") == "manual":
@@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC):
 
                     out_parts.append(s[last:])
                     flt["value"] = "".join(out_parts)
-                doc_ids.extend(meta_filter(metas, filters))
+                doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
                 if not doc_ids:
                     doc_ids = None
 
diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
index 7341e336a..e121bcba7 100644
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -305,12 +305,12 @@ async def retrieval_test():
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
             chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/api/apps/sdk/agents.py b/api/apps/sdk/agents.py
index dda696bf4..b20a22ad8 100644
--- a/api/apps/sdk/agents.py
+++ b/api/apps/sdk/agents.py
@@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str):
             data=False, message=str(e),
             code=RetCode.EXCEPTION_ERROR)
 
-    def sse():
+    async def sse():
         nonlocal canvas
         try:
-            for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
+            async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
                 yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"
 
             cvs.dsl = json.loads(str(canvas))
diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py
index b86fb5c1e..55ea54faf 100644
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@@ -120,7 +120,7 @@ async def retrieval(tenant_id):
     retrieval_setting = req.get("retrieval_setting", {})
     similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
     top = int(retrieval_setting.get("top_k", 1024))
-    metadata_condition = req.get("metadata_condition", {})
+    metadata_condition = req.get("metadata_condition", {}) or {}
     metas = DocumentService.get_meta_by_kbs([kb_id])
 
     doc_ids = []
@@ -132,7 +132,7 @@ async def retrieval(tenant_id):
 
         embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
         if metadata_condition:
-            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
+            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
         if not doc_ids and metadata_condition:
             doc_ids = ["-999"]
         ranks = settings.retriever.retrieval(
diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 8593667d7..84300ac3c 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -1442,9 +1442,9 @@ async def retrieval_test(tenant_id):
         if doc_id not in doc_ids_list:
             return get_error_data_result(f"The datasets don't own the document {doc_id}")
     if not doc_ids:
-        metadata_condition = req.get("metadata_condition", {})
+        metadata_condition = req.get("metadata_condition", {}) or {}
         metas = DocumentService.get_meta_by_kbs(kb_ids)
-        doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
+        doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
     similarity_threshold = float(req.get("similarity_threshold", 0.2))
     vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
     top = int(req.get("top_k", 1024))
diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py
index 98151a5fe..533375622 100644
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id):
         return resp
     else:
         # For non-streaming, just return the response directly
-        response = next(
-            completion_openai(
+        async for response in completion_openai(
                 tenant_id,
                 agent_id,
                 question,
                 session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""),
                 stream=False,
                 **req,
-            )
-        )
-        return jsonify(response)
+            ):
+            return jsonify(response)
 
 
 @manager.route("/agents/<agent_id>/completions", methods=["POST"])  # noqa: F821
@@ -977,12 +975,12 @@ async def retrieval_test_embedded():
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
             chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/api/db/services/canvas_service.py b/api/db/services/canvas_service.py
index db8e16068..57b4b5c2a 100644
--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
@@ -177,7 +177,7 @@ class UserCanvasService(CommonService):
         return True
 
 
-def completion(tenant_id, agent_id, session_id=None, **kwargs):
+async def completion(tenant_id, agent_id, session_id=None, **kwargs):
     query = kwargs.get("query", "") or kwargs.get("question", "")
     files = kwargs.get("files", [])
     inputs = kwargs.get("inputs", {})
@@ -219,7 +219,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
         "id": message_id
     })
     txt = ""
-    for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
+    async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
         ans["session_id"] = session_id
         if ans["event"] == "message":
             txt += ans["data"]["content"]
@@ -237,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
     API4ConversationService.append_message(conv["id"], conv)
 
 
-def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
+async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
     tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
     prompt_tokens = len(tiktoken_encoder.encode(str(question)))
     user_id = kwargs.get("user_id", "")
@@ -245,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
     if stream:
         completion_tokens = 0
         try:
-            for ans in completion(
+            async for ans in completion(
                 tenant_id=tenant_id,
                 agent_id=agent_id,
                 session_id=session_id,
@@ -304,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
         try:
             all_content = ""
             reference = {}
-            for ans in completion(
+            async for ans in completion(
                 tenant_id=tenant_id,
                 agent_id=agent_id,
                 session_id=session_id,
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index d2f3b9bc1..db878574d 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
     ]
 
 
-def meta_filter(metas: dict, filters: list[dict]):
+def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
     doc_ids = set([])
 
     def filter_out(v2docs, operator, value):
@@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
             if not doc_ids:
                 doc_ids = set(ids)
             else:
-                doc_ids = doc_ids & set(ids)
+                if logic == "and":
+                    doc_ids = doc_ids & set(ids)
+                else:
+                    doc_ids = doc_ids | set(ids)
             if not doc_ids:
                 return []
     return list(doc_ids)
@@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs):
     if dialog.meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
         if dialog.meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, questions[-1])
-            attachments.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
+            attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not attachments:
                 attachments = None
         elif dialog.meta_data_filter.get("method") == "manual":
-            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
+            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
             if not attachments:
                 attachments = None
 
@@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
     if meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
@@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
     if meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index 481614d13..bc1b15670 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -2085,6 +2085,7 @@ curl --request POST \
           "dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
           "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
           "metadata_condition": {
+            "logic": "and",
             "conditions": [
               {
                 "name": "author",
diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py
index 3d8438f4a..82c6466a2 100644
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
     return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
 
 
-def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
+def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
     sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
         current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
         metadata_keys=json.dumps(meta_data),
@@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
     ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
     try:
         ans = json_repair.loads(ans)
-        assert isinstance(ans, list), ans
+        assert isinstance(ans, dict), ans
+        assert "conditions" in ans and isinstance(ans["conditions"], list), ans
         return ans
     except Exception:
         logging.exception(f"Loading json failure: {ans}")
-    return []
+
+    return {"conditions": []}
 
 
 def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
diff --git a/rag/prompts/meta_filter.md b/rag/prompts/meta_filter.md
index 89e322fe5..7df3c4885 100644
--- a/rag/prompts/meta_filter.md
+++ b/rag/prompts/meta_filter.md
@@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
      }
 
 2. **Output Requirements**:
-   - Always output a JSON array of filter objects
-   - Each object must have:
+   - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
+   - Each filter object in conditions must have:
         "key": (metadata attribute name),
         "value": (string value to compare),
         "op": (operator from allowed list)
+   - Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
+
 
 3. **Operator Guide**:
    - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
@@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
         - Attribute doesn't exist in metadata
         - Value has no match in metadata
 
-5. **Example**:
+5. **Example A**:
    - User query: "上市日期七月份的有哪些商品，不要蓝色的"
    - Metadata: { "color": {...}, "listing_date": {...} }
    - Output: 
-        [
+   {
+        "logic": "and",
+        "conditions": [
           {"key": "listing_date", "value": "2025-07-01", "op": "≥"},
           {"key": "listing_date", "value": "2025-08-01", "op": "<"},
           {"key": "color", "value": "blue", "op": "≠"}
         ]
+   }
 
-6. **Final Output**:
-   - ONLY output valid JSON array
+6. **Example B**:
+   - User query: "Both blue and red are acceptable."
+   - Metadata: { "color": {...}, "listing_date": {...} }
+   - Output: 
+   {
+        "logic": "or",
+        "conditions": [
+          {"key": "color", "value": "blue", "op": "="},
+          {"key": "color", "value": "red", "op": "="}
+        ]
+   }
+
+7. **Final Output**:
+   - ONLY output valid JSON dictionary
    - NO additional text/explanations
+   - Json schema is as following:
+```json
+{
+  "type": "object",
+  "properties": {
+    "logic": {
+      "type": "string",
+      "description": "Logic relationship between all the conditions, the default is 'and'.",
+      "enum": [
+        "and",
+        "or"
+      ]
+    },
+    "conditions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "key": {
+            "type": "string",
+            "description": "Metadata attribute name."
+          },
+          "value": {
+            "type": "string",
+            "description": "Value to compare."
+          },
+          "op": {
+            "type": "string",
+            "description": "Operator from allowed list.",
+            "enum": [
+              "contains",
+              "not contains",
+              "start with",
+              "end with",
+              "empty",
+              "not empty",
+              "=",
+              "≠",
+              ">",
+              "<",
+              "≥",
+              "≤"
+            ]
+          }
+        },
+        "required": [
+          "key",
+          "value",
+          "op"
+        ],
+        "additionalProperties": false
+      }
+    }
+  },
+  "required": [
+    "conditions"
+  ],
+  "additionalProperties": false
+}
+```
 
 **Current Task**:
-- Today's date: {{current_date}}
-- Available metadata keys: {{metadata_keys}}
-- User query: "{{user_question}}"
+- Today's date: {{ current_date }}
+- Available metadata keys: {{ metadata_keys }}
+- User query: "{{ user_question }}"
 

From 69578ebfceec893f4622403f38e0fe509dbd7e25 Mon Sep 17 00:00:00 2001
From: chanx <1243304602@qq.com>
Date: Thu, 20 Nov 2025 15:32:41 +0800
Subject: [PATCH 2/6] Fix: Change package-lock.json (#11407)

### What problem does this PR solve?

Fix: Change package-lock.json

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 web/package-lock.json | 96 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/web/package-lock.json b/web/package-lock.json
index e8d87301b..7bae0ed0f 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -66,6 +66,7 @@
         "input-otp": "^1.4.1",
         "js-base64": "^3.7.5",
         "jsencrypt": "^3.3.2",
+        "jsoneditor": "^10.4.2",
         "lexical": "^0.23.1",
         "lodash": "^4.17.21",
         "lucide-react": "^0.546.0",
@@ -8998,6 +8999,12 @@
         "@sinonjs/commons": "^3.0.0"
       }
     },
+    "node_modules/@sphinxxxx/color-conversion": {
+      "version": "2.2.2",
+      "resolved": "https://registry.npmmirror.com/@sphinxxxx/color-conversion/-/color-conversion-2.2.2.tgz",
+      "integrity": "sha512-XExJS3cLqgrmNBIP3bBw6+1oQ1ksGjFh0+oClDKFYpCCqx/hlqwWO5KO/S63fzUo67SxI9dMrF0y5T/Ey7h8Zw==",
+      "license": "ISC"
+    },
     "node_modules/@storybook/addon-docs": {
       "version": "9.1.4",
       "resolved": "https://registry.npmmirror.com/@storybook/addon-docs/-/addon-docs-9.1.4.tgz",
@@ -12962,6 +12969,12 @@
         "node": ">= 0.6"
       }
     },
+    "node_modules/ace-builds": {
+      "version": "1.43.4",
+      "resolved": "https://registry.npmmirror.com/ace-builds/-/ace-builds-1.43.4.tgz",
+      "integrity": "sha512-8hAxVfo2ImICd69BWlZwZlxe9rxDGDjuUhh+WeWgGDvfBCE+r3lkynkQvIovDz4jcMi8O7bsEaFygaDT+h9sBA==",
+      "license": "BSD-3-Clause"
+    },
     "node_modules/acorn": {
       "version": "8.15.0",
       "resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.15.0.tgz",
@@ -21894,6 +21907,12 @@
         "@pkgjs/parseargs": "^0.11.0"
       }
     },
+    "node_modules/javascript-natural-sort": {
+      "version": "0.7.1",
+      "resolved": "https://registry.npmmirror.com/javascript-natural-sort/-/javascript-natural-sort-0.7.1.tgz",
+      "integrity": "sha512-nO6jcEfZWQXDhOiBtG2KvKyEptz7RVbpGP4vTD2hLBdmNQSsCiicO2Ioinv6UI4y9ukqnBpy+XZ9H6uLNgJTlw==",
+      "license": "MIT"
+    },
     "node_modules/javascript-stringify": {
       "version": "2.1.0",
       "resolved": "https://registry.npmmirror.com/javascript-stringify/-/javascript-stringify-2.1.0.tgz",
@@ -24253,6 +24272,15 @@
         "jiti": "bin/jiti.js"
       }
     },
+    "node_modules/jmespath": {
+      "version": "0.16.0",
+      "resolved": "https://registry.npmmirror.com/jmespath/-/jmespath-0.16.0.tgz",
+      "integrity": "sha512-9FzQjJ7MATs1tSpnco1K6ayiYE3figslrXA72G2HQ/n76RzvYlofyi5QM+iX4YRs/pu3yzxlVQSST23+dMDknw==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">= 0.6.0"
+      }
+    },
     "node_modules/js-base64": {
       "version": "3.7.5",
       "resolved": "https://registry.npmmirror.com/js-base64/-/js-base64-3.7.5.tgz",
@@ -24357,6 +24385,12 @@
       "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
       "license": "MIT"
     },
+    "node_modules/json-source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmmirror.com/json-source-map/-/json-source-map-0.6.1.tgz",
+      "integrity": "sha512-1QoztHPsMQqhDq0hlXY5ZqcEdUzxQEIxgFkKl4WUp2pgShObl+9ovi4kRh2TfvAfxAoHOJ9vIMEqk3k4iex7tg==",
+      "license": "MIT"
+    },
     "node_modules/json-stable-stringify-without-jsonify": {
       "version": "1.0.1",
       "resolved": "https://registry.npmmirror.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
@@ -24393,6 +24427,44 @@
         "node": ">=6"
       }
     },
+    "node_modules/jsoneditor": {
+      "version": "10.4.2",
+      "resolved": "https://registry.npmmirror.com/jsoneditor/-/jsoneditor-10.4.2.tgz",
+      "integrity": "sha512-SQPCXlanU4PqdVsYuj2X7yfbLiiJYjklbksGfMKPsuwLhAIPxDlG43jYfXieGXvxpuq1fkw08YoRbkKXKabcLA==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "ace-builds": "^1.36.2",
+        "ajv": "^6.12.6",
+        "javascript-natural-sort": "^0.7.1",
+        "jmespath": "^0.16.0",
+        "json-source-map": "^0.6.1",
+        "jsonrepair": "^3.8.1",
+        "picomodal": "^3.0.0",
+        "vanilla-picker": "^2.12.3"
+      }
+    },
+    "node_modules/jsoneditor/node_modules/ajv": {
+      "version": "6.12.6",
+      "resolved": "https://registry.npmmirror.com/ajv/-/ajv-6.12.6.tgz",
+      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "license": "MIT",
+      "dependencies": {
+        "fast-deep-equal": "^3.1.1",
+        "fast-json-stable-stringify": "^2.0.0",
+        "json-schema-traverse": "^0.4.1",
+        "uri-js": "^4.2.2"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/epoberezkin"
+      }
+    },
+    "node_modules/jsoneditor/node_modules/json-schema-traverse": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmmirror.com/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "license": "MIT"
+    },
     "node_modules/jsonfile": {
       "version": "6.1.0",
       "resolved": "https://registry.npmmirror.com/jsonfile/-/jsonfile-6.1.0.tgz",
@@ -24404,6 +24476,15 @@
         "graceful-fs": "^4.1.6"
       }
     },
+    "node_modules/jsonrepair": {
+      "version": "3.13.1",
+      "resolved": "https://registry.npmmirror.com/jsonrepair/-/jsonrepair-3.13.1.tgz",
+      "integrity": "sha512-WJeiE0jGfxYmtLwBTEk8+y/mYcaleyLXWaqp5bJu0/ZTSeG0KQq/wWQ8pmnkKenEdN6pdnn6QtcoSUkbqDHWNw==",
+      "license": "ISC",
+      "bin": {
+        "jsonrepair": "bin/cli.js"
+      }
+    },
     "node_modules/jsx-ast-utils": {
       "version": "3.3.5",
       "resolved": "https://registry.npmmirror.com/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz",
@@ -27499,6 +27580,12 @@
         "node": ">=8.6"
       }
     },
+    "node_modules/picomodal": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/picomodal/-/picomodal-3.0.0.tgz",
+      "integrity": "sha512-FoR3TDfuLlqUvcEeK5ifpKSVVns6B4BQvc8SDF6THVMuadya6LLtji0QgUDSStw0ZR2J7I6UGi5V2V23rnPWTw==",
+      "license": "MIT"
+    },
     "node_modules/pidtree": {
       "version": "0.6.0",
       "resolved": "https://registry.npmmirror.com/pidtree/-/pidtree-0.6.0.tgz",
@@ -36235,6 +36322,15 @@
       "dev": true,
       "peer": true
     },
+    "node_modules/vanilla-picker": {
+      "version": "2.12.3",
+      "resolved": "https://registry.npmmirror.com/vanilla-picker/-/vanilla-picker-2.12.3.tgz",
+      "integrity": "sha512-qVkT1E7yMbUsB2mmJNFmaXMWE2hF8ffqzMMwe9zdAikd8u2VfnsVY2HQcOUi2F38bgbxzlJBEdS1UUhOXdF9GQ==",
+      "license": "ISC",
+      "dependencies": {
+        "@sphinxxxx/color-conversion": "^2.2.2"
+      }
+    },
     "node_modules/vary": {
       "version": "1.1.2",
       "resolved": "https://registry.npmmirror.com/vary/-/vary-1.1.2.tgz",

From b846a0f547252689b7b8fc2a334c01141bd4d905 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Thu, 20 Nov 2025 15:35:09 +0800
Subject: [PATCH 3/6] Fix: incorrect retrieval total count with pagination
 enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/nlp/search.py | 150 +++++++++++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 54 deletions(-)

diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index 4dbd9945c..a479e5d3f 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -355,75 +355,102 @@ class Dealer:
                                            rag_tokenizer.tokenize(ans).split(),
                                            rag_tokenizer.tokenize(inst).split())
 
-    def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
-                  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True,
-                  rerank_mdl=None, highlight=False,
-                  rank_feature: dict | None = {PAGERANK_FLD: 10}):
+    def retrieval(
+        self,
+        question,
+        embd_mdl,
+        tenant_ids,
+        kb_ids,
+        page,
+        page_size,
+        similarity_threshold=0.2,
+        vector_similarity_weight=0.3,
+        top=1024,
+        doc_ids=None,
+        aggs=True,
+        rerank_mdl=None,
+        highlight=False,
+        rank_feature: dict | None = {PAGERANK_FLD: 10},
+    ):
         ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
         if not question:
             return ranks
 
         # Ensure RERANK_LIMIT is multiple of page_size
-        RERANK_LIMIT = math.ceil(64/page_size) * page_size if page_size>1 else 1
-        req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "page": math.ceil(page_size*page/RERANK_LIMIT), "size": RERANK_LIMIT,
-               "question": question, "vector": True, "topk": top,
-               "similarity": similarity_threshold,
-               "available_int": 1}
-
+        RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
+        req = {
+            "kb_ids": kb_ids,
+            "doc_ids": doc_ids,
+            "page": math.ceil(page_size * page / RERANK_LIMIT),
+            "size": RERANK_LIMIT,
+            "question": question,
+            "vector": True,
+            "topk": top,
+            "similarity": similarity_threshold,
+            "available_int": 1,
+        }
 
         if isinstance(tenant_ids, str):
             tenant_ids = tenant_ids.split(",")
 
-        sres = self.search(req, [index_name(tid) for tid in tenant_ids],
-                           kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
+        sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
 
         if rerank_mdl and sres.total > 0:
-            sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
-                                                   sres, question, 1 - vector_similarity_weight,
-                                                   vector_similarity_weight,
-                                                   rank_feature=rank_feature)
+            sim, tsim, vsim = self.rerank_by_model(
+                rerank_mdl,
+                sres,
+                question,
+                1 - vector_similarity_weight,
+                vector_similarity_weight,
+                rank_feature=rank_feature,
+            )
         else:
-            lower_case_doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
-            if lower_case_doc_engine in ["elasticsearch","opensearch"]:
+            lower_case_doc_engine = os.getenv("DOC_ENGINE", "elasticsearch")
+            if lower_case_doc_engine in ["elasticsearch", "opensearch"]:
                 # ElasticSearch doesn't normalize each way score before fusion.
                 sim, tsim, vsim = self.rerank(
-                    sres, question, 1 - vector_similarity_weight, vector_similarity_weight,
-                    rank_feature=rank_feature)
+                    sres,
+                    question,
+                    1 - vector_similarity_weight,
+                    vector_similarity_weight,
+                    rank_feature=rank_feature,
+                )
             else:
                 # Don't need rerank here since Infinity normalizes each way score before fusion.
                 sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
-                sim = [s if s is not None else 0. for s in sim]
+                sim = [s if s is not None else 0.0 for s in sim]
                 tsim = sim
                 vsim = sim
-        # Already paginated in search function
-        max_pages = RERANK_LIMIT // page_size
-        page_index = (page % max_pages) - 1
-        begin = max(page_index * page_size, 0)
-        sim = sim[begin : begin + page_size]
+
         sim_np = np.array(sim, dtype=np.float64)
-        idx = np.argsort(sim_np * -1)
+        if sim_np.size == 0:
+            return ranks
+
+        sorted_idx = np.argsort(sim_np * -1)
+
+        valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= similarity_threshold]
+        filtered_count = len(valid_idx)
+        ranks["total"] = int(filtered_count)
+
+        if filtered_count == 0:
+            return ranks
+
+        max_pages = max(RERANK_LIMIT // max(page_size, 1), 1)
+        page_index = (page - 1) % max_pages
+        begin = page_index * page_size
+        end = begin + page_size
+        page_idx = valid_idx[begin:end]
+
         dim = len(sres.query_vector)
         vector_column = f"q_{dim}_vec"
         zero_vector = [0.0] * dim
-        filtered_count = (sim_np >= similarity_threshold).sum()
-        ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
-        for i in idx:
-            if np.float64(sim[i]) < similarity_threshold:
-                break
 
+        for i in page_idx:
             id = sres.ids[i]
             chunk = sres.field[id]
             dnm = chunk.get("docnm_kwd", "")
             did = chunk.get("doc_id", "")
 
-            if len(ranks["chunks"]) >= page_size:
-                if aggs:
-                    if dnm not in ranks["doc_aggs"]:
-                        ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
-                    ranks["doc_aggs"][dnm]["count"] += 1
-                    continue
-                break
-
             position_int = chunk.get("position_int", [])
             d = {
                 "chunk_id": id,
@@ -434,12 +461,12 @@ class Dealer:
                 "kb_id": chunk["kb_id"],
                 "important_kwd": chunk.get("important_kwd", []),
                 "image_id": chunk.get("img_id", ""),
-                "similarity": sim[i],
-                "vector_similarity": vsim[i],
-                "term_similarity": tsim[i],
+                "similarity": float(sim_np[i]),
+                "vector_similarity": float(vsim[i]),
+                "term_similarity": float(tsim[i]),
                 "vector": chunk.get(vector_column, zero_vector),
                 "positions": position_int,
-                "doc_type_kwd": chunk.get("doc_type_kwd", "")
+                "doc_type_kwd": chunk.get("doc_type_kwd", ""),
             }
             if highlight and sres.highlight:
                 if id in sres.highlight:
@@ -447,15 +474,30 @@ class Dealer:
                 else:
                     d["highlight"] = d["content_with_weight"]
             ranks["chunks"].append(d)
-            if dnm not in ranks["doc_aggs"]:
-                ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
-            ranks["doc_aggs"][dnm]["count"] += 1
-        ranks["doc_aggs"] = [{"doc_name": k,
-                              "doc_id": v["doc_id"],
-                              "count": v["count"]} for k,
-                                                       v in sorted(ranks["doc_aggs"].items(),
-                                                                   key=lambda x: x[1]["count"] * -1)]
-        ranks["chunks"] = ranks["chunks"][:page_size]
+
+        if aggs:
+            for i in valid_idx:
+                id = sres.ids[i]
+                chunk = sres.field[id]
+                dnm = chunk.get("docnm_kwd", "")
+                did = chunk.get("doc_id", "")
+                if dnm not in ranks["doc_aggs"]:
+                    ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
+                ranks["doc_aggs"][dnm]["count"] += 1
+
+            ranks["doc_aggs"] = [
+                {
+                    "doc_name": k,
+                    "doc_id": v["doc_id"],
+                    "count": v["count"],
+                }
+                for k, v in sorted(
+                    ranks["doc_aggs"].items(),
+                    key=lambda x: x[1]["count"] * -1,
+                )
+            ]
+        else:
+            ranks["doc_aggs"] = []
 
         return ranks
 
@@ -564,7 +606,7 @@ class Dealer:
         ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2)
         if not ids:
             return chunks
-        
+
         vector_size = 1024
         id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
         for cid, sim in ids:

From 0d5589bfda95a5f649722a12741d7195a5440d89 Mon Sep 17 00:00:00 2001
From: balibabu <cike8899@users.noreply.github.com>
Date: Thu, 20 Nov 2025 15:35:28 +0800
Subject: [PATCH 4/6] Feat: Outputs data is directly synchronized to the canvas
 without going through the form. #10427 (#11406)

### What problem does this PR solve?

Feat: Outputs data is directly synchronized to the canvas without going
through the form. #10427

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
---
 web/src/pages/agent/form/agent-form/index.tsx | 27 +++++++++--------
 .../use-show-structured-output-dialog.ts      | 30 +++++++++++++++----
 .../pages/agent/form/agent-form/use-values.ts |  4 ++-
 .../agent/form/agent-form/use-watch-change.ts | 11 +------
 4 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/web/src/pages/agent/form/agent-form/index.tsx b/web/src/pages/agent/form/agent-form/index.tsx
index 2b23010cd..38c49b666 100644
--- a/web/src/pages/agent/form/agent-form/index.tsx
+++ b/web/src/pages/agent/form/agent-form/index.tsx
@@ -22,7 +22,8 @@ import { Switch } from '@/components/ui/switch';
 import { LlmModelType } from '@/constants/knowledge';
 import { useFindLlmByUuid } from '@/hooks/use-llm-request';
 import { zodResolver } from '@hookform/resolvers/zod';
-import { memo, useCallback, useEffect, useMemo } from 'react';
+import { get } from 'lodash';
+import { memo, useEffect, useMemo } from 'react';
 import { useForm, useWatch } from 'react-hook-form';
 import { useTranslation } from 'react-i18next';
 import { z } from 'zod';
@@ -45,7 +46,10 @@ import { AgentTools, Agents } from './agent-tools';
 import { StructuredOutputDialog } from './structured-output-dialog';
 import { StructuredOutputPanel } from './structured-output-panel';
 import { useBuildPromptExtraPromptOptions } from './use-build-prompt-options';
-import { useShowStructuredOutputDialog } from './use-show-structured-output-dialog';
+import {
+  useHandleShowStructuredOutput,
+  useShowStructuredOutputDialog,
+} from './use-show-structured-output-dialog';
 import { useValues } from './use-values';
 import { useWatchFormChange } from './use-watch-change';
 
@@ -121,22 +125,19 @@ function AgentForm({ node }: INextOperatorForm) {
   });
 
   const {
-    initialStructuredOutput,
     showStructuredOutputDialog,
     structuredOutputDialogVisible,
     hideStructuredOutputDialog,
     handleStructuredOutputDialogOk,
   } = useShowStructuredOutputDialog(node?.id);
 
-  const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
+  const structuredOutput = get(
+    node,
+    `data.form.outputs.${AgentStructuredOutputField}`,
+  );
 
-  const handleShowStructuredOutput = useCallback(
-    (val: boolean) => {
-      if (node?.id && val) {
-        updateNodeForm(node?.id, {}, ['outputs', AgentStructuredOutputField]);
-      }
-    },
-    [node?.id, updateNodeForm],
+  const { handleShowStructuredOutput } = useHandleShowStructuredOutput(
+    node?.id,
   );
 
   useEffect(() => {
@@ -327,7 +328,7 @@ function AgentForm({ node }: INextOperatorForm) {
               </div>
 
               <StructuredOutputPanel
-                value={initialStructuredOutput}
+                value={structuredOutput}
               ></StructuredOutputPanel>
             </section>
           )}
@@ -337,7 +338,7 @@ function AgentForm({ node }: INextOperatorForm) {
         <StructuredOutputDialog
           hideModal={hideStructuredOutputDialog}
           onOk={handleStructuredOutputDialogOk}
-          initialValues={initialStructuredOutput}
+          initialValues={structuredOutput}
         ></StructuredOutputDialog>
       )}
     </>
diff --git a/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts b/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts
index 19e38cefe..d66fcfb45 100644
--- a/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts
+++ b/web/src/pages/agent/form/agent-form/use-show-structured-output-dialog.ts
@@ -1,6 +1,8 @@
 import { JSONSchema } from '@/components/jsonjoy-builder';
+import { AgentStructuredOutputField } from '@/constants/agent';
 import { useSetModalState } from '@/hooks/common-hooks';
 import { useCallback } from 'react';
+import { initialAgentValues } from '../../constant';
 import useGraphStore from '../../store';
 
 export function useShowStructuredOutputDialog(nodeId?: string) {
@@ -9,15 +11,13 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
     showModal: showStructuredOutputDialog,
     hideModal: hideStructuredOutputDialog,
   } = useSetModalState();
-  const { updateNodeForm, getNode } = useGraphStore((state) => state);
-
-  const initialStructuredOutput = getNode(nodeId)?.data.form.outputs.structured;
+  const { updateNodeForm } = useGraphStore((state) => state);
 
   const handleStructuredOutputDialogOk = useCallback(
     (values: JSONSchema) => {
       // Sync data to canvas
       if (nodeId) {
-        updateNodeForm(nodeId, values, ['outputs', 'structured']);
+        updateNodeForm(nodeId, values, ['outputs', AgentStructuredOutputField]);
       }
       hideStructuredOutputDialog();
     },
@@ -25,10 +25,30 @@ export function useShowStructuredOutputDialog(nodeId?: string) {
   );
 
   return {
-    initialStructuredOutput,
     structuredOutputDialogVisible,
     showStructuredOutputDialog,
     hideStructuredOutputDialog,
     handleStructuredOutputDialogOk,
   };
 }
+
+export function useHandleShowStructuredOutput(nodeId?: string) {
+  const updateNodeForm = useGraphStore((state) => state.updateNodeForm);
+
+  const handleShowStructuredOutput = useCallback(
+    (val: boolean) => {
+      if (nodeId) {
+        if (val) {
+          updateNodeForm(nodeId, {}, ['outputs', AgentStructuredOutputField]);
+        } else {
+          updateNodeForm(nodeId, initialAgentValues.outputs, ['outputs']);
+        }
+      }
+    },
+    [nodeId, updateNodeForm],
+  );
+
+  return {
+    handleShowStructuredOutput,
+  };
+}
diff --git a/web/src/pages/agent/form/agent-form/use-values.ts b/web/src/pages/agent/form/agent-form/use-values.ts
index f8747e4b4..fb7f94861 100644
--- a/web/src/pages/agent/form/agent-form/use-values.ts
+++ b/web/src/pages/agent/form/agent-form/use-values.ts
@@ -6,8 +6,10 @@ import { initialAgentValues } from '../../constant';
 
 // You need to exclude the mcp and tools fields that are not in the form,
 // otherwise the form data update will reset the tools or mcp data to an array
+// Exclude data that is not in the form to avoid writing this data to the canvas when using useWatch.
+// Outputs, tools, and MCP data are directly synchronized to the canvas without going through the form.
 function omitToolsAndMcp(values: Record<string, any>) {
-  return omit(values, ['mcp', 'tools']);
+  return omit(values, ['mcp', 'tools', 'outputs']);
 }
 
 export function useValues(node?: RAGFlowNodeType) {
diff --git a/web/src/pages/agent/form/agent-form/use-watch-change.ts b/web/src/pages/agent/form/agent-form/use-watch-change.ts
index 7c53a8d40..98b0ecf31 100644
--- a/web/src/pages/agent/form/agent-form/use-watch-change.ts
+++ b/web/src/pages/agent/form/agent-form/use-watch-change.ts
@@ -1,7 +1,6 @@
-import { omit } from 'lodash';
 import { useEffect } from 'react';
 import { UseFormReturn, useWatch } from 'react-hook-form';
-import { AgentStructuredOutputField, PromptRole } from '../../constant';
+import { PromptRole } from '../../constant';
 import useGraphStore from '../../store';
 
 export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
@@ -17,14 +16,6 @@ export function useWatchFormChange(id?: string, form?: UseFormReturn<any>) {
         prompts: [{ role: PromptRole.User, content: values.prompts }],
       };
 
-      if (!values.showStructuredOutput) {
-        nextValues = {
-          ...nextValues,
-          outputs: omit(values.outputs, [AgentStructuredOutputField]),
-        };
-      } else {
-        nextValues = omit(nextValues, 'outputs');
-      }
       updateNodeForm(id, nextValues);
     }
   }, [form?.formState.isDirty, id, updateNodeForm, values]);

From c8ab9079b3debc7595d456de809755ba5fcf8515 Mon Sep 17 00:00:00 2001
From: buua436 <66937541+buua436@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:00:38 +0800
Subject: [PATCH 5/6] Fix:improve multi-column document detection (#11415)

### What problem does this PR solve?

change:
improve multi-column document detection

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/pdf_parser.py | 99 ++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 38 deletions(-)

diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index 5bc877a6a..6d8431c82 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -33,6 +33,8 @@ import xgboost as xgb
 from huggingface_hub import snapshot_download
 from PIL import Image
 from pypdf import PdfReader as pdf2_read
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
 
 from common.file_utils import get_project_base_directory
 from common.misc_utils import pip_install_torch
@@ -353,7 +355,6 @@ class RAGFlowPdfParser:
     def _assign_column(self, boxes, zoomin=3):
         if not boxes:
             return boxes
-
         if all("col_id" in b for b in boxes):
             return boxes
 
@@ -361,61 +362,80 @@ class RAGFlowPdfParser:
         for b in boxes:
             by_page[b["page_number"]].append(b)
 
-        page_info = {}  # pg -> dict(page_w, left_edge, cand_cols)
-        counter = Counter()
+        page_cols = {}
 
         for pg, bxs in by_page.items():
             if not bxs:
-                page_info[pg] = {"page_w": 1.0, "left_edge": 0.0, "cand": 1}
-                counter[1] += 1
+                page_cols[pg] = 1
                 continue
 
-            if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= pg:
-                page_w = self.page_images[pg - 1].size[0] / max(1, zoomin)
-                left_edge = 0.0
-            else:
-                xs0 = [box["x0"] for box in bxs]
-                xs1 = [box["x1"] for box in bxs]
-                left_edge = float(min(xs0))
-                page_w = max(1.0, float(max(xs1) - left_edge))
+            x0s_raw = np.array([b["x0"] for b in bxs], dtype=float)
 
-            widths = [max(1.0, (box["x1"] - box["x0"])) for box in bxs]
-            median_w = float(np.median(widths)) if widths else 1.0
+            min_x0 = np.min(x0s_raw)
+            max_x1 = np.max([b["x1"] for b in bxs])
+            width = max_x1 - min_x0
 
-            raw_cols = int(page_w / max(1.0, median_w))
+            INDENT_TOL = width * 0.12
+            x0s = []
+            for x in x0s_raw:
+                if abs(x - min_x0) < INDENT_TOL:
+                    x0s.append([min_x0])
+                else:
+                    x0s.append([x])
+            x0s = np.array(x0s, dtype=float)
+            
+            max_try = min(4, len(bxs))
+            if max_try < 2:
+                max_try = 1
+            best_k = 1
+            best_score = -1
 
-            # cand = raw_cols if (raw_cols >= 2 and median_w < page_w / raw_cols * 0.8) else 1
-            cand = raw_cols
+            for k in range(1, max_try + 1):
+                km = KMeans(n_clusters=k, n_init="auto")
+                labels = km.fit_predict(x0s)
 
-            page_info[pg] = {"page_w": page_w, "left_edge": left_edge, "cand": cand}
-            counter[cand] += 1
+                centers = np.sort(km.cluster_centers_.flatten())
+                if len(centers) > 1:
+                    try:
+                        score = silhouette_score(x0s, labels)
+                    except ValueError:
+                        continue
+                else:
+                    score = 0
+                print(f"{k=},{score=}",flush=True)
+                if score > best_score:
+                    best_score = score
+                    best_k = k
 
-            logging.info(f"[Page {pg}] median_w={median_w:.2f}, page_w={page_w:.2f}, raw_cols={raw_cols}, cand={cand}")
+            page_cols[pg] = best_k
+            logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
 
-        global_cols = counter.most_common(1)[0][0]
+
+        global_cols = Counter(page_cols.values()).most_common(1)[0][0]
         logging.info(f"Global column_num decided by majority: {global_cols}")
 
+
         for pg, bxs in by_page.items():
             if not bxs:
                 continue
+            k = page_cols[pg] 
+            if len(bxs) < k:
+                k = 1
+            x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
+            km = KMeans(n_clusters=k, n_init="auto")
+            labels = km.fit_predict(x0s)
 
-            page_w = page_info[pg]["page_w"]
-            left_edge = page_info[pg]["left_edge"]
+            centers = km.cluster_centers_.flatten()
+            order = np.argsort(centers)
 
-            if global_cols == 1:
-                for box in bxs:
-                    box["col_id"] = 0
-                continue
+            remap = {orig: new for new, orig in enumerate(order)}
 
-            for box in bxs:
-                w = box["x1"] - box["x0"]
-                if w >= 0.8 * page_w:
-                    box["col_id"] = 0
-                    continue
-                cx = 0.5 * (box["x0"] + box["x1"])
-                norm_cx = (cx - left_edge) / page_w
-                norm_cx = max(0.0, min(norm_cx, 0.999999))
-                box["col_id"] = int(min(global_cols - 1, norm_cx * global_cols))
+            for b, lb in zip(bxs, labels):
+                b["col_id"] = remap[lb]
+            
+            grouped = defaultdict(list)
+            for b in bxs:
+                grouped[b["col_id"]].append(b)
 
         return boxes
 
@@ -1303,7 +1323,10 @@ class RAGFlowPdfParser:
 
         positions = []
         for ii, (pns, left, right, top, bottom) in enumerate(poss):
-            right = left + max_width
+            if 0 < ii < len(poss) - 1:
+                right = max(left + 10, right)
+            else:
+                right = left + max_width
             bottom *= ZM
             for pn in pns[1:]:
                 if 0 <= pn - 1 < page_count:

From d3d2ccc76c1078dd401707eb6b3bf6db51200d1d Mon Sep 17 00:00:00 2001
From: Billy Bao <newyorkupperbay@gmail.com>
Date: Thu, 20 Nov 2025 19:07:17 +0800
Subject: [PATCH 6/6] Feat: add more chunking method (#11413)

### What problem does this PR solve?

Feat: add more chunking method #11311

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 deepdoc/parser/docling_parser.py | 14 +++++++----
 deepdoc/parser/mineru_parser.py  | 12 +++++++---
 docker/.env                      |  9 ++++++-
 rag/app/manual.py                |  5 ++--
 rag/app/naive.py                 |  4 ++++
 rag/app/paper.py                 | 41 +++++++++++++++++++++++++-------
 6 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py
index 59fec9250..965f82265 100644
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
                         bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
                 yield (DoclingContentType.EQUATION.value, text, bbox)
 
-    def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
+    def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
         sections: list[tuple[str, str]] = []
         for typ, payload, bbox in self._iter_doc_items(doc):
             if typ == DoclingContentType.TEXT.value:
@@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
                 continue
             
             tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
-            sections.append((section, tag))
+            if parse_method == "manual":
+                sections.append((section, typ, tag))
+            elif parse_method == "paper":
+                sections.append((section + tag, typ))
+            else:
+                sections.append((section, tag))
         return sections
 
     def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
@@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser):
         output_dir: Optional[str] = None, 
         lang: Optional[str] = None,        
         method: str = "auto",             
-        delete_output: bool = True,       
+        delete_output: bool = True,
+        parse_method: str = "raw"     
     ):
 
         if not self.check_installation():
@@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
         if callback:
             callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
 
-        sections = self._transfer_to_sections(doc)
+        sections = self._transfer_to_sections(doc, parse_method=parse_method)
         tables = self._transfer_to_tables(doc)
 
         if callback:
diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index d2b694188..d4834de39 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
                     item[key] = str((subdir / item[key]).resolve())
         return data
 
-    def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
+    def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
         sections = []
         for output in outputs:
             match output["type"]:
@@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
                 case MinerUContentType.DISCARDED:
                     pass
 
-            if section:
+            if section and parse_method == "manual":
+                sections.append((section, output["type"], self._line_tag(output)))
+            elif section and parse_method == "paper":
+                sections.append((section + self._line_tag(output), output["type"]))
+            else:
                 sections.append((section, self._line_tag(output)))
         return sections
 
@@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
         method: str = "auto",
         server_url: Optional[str] = None,
         delete_output: bool = True,
+        parse_method: str = "raw"
     ) -> tuple:
         import shutil
 
@@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
             self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
             if callback:
                 callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
-            return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
+                
+            return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
         finally:
             if temp_pdf and temp_pdf.exists():
                 try:
diff --git a/docker/.env b/docker/.env
index d7e4b025f..6423b7824 100644
--- a/docker/.env
+++ b/docker/.env
@@ -230,9 +230,16 @@ REGISTER_ENABLED=1
 # SANDBOX_MAX_MEMORY=256m # b, k, m, g
 # SANDBOX_TIMEOUT=10s # s, m, 1m30s
 
-# Enable DocLing and Mineru
+# Enable DocLing
 USE_DOCLING=false
+
+# Enable Mineru
 USE_MINERU=false
+MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
+MINERU_DELETE_OUTPUT=0   # keep output directory
+MINERU_BACKEND=pipeline  # or another backend you prefer
+
+
 
 # pptx support
 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1
\ No newline at end of file
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 5808e2498..124864041 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             lang = lang,
             callback = callback,
             pdf_cls = Pdf,
+            parse_method = "manual",
             **kwargs
         )
 
@@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             elif len(section) != 3:
                 raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
 
-            txt, sec_id, poss = section
+            txt, layoutno, poss = section
             if isinstance(poss, str):
                 poss = pdf_parser.extract_positions(poss)
                 first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
@@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                     pn = pn[0]           # [pn] -> pn
                     poss[0] = (pn, *first[1:])
 
-            return (txt, sec_id, poss)
+            return (txt, layoutno, poss)
         
 
         sections = [_normalize_section(sec) for sec in sections]
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 49dca17af..562336d7f 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
     mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
     mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
     pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
+    parse_method = kwargs.get("parse_method", "raw")
 
     if not pdf_parser.check_installation():
         callback(-1, "MinerU not found.")
@@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
         backend=os.environ.get("MINERU_BACKEND", "pipeline"),
         server_url=os.environ.get("MINERU_SERVER_URL", ""),
         delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+        parse_method=parse_method
     )
     return sections, tables, pdf_parser
 
 
 def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
     pdf_parser = DoclingParser()
+    parse_method = kwargs.get("parse_method", "raw")
 
     if not pdf_parser.check_installation():
         callback(-1, "Docling not found.")
@@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
         callback=callback,
         output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
         delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+        parse_method=parse_method
     )
     return sections, tables, pdf_parser
 
diff --git a/rag/app/paper.py b/rag/app/paper.py
index d95976c9f..222be0762 100644
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -21,8 +21,10 @@ import re
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
 from common.constants import ParserType
 from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
-from deepdoc.parser import PdfParser, PlainParser
+from deepdoc.parser import PdfParser
 import numpy as np
+from rag.app.naive import by_plaintext, PARSERS
+
 
 class Pdf(PdfParser):
     def __init__(self):
@@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         "parser_config", {
             "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
     if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
-            pdf_parser = PlainParser()
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        
+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        pdf_parser = PARSERS.get(name, by_plaintext)
+        callback(0.1, "Start to parse.")
+
+        if name == "deepdoc":
+            pdf_parser = Pdf()
+            paper = pdf_parser(filename if not binary else binary,
+                               from_page=from_page, to_page=to_page, callback=callback)
+        else:
+            sections, tables, pdf_parser = pdf_parser(
+                filename=filename,
+                binary=binary,
+                from_page=from_page,
+                to_page=to_page,
+                lang=lang,
+                callback=callback,
+                pdf_cls=Pdf,
+                parse_method="paper",
+                **kwargs
+            )
+
             paper = {
                 "title": filename,
                 "authors": " ",
                 "abstract": "",
-                "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
-                "tables": []
+                "sections": sections,
+                "tables": tables
             }
-        else:
-            pdf_parser = Pdf()
-            paper = pdf_parser(filename if not binary else binary,
-                               from_page=from_page, to_page=to_page, callback=callback)
+
         tbls=paper["tables"]
         tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
         paper["tables"] = tbls