From 73fb8ee154799a24da39ff6b208ef9185cec3882 Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Thu, 20 Nov 2025 12:41:25 +0800
Subject: [PATCH] Feat: add or logic operations for meta data filters.

---
 agent/tools/retrieval.py              |  6 +-
 api/apps/chunk_app.py                 |  6 +-
 api/apps/sdk/dify_retrieval.py        |  2 +-
 api/apps/sdk/doc.py                   |  2 +-
 api/apps/sdk/session.py               |  6 +-
 api/db/services/dialog_service.py     | 25 +++----
 docs/references/http_api_reference.md |  1 +
 rag/prompts/generator.py              |  8 ++-
 rag/prompts/meta_filter.md            | 95 ++++++++++++++++++++++++---
 9 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py
index ab388a08e..c3a01e517 100644
--- a/agent/tools/retrieval.py
+++ b/agent/tools/retrieval.py
@@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC):
             metas = DocumentService.get_meta_by_kbs(kb_ids)
             if self._param.meta_data_filter.get("method") == "auto":
                 chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
-                filters = gen_meta_filter(chat_mdl, metas, query)
-                doc_ids.extend(meta_filter(metas, filters))
+                filters: dict = gen_meta_filter(chat_mdl, metas, query)
+                doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
                 if not doc_ids:
                     doc_ids = None
             elif self._param.meta_data_filter.get("method") == "manual":
@@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC):
 
                     out_parts.append(s[last:])
                     flt["value"] = "".join(out_parts)
-                doc_ids.extend(meta_filter(metas, filters))
+                doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
                 if not doc_ids:
                     doc_ids = None
 
diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
index 7341e336a..e121bcba7 100644
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -305,12 +305,12 @@ async def retrieval_test():
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
             chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py
index b86fb5c1e..8ea24a6d5 100644
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@@ -132,7 +132,7 @@ async def retrieval(tenant_id):
 
         embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
         if metadata_condition:
-            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
+            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
         if not doc_ids and metadata_condition:
             doc_ids = ["-999"]
         ranks = settings.retriever.retrieval(
diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 8593667d7..f4653aafc 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -1444,7 +1444,7 @@ async def retrieval_test(tenant_id):
     if not doc_ids:
         metadata_condition = req.get("metadata_condition", {})
         metas = DocumentService.get_meta_by_kbs(kb_ids)
-        doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
+        doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
     similarity_threshold = float(req.get("similarity_threshold", 0.2))
     vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
     top = int(req.get("top_k", 1024))
diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py
index 98151a5fe..8a853f848 100644
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@@ -977,12 +977,12 @@ async def retrieval_test_embedded():
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
             chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index d2f3b9bc1..db878574d 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
     ]
 
 
-def meta_filter(metas: dict, filters: list[dict]):
+def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
     doc_ids = set([])
 
     def filter_out(v2docs, operator, value):
@@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
             if not doc_ids:
                 doc_ids = set(ids)
             else:
-                doc_ids = doc_ids & set(ids)
+                if logic == "and":
+                    doc_ids = doc_ids & set(ids)
+                else:
+                    doc_ids = doc_ids | set(ids)
             if not doc_ids:
                 return []
     return list(doc_ids)
@@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs):
     if dialog.meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
         if dialog.meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, questions[-1])
-            attachments.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
+            attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not attachments:
                 attachments = None
         elif dialog.meta_data_filter.get("method") == "manual":
-            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
+            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
             if not attachments:
                 attachments = None
 
@@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
     if meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
@@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
     if meta_data_filter:
         metas = DocumentService.get_meta_by_kbs(kb_ids)
         if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
         elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
             if not doc_ids:
                 doc_ids = None
 
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index 481614d13..bc1b15670 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -2085,6 +2085,7 @@ curl --request POST \
           "dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
           "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
           "metadata_condition": {
+            "logic": "and",
             "conditions": [
               {
                 "name": "author",
diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py
index 3d8438f4a..82c6466a2 100644
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
     return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
 
 
-def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
+def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
     sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
         current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
         metadata_keys=json.dumps(meta_data),
@@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
     ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
     try:
         ans = json_repair.loads(ans)
-        assert isinstance(ans, list), ans
+        assert isinstance(ans, dict), ans
+        assert "conditions" in ans and isinstance(ans["conditions"], list), ans
         return ans
     except Exception:
         logging.exception(f"Loading json failure: {ans}")
-    return []
+
+    return {"conditions": []}
 
 
 def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
diff --git a/rag/prompts/meta_filter.md b/rag/prompts/meta_filter.md
index 89e322fe5..7df3c4885 100644
--- a/rag/prompts/meta_filter.md
+++ b/rag/prompts/meta_filter.md
@@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
      }
 
 2. **Output Requirements**:
-   - Always output a JSON array of filter objects
-   - Each object must have:
+   - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
+   - Each filter object in conditions must have:
         "key": (metadata attribute name),
         "value": (string value to compare),
         "op": (operator from allowed list)
+   - Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
+
 
 3. **Operator Guide**:
    - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
@@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
         - Attribute doesn't exist in metadata
         - Value has no match in metadata
 
-5. **Example**:
+5. **Example A**:
    - User query: "上市日期七月份的有哪些商品，不要蓝色的"
    - Metadata: { "color": {...}, "listing_date": {...} }
    - Output: 
-        [
+   {
+        "logic": "and",
+        "conditions": [
           {"key": "listing_date", "value": "2025-07-01", "op": "≥"},
           {"key": "listing_date", "value": "2025-08-01", "op": "<"},
           {"key": "color", "value": "blue", "op": "≠"}
         ]
+   }
 
-6. **Final Output**:
-   - ONLY output valid JSON array
+6. **Example B**:
+   - User query: "Both blue and red are acceptable."
+   - Metadata: { "color": {...}, "listing_date": {...} }
+   - Output: 
+   {
+        "logic": "or",
+        "conditions": [
+          {"key": "color", "value": "blue", "op": "="},
+          {"key": "color", "value": "red", "op": "="}
+        ]
+   }
+
+7. **Final Output**:
+   - ONLY output valid JSON dictionary
    - NO additional text/explanations
+   - Json schema is as following:
+```json
+{
+  "type": "object",
+  "properties": {
+    "logic": {
+      "type": "string",
+      "description": "Logic relationship between all the conditions, the default is 'and'.",
+      "enum": [
+        "and",
+        "or"
+      ]
+    },
+    "conditions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "key": {
+            "type": "string",
+            "description": "Metadata attribute name."
+          },
+          "value": {
+            "type": "string",
+            "description": "Value to compare."
+          },
+          "op": {
+            "type": "string",
+            "description": "Operator from allowed list.",
+            "enum": [
+              "contains",
+              "not contains",
+              "start with",
+              "end with",
+              "empty",
+              "not empty",
+              "=",
+              "≠",
+              ">",
+              "<",
+              "≥",
+              "≤"
+            ]
+          }
+        },
+        "required": [
+          "key",
+          "value",
+          "op"
+        ],
+        "additionalProperties": false
+      }
+    }
+  },
+  "required": [
+    "conditions"
+  ],
+  "additionalProperties": false
+}
+```
 
 **Current Task**:
-- Today's date: {{current_date}}
-- Available metadata keys: {{metadata_keys}}
-- User query: "{{user_question}}"
+- Today's date: {{ current_date }}
+- Available metadata keys: {{ metadata_keys }}
+- User query: "{{ user_question }}"