From 73fb8ee154799a24da39ff6b208ef9185cec3882 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 20 Nov 2025 12:41:25 +0800 Subject: [PATCH] Feat: add or logic operations for meta data filters. --- agent/tools/retrieval.py | 6 +- api/apps/chunk_app.py | 6 +- api/apps/sdk/dify_retrieval.py | 2 +- api/apps/sdk/doc.py | 2 +- api/apps/sdk/session.py | 6 +- api/db/services/dialog_service.py | 25 +++---- docs/references/http_api_reference.md | 1 + rag/prompts/generator.py | 8 ++- rag/prompts/meta_filter.md | 95 ++++++++++++++++++++++++--- 9 files changed, 117 insertions(+), 34 deletions(-) diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py index ab388a08e..c3a01e517 100644 --- a/agent/tools/retrieval.py +++ b/agent/tools/retrieval.py @@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC): metas = DocumentService.get_meta_by_kbs(kb_ids) if self._param.meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT) - filters = gen_meta_filter(chat_mdl, metas, query) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, query) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif self._param.meta_data_filter.get("method") == "manual": @@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC): out_parts.append(s[last:]) flt["value"] = "".join(out_parts) - doc_ids.extend(meta_filter(metas, filters)) + doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 7341e336a..e121bcba7 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -305,12 +305,12 @@ async def retrieval_test(): metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index b86fb5c1e..8ea24a6d5 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -132,7 +132,7 @@ async def retrieval(tenant_id): embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) if metadata_condition: - doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) + doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) if not doc_ids and metadata_condition: doc_ids = ["-999"] ranks = settings.retriever.retrieval( diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 8593667d7..f4653aafc 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -1444,7 +1444,7 @@ async def retrieval_test(tenant_id): if not doc_ids: metadata_condition = req.get("metadata_condition", {}) metas = DocumentService.get_meta_by_kbs(kb_ids) - doc_ids = meta_filter(metas, convert_conditions(metadata_condition)) + doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top_k", 1024)) diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index 98151a5fe..8a853f848 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -977,12 +977,12 @@ async def retrieval_test_embedded(): metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index d2f3b9bc1..db878574d 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -287,7 +287,7 @@ def convert_conditions(metadata_condition): ] -def meta_filter(metas: dict, filters: list[dict]): +def meta_filter(metas: dict, filters: list[dict], logic: str = "and"): doc_ids = set([]) def filter_out(v2docs, operator, value): @@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]): if not doc_ids: doc_ids = set(ids) else: - doc_ids = doc_ids & set(ids) + if logic == "and": + doc_ids = doc_ids & set(ids) + else: + doc_ids = doc_ids | set(ids) if not doc_ids: return [] return list(doc_ids) @@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs): if dialog.meta_data_filter: metas = DocumentService.get_meta_by_kbs(dialog.kb_ids) if dialog.meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, questions[-1]) - attachments.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1]) + attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not attachments: attachments = None elif dialog.meta_data_filter.get("method") == "manual": - attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"])) + attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and"))) if not attachments: attachments = None @@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}): if meta_data_filter: metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None @@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}): if meta_data_filter: metas = DocumentService.get_meta_by_kbs(kb_ids) if meta_data_filter.get("method") == "auto": - filters = gen_meta_filter(chat_mdl, metas, question) - doc_ids.extend(meta_filter(metas, filters)) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None elif meta_data_filter.get("method") == "manual": - doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) + doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if not doc_ids: doc_ids = None diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 481614d13..bc1b15670 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -2085,6 +2085,7 @@ curl --request POST \ "dataset_ids": ["b2a62730759d11ef987d0242ac120004"], "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"], "metadata_condition": { + "logic": "and", "conditions": [ { "name": "author", diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index 3d8438f4a..82c6466a2 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st return re.sub(r"^.*", "", ans, flags=re.DOTALL) -def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: +def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict: sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render( current_date=datetime.datetime.today().strftime('%Y-%m-%d'), metadata_keys=json.dumps(meta_data), @@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: ans = re.sub(r"(^.*|```json\n|```\n*$)", "", ans, flags=re.DOTALL) try: ans = json_repair.loads(ans) - assert isinstance(ans, list), ans + assert isinstance(ans, dict), ans + assert "conditions" in ans and isinstance(ans["conditions"], list), ans return ans except Exception: logging.exception(f"Loading json failure: {ans}") - return [] + + return {"conditions": []} def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None): diff --git a/rag/prompts/meta_filter.md b/rag/prompts/meta_filter.md index 89e322fe5..7df3c4885 100644 --- a/rag/prompts/meta_filter.md +++ b/rag/prompts/meta_filter.md @@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an } 2. **Output Requirements**: - - Always output a JSON array of filter objects - - Each object must have: + - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or'). + - Each filter object in conditions must have: "key": (metadata attribute name), "value": (string value to compare), "op": (operator from allowed list) + - Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions) + 3. **Operator Guide**: - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"] @@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an - Attribute doesn't exist in metadata - Value has no match in metadata -5. **Example**: +5. **Example A**: - User query: "上市日期七月份的有哪些商品,不要蓝色的" - Metadata: { "color": {...}, "listing_date": {...} } - Output: - [ + { + "logic": "and", + "conditions": [ {"key": "listing_date", "value": "2025-07-01", "op": "≥"}, {"key": "listing_date", "value": "2025-08-01", "op": "<"}, {"key": "color", "value": "blue", "op": "≠"} ] + } -6. **Final Output**: - - ONLY output valid JSON array +6. **Example B**: + - User query: "Both blue and red are acceptable." + - Metadata: { "color": {...}, "listing_date": {...} } + - Output: + { + "logic": "or", + "conditions": [ + {"key": "color", "value": "blue", "op": "="}, + {"key": "color", "value": "red", "op": "="} + ] + } + +7. **Final Output**: + - ONLY output valid JSON dictionary - NO additional text/explanations + - Json schema is as following: +```json +{ + "type": "object", + "properties": { + "logic": { + "type": "string", + "description": "Logic relationship between all the conditions, the default is 'and'.", + "enum": [ + "and", + "or" + ] + }, + "conditions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": { + "type": "string", + "description": "Metadata attribute name." + }, + "value": { + "type": "string", + "description": "Value to compare." + }, + "op": { + "type": "string", + "description": "Operator from allowed list.", + "enum": [ + "contains", + "not contains", + "start with", + "end with", + "empty", + "not empty", + "=", + "≠", + ">", + "<", + "≥", + "≤" + ] + } + }, + "required": [ + "key", + "value", + "op" + ], + "additionalProperties": false + } + } + }, + "required": [ + "conditions" + ], + "additionalProperties": false +} +``` **Current Task**: -- Today's date: {{current_date}} -- Available metadata keys: {{metadata_keys}} -- User query: "{{user_question}}" +- Today's date: {{ current_date }} +- Available metadata keys: {{ metadata_keys }} +- User query: "{{ user_question }}"