diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5341d83ae..a5bdc1735 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,6 @@ name: tests +permissions: + contents: read on: push: diff --git a/admin/client/admin_client.py b/admin/client/admin_client.py index 4b210d2b5..8cad14bab 100644 --- a/admin/client/admin_client.py +++ b/admin/client/admin_client.py @@ -351,7 +351,7 @@ class AdminCLI(Cmd): def verify_admin(self, arguments: dict, single_command: bool): self.host = arguments['host'] self.port = arguments['port'] - print(f"Attempt to access ip: {self.host}, port: {self.port}") + print("Attempt to access server for admin login") url = f"http://{self.host}:{self.port}/api/v1/admin/login" attempt_count = 3 @@ -390,7 +390,7 @@ class AdminCLI(Cmd): print(f"Bad response,status: {response.status_code}, password is wrong") except Exception as e: print(str(e)) - print(f"Can't access {self.host}, port: {self.port}") + print("Can't access server for admin login (connection failed)") def _format_service_detail_table(self, data): if isinstance(data, list): @@ -674,7 +674,7 @@ class AdminCLI(Cmd): user_name: str = user_name_tree.children[0].strip("'\"") password_tree: Tree = command['password'] password: str = password_tree.children[0].strip("'\"") - print(f"Alter user: {user_name}, password: {password}") + print(f"Alter user: {user_name}, password: ******") url = f'http://{self.host}:{self.port}/api/v1/admin/users/{user_name}/password' response = self.session.put(url, json={'new_password': encrypt(password)}) res_json = response.json() @@ -689,7 +689,7 @@ class AdminCLI(Cmd): password_tree: Tree = command['password'] password: str = password_tree.children[0].strip("'\"") role: str = command['role'] - print(f"Create user: {user_name}, password: {password}, role: {role}") + print(f"Create user: {user_name}, password: ******, role: {role}") url = f'http://{self.host}:{self.port}/api/v1/admin/users' response = self.session.post( url, @@ -951,7 +951,7 @@ def main(): args = cli.parse_connection_args(sys.argv) if 'error' in args: - print(f"Error: {args['error']}") + print("Error: Invalid connection arguments") return if 'command' in args: @@ -960,7 +960,7 @@ def main(): return if cli.verify_admin(args, single_command=True): command: str = args['command'] - print(f"Run single command: {command}") + # print(f"Run single command: {command}") cli.run_single_command(command) else: if cli.verify_admin(args, single_command=False): diff --git a/admin/server/auth.py b/admin/server/auth.py index 6c8bc2cb8..486b9a4fb 100644 --- a/admin/server/auth.py +++ b/admin/server/auth.py @@ -176,11 +176,11 @@ def login_verify(f): "message": "Access denied", "data": None }), 200 - except Exception as e: - error_msg = str(e) + except Exception: + logging.exception("An error occurred during admin login verification.") return jsonify({ "code": 500, - "message": error_msg + "message": "An internal server error occurred." }), 200 return f(*args, **kwargs) diff --git a/agent/tools/retrieval.py b/agent/tools/retrieval.py index 52e295fd0..a0c990a81 100644 --- a/agent/tools/retrieval.py +++ b/agent/tools/retrieval.py @@ -136,6 +136,16 @@ class Retrieval(ToolBase, ABC): doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None + elif self._param.meta_data_filter.get("method") == "semi_auto": + selected_keys = self._param.meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT) + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, query) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not doc_ids: + doc_ids = None elif self._param.meta_data_filter.get("method") == "manual": filters = self._param.meta_data_filter["manual"] for flt in filters: diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py index fe32dca0b..ed8c8c7a0 100644 --- a/api/apps/canvas_app.py +++ b/api/apps/canvas_app.py @@ -342,7 +342,15 @@ async def test_db_connect(): f"UID={req['username']};" f"PWD={req['password']};" ) - logging.info(conn_str) + redacted_conn_str = ( + f"DATABASE={req['database']};" + f"HOSTNAME={req['host']};" + f"PORT={req['port']};" + f"PROTOCOL=TCPIP;" + f"UID={req['username']};" + f"PWD=****;" + ) + logging.info(redacted_conn_str) conn = ibm_db.connect(conn_str, "", "") stmt = ibm_db.exec_immediate(conn, "SELECT 1 FROM sysibm.sysdummy1") ibm_db.fetch_assoc(stmt) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index d96de64d0..37cd0c7a1 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -327,10 +327,44 @@ async def retrieval_test(): local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not local_doc_ids: local_doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + chat_mdl = LLMBundle(user_id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None elif meta_data_filter.get("method") == "manual": local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if meta_data_filter["manual"] and not local_doc_ids: local_doc_ids = ["-999"] + else: + meta_data_filter = req.get("meta_data_filter") + if meta_data_filter: + metas = DocumentService.get_meta_by_kbs(kb_ids) + if meta_data_filter.get("method") == "auto": + chat_mdl = LLMBundle(user_id, LLMType.CHAT) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + chat_mdl = LLMBundle(user_id, LLMType.CHAT) + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None + elif meta_data_filter.get("method") == "manual": + local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) + if meta_data_filter["manual"] and not local_doc_ids: + local_doc_ids = ["-999"] tenants = UserTenantService.query(user_id=user_id) for kb_id in kb_ids: diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index 337cb74df..b85921115 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -435,7 +435,7 @@ async def mindmap(): kb_ids.extend(req["kb_ids"]) kb_ids = list(set(kb_ids)) - mind_map = gen_mindmap(req["question"], kb_ids, search_app.get("tenant_id", current_user.id), search_config) + mind_map = await gen_mindmap(req["question"], kb_ids, search_app.get("tenant_id", current_user.id), search_config) if "error" in mind_map: return server_error_response(Exception(mind_map["error"])) return get_json_result(data=mind_map) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index ba52bd61c..1a987cd39 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -27,6 +27,7 @@ from api.db import VALID_FILE_TYPES, FileType from api.db.db_models import Task from api.db.services import duplicate_name from api.db.services.document_service import DocumentService, doc_upload_and_parse +from api.db.services.dialog_service import meta_filter, convert_conditions from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService @@ -246,9 +247,19 @@ async def list_docs(): return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}") suffix = req.get("suffix", []) + metadata_condition = req.get("metadata_condition", {}) or {} + if metadata_condition and not isinstance(metadata_condition, dict): + return get_data_error_result(message="metadata_condition must be an object.") + + doc_ids_filter = None + if metadata_condition: + metas = DocumentService.get_flatted_meta_by_kbs([kb_id]) + doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) + if metadata_condition.get("conditions") and not doc_ids_filter: + return get_json_result(data={"total": 0, "docs": []}) try: - docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix) + docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter) if create_time_from or create_time_to: filtered_docs = [] @@ -319,6 +330,87 @@ async def doc_infos(): return get_json_result(data=list(docs.dicts())) +@manager.route("/metadata/summary", methods=["POST"]) # noqa: F821 +@login_required +async def metadata_summary(): + req = await get_request_json() + kb_id = req.get("kb_id") + if not kb_id: + return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) + + tenants = UserTenantService.query(user_id=current_user.id) + for tenant in tenants: + if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id): + break + else: + return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=RetCode.OPERATING_ERROR) + + try: + summary = DocumentService.get_metadata_summary(kb_id) + return get_json_result(data={"summary": summary}) + except Exception as e: + return server_error_response(e) + + +@manager.route("/metadata/update", methods=["POST"]) # noqa: F821 +@login_required +async def metadata_update(): + req = await get_request_json() + kb_id = req.get("kb_id") + if not kb_id: + return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) + + tenants = UserTenantService.query(user_id=current_user.id) + for tenant in tenants: + if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id): + break + else: + return get_json_result(data=False, message="Only owner of knowledgebase authorized for this operation.", code=RetCode.OPERATING_ERROR) + + selector = req.get("selector", {}) or {} + updates = req.get("updates", []) or [] + deletes = req.get("deletes", []) or [] + + if not isinstance(selector, dict): + return get_json_result(data=False, message="selector must be an object.", code=RetCode.ARGUMENT_ERROR) + if not isinstance(updates, list) or not isinstance(deletes, list): + return get_json_result(data=False, message="updates and deletes must be lists.", code=RetCode.ARGUMENT_ERROR) + + metadata_condition = selector.get("metadata_condition", {}) or {} + if metadata_condition and not isinstance(metadata_condition, dict): + return get_json_result(data=False, message="metadata_condition must be an object.", code=RetCode.ARGUMENT_ERROR) + + document_ids = selector.get("document_ids", []) or [] + if document_ids and not isinstance(document_ids, list): + return get_json_result(data=False, message="document_ids must be a list.", code=RetCode.ARGUMENT_ERROR) + + for upd in updates: + if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd: + return get_json_result(data=False, message="Each update requires key and value.", code=RetCode.ARGUMENT_ERROR) + for d in deletes: + if not isinstance(d, dict) or not d.get("key"): + return get_json_result(data=False, message="Each delete requires key.", code=RetCode.ARGUMENT_ERROR) + + kb_doc_ids = KnowledgebaseService.list_documents_by_ids([kb_id]) + target_doc_ids = set(kb_doc_ids) + if document_ids: + invalid_ids = set(document_ids) - set(kb_doc_ids) + if invalid_ids: + return get_json_result(data=False, message=f"These documents do not belong to dataset {kb_id}: {', '.join(invalid_ids)}", code=RetCode.ARGUMENT_ERROR) + target_doc_ids = set(document_ids) + + if metadata_condition: + metas = DocumentService.get_flatted_meta_by_kbs([kb_id]) + filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) + target_doc_ids = target_doc_ids & filtered_ids + if metadata_condition.get("conditions") and not target_doc_ids: + return get_json_result(data={"updated": 0, "matched_docs": 0}) + + target_doc_ids = list(target_doc_ids) + updated = DocumentService.batch_update_metadata(kb_id, target_doc_ids, updates, deletes) + return get_json_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) + + @manager.route("/thumbnails", methods=["GET"]) # noqa: F821 # @login_required def thumbnails(): @@ -698,7 +790,10 @@ async def set_meta(): if not isinstance(meta, dict): return get_json_result(data=False, message="Only dictionary type supported.", code=RetCode.ARGUMENT_ERROR) for k, v in meta.items(): - if not isinstance(v, str) and not isinstance(v, int) and not isinstance(v, float): + if isinstance(v, list): + if not all(isinstance(i, (str, int, float)) for i in v): + return get_json_result(data=False, message=f"The type is not supported in list: {v}", code=RetCode.ARGUMENT_ERROR) + elif not isinstance(v, (str, int, float)): return get_json_result(data=False, message=f"The type is not supported: {v}", code=RetCode.ARGUMENT_ERROR) except Exception as e: return get_json_result(data=False, message=f"Json syntax error: {e}", code=RetCode.ARGUMENT_ERROR) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 0019e2a42..b65a20133 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -14,6 +14,7 @@ # limitations under the License. # import datetime +import json import logging import pathlib import re @@ -551,13 +552,29 @@ def list_docs(dataset_id, tenant_id): run_status = q.getlist("run") create_time_from = int(q.get("create_time_from", 0)) create_time_to = int(q.get("create_time_to", 0)) + metadata_condition_raw = q.get("metadata_condition") + metadata_condition = {} + if metadata_condition_raw: + try: + metadata_condition = json.loads(metadata_condition_raw) + except Exception: + return get_error_data_result(message="metadata_condition must be valid JSON.") + if metadata_condition and not isinstance(metadata_condition, dict): + return get_error_data_result(message="metadata_condition must be an object.") # map run status (text or numeric) - align with API parameter run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] + doc_ids_filter = None + if metadata_condition: + metas = DocumentService.get_flatted_meta_by_kbs([dataset_id]) + doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) + if metadata_condition.get("conditions") and not doc_ids_filter: + return get_result(data={"total": 0, "docs": []}) + docs, total = DocumentService.get_list( - dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted + dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted, doc_ids_filter ) # time range filter (0 means no bound) @@ -586,6 +603,70 @@ def list_docs(dataset_id, tenant_id): return get_result(data={"total": total, "docs": output_docs}) + +@manager.route("/datasets//metadata/summary", methods=["GET"]) # noqa: F821 +@token_required +def metadata_summary(dataset_id, tenant_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") + + try: + summary = DocumentService.get_metadata_summary(dataset_id) + return get_result(data={"summary": summary}) + except Exception as e: + return server_error_response(e) + + +@manager.route("/datasets//metadata/update", methods=["POST"]) # noqa: F821 +@token_required +async def metadata_batch_update(dataset_id, tenant_id): + if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): + return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") + + req = await get_request_json() + selector = req.get("selector", {}) or {} + updates = req.get("updates", []) or [] + deletes = req.get("deletes", []) or [] + + if not isinstance(selector, dict): + return get_error_data_result(message="selector must be an object.") + if not isinstance(updates, list) or not isinstance(deletes, list): + return get_error_data_result(message="updates and deletes must be lists.") + + metadata_condition = selector.get("metadata_condition", {}) or {} + if metadata_condition and not isinstance(metadata_condition, dict): + return get_error_data_result(message="metadata_condition must be an object.") + + document_ids = selector.get("document_ids", []) or [] + if document_ids and not isinstance(document_ids, list): + return get_error_data_result(message="document_ids must be a list.") + + for upd in updates: + if not isinstance(upd, dict) or not upd.get("key") or "value" not in upd: + return get_error_data_result(message="Each update requires key and value.") + for d in deletes: + if not isinstance(d, dict) or not d.get("key"): + return get_error_data_result(message="Each delete requires key.") + + kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id]) + target_doc_ids = set(kb_doc_ids) + if document_ids: + invalid_ids = set(document_ids) - set(kb_doc_ids) + if invalid_ids: + return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}") + target_doc_ids = set(document_ids) + + if metadata_condition: + metas = DocumentService.get_flatted_meta_by_kbs([dataset_id]) + filtered_ids = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) + target_doc_ids = target_doc_ids & filtered_ids + if metadata_condition.get("conditions") and not target_doc_ids: + return get_result(data={"updated": 0, "matched_docs": 0}) + + target_doc_ids = list(target_doc_ids) + updated = DocumentService.batch_update_metadata(dataset_id, target_doc_ids, updates, deletes) + return get_result(data={"updated": updated, "matched_docs": len(target_doc_ids)}) + @manager.route("/datasets//documents", methods=["DELETE"]) # noqa: F821 @token_required async def delete(tenant_id, dataset_id): diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index fe4723984..cb4d78f3b 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -984,10 +984,44 @@ async def retrieval_test_embedded(): local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not local_doc_ids: local_doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, _question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None elif meta_data_filter.get("method") == "manual": local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if meta_data_filter["manual"] and not local_doc_ids: local_doc_ids = ["-999"] + else: + meta_data_filter = req.get("meta_data_filter") + if meta_data_filter: + metas = DocumentService.get_meta_by_kbs(kb_ids) + if meta_data_filter.get("method") == "auto": + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT) + filters: dict = gen_meta_filter(chat_mdl, metas, question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT) + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question) + local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not local_doc_ids: + local_doc_ids = None + elif meta_data_filter.get("method") == "manual": + local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) + if meta_data_filter["manual"] and not local_doc_ids: + local_doc_ids = ["-999"] tenants = UserTenantService.query(user_id=tenant_id) for kb_id in kb_ids: @@ -1135,7 +1169,7 @@ async def mindmap(): search_id = req.get("search_id", "") search_app = SearchService.get_detail(search_id) if search_id else {} - mind_map = gen_mindmap(req["question"], req["kb_ids"], tenant_id, search_app.get("search_config", {})) + mind_map =await gen_mindmap(req["question"], req["kb_ids"], tenant_id, search_app.get("search_config", {})) if "error" in mind_map: return server_error_response(Exception(mind_map["error"])) return get_json_result(data=mind_map) diff --git a/api/db/init_data.py b/api/db/init_data.py index d4873d332..7454965eb 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -73,7 +73,7 @@ def init_superuser(nickname=DEFAULT_SUPERUSER_NICKNAME, email=DEFAULT_SUPERUSER_ UserTenantService.insert(**usr_tenant) TenantLLMService.insert_many(tenant_llm) logging.info( - f"Super user initialized. email: {email}, password: {password}. Changing the password after login is strongly recommended.") + f"Super user initialized. email: {email},A default password has been set; changing the password after login is strongly recommended.") chat_mdl = LLMBundle(tenant["id"], LLMType.CHAT, tenant["llm_id"]) msg = chat_mdl.chat(system="", history=[ diff --git a/api/db/joint_services/user_account_service.py b/api/db/joint_services/user_account_service.py index 34ceee648..48937653e 100644 --- a/api/db/joint_services/user_account_service.py +++ b/api/db/joint_services/user_account_service.py @@ -273,7 +273,7 @@ def delete_user_data(user_id: str) -> dict: except Exception as e: logging.exception(e) - return {"success": False, "message": f"Error: {str(e)}. Already done:\n{done_msg}"} + return {"success": False, "message": "An internal error occurred during user deletion. Some operations may have completed.","details": done_msg} def delete_user_agents(user_id: str) -> dict: diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index cd6a9a4ba..88f61f190 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import asyncio import binascii import logging import re @@ -426,6 +425,15 @@ async def async_chat(dialog, messages, stream=True, **kwargs): attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not attachments: attachments = None + elif dialog.meta_data_filter.get("method") == "semi_auto": + selected_keys = dialog.meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, questions[-1]) + attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not attachments: + attachments = None elif dialog.meta_data_filter.get("method") == "manual": conds = dialog.meta_data_filter["manual"] attachments.extend(meta_filter(metas, conds, dialog.meta_data_filter.get("logic", "and"))) @@ -835,6 +843,15 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not doc_ids: + doc_ids = None elif meta_data_filter.get("method") == "manual": doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if meta_data_filter["manual"] and not doc_ids: @@ -887,7 +904,7 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf yield decorate_answer(answer) -def gen_mindmap(question, kb_ids, tenant_id, search_config={}): +async def gen_mindmap(question, kb_ids, tenant_id, search_config={}): meta_data_filter = search_config.get("meta_data_filter", {}) doc_ids = search_config.get("doc_ids", []) rerank_id = search_config.get("rerank_id", "") @@ -910,6 +927,15 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}): doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) if not doc_ids: doc_ids = None + elif meta_data_filter.get("method") == "semi_auto": + selected_keys = meta_data_filter.get("semi_auto", []) + if selected_keys: + filtered_metas = {key: metas[key] for key in selected_keys if key in metas} + if filtered_metas: + filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question) + doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) + if not doc_ids: + doc_ids = None elif meta_data_filter.get("method") == "manual": doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) if meta_data_filter["manual"] and not doc_ids: @@ -931,5 +957,5 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}): rank_feature=label_question(question, kbs), ) mindmap = MindMapExtractor(chat_mdl) - mind_map = asyncio.run(mindmap([c["content_with_weight"] for c in ranks["chunks"]])) + mind_map = await mindmap([c["content_with_weight"] for c in ranks["chunks"]]) return mind_map.output diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 43adf5d8e..6326ef5de 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -79,7 +79,7 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() def get_list(cls, kb_id, page_number, items_per_page, - orderby, desc, keywords, id, name, suffix=None, run = None): + orderby, desc, keywords, id, name, suffix=None, run = None, doc_ids=None): fields = cls.get_cls_model_fields() docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\ .join(File, on = (File.id == File2Document.file_id))\ @@ -96,6 +96,8 @@ class DocumentService(CommonService): docs = docs.where( fn.LOWER(cls.model.name).contains(keywords.lower()) ) + if doc_ids: + docs = docs.where(cls.model.id.in_(doc_ids)) if suffix: docs = docs.where(cls.model.suffix.in_(suffix)) if run: @@ -123,7 +125,7 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() def get_by_kb_id(cls, kb_id, page_number, items_per_page, - orderby, desc, keywords, run_status, types, suffix): + orderby, desc, keywords, run_status, types, suffix, doc_ids=None): fields = cls.get_cls_model_fields() if keywords: docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\ @@ -143,6 +145,8 @@ class DocumentService(CommonService): .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\ .where(cls.model.kb_id == kb_id) + if doc_ids: + docs = docs.where(cls.model.id.in_(doc_ids)) if run_status: docs = docs.where(cls.model.run.in_(run_status)) if types: @@ -644,6 +648,13 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() def get_meta_by_kbs(cls, kb_ids): + """ + Legacy metadata aggregator (backward-compatible). + - Does NOT expand list values and a list is kept as one string key. + Example: {"tags": ["foo","bar"]} -> meta["tags"]["['foo', 'bar']"] = [doc_id] + - Expects meta_fields is a dict. + Use when existing callers rely on the old list-as-string semantics. + """ fields = [ cls.model.id, cls.model.meta_fields, @@ -660,6 +671,162 @@ class DocumentService(CommonService): meta[k][v].append(doc_id) return meta + @classmethod + @DB.connection_context() + def get_flatted_meta_by_kbs(cls, kb_ids): + """ + - Parses stringified JSON meta_fields when possible and skips non-dict or unparsable values. + - Expands list values into individual entries. + Example: {"tags": ["foo","bar"], "author": "alice"} -> + meta["tags"]["foo"] = [doc_id], meta["tags"]["bar"] = [doc_id], meta["author"]["alice"] = [doc_id] + Prefer for metadata_condition filtering and scenarios that must respect list semantics. + """ + fields = [ + cls.model.id, + cls.model.meta_fields, + ] + meta = {} + for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)): + doc_id = r.id + meta_fields = r.meta_fields or {} + if isinstance(meta_fields, str): + try: + meta_fields = json.loads(meta_fields) + except Exception: + continue + if not isinstance(meta_fields, dict): + continue + for k, v in meta_fields.items(): + if k not in meta: + meta[k] = {} + values = v if isinstance(v, list) else [v] + for vv in values: + if vv is None: + continue + sv = str(vv) + if sv not in meta[k]: + meta[k][sv] = [] + meta[k][sv].append(doc_id) + return meta + + @classmethod + @DB.connection_context() + def get_metadata_summary(cls, kb_id): + fields = [cls.model.id, cls.model.meta_fields] + summary = {} + for r in cls.model.select(*fields).where(cls.model.kb_id == kb_id): + meta_fields = r.meta_fields or {} + if isinstance(meta_fields, str): + try: + meta_fields = json.loads(meta_fields) + except Exception: + continue + if not isinstance(meta_fields, dict): + continue + for k, v in meta_fields.items(): + values = v if isinstance(v, list) else [v] + for vv in values: + if not vv: + continue + sv = str(vv) + if k not in summary: + summary[k] = {} + summary[k][sv] = summary[k].get(sv, 0) + 1 + return {k: sorted([(val, cnt) for val, cnt in v.items()], key=lambda x: x[1], reverse=True) for k, v in summary.items()} + + @classmethod + @DB.connection_context() + def batch_update_metadata(cls, kb_id, doc_ids, updates=None, deletes=None): + updates = updates or [] + deletes = deletes or [] + if not doc_ids: + return 0 + + def _normalize_meta(meta): + if isinstance(meta, str): + try: + meta = json.loads(meta) + except Exception: + return {} + if not isinstance(meta, dict): + return {} + return deepcopy(meta) + + def _str_equal(a, b): + return str(a) == str(b) + + def _apply_updates(meta): + changed = False + for upd in updates: + key = upd.get("key") + if not key or key not in meta: + continue + new_value = upd.get("value") + match_value = upd.get("match", new_value) + if isinstance(meta[key], list): + replaced = False + new_list = [] + for item in meta[key]: + if match_value and _str_equal(item, match_value): + new_list.append(new_value) + replaced = True + else: + new_list.append(item) + if replaced: + meta[key] = new_list + changed = True + else: + if not match_value: + continue + if _str_equal(meta[key], match_value): + meta[key] = new_value + changed = True + return changed + + def _apply_deletes(meta): + changed = False + for d in deletes: + key = d.get("key") + if not key or key not in meta: + continue + value = d.get("value", None) + if isinstance(meta[key], list): + if value is None: + del meta[key] + changed = True + continue + new_list = [item for item in meta[key] if not _str_equal(item, value)] + if len(new_list) != len(meta[key]): + if new_list: + meta[key] = new_list + else: + del meta[key] + changed = True + else: + if value is None or _str_equal(meta[key], value): + del meta[key] + changed = True + return changed + + updated_docs = 0 + with DB.atomic(): + rows = cls.model.select(cls.model.id, cls.model.meta_fields).where( + (cls.model.id.in_(doc_ids)) & (cls.model.kb_id == kb_id) + ) + for r in rows: + meta = _normalize_meta(r.meta_fields or {}) + original_meta = deepcopy(meta) + changed = _apply_updates(meta) + changed = _apply_deletes(meta) or changed + if changed and meta != original_meta: + cls.model.update( + meta_fields=meta, + update_time=current_timestamp(), + update_date=get_format_time() + ).where(cls.model.id == r.id).execute() + updated_docs += 1 + return updated_docs + @classmethod @DB.connection_context() def update_progress(cls): diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index 86356a7a7..e4bf64aac 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -109,7 +109,7 @@ class LLMBundle(LLM4Tenant): llm_name = getattr(self, "llm_name", None) if not TenantLLMService.increase_usage(self.tenant_id, self.llm_type, used_tokens, llm_name): - logging.error("LLMBundle.encode can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens)) + logging.error("LLMBundle.encode can't update token usage for /EMBEDDING used_tokens: {}".format(used_tokens)) if self.langfuse: generation.update(usage_details={"total_tokens": used_tokens}) @@ -124,7 +124,7 @@ class LLMBundle(LLM4Tenant): emd, used_tokens = self.mdl.encode_queries(query) llm_name = getattr(self, "llm_name", None) if not TenantLLMService.increase_usage(self.tenant_id, self.llm_type, used_tokens, llm_name): - logging.error("LLMBundle.encode_queries can't update token usage for {}/EMBEDDING used_tokens: {}".format(self.tenant_id, used_tokens)) + logging.error("LLMBundle.encode_queries can't update token usage for /EMBEDDING used_tokens: {}".format(used_tokens)) if self.langfuse: generation.update(usage_details={"total_tokens": used_tokens}) diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index a057d0694..aff225703 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -1110,7 +1110,10 @@ def _make_attachment_link( ) -> str | None: download_link = "" - if "api.atlassian.com" in confluence_client.url: + from urllib.parse import urlparse + netloc =urlparse(confluence_client.url).hostname + if netloc == "api.atlassian.com" or (netloc and netloc.endswith(".api.atlassian.com")): + # if "api.atlassian.com" in confluence_client.url: # https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content---attachments/#api-wiki-rest-api-content-id-child-attachment-attachmentid-download-get if not parent_content_id: logging.warning( diff --git a/common/data_source/jira/connector.py b/common/data_source/jira/connector.py index 06a0a9069..2a93aaf51 100644 --- a/common/data_source/jira/connector.py +++ b/common/data_source/jira/connector.py @@ -135,7 +135,7 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync except ValueError as exc: raise ConnectorValidationError(str(exc)) from exc else: - logger.warning(f"[Jira] Scoped token requested but Jira base URL {self.jira_base_url} does not appear to be an Atlassian Cloud domain; scoped token ignored.") + logger.warning("[Jira] Scoped token requested but Jira base URL does not appear to be an Atlassian Cloud domain; scoped token ignored.") user_email = credentials.get("jira_user_email") or credentials.get("username") api_token = credentials.get("jira_api_token") or credentials.get("token") or credentials.get("api_token") @@ -245,7 +245,7 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync while True: attempt += 1 jql = self._build_jql(attempt_start, end) - logger.info(f"[Jira] Executing Jira JQL attempt {attempt} (start={attempt_start}, end={end}, buffered_retry={retried_with_buffer}): {jql}") + logger.info(f"[Jira] Executing Jira JQL attempt {attempt} (start={attempt_start}, end={end}, buffered_retry={retried_with_buffer})") try: return (yield from self._load_from_checkpoint_internal(jql, checkpoint, start_filter=start)) except Exception as exc: @@ -927,9 +927,6 @@ def main(config: dict[str, Any] | None = None) -> None: base_url = config.get("base_url") credentials = config.get("credentials", {}) - print(f"[Jira] {config=}", flush=True) - print(f"[Jira] {credentials=}", flush=True) - if not base_url: raise RuntimeError("Jira base URL must be provided via config or CLI arguments.") if not (credentials.get("jira_api_token") or (credentials.get("jira_user_email") and credentials.get("jira_password"))): diff --git a/common/http_client.py b/common/http_client.py index 91ac0cadc..5c633d78d 100644 --- a/common/http_client.py +++ b/common/http_client.py @@ -16,7 +16,9 @@ import logging import os import time from typing import Any, Dict, Optional +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from common import settings import httpx logger = logging.getLogger(__name__) @@ -52,6 +54,55 @@ def _get_delay(backoff_factor: float, attempt: int) -> float: return backoff_factor * (2**attempt) +# List of sensitive parameters to redact from URLs before logging +_SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"} + +def _redact_sensitive_url_params(url: str) -> str: + try: + parsed = urlparse(url) + if not parsed.query: + return url + clean_query = [] + for k, v in parse_qsl(parsed.query, keep_blank_values=True): + if k.lower() in _SENSITIVE_QUERY_KEYS: + clean_query.append((k, "***REDACTED***")) + else: + clean_query.append((k, v)) + new_query = urlencode(clean_query, doseq=True) + redacted_url = urlunparse(parsed._replace(query=new_query)) + return redacted_url + except Exception: + return url + +def _is_sensitive_url(url: str) -> bool: + """Return True if URL is one of the configured OAuth endpoints.""" + # Collect known sensitive endpoint URLs from settings + oauth_urls = set() + # GitHub OAuth endpoints + try: + if settings.GITHUB_OAUTH is not None: + url_val = settings.GITHUB_OAUTH.get("url") + if url_val: + oauth_urls.add(url_val) + except Exception: + pass + # Feishu OAuth endpoints + try: + if settings.FEISHU_OAUTH is not None: + for k in ("app_access_token_url", "user_access_token_url"): + url_val = settings.FEISHU_OAUTH.get(k) + if url_val: + oauth_urls.add(url_val) + except Exception: + pass + # Defensive normalization: compare only scheme+netloc+path + url_obj = urlparse(url) + for sensitive_url in oauth_urls: + sensitive_obj = urlparse(sensitive_url) + if (url_obj.scheme, url_obj.netloc, url_obj.path) == (sensitive_obj.scheme, sensitive_obj.netloc, sensitive_obj.path): + return True + return False + async def async_request( method: str, url: str, @@ -93,20 +144,23 @@ async def async_request( method=method, url=url, headers=headers, **kwargs ) duration = time.monotonic() - start + log_url = "" if _is_sensitive_url else _redact_sensitive_url_params(url) logger.debug( - f"async_request {method} {url} -> {response.status_code} in {duration:.3f}s" + f"async_request {method} {log_url} -> {response.status_code} in {duration:.3f}s" ) return response except httpx.RequestError as exc: last_exc = exc if attempt >= retries: + log_url = "" if _is_sensitive_url else _redact_sensitive_url_params(url) logger.warning( - f"async_request exhausted retries for {method} {url}: {exc}" + f"async_request exhausted retries for {method} {log_url}" ) raise delay = _get_delay(backoff_factor, attempt) + log_url = "" if _is_sensitive_url else _redact_sensitive_url_params(url) logger.warning( - f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {url}: {exc}; retrying in {delay:.2f}s" + f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s" ) await asyncio.sleep(delay) raise last_exc # pragma: no cover diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 4d784d33c..d363e7f06 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -369,6 +369,13 @@ "model_type": "chat", "is_tools": true }, + { + "llm_name": "deepseek-v3.2", + "tags": "LLM,CHAT,128K", + "max_tokens": 128000, + "model_type": "chat", + "is_tools": true + }, { "llm_name": "deepseek-r1", "tags": "LLM,CHAT,64K", diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 7e3919bbd..57840ebb8 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -106,7 +106,7 @@ class MinerUParser(RAGFlowPdfParser): def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]: reason = "" - valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"] + valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine"] if backend not in valid_backends: reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}" self.logger.warning(reason) diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index e33919c7e..718760dda 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -1477,7 +1477,7 @@ Failure: ### List documents -**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}` +**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}&metadata_condition={json}` Lists documents in a specified dataset. @@ -1492,6 +1492,7 @@ Lists documents in a specified dataset. ##### Request examples **A basic request with pagination:** + ```bash curl --request GET \ --url http://{address}/api/v1/datasets/{dataset_id}/documents?page=1&page_size=10 \ @@ -1534,6 +1535,11 @@ curl --request GET \ - `3` / `DONE`: Document processing completed successfully - `4` / `FAIL`: Document processing failed Defaults to all statuses. +- `metadata_condition`: (*Filter parameter*), `object` (JSON in query) + Optional metadata filter applied to documents when `document_ids` is not provided. Uses the same structure as retrieval: + - `logic`: `"and"` (default) or `"or"` + - `conditions`: array of `{ "name": string, "comparison_operator": string, "value": string }` + - `comparison_operator` supports: `is`, `not is`, `contains`, `not contains`, `in`, `not in`, `start with`, `end with`, `>`, `<`, `≥`, `≤`, `empty`, `not empty` ##### Usage examples @@ -1545,6 +1551,15 @@ curl --request GET \ --header 'Authorization: Bearer ' ``` +**Filter by metadata (query JSON):** + +```bash +curl -G \ + --url "http://localhost:9222/api/v1/datasets/{{KB_ID}}/documents" \ + --header 'Authorization: Bearer ' \ + --data-urlencode 'metadata_condition={"logic":"and","conditions":[{"name":"tags","comparison_operator":"is","value":"bar"},{"name":"author","comparison_operator":"is","value":"alice"}]}' +``` + #### Response Success: @@ -2088,6 +2103,108 @@ Failure: --- +### Dataset metadata summary + +**GET** `/api/v1/datasets/{dataset_id}/metadata/summary` + +Aggregates metadata values across all documents in a dataset. + +#### Request + +- Method: GET +- URL: `/api/v1/datasets/{dataset_id}/metadata/summary` +- Headers: + - `'Authorization: Bearer '` + +##### Response + +Success: + +```json +{ + "code": 0, + "data": { + "summary": { + "tags": [["bar", 2], ["foo", 1], ["baz", 1]], + "author": [["alice", 2], ["bob", 1]] + } + } +} +``` + +--- + +### Dataset metadata update + +**POST** `/api/v1/datasets/{dataset_id}/metadata/update` + +Batch update or delete document-level metadata in a dataset. If both `document_ids` and `metadata_condition` are omitted, all documents in the dataset are selected. When both are provided, the intersection is used. + +#### Request + +- Method: POST +- URL: `/api/v1/datasets/{dataset_id}/metadata/update` +- Headers: + - `'content-Type: application/json'` + - `'Authorization: Bearer '` +- Body: + - `selector`: `object`, optional + - `document_ids`: `list[string]`, optional + - `metadata_condition`: `object`, optional + - `logic`: `"and"` (default) or `"or"` + - `conditions`: array of `{ "name": string, "comparison_operator": string, "value": string }` + - `comparison_operator` supports: `is`, `not is`, `contains`, `not contains`, `in`, `not in`, `start with`, `end with`, `>`, `<`, `≥`, `≤`, `empty`, `not empty` + - `updates`: `array`, optional + - items: `{ "key": string, "value": any, "match": any (optional) }` + - For lists: replace elements equal to `match` (or `value` when `match` omitted) with `value`. + - For scalars: replace when current value equals `match` (or `value` when `match` omitted). + - `deletes`: `array`, optional + - items: `{ "key": string, "value": any (optional) }` + - For lists: remove elements equal to `value`; if list becomes empty, remove the key. + - For scalars: remove the key when `value` matches or when `value` is omitted. + +##### Request example + +```bash +curl --request POST \ + --url http://{address}/api/v1/datasets/{dataset_id}/metadata/update \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer ' \ + --data '{ + "selector": { + "metadata_condition": { + "logic": "and", + "conditions": [ + {"name": "author", "comparison_operator": "is", "value": "alice"} + ] + } + }, + "updates": [ + {"key": "tags", "match": "foo", "value": "foo_new"} + ], + "deletes": [ + {"key": "obsolete_key"}, + {"key": "author", "value": "alice"} + ] + }' +``` + +##### Response + +Success: + +```json +{ + "code": 0, + "data": { + "updated": 1, + "matched_docs": 2 + } +} +``` + +--- + ### Retrieve chunks **POST** `/api/v1/retrieval` @@ -2117,6 +2234,7 @@ Retrieves chunks from specified datasets. - `"metadata_condition"`: `object` - `"use_kg"`: `boolean` - `"toc_enhance"`: `boolean` + ##### Request example ```bash @@ -2189,7 +2307,7 @@ curl --request POST \ - `"conditions"`: (*Body parameter*), `array` A list of metadata filter conditions. - `"name"`: `string` - The metadata field name to filter by, e.g., `"author"`, `"company"`, `"url"`. Ensure this parameter before use. See [Set metadata](../guides/dataset/set_metadata.md) for details. - - `comparison_operator`: `string` - The comparison operator. Can be one of: + - `comparison_operator`: `string` - The comparison operator. Can be one of: - `"contains"` - `"not contains"` - `"start with"` @@ -2203,7 +2321,6 @@ curl --request POST \ - `"≤"` - `"value"`: `string` - The value to compare. - #### Response Success: @@ -4450,7 +4567,9 @@ Failure: --- ### System + --- + ### Check system health **GET** `/v1/system/healthz` @@ -4519,6 +4638,7 @@ Content-Type: application/json ``` Explanation: + - Each service is reported as "ok" or "nok". - The top-level `status` reflects overall health. - If any service is "nok", detailed error info appears in `_meta`. diff --git a/graphrag/search.py b/graphrag/search.py index 7399ea393..860c58906 100644 --- a/graphrag/search.py +++ b/graphrag/search.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import asyncio import json import logging from collections import defaultdict @@ -44,7 +43,7 @@ class KGSearch(Dealer): return response def query_rewrite(self, llm, question, idxnms, kb_ids): - ty2ents = asyncio.run(get_entity_type2samples(idxnms, kb_ids)) + ty2ents = get_entity_type2samples(idxnms, kb_ids) hint_prompt = PROMPTS["minirag_query2kwd"].format(query=question, TYPE_POOL=json.dumps(ty2ents, ensure_ascii=False, indent=2)) result = self._chat(llm, hint_prompt, [{"role": "user", "content": "Output:"}], {}) diff --git a/graphrag/utils.py b/graphrag/utils.py index 9b3dc2c2b..7e3fec1a9 100644 --- a/graphrag/utils.py +++ b/graphrag/utils.py @@ -626,8 +626,8 @@ def merge_tuples(list1, list2): return result -async def get_entity_type2samples(idxnms, kb_ids: list): - es_res = await asyncio.to_thread(settings.retriever.search,{"knowledge_graph_kwd": "ty2ents", "kb_id": kb_ids, "size": 10000, "fields": ["content_with_weight"]},idxnms,kb_ids) +def get_entity_type2samples(idxnms, kb_ids: list): + es_res = settings.retriever.search({"knowledge_graph_kwd": "ty2ents", "kb_id": kb_ids, "size": 10000, "fields": ["content_with_weight"]},idxnms,kb_ids) res = defaultdict(list) for id in es_res.ids: diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 400f90370..d6603f68f 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -714,4 +714,4 @@ async def main(): if __name__ == "__main__": faulthandler.enable() init_root_logger(CONSUMER_NAME) - asyncio.run(main) + asyncio.run(main()) diff --git a/rag/utils/ob_conn.py b/rag/utils/ob_conn.py index 6218a8c4e..3c00be421 100644 --- a/rag/utils/ob_conn.py +++ b/rag/utils/ob_conn.py @@ -17,13 +17,16 @@ import json import logging import os import re +import threading import time from typing import Any, Optional +import numpy as np from elasticsearch_dsl import Q, Search from pydantic import BaseModel from pymysql.converters import escape_string from pyobvector import ObVecClient, FtsIndexParam, FtsParser, ARRAY, VECTOR +from pyobvector.client import ClusterVersionException from pyobvector.client.hybrid_search import HybridSearch from pyobvector.util import ObVersion from sqlalchemy import text, Column, String, Integer, JSON, Double, Row, Table @@ -106,17 +109,6 @@ index_columns: list[str] = [ "removed_kwd", ] -fulltext_search_columns: list[str] = [ - "docnm_kwd", - "content_with_weight", - "title_tks", - "title_sm_tks", - "important_tks", - "question_tks", - "content_ltks", - "content_sm_ltks" -] - fts_columns_origin: list[str] = [ "docnm_kwd^10", "content_with_weight", @@ -138,7 +130,7 @@ fulltext_index_name_template = "fts_idx_%s" # MATCH AGAINST: https://www.oceanbase.com/docs/common-oceanbase-database-cn-1000000002017607 fulltext_search_template = "MATCH (%s) AGAINST ('%s' IN NATURAL LANGUAGE MODE)" # cosine_distance: https://www.oceanbase.com/docs/common-oceanbase-database-cn-1000000002012938 -vector_search_template = "cosine_distance(%s, %s)" +vector_search_template = "cosine_distance(%s, '%s')" class SearchResult(BaseModel): @@ -362,18 +354,28 @@ class OBConnection(DocStoreConnection): port = mysql_config.get("port", 2881) self.username = mysql_config.get("user", "root@test") self.password = mysql_config.get("password", "infini_rag_flow") + max_connections = mysql_config.get("max_connections", 300) else: logger.info("Use customized config to create OceanBase connection.") host = ob_config.get("host", "localhost") port = ob_config.get("port", 2881) self.username = ob_config.get("user", "root@test") self.password = ob_config.get("password", "infini_rag_flow") + max_connections = ob_config.get("max_connections", 300) self.db_name = ob_config.get("db_name", "test") self.uri = f"{host}:{port}" logger.info(f"Use OceanBase '{self.uri}' as the doc engine.") + # Set the maximum number of connections that can be created above the pool_size. + # By default, this is half of max_connections, but at least 10. + # This allows the pool to handle temporary spikes in demand without exhausting resources. + max_overflow = int(os.environ.get("OB_MAX_OVERFLOW", max(max_connections // 2, 10))) + # Set the number of seconds to wait before giving up when trying to get a connection from the pool. + # Default is 30 seconds, but can be overridden with the OB_POOL_TIMEOUT environment variable. + pool_timeout = int(os.environ.get("OB_POOL_TIMEOUT", "30")) + for _ in range(ATTEMPT_TIME): try: self.client = ObVecClient( @@ -383,6 +385,9 @@ class OBConnection(DocStoreConnection): db_name=self.db_name, pool_pre_ping=True, pool_recycle=3600, + pool_size=max_connections, + max_overflow=max_overflow, + pool_timeout=pool_timeout, ) break except Exception as e: @@ -398,6 +403,37 @@ class OBConnection(DocStoreConnection): self._check_ob_version() self._try_to_update_ob_query_timeout() + self.es = None + if self.enable_hybrid_search: + try: + self.es = HybridSearch( + uri=self.uri, + user=self.username, + password=self.password, + db_name=self.db_name, + pool_pre_ping=True, + pool_recycle=3600, + pool_size=max_connections, + max_overflow=max_overflow, + pool_timeout=pool_timeout, + ) + logger.info("OceanBase Hybrid Search feature is enabled") + except ClusterVersionException as e: + logger.info("Failed to initialize HybridSearch client, fallback to use SQL", exc_info=e) + self.es = None + + if self.es is not None and self.search_original_content: + logger.info("HybridSearch is enabled, forcing search_original_content to False") + self.search_original_content = False + # Determine which columns to use for full-text search dynamically: + # If HybridSearch is enabled (self.es is not None), we must use tokenized columns (fts_columns_tks) + # for compatibility and performance with HybridSearch. Otherwise, we use the original content columns + # (fts_columns_origin), which may be controlled by an environment variable. + self.fulltext_search_columns = fts_columns_origin if self.search_original_content else fts_columns_tks + + self._table_exists_cache: set[str] = set() + self._table_exists_cache_lock = threading.RLock() + logger.info(f"OceanBase {self.uri} is healthy.") def _check_ob_version(self): @@ -417,18 +453,6 @@ class OBConnection(DocStoreConnection): f"The version of OceanBase needs to be higher than or equal to 4.3.5.1, current version is {version_str}" ) - self.es = None - if not ob_version < ObVersion.from_db_version_nums(4, 4, 1, 0) and self.enable_hybrid_search: - self.es = HybridSearch( - uri=self.uri, - user=self.username, - password=self.password, - db_name=self.db_name, - pool_pre_ping=True, - pool_recycle=3600, - ) - logger.info("OceanBase Hybrid Search feature is enabled") - def _try_to_update_ob_query_timeout(self): try: val = self._get_variable_value("ob_query_timeout") @@ -455,9 +479,19 @@ class OBConnection(DocStoreConnection): return os.getenv(var, default).lower() in ['true', '1', 'yes', 'y'] self.enable_fulltext_search = is_true('ENABLE_FULLTEXT_SEARCH', 'true') + logger.info(f"ENABLE_FULLTEXT_SEARCH={self.enable_fulltext_search}") + self.use_fulltext_hint = is_true('USE_FULLTEXT_HINT', 'true') + logger.info(f"USE_FULLTEXT_HINT={self.use_fulltext_hint}") + self.search_original_content = is_true("SEARCH_ORIGINAL_CONTENT", 'true') + logger.info(f"SEARCH_ORIGINAL_CONTENT={self.search_original_content}") + self.enable_hybrid_search = is_true('ENABLE_HYBRID_SEARCH', 'false') + logger.info(f"ENABLE_HYBRID_SEARCH={self.enable_hybrid_search}") + + self.use_fulltext_first_fusion_search = is_true('USE_FULLTEXT_FIRST_FUSION_SEARCH', 'true') + logger.info(f"USE_FULLTEXT_FIRST_FUSION_SEARCH={self.use_fulltext_first_fusion_search}") """ Database operations @@ -478,6 +512,43 @@ class OBConnection(DocStoreConnection): return row[1] raise Exception(f"Variable '{var_name}' not found.") + def _check_table_exists_cached(self, table_name: str) -> bool: + """ + Check table existence with cache to reduce INFORMATION_SCHEMA queries under high concurrency. + Only caches when table exists. Does not cache when table does not exist. + Thread-safe implementation: read operations are lock-free (GIL-protected), + write operations are protected by RLock to ensure cache consistency. + + Args: + table_name: Table name + + Returns: + Whether the table exists with all required indexes and columns + """ + if table_name in self._table_exists_cache: + return True + + try: + if not self.client.check_table_exists(table_name): + return False + for column_name in index_columns: + if not self._index_exists(table_name, index_name_template % (table_name, column_name)): + return False + for fts_column in self.fulltext_search_columns: + column_name = fts_column.split("^")[0] + if not self._index_exists(table_name, fulltext_index_name_template % column_name): + return False + for column in [column_order_id, column_group_id]: + if not self._column_exist(table_name, column.name): + return False + except Exception as e: + raise Exception(f"OBConnection._check_table_exists_cached error: {str(e)}") + + with self._table_exists_cache_lock: + if table_name not in self._table_exists_cache: + self._table_exists_cache.add(table_name) + return True + """ Table operations """ @@ -500,8 +571,7 @@ class OBConnection(DocStoreConnection): process_func=lambda: self._add_index(indexName, column_name), ) - fts_columns = fts_columns_origin if self.search_original_content else fts_columns_tks - for fts_column in fts_columns: + for fts_column in self.fulltext_search_columns: column_name = fts_column.split("^")[0] _try_with_lock( lock_name=f"ob_add_fulltext_idx_{indexName}_{column_name}", @@ -546,24 +616,7 @@ class OBConnection(DocStoreConnection): raise Exception(f"OBConnection.deleteIndex error: {str(e)}") def indexExist(self, indexName: str, knowledgebaseId: str = None) -> bool: - try: - if not self.client.check_table_exists(indexName): - return False - for column_name in index_columns: - if not self._index_exists(indexName, index_name_template % (indexName, column_name)): - return False - fts_columns = fts_columns_origin if self.search_original_content else fts_columns_tks - for fts_column in fts_columns: - column_name = fts_column.split("^")[0] - if not self._index_exists(indexName, fulltext_index_name_template % column_name): - return False - for column in [column_order_id, column_group_id]: - if not self._column_exist(indexName, column.name): - return False - except Exception as e: - raise Exception(f"OBConnection.indexExist error: {str(e)}") - - return True + return self._check_table_exists_cached(indexName) def _get_count(self, table_name: str, filter_list: list[str] = None) -> int: where_clause = "WHERE " + " AND ".join(filter_list) if len(filter_list) > 0 else "" @@ -853,10 +906,8 @@ class OBConnection(DocStoreConnection): fulltext_query = escape_string(fulltext_query.strip()) fulltext_topn = m.topn - fts_columns = fts_columns_origin if self.search_original_content else fts_columns_tks - # get fulltext match expression and weight values - for field in fts_columns: + for field in self.fulltext_search_columns: parts = field.split("^") column_name: str = parts[0] column_weight: float = float(parts[1]) if (len(parts) > 1 and parts[1]) else 1.0 @@ -885,7 +936,8 @@ class OBConnection(DocStoreConnection): fulltext_search_score_expr = f"({' + '.join(f'{expr} * {fulltext_search_weight.get(col, 0)}' for col, expr in fulltext_search_expr.items())})" if vector_data: - vector_search_expr = vector_search_template % (vector_column_name, vector_data) + vector_data_str = "[" + ",".join([str(np.float32(v)) for v in vector_data]) + "]" + vector_search_expr = vector_search_template % (vector_column_name, vector_data_str) # use (1 - cosine_distance) as score, which should be [-1, 1] # https://www.oceanbase.com/docs/common-oceanbase-database-standalone-1000000003577323 vector_search_score_expr = f"(1 - {vector_search_expr})" @@ -910,11 +962,15 @@ class OBConnection(DocStoreConnection): if search_type in ["fusion", "fulltext", "vector"] and "_score" not in output_fields: output_fields.append("_score") - group_results = kwargs.get("group_results", False) + if limit: + if vector_topn is not None: + limit = min(vector_topn, limit) + if fulltext_topn is not None: + limit = min(fulltext_topn, limit) for index_name in indexNames: - if not self.client.check_table_exists(index_name): + if not self._check_table_exists_cached(index_name): continue fulltext_search_hint = f"/*+ UNION_MERGE({index_name} {' '.join(fulltext_search_idx_list)}) */" if self.use_fulltext_hint else "" @@ -922,29 +978,7 @@ class OBConnection(DocStoreConnection): if search_type == "fusion": # fusion search, usually for chat num_candidates = vector_topn + fulltext_topn - if group_results: - count_sql = ( - f"WITH fulltext_results AS (" - f" SELECT {fulltext_search_hint} *, {fulltext_search_score_expr} AS relevance" - f" FROM {index_name}" - f" WHERE {filters_expr} AND {fulltext_search_filter}" - f" ORDER BY relevance DESC" - f" LIMIT {num_candidates}" - f")," - f" scored_results AS (" - f" SELECT *" - f" FROM fulltext_results" - f" WHERE {vector_search_filter}" - f")," - f" group_results AS (" - f" SELECT *, ROW_NUMBER() OVER (PARTITION BY group_id) as rn" - f" FROM scored_results" - f")" - f" SELECT COUNT(*)" - f" FROM group_results" - f" WHERE rn = 1" - ) - else: + if self.use_fulltext_first_fusion_search: count_sql = ( f"WITH fulltext_results AS (" f" SELECT {fulltext_search_hint} *, {fulltext_search_score_expr} AS relevance" @@ -955,6 +989,22 @@ class OBConnection(DocStoreConnection): f")" f" SELECT COUNT(*) FROM fulltext_results WHERE {vector_search_filter}" ) + else: + count_sql = ( + f"WITH fulltext_results AS (" + f" SELECT {fulltext_search_hint} id FROM {index_name}" + f" WHERE {filters_expr} AND {fulltext_search_filter}" + f" ORDER BY {fulltext_search_score_expr}" + f" LIMIT {fulltext_topn}" + f")," + f"vector_results AS (" + f" SELECT id FROM {index_name}" + f" WHERE {filters_expr} AND {vector_search_filter}" + f" ORDER BY {vector_search_expr}" + f" APPROXIMATE LIMIT {vector_topn}" + f")" + f" SELECT COUNT(*) FROM fulltext_results f FULL OUTER JOIN vector_results v ON f.id = v.id" + ) logger.debug("OBConnection.search with count sql: %s", count_sql) start_time = time.time() @@ -976,32 +1026,8 @@ class OBConnection(DocStoreConnection): if total_count == 0: continue - score_expr = f"(relevance * {1 - vector_similarity_weight} + {vector_search_score_expr} * {vector_similarity_weight} + {pagerank_score_expr})" - if group_results: - fusion_sql = ( - f"WITH fulltext_results AS (" - f" SELECT {fulltext_search_hint} *, {fulltext_search_score_expr} AS relevance" - f" FROM {index_name}" - f" WHERE {filters_expr} AND {fulltext_search_filter}" - f" ORDER BY relevance DESC" - f" LIMIT {num_candidates}" - f")," - f" scored_results AS (" - f" SELECT *, {score_expr} AS _score" - f" FROM fulltext_results" - f" WHERE {vector_search_filter}" - f")," - f" group_results AS (" - f" SELECT *, ROW_NUMBER() OVER (PARTITION BY group_id ORDER BY _score DESC) as rn" - f" FROM scored_results" - f")" - f" SELECT {fields_expr}, _score" - f" FROM group_results" - f" WHERE rn = 1" - f" ORDER BY _score DESC" - f" LIMIT {offset}, {limit}" - ) - else: + if self.use_fulltext_first_fusion_search: + score_expr = f"(relevance * {1 - vector_similarity_weight} + {vector_search_score_expr} * {vector_similarity_weight} + {pagerank_score_expr})" fusion_sql = ( f"WITH fulltext_results AS (" f" SELECT {fulltext_search_hint} *, {fulltext_search_score_expr} AS relevance" @@ -1016,6 +1042,38 @@ class OBConnection(DocStoreConnection): f" ORDER BY _score DESC" f" LIMIT {offset}, {limit}" ) + else: + pagerank_score_expr = f"(CAST(IFNULL(f.{PAGERANK_FLD}, 0) AS DECIMAL(10, 2)) / 100)" + score_expr = f"(f.relevance * {1 - vector_similarity_weight} + v.similarity * {vector_similarity_weight} + {pagerank_score_expr})" + fields_expr = ", ".join([f"t.{f} as {f}" for f in output_fields if f != "_score"]) + fusion_sql = ( + f"WITH fulltext_results AS (" + f" SELECT {fulltext_search_hint} id, pagerank_fea, {fulltext_search_score_expr} AS relevance" + f" FROM {index_name}" + f" WHERE {filters_expr} AND {fulltext_search_filter}" + f" ORDER BY relevance DESC" + f" LIMIT {fulltext_topn}" + f")," + f"vector_results AS (" + f" SELECT id, pagerank_fea, {vector_search_score_expr} AS similarity" + f" FROM {index_name}" + f" WHERE {filters_expr} AND {vector_search_filter}" + f" ORDER BY {vector_search_expr}" + f" APPROXIMATE LIMIT {vector_topn}" + f")," + f"combined_results AS (" + f" SELECT COALESCE(f.id, v.id) AS id, {score_expr} AS score" + f" FROM fulltext_results f" + f" FULL OUTER JOIN vector_results v" + f" ON f.id = v.id" + f")" + f" SELECT {fields_expr}, c.score as _score" + f" FROM combined_results c" + f" JOIN {index_name} t" + f" ON c.id = t.id" + f" ORDER BY score DESC" + f" LIMIT {offset}, {limit}" + ) logger.debug("OBConnection.search with fusion sql: %s", fusion_sql) start_time = time.time() @@ -1234,10 +1292,14 @@ class OBConnection(DocStoreConnection): for row in rows: result.chunks.append(self._row_to_entity(row, output_fields)) + + if result.total == 0: + result.total = len(result.chunks) + return result def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None: - if not self.client.check_table_exists(indexName): + if not self._check_table_exists_cached(indexName): return None try: @@ -1336,7 +1398,7 @@ class OBConnection(DocStoreConnection): return res def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool: - if not self.client.check_table_exists(indexName): + if not self._check_table_exists_cached(indexName): return True condition["kb_id"] = knowledgebaseId @@ -1387,7 +1449,7 @@ class OBConnection(DocStoreConnection): return False def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int: - if not self.client.check_table_exists(indexName): + if not self._check_table_exists_cached(indexName): return 0 condition["kb_id"] = knowledgebaseId diff --git a/rag/utils/opendal_conn.py b/rag/utils/opendal_conn.py index c6cebf9ca..1f52f6f63 100644 --- a/rag/utils/opendal_conn.py +++ b/rag/utils/opendal_conn.py @@ -41,7 +41,9 @@ def get_opendal_config(): scheme = opendal_config.get("scheme") config_data = opendal_config.get("config", {}) kwargs = {"scheme": scheme, **config_data} - logging.info("Loaded OpenDAL configuration from yaml: %s", kwargs) + safe_log_keys=['scheme', 'host', 'port', 'database', 'table'] + loggable_kwargs = {k: v for k, v in kwargs.items() if k in safe_log_keys} + logging.info("Loaded OpenDAL configuration(non sensitive): %s", loggable_kwargs) return kwargs except Exception as e: logging.error("Failed to load OpenDAL configuration from yaml: %s", str(e)) diff --git a/web/src/components/jsonjoy-builder/components/schema-editor/add-field-button.tsx b/web/src/components/jsonjoy-builder/components/schema-editor/add-field-button.tsx index 7a25705f9..fe06c1952 100644 --- a/web/src/components/jsonjoy-builder/components/schema-editor/add-field-button.tsx +++ b/web/src/components/jsonjoy-builder/components/schema-editor/add-field-button.tsx @@ -20,6 +20,7 @@ import { CirclePlus, HelpCircle, Info } from 'lucide-react'; import { useId, useState, type FC, type FormEvent } from 'react'; import { useTranslation } from '../../hooks/use-translation'; import type { NewField, SchemaType } from '../../types/json-schema'; +import { KeyInputProps } from './interface'; import SchemaTypeSelector from './schema-type-selector'; interface AddFieldButtonProps { @@ -27,9 +28,10 @@ interface AddFieldButtonProps { variant?: 'primary' | 'secondary'; } -const AddFieldButton: FC = ({ +const AddFieldButton: FC = ({ onAddField, variant = 'primary', + pattern, }) => { const [dialogOpen, setDialogOpen] = useState(false); const [fieldName, setFieldName] = useState(''); @@ -120,6 +122,7 @@ const AddFieldButton: FC = ({ placeholder={t.fieldNamePlaceholder} className="font-mono text-sm w-full" required + searchValue={pattern} /> diff --git a/web/src/components/jsonjoy-builder/components/schema-editor/context.ts b/web/src/components/jsonjoy-builder/components/schema-editor/context.ts new file mode 100644 index 000000000..3fbb14a26 --- /dev/null +++ b/web/src/components/jsonjoy-builder/components/schema-editor/context.ts @@ -0,0 +1,9 @@ +import React, { useContext } from 'react'; +import { KeyInputProps } from './interface'; + +export const KeyInputContext = React.createContext({}); + +export function useInputPattern() { + const x = useContext(KeyInputContext); + return x.pattern; +} diff --git a/web/src/components/jsonjoy-builder/components/schema-editor/interface.ts b/web/src/components/jsonjoy-builder/components/schema-editor/interface.ts new file mode 100644 index 000000000..39e74a641 --- /dev/null +++ b/web/src/components/jsonjoy-builder/components/schema-editor/interface.ts @@ -0,0 +1 @@ +export type KeyInputProps = { pattern?: RegExp | string }; diff --git a/web/src/components/jsonjoy-builder/components/schema-editor/schema-property-editor.tsx b/web/src/components/jsonjoy-builder/components/schema-editor/schema-property-editor.tsx index f95031e9c..347d69d26 100644 --- a/web/src/components/jsonjoy-builder/components/schema-editor/schema-property-editor.tsx +++ b/web/src/components/jsonjoy-builder/components/schema-editor/schema-property-editor.tsx @@ -16,6 +16,7 @@ import { withObjectSchema, } from '../../types/json-schema'; import type { ValidationTreeNode } from '../../types/validation'; +import { useInputPattern } from './context'; import TypeDropdown from './type-dropdown'; import TypeEditor from './type-editor'; @@ -54,6 +55,8 @@ export const SchemaPropertyEditor: React.FC = ({ 'object' as SchemaType, ); + const pattern = useInputPattern(); + // Update temp values when props change useEffect(() => { setTempName(name); @@ -123,6 +126,7 @@ export const SchemaPropertyEditor: React.FC = ({ className="h-8 text-sm font-medium min-w-[120px] max-w-full z-10" autoFocus onFocus={(e) => e.target.select()} + searchValue={pattern} /> ) : ( + + + {Object.keys(metadata.data).map((key, idx) => { + return ( + + {key} + + ); + })} + + + +
+ {fields.map((field, index) => { + const typeField = `${name}.${index}`; + return ( +
+
+ ( + + + + + + + )} + /> +
+ +
+ ); + })} +
+ + ); +} diff --git a/web/src/constants/agent.tsx b/web/src/constants/agent.tsx index ec42c0f2c..2f51e24f6 100644 --- a/web/src/constants/agent.tsx +++ b/web/src/constants/agent.tsx @@ -193,3 +193,19 @@ export enum SwitchLogicOperator { And = 'and', Or = 'or', } + +export const WebhookAlgorithmList = [ + 'hs256', + 'hs384', + 'hs512', + 'rs256', + 'rs384', + 'rs512', + 'es256', + 'es384', + 'es512', + 'ps256', + 'ps384', + 'ps512', + 'none', +] as const; diff --git a/web/src/constants/chat.ts b/web/src/constants/chat.ts index 02d23b652..f102d9e79 100644 --- a/web/src/constants/chat.ts +++ b/web/src/constants/chat.ts @@ -36,5 +36,6 @@ export const EmptyConversationId = 'empty'; export enum DatasetMetadata { Disabled = 'disabled', Automatic = 'auto', + SemiAutomatic = 'semi_auto', Manual = 'manual', } diff --git a/web/src/interfaces/request/knowledge.ts b/web/src/interfaces/request/knowledge.ts index de1b00b43..ea60888de 100644 --- a/web/src/interfaces/request/knowledge.ts +++ b/web/src/interfaces/request/knowledge.ts @@ -7,6 +7,16 @@ export interface ITestRetrievalRequestBody { use_kg?: boolean; highlight?: boolean; kb_id?: string[]; + meta_data_filter?: { + logic?: string; + method?: string; + manual?: Array<{ + key: string; + op: string; + value: string; + }>; + semi_auto?: string[]; + }; } export interface IFetchKnowledgeListRequestBody { diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index c03408beb..6d1049eda 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -737,11 +737,13 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s metadataTip: 'Metadata filtering is the process of using metadata attributes (such as tags, categories, or access permissions) to refine and control the retrieval of relevant information within a system.', conditions: 'Conditions', + metadataKeys: 'Filterable items', addCondition: 'Add condition', meta: { disabled: 'Disabled', auto: 'Automatic', manual: 'Manual', + semi_auto: 'Semi-automatic', }, cancel: 'Cancel', chatSetting: 'Chat setting', @@ -1961,6 +1963,37 @@ Important structured information may include: names, dates, locations, events, k removeFirst: 'Remove first', removeLast: 'Remove last', }, + webhook: { + name: 'Webhook', + methods: 'Methods', + contentTypes: 'Content types', + security: 'Security', + schema: 'Schema', + response: 'Response', + executionMode: 'Execution mode', + authMethods: 'Authentication Methods', + authType: 'Authentication Type', + limit: 'Request Limit', + per: 'Time Period', + maxBodySize: 'Maximum Body Size', + ipWhitelist: 'IP Whitelist', + tokenHeader: 'Token Header', + tokenValue: 'Token Value', + username: 'Username', + password: 'Password', + algorithm: 'Algorithm', + secret: 'Secret', + issuer: 'Issuer', + audience: 'Audience', + requiredClaims: 'Required Claims', + header: 'Header', + status: 'Status', + headersTemplate: 'Headers Template', + bodyTemplate: 'Body Template', + basic: 'Basic', + bearer: 'Bearer', + apiKey: 'Api Key', + }, }, llmTools: { bad_calculator: { diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 8c879ffc5..c06ec2886 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -673,11 +673,13 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 metadataTip: '元数据过滤是使用元数据属性(例如标签、类别或访问权限)来优化和控制系统内相关信息检索的过程。', conditions: '条件', + metadataKeys: '可选过滤项', addCondition: '增加条件', meta: { disabled: '禁用', auto: '自动', manual: '手动', + semi_auto: '半自动', }, cancel: '取消', chatSetting: '聊天设置', @@ -1755,6 +1757,37 @@ Tokenizer 会根据所选方式将内容存储为对应的数据结构。`, removeFirst: '移除第一个', removeLast: '移除最后一个', }, + webhook: { + name: '网络钩子', + methods: '方法', + contentTypes: '内容类型', + security: '安全配置', + schema: '模式', + response: '响应', + executionMode: '执行模式', + authMethods: '认证方法', + authType: '认证类型', + limit: '请求限制', + per: '时间周期', + maxBodySize: '最大主体大小', + ipWhitelist: 'IP白名单', + tokenHeader: '令牌头部', + tokenValue: '令牌值', + username: '用户名', + password: '密码', + algorithm: '算法', + secret: '密钥', + issuer: '签发者', + audience: '受众', + requiredClaims: '必需声明', + header: '头部', + status: '状态', + headersTemplate: '头部模板', + bodyTemplate: '主体模板', + basic: '基础认证', + bearer: '承载令牌', + apiKey: 'API密钥', + }, }, footer: { profile: 'All rights reserved @ React', diff --git a/web/src/pages/agent/canvas/node/variable-display.tsx b/web/src/pages/agent/canvas/node/variable-display.tsx index ceca80faa..cfca809f7 100644 --- a/web/src/pages/agent/canvas/node/variable-display.tsx +++ b/web/src/pages/agent/canvas/node/variable-display.tsx @@ -1,32 +1,14 @@ -import i18n from '@/locales/config'; -import { BeginId } from '@/pages/agent/constant'; import { ReactNode } from 'react'; -const prefix = BeginId + '@'; - interface VariableDisplayProps { content: string; getLabel?: (value?: string) => string | ReactNode; } // This component mimics the VariableNode's decorate function from PromptEditor -function VariableNodeDisplay({ - value, - label, -}: { - value: string; - label: ReactNode; -}) { +function VariableNodeDisplay({ label }: { label: ReactNode }) { let content: ReactNode = {label}; - if (value.startsWith(prefix)) { - content = ( -
- {i18n.t(`flow.begin`)} / {content} -
- ); - } - return
{content}
; } @@ -63,11 +45,7 @@ export function VariableDisplay({ content, getLabel }: VariableDisplayProps) { if (label && label !== variableValue) { // If we found a valid label, render as variable node elements.push( - , + , ); } else { // If no label found, keep as original text diff --git a/web/src/pages/agent/constant/index.tsx b/web/src/pages/agent/constant/index.tsx index abef76ab3..5c25b7fe0 100644 --- a/web/src/pages/agent/constant/index.tsx +++ b/web/src/pages/agent/constant/index.tsx @@ -25,6 +25,7 @@ export * from './pipeline'; export enum AgentDialogueMode { Conversational = 'conversational', Task = 'task', + Webhook = 'Webhook', } import { ModelVariableType } from '@/constants/knowledge'; @@ -930,3 +931,37 @@ export enum AgentVariableType { Begin = 'begin', Conversation = 'conversation', } + +export enum WebhookMethod { + Post = 'POST', + Get = 'GET', + Put = 'PUT', + Patch = 'PATCH', + Delete = 'DELETE', + Head = 'HEAD', +} + +export enum WebhookContentType { + ApplicationJson = 'application/json', + MultipartFormData = 'multipart/form-data', + ApplicationXWwwFormUrlencoded = 'application/x-www-form-urlencoded', + TextPlain = 'text/plain', + ApplicationOctetStream = 'application/octet-stream', +} + +export enum WebhookExecutionMode { + Immediately = 'Immediately', + Streaming = 'Streaming', +} + +export enum WebhookSecurityAuthType { + None = 'none', + Token = 'token', + Basic = 'basic', + Jwt = 'jwt', + Hmac = 'hmac', +} + +export const RateLimitPerList = ['minute', 'hour', 'day']; + +export const WebhookMaxBodySize = ['10MB', '50MB', '100MB', '1000MB']; diff --git a/web/src/pages/agent/form/agent-form/index.tsx b/web/src/pages/agent/form/agent-form/index.tsx index 38c49b666..a78dc1f77 100644 --- a/web/src/pages/agent/form/agent-form/index.tsx +++ b/web/src/pages/agent/form/agent-form/index.tsx @@ -42,9 +42,9 @@ import { FormWrapper } from '../components/form-wrapper'; import { Output } from '../components/output'; import { PromptEditor } from '../components/prompt-editor'; import { QueryVariable } from '../components/query-variable'; +import { SchemaDialog } from '../components/schema-dialog'; +import { SchemaPanel } from '../components/schema-panel'; import { AgentTools, Agents } from './agent-tools'; -import { StructuredOutputDialog } from './structured-output-dialog'; -import { StructuredOutputPanel } from './structured-output-panel'; import { useBuildPromptExtraPromptOptions } from './use-build-prompt-options'; import { useHandleShowStructuredOutput, @@ -327,19 +327,17 @@ function AgentForm({ node }: INextOperatorForm) { - + )} {structuredOutputDialogVisible && ( - + > )} ); diff --git a/web/src/pages/agent/form/begin-form/index.tsx b/web/src/pages/agent/form/begin-form/index.tsx index ad4eb9d3e..c86f24cac 100644 --- a/web/src/pages/agent/form/begin-form/index.tsx +++ b/web/src/pages/agent/form/begin-form/index.tsx @@ -12,6 +12,7 @@ import { RAGFlowSelect } from '@/components/ui/select'; import { Switch } from '@/components/ui/switch'; import { Textarea } from '@/components/ui/textarea'; import { FormTooltip } from '@/components/ui/tooltip'; +import { WebhookAlgorithmList } from '@/constants/agent'; import { zodResolver } from '@hookform/resolvers/zod'; import { t } from 'i18next'; import { Plus } from 'lucide-react'; @@ -24,37 +25,71 @@ import { INextOperatorForm } from '../../interface'; import { ParameterDialog } from './parameter-dialog'; import { QueryTable } from './query-table'; import { useEditQueryRecord } from './use-edit-query'; +import { useHandleModeChange } from './use-handle-mode-change'; import { useValues } from './use-values'; import { useWatchFormChange } from './use-watch-change'; +import { WebHook } from './webhook'; const ModeOptions = [ { value: AgentDialogueMode.Conversational, label: t('flow.conversational') }, { value: AgentDialogueMode.Task, label: t('flow.task') }, + { value: AgentDialogueMode.Webhook, label: t('flow.webhook.name') }, ]; +const FormSchema = z.object({ + enablePrologue: z.boolean().optional(), + prologue: z.string().trim().optional(), + mode: z.string(), + inputs: z + .array( + z.object({ + key: z.string(), + type: z.string(), + value: z.string(), + optional: z.boolean(), + name: z.string(), + options: z.array(z.union([z.number(), z.string(), z.boolean()])), + }), + ) + .optional(), + methods: z.string().optional(), + content_types: z.string().optional(), + security: z + .object({ + auth_type: z.string(), + ip_whitelist: z.array(z.object({ value: z.string() })), + rate_limit: z.object({ + limit: z.number(), + per: z.string().optional(), + }), + max_body_size: z.string(), + jwt: z + .object({ + algorithm: z.string().default(WebhookAlgorithmList[0]).optional(), + }) + .optional(), + }) + .optional(), + schema: z.record(z.any()).optional(), + response: z + .object({ + status: z.number(), + headers_template: z.array( + z.object({ key: z.string(), value: z.string() }), + ), + body_template: z.array(z.object({ key: z.string(), value: z.string() })), + }) + .optional(), + execution_mode: z.string().optional(), +}); + +export type BeginFormSchemaType = z.infer; + function BeginForm({ node }: INextOperatorForm) { const { t } = useTranslation(); const values = useValues(node); - const FormSchema = z.object({ - enablePrologue: z.boolean().optional(), - prologue: z.string().trim().optional(), - mode: z.string(), - inputs: z - .array( - z.object({ - key: z.string(), - type: z.string(), - value: z.string(), - optional: z.boolean(), - name: z.string(), - options: z.array(z.union([z.number(), z.string(), z.boolean()])), - }), - ) - .optional(), - }); - const form = useForm({ defaultValues: values, resolver: zodResolver(FormSchema), @@ -72,6 +107,8 @@ function BeginForm({ node }: INextOperatorForm) { const previousModeRef = useRef(mode); + const { handleModeChange } = useHandleModeChange(form); + useEffect(() => { if ( previousModeRef.current === AgentDialogueMode.Task && @@ -111,6 +148,10 @@ function BeginForm({ node }: INextOperatorForm) { placeholder={t('common.pleaseSelect')} options={ModeOptions} {...field} + onChange={(val) => { + handleModeChange(val); + field.onChange(val); + }} > @@ -158,44 +199,49 @@ function BeginForm({ node }: INextOperatorForm) { )} /> )} - {/* Create a hidden field to make Form instance record this */} -
} - /> - - {t('flow.input')} - - - } - rightContent={ - + } > - - - } - > - - - {visible && ( - + + + {visible && ( + + )} + )} diff --git a/web/src/pages/agent/form/begin-form/use-handle-mode-change.ts b/web/src/pages/agent/form/begin-form/use-handle-mode-change.ts new file mode 100644 index 000000000..e85ed5a6e --- /dev/null +++ b/web/src/pages/agent/form/begin-form/use-handle-mode-change.ts @@ -0,0 +1,76 @@ +import { useCallback } from 'react'; +import { UseFormReturn } from 'react-hook-form'; +import { + AgentDialogueMode, + RateLimitPerList, + WebhookExecutionMode, + WebhookMaxBodySize, + WebhookSecurityAuthType, +} from '../../constant'; + +// const WebhookSchema = { +// query: { +// type: 'object', +// required: [], +// properties: { +// // debug: { type: 'boolean' }, +// // event: { type: 'string' }, +// }, +// }, + +// headers: { +// type: 'object', +// required: [], +// properties: { +// // 'X-Trace-ID': { type: 'string' }, +// }, +// }, + +// body: { +// type: 'object', +// required: [], +// properties: { +// id: { type: 'string' }, +// payload: { type: 'object' }, +// }, +// }, +// }; + +const schema = { + properties: { + query: { + type: 'object', + description: '', + }, + headers: { + type: 'object', + description: '', + }, + body: { + type: 'object', + description: '', + }, + }, +}; + +const initialFormValuesMap = { + schema: schema, + 'security.auth_type': WebhookSecurityAuthType.Basic, + 'security.rate_limit.per': RateLimitPerList[0], + 'security.max_body_size': WebhookMaxBodySize[0], + execution_mode: WebhookExecutionMode.Immediately, +}; + +export function useHandleModeChange(form: UseFormReturn) { + const handleModeChange = useCallback( + (mode: AgentDialogueMode) => { + if (mode === AgentDialogueMode.Webhook) { + Object.entries(initialFormValuesMap).forEach(([key, value]) => { + form.setValue(key, value, { shouldDirty: true }); + }); + } + }, + [form], + ); + return { handleModeChange }; +} diff --git a/web/src/pages/agent/form/begin-form/use-show-schema-dialog.tsx b/web/src/pages/agent/form/begin-form/use-show-schema-dialog.tsx new file mode 100644 index 000000000..0bc6261e5 --- /dev/null +++ b/web/src/pages/agent/form/begin-form/use-show-schema-dialog.tsx @@ -0,0 +1,28 @@ +import { JSONSchema } from '@/components/jsonjoy-builder'; +import { useSetModalState } from '@/hooks/common-hooks'; +import { useCallback } from 'react'; +import { UseFormReturn } from 'react-hook-form'; + +export function useShowSchemaDialog(form: UseFormReturn) { + const { + visible: schemaDialogVisible, + showModal: showSchemaDialog, + hideModal: hideSchemaDialog, + } = useSetModalState(); + + const handleSchemaDialogOk = useCallback( + (values: JSONSchema) => { + // Sync data to canvas + form.setValue('schema', values); + hideSchemaDialog(); + }, + [form, hideSchemaDialog], + ); + + return { + schemaDialogVisible, + showSchemaDialog, + hideSchemaDialog, + handleSchemaDialogOk, + }; +} diff --git a/web/src/pages/agent/form/begin-form/use-watch-change.ts b/web/src/pages/agent/form/begin-form/use-watch-change.ts index f0da58068..02158e969 100644 --- a/web/src/pages/agent/form/begin-form/use-watch-change.ts +++ b/web/src/pages/agent/form/begin-form/use-watch-change.ts @@ -1,6 +1,7 @@ import { omit } from 'lodash'; import { useEffect } from 'react'; import { UseFormReturn, useWatch } from 'react-hook-form'; +import { AgentDialogueMode } from '../../constant'; import { BeginQuery } from '../../interface'; import useGraphStore from '../../store'; @@ -20,9 +21,21 @@ export function useWatchFormChange(id?: string, form?: UseFormReturn) { if (id) { values = form?.getValues() || {}; + let outputs: Record = {}; + + // For webhook mode, use schema properties as direct outputs + // Each property (body, headers, query) should be able to show secondary menu + if ( + values.mode === AgentDialogueMode.Webhook && + values.schema?.properties + ) { + outputs = { ...values.schema.properties }; + } + const nextValues = { ...values, inputs: transferInputsArrayToObject(values.inputs), + outputs, }; updateNodeForm(id, nextValues); diff --git a/web/src/pages/agent/form/begin-form/webhook/auth.tsx b/web/src/pages/agent/form/begin-form/webhook/auth.tsx new file mode 100644 index 000000000..4a739b491 --- /dev/null +++ b/web/src/pages/agent/form/begin-form/webhook/auth.tsx @@ -0,0 +1,139 @@ +import { SelectWithSearch } from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Input } from '@/components/ui/input'; +import { WebhookAlgorithmList } from '@/constants/agent'; +import { WebhookSecurityAuthType } from '@/pages/agent/constant'; +import { buildOptions } from '@/utils/form'; +import { useCallback } from 'react'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +const AlgorithmOptions = buildOptions(WebhookAlgorithmList); + +const RequiredClaimsOptions = buildOptions(['exp', 'sub']); + +export function Auth() { + const { t } = useTranslation(); + const form = useFormContext(); + + const authType = useWatch({ + name: 'security.auth_type', + control: form.control, + }); + + const renderTokenAuth = useCallback( + () => ( + <> + + + + + + + + ), + [t], + ); + + const renderBasicAuth = useCallback( + () => ( + <> + + + + + + + + ), + [t], + ); + + const renderJwtAuth = useCallback( + () => ( + <> + + + + + + + + + + + + + + + + + ), + [t], + ); + + const renderHmacAuth = useCallback( + () => ( + <> + + + + + + + + + + + ), + [t], + ); + + const AuthMap = { + [WebhookSecurityAuthType.Token]: renderTokenAuth, + [WebhookSecurityAuthType.Basic]: renderBasicAuth, + [WebhookSecurityAuthType.Jwt]: renderJwtAuth, + [WebhookSecurityAuthType.Hmac]: renderHmacAuth, + [WebhookSecurityAuthType.None]: () => null, + }; + + return AuthMap[ + (authType ?? WebhookSecurityAuthType.None) as WebhookSecurityAuthType + ](); +} diff --git a/web/src/pages/agent/form/begin-form/webhook/dynamic-response.tsx b/web/src/pages/agent/form/begin-form/webhook/dynamic-response.tsx new file mode 100644 index 000000000..18030feff --- /dev/null +++ b/web/src/pages/agent/form/begin-form/webhook/dynamic-response.tsx @@ -0,0 +1,213 @@ +import { BoolSegmented } from '@/components/bool-segmented'; +import { KeyInput } from '@/components/key-input'; +import { SelectWithSearch } from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { useIsDarkTheme } from '@/components/theme-provider'; +import { Button } from '@/components/ui/button'; +import { Input } from '@/components/ui/input'; +import { Separator } from '@/components/ui/separator'; +import { Textarea } from '@/components/ui/textarea'; +import { Editor, loader } from '@monaco-editor/react'; +import { X } from 'lucide-react'; +import { ReactNode, useCallback } from 'react'; +import { useFieldArray, useFormContext } from 'react-hook-form'; +import { InputMode, TypesWithArray } from '../../../constant'; +import { + InputModeOptions, + buildConversationVariableSelectOptions, +} from '../../../utils'; +import { DynamicFormHeader } from '../../components/dynamic-fom-header'; +import { QueryVariable } from '../../components/query-variable'; + +loader.config({ paths: { vs: '/vs' } }); + +type SelectKeysProps = { + name: string; + label: ReactNode; + tooltip?: string; + keyField?: string; + valueField?: string; + operatorField?: string; + nodeId?: string; +}; + +const VariableTypeOptions = buildConversationVariableSelectOptions(); + +const modeField = 'input_mode'; + +const ConstantValueMap = { + [TypesWithArray.Boolean]: true, + [TypesWithArray.Number]: 0, + [TypesWithArray.String]: '', + [TypesWithArray.ArrayBoolean]: '[]', + [TypesWithArray.ArrayNumber]: '[]', + [TypesWithArray.ArrayString]: '[]', + [TypesWithArray.ArrayObject]: '[]', + [TypesWithArray.Object]: '{}', +}; + +export function DynamicResponse({ + name, + label, + tooltip, + keyField = 'key', + valueField = 'value', + operatorField = 'type', +}: SelectKeysProps) { + const form = useFormContext(); + const isDarkTheme = useIsDarkTheme(); + + const { fields, remove, append } = useFieldArray({ + name: name, + control: form.control, + }); + + const initializeValue = useCallback( + (mode: string, variableType: string, valueFieldAlias: string) => { + if (mode === InputMode.Variable) { + form.setValue(valueFieldAlias, '', { shouldDirty: true }); + } else { + const val = ConstantValueMap[variableType as TypesWithArray]; + form.setValue(valueFieldAlias, val, { shouldDirty: true }); + } + }, + [form], + ); + + const handleModeChange = useCallback( + (mode: string, valueFieldAlias: string, operatorFieldAlias: string) => { + const variableType = form.getValues(operatorFieldAlias); + initializeValue(mode, variableType, valueFieldAlias); + }, + [form, initializeValue], + ); + + const handleVariableTypeChange = useCallback( + (variableType: string, valueFieldAlias: string, modeFieldAlias: string) => { + const mode = form.getValues(modeFieldAlias); + + initializeValue(mode, variableType, valueFieldAlias); + }, + [form, initializeValue], + ); + + const renderParameter = useCallback( + (operatorFieldName: string, modeFieldName: string) => { + const mode = form.getValues(modeFieldName); + const logicalOperator = form.getValues(operatorFieldName); + + if (mode === InputMode.Constant) { + if (logicalOperator === TypesWithArray.Boolean) { + return ; + } + + if (logicalOperator === TypesWithArray.Number) { + return ; + } + + if (logicalOperator === TypesWithArray.String) { + return ; + } + + return ( + + ); + } + + return ( + + ); + }, + [form, isDarkTheme], + ); + + return ( +
+ + append({ + [keyField]: '', + [valueField]: '', + [modeField]: InputMode.Constant, + [operatorField]: TypesWithArray.String, + }) + } + > +
+ {fields.map((field, index) => { + const keyFieldAlias = `${name}.${index}.${keyField}`; + const valueFieldAlias = `${name}.${index}.${valueField}`; + const operatorFieldAlias = `${name}.${index}.${operatorField}`; + const modeFieldAlias = `${name}.${index}.${modeField}`; + + return ( +
+
+
+ + + + + + {(field) => ( + { + handleVariableTypeChange( + val, + valueFieldAlias, + modeFieldAlias, + ); + field.onChange(val); + }} + options={VariableTypeOptions} + > + )} + + + + {(field) => ( + { + handleModeChange( + val, + valueFieldAlias, + operatorFieldAlias, + ); + field.onChange(val); + }} + options={InputModeOptions} + > + )} + +
+ + {renderParameter(operatorFieldAlias, modeFieldAlias)} + +
+ + +
+ ); + })} +
+
+ ); +} diff --git a/web/src/pages/agent/form/begin-form/webhook/index.tsx b/web/src/pages/agent/form/begin-form/webhook/index.tsx new file mode 100644 index 000000000..86e844b07 --- /dev/null +++ b/web/src/pages/agent/form/begin-form/webhook/index.tsx @@ -0,0 +1,134 @@ +import { Collapse } from '@/components/collapse'; +import { SelectWithSearch } from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Button } from '@/components/ui/button'; +import { Input } from '@/components/ui/input'; +import { Separator } from '@/components/ui/separator'; +import { Textarea } from '@/components/ui/textarea'; +import { buildOptions } from '@/utils/form'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { + RateLimitPerList, + WebhookContentType, + WebhookExecutionMode, + WebhookMaxBodySize, + WebhookMethod, + WebhookSecurityAuthType, +} from '../../../constant'; +import { DynamicStringForm } from '../../components/dynamic-string-form'; +import { SchemaDialog } from '../../components/schema-dialog'; +import { SchemaPanel } from '../../components/schema-panel'; +import { useShowSchemaDialog } from '../use-show-schema-dialog'; +import { Auth } from './auth'; +import { WebhookResponse } from './response'; + +const RateLimitPerOptions = buildOptions(RateLimitPerList); + +export function WebHook() { + const { t } = useTranslation(); + const form = useFormContext(); + + const executionMode = useWatch({ + control: form.control, + name: 'execution_mode', + }); + + const { + showSchemaDialog, + schemaDialogVisible, + hideSchemaDialog, + handleSchemaDialogOk, + } = useShowSchemaDialog(form); + + const schema = form.getValues('schema'); + + return ( + <> + + + + + + + Security}> +
+ + + + + + + + + + + + + + +
+
+ + + + + + + {executionMode === WebhookExecutionMode.Immediately && ( + + )} + +
+ Schema + +
+ + {schemaDialogVisible && ( + + )} + + ); +} diff --git a/web/src/pages/agent/form/begin-form/webhook/response.tsx b/web/src/pages/agent/form/begin-form/webhook/response.tsx new file mode 100644 index 000000000..a50d212e0 --- /dev/null +++ b/web/src/pages/agent/form/begin-form/webhook/response.tsx @@ -0,0 +1,30 @@ +import { Collapse } from '@/components/collapse'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Input } from '@/components/ui/input'; +import { useTranslation } from 'react-i18next'; +import { DynamicResponse } from './dynamic-response'; + +export function WebhookResponse() { + const { t } = useTranslation(); + + return ( + Response}> +
+ + + + + +
+
+ ); +} diff --git a/web/src/pages/agent/form/components/dynamic-string-form.tsx b/web/src/pages/agent/form/components/dynamic-string-form.tsx new file mode 100644 index 000000000..224e92310 --- /dev/null +++ b/web/src/pages/agent/form/components/dynamic-string-form.tsx @@ -0,0 +1,46 @@ +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Button } from '@/components/ui/button'; +import { Input } from '@/components/ui/input'; +import { Trash2 } from 'lucide-react'; +import { useFieldArray, useFormContext } from 'react-hook-form'; +import { DynamicFormHeader, FormListHeaderProps } from './dynamic-fom-header'; + +type DynamicStringFormProps = { name: string } & FormListHeaderProps; +export function DynamicStringForm({ name, label }: DynamicStringFormProps) { + const form = useFormContext(); + + const { fields, append, remove } = useFieldArray({ + name: name, + control: form.control, + }); + + return ( +
+ append({ value: '' })} + > +
+ {fields.map((field, index) => ( +
+ + + + +
+ ))} +
+
+ ); +} diff --git a/web/src/pages/agent/form/agent-form/structured-output-dialog.tsx b/web/src/pages/agent/form/components/schema-dialog.tsx similarity index 81% rename from web/src/pages/agent/form/agent-form/structured-output-dialog.tsx rename to web/src/pages/agent/form/components/schema-dialog.tsx index 6ce305bff..4d67e00c0 100644 --- a/web/src/pages/agent/form/agent-form/structured-output-dialog.tsx +++ b/web/src/pages/agent/form/components/schema-dialog.tsx @@ -3,6 +3,7 @@ import { JsonSchemaVisualizer, SchemaVisualEditor, } from '@/components/jsonjoy-builder'; +import { KeyInputProps } from '@/components/jsonjoy-builder/components/schema-editor/interface'; import { Button } from '@/components/ui/button'; import { Dialog, @@ -16,11 +17,12 @@ import { IModalProps } from '@/interfaces/common'; import { useCallback, useState } from 'react'; import { useTranslation } from 'react-i18next'; -export function StructuredOutputDialog({ +export function SchemaDialog({ hideModal, onOk, initialValues, -}: IModalProps) { + pattern, +}: IModalProps & KeyInputProps) { const { t } = useTranslation(); const [schema, setSchema] = useState(initialValues); @@ -36,7 +38,11 @@ export function StructuredOutputDialog({
- +
diff --git a/web/src/pages/agent/form/agent-form/structured-output-panel.tsx b/web/src/pages/agent/form/components/schema-panel.tsx similarity index 78% rename from web/src/pages/agent/form/agent-form/structured-output-panel.tsx rename to web/src/pages/agent/form/components/schema-panel.tsx index 64e13c6eb..e76ff726e 100644 --- a/web/src/pages/agent/form/agent-form/structured-output-panel.tsx +++ b/web/src/pages/agent/form/components/schema-panel.tsx @@ -1,6 +1,6 @@ import { JSONSchema, JsonSchemaVisualizer } from '@/components/jsonjoy-builder'; -export function StructuredOutputPanel({ value }: { value: JSONSchema }) { +export function SchemaPanel({ value }: { value: JSONSchema }) { return (
state); + const { getOperatorTypeFromId, getNode } = useGraphStore((state) => state); const showSecondaryMenu = useCallback( (value: string, outputLabel: string) => { const nodeId = getNodeId(value); - return ( - getOperatorTypeFromId(nodeId) === Operator.Agent && + const operatorType = getOperatorTypeFromId(nodeId); + + // For Agent nodes, show secondary menu for 'structured' field + if ( + operatorType === Operator.Agent && outputLabel === AgentStructuredOutputField - ); + ) { + return true; + } + + // For Begin nodes in webhook mode, show secondary menu for schema properties (body, headers, query, etc.) + if (operatorType === Operator.Begin) { + const node = getNode(nodeId); + const mode = get(node, 'data.form.mode'); + if (mode === AgentDialogueMode.Webhook) { + // Check if this output field is from the schema + const outputs = get(node, 'data.form.outputs', {}); + const outputField = outputs[outputLabel]; + // Show secondary menu if the field is an object or has properties + return ( + outputField && + (outputField.type === 'object' || + (outputField.properties && + Object.keys(outputField.properties).length > 0)) + ); + } + } + + return false; }, - [getOperatorTypeFromId], + [getOperatorTypeFromId, getNode], ); return showSecondaryMenu; } +function useGetBeginOutputsOrSchema() { + const { getNode } = useGraphStore((state) => state); + + const getBeginOutputs = useCallback(() => { + const node = getNode(BeginId); + const outputs = get(node, 'data.form.outputs', {}); + return outputs; + }, [getNode]); + + const getBeginSchema = useCallback(() => { + const node = getNode(BeginId); + const outputs = get(node, 'data.form.schema', {}); + return outputs; + }, [getNode]); + + return { getBeginOutputs, getBeginSchema }; +} export function useGetStructuredOutputByValue() { - const { getNode } = useGraphStore((state) => state); + const { getNode, getOperatorTypeFromId } = useGraphStore((state) => state); + + const { getBeginOutputs } = useGetBeginOutputsOrSchema(); const getStructuredOutput = useCallback( (value: string) => { - const node = getNode(getNodeId(value)); - const structuredOutput = get( - node, - `data.form.outputs.${AgentStructuredOutputField}`, - ); + const nodeId = getNodeId(value); + const node = getNode(nodeId); + const operatorType = getOperatorTypeFromId(nodeId); + const fields = splitValue(value); + const outputLabel = fields.at(1); + + let structuredOutput; + if (operatorType === Operator.Agent) { + structuredOutput = get( + node, + `data.form.outputs.${AgentStructuredOutputField}`, + ); + } else if (operatorType === Operator.Begin) { + // For Begin nodes in webhook mode, get the specific schema property + const outputs = getBeginOutputs(); + if (outputLabel) { + structuredOutput = outputs[outputLabel]; + } + } return structuredOutput; }, - [getNode], + [getBeginOutputs, getNode, getOperatorTypeFromId], ); return getStructuredOutput; @@ -66,13 +126,14 @@ export function useFindAgentStructuredOutputLabel() { icon?: ReactNode; }>, ) => { - // agent structured output const fields = splitValue(value); + const operatorType = getOperatorTypeFromId(fields.at(0)); + + // Handle Agent structured fields if ( - getOperatorTypeFromId(fields.at(0)) === Operator.Agent && + operatorType === Operator.Agent && fields.at(1)?.startsWith(AgentStructuredOutputField) ) { - // is agent structured output const agentOption = options.find((x) => value.includes(x.value)); const jsonSchemaFields = fields .at(1) @@ -84,6 +145,19 @@ export function useFindAgentStructuredOutputLabel() { value: value, }; } + + // Handle Begin webhook fields + if (operatorType === Operator.Begin && fields.at(1)) { + const fieldOption = options + .filter((x) => x.parentLabel === BeginId) + .find((x) => value.startsWith(x.value)); + + return { + ...fieldOption, + label: fields.at(1), + value: value, + }; + } }, [getOperatorTypeFromId], ); @@ -94,6 +168,7 @@ export function useFindAgentStructuredOutputLabel() { export function useFindAgentStructuredOutputTypeByValue() { const { getOperatorTypeFromId } = useGraphStore((state) => state); const filterStructuredOutput = useGetStructuredOutputByValue(); + const { getBeginSchema } = useGetBeginOutputsOrSchema(); const findTypeByValue = useCallback( ( @@ -136,10 +211,12 @@ export function useFindAgentStructuredOutputTypeByValue() { } const fields = splitValue(value); const nodeId = fields.at(0); + const operatorType = getOperatorTypeFromId(nodeId); const jsonSchema = filterStructuredOutput(value); + // Handle Agent structured fields if ( - getOperatorTypeFromId(nodeId) === Operator.Agent && + operatorType === Operator.Agent && fields.at(1)?.startsWith(AgentStructuredOutputField) ) { const jsonSchemaFields = fields @@ -151,13 +228,32 @@ export function useFindAgentStructuredOutputTypeByValue() { return type; } } + + // Handle Begin webhook fields (body, headers, query, etc.) + if (operatorType === Operator.Begin) { + const outputLabel = fields.at(1); + const schema = getBeginSchema(); + if (outputLabel && schema) { + const jsonSchemaFields = fields.at(1); + if (jsonSchemaFields) { + const type = findTypeByValue(schema, jsonSchemaFields); + return type; + } + } + } }, - [filterStructuredOutput, findTypeByValue, getOperatorTypeFromId], + [ + filterStructuredOutput, + findTypeByValue, + getBeginSchema, + getOperatorTypeFromId, + ], ); return findAgentStructuredOutputTypeByValue; } +// TODO: Consider merging with useFindAgentStructuredOutputLabel export function useFindAgentStructuredOutputLabelByValue() { const { getNode } = useGraphStore((state) => state); diff --git a/web/src/pages/agent/hooks/use-get-begin-query.tsx b/web/src/pages/agent/hooks/use-get-begin-query.tsx index 387f59821..46825e5a4 100644 --- a/web/src/pages/agent/hooks/use-get-begin-query.tsx +++ b/web/src/pages/agent/hooks/use-get-begin-query.tsx @@ -314,10 +314,12 @@ export function useFilterQueryVariableOptionsByTypes({ ? toLower(y.type).includes(toLower(x)) : toLower(y.type) === toLower(x), ) || + // agent structured output isAgentStructured( y.value, y.value.slice(-AgentStructuredOutputField.length), - ), // agent structured output + ) || + y.value.startsWith(BeginId), // begin node outputs ), }; }) diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index 6825dd9f5..592d92e45 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -24,6 +24,7 @@ import { import pipe from 'lodash/fp/pipe'; import isObject from 'lodash/isObject'; import { + AgentDialogueMode, CategorizeAnchorPointPositions, FileType, FileTypeSuffixMap, @@ -34,6 +35,7 @@ import { Operator, TypesWithArray, } from './constant'; +import { BeginFormSchemaType } from './form/begin-form'; import { DataOperationsFormSchemaType } from './form/data-operations-form'; import { ExtractorFormSchemaType } from './form/extractor-form'; import { HierarchicalMergerFormSchemaType } from './form/hierarchical-merger-form'; @@ -312,6 +314,41 @@ function transformDataOperationsParams(params: DataOperationsFormSchemaType) { }; } +export function transformArrayToObject( + list?: Array<{ key: string; value: string }>, +) { + if (!Array.isArray(list)) return {}; + return list?.reduce>((pre, cur) => { + if (cur.key) { + pre[cur.key] = cur.value; + } + return pre; + }, {}); +} + +function transformBeginParams(params: BeginFormSchemaType) { + if (params.mode === AgentDialogueMode.Webhook) { + return { + ...params, + security: { + ...params.security, + ip_whitelist: params.security?.ip_whitelist.map((x) => x.value), + }, + response: { + ...params.response, + headers_template: transformArrayToObject( + params.response?.headers_template, + ), + body_template: transformArrayToObject(params.response?.body_template), + }, + }; + } + + return { + ...params, + }; +} + // construct a dsl based on the node information of the graph export const buildDslComponentsByGraph = ( nodes: RAGFlowNodeType[], @@ -361,6 +398,9 @@ export const buildDslComponentsByGraph = ( case Operator.DataOperations: params = transformDataOperationsParams(params); break; + case Operator.Begin: + params = transformBeginParams(params); + break; default: break; } diff --git a/web/src/pages/dataset/testing/testing-form.tsx b/web/src/pages/dataset/testing/testing-form.tsx index f6dc195b0..6d856b7d5 100644 --- a/web/src/pages/dataset/testing/testing-form.tsx +++ b/web/src/pages/dataset/testing/testing-form.tsx @@ -7,14 +7,18 @@ import { z } from 'zod'; import { CrossLanguageFormField } from '@/components/cross-language-form-field'; import { FormContainer } from '@/components/form-container'; import { - initialTopKValue, + MetadataFilter, + MetadataFilterSchema, +} from '@/components/metadata-filter'; +import { RerankFormFields, + initialTopKValue, topKSchema, } from '@/components/rerank'; import { + SimilaritySliderFormField, initialSimilarityThresholdValue, initialVectorSimilarityWeightValue, - SimilaritySliderFormField, similarityThresholdSchema, vectorSimilarityWeightSchema, } from '@/components/similarity-slider'; @@ -33,6 +37,7 @@ import { trim } from 'lodash'; import { Send } from 'lucide-react'; import { useEffect } from 'react'; import { useTranslation } from 'react-i18next'; +import { useParams } from 'umi'; type TestingFormProps = Pick< ReturnType, @@ -45,6 +50,8 @@ export default function TestingForm({ setValues, }: TestingFormProps) { const { t } = useTranslation(); + const { id } = useParams(); + const knowledgeBaseId = id; const formSchema = z.object({ question: z.string().min(1, { @@ -54,6 +61,8 @@ export default function TestingForm({ ...vectorSimilarityWeightSchema, ...topKSchema, use_kg: z.boolean().optional(), + kb_ids: z.array(z.string()).optional(), + ...MetadataFilterSchema, }); const form = useForm>({ @@ -63,6 +72,7 @@ export default function TestingForm({ ...initialVectorSimilarityWeightValue, ...initialTopKValue, use_kg: false, + kb_ids: [knowledgeBaseId], }, }); @@ -90,6 +100,7 @@ export default function TestingForm({ +