From 89b80c66d0f87cef2fa6812576ff4d147cbf7541 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 11 Dec 2025 14:33:42 +0800 Subject: [PATCH] Fix: tokenizer issue. --- api/apps/sdk/files.py | 1 + rag/nlp/rag_tokenizer.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/api/apps/sdk/files.py b/api/apps/sdk/files.py index 2e9fd6df3..8bac19ccd 100644 --- a/api/apps/sdk/files.py +++ b/api/apps/sdk/files.py @@ -33,6 +33,7 @@ from api.utils.web_utils import CONTENT_TYPE_MAP from common import settings from common.constants import RetCode + @manager.route('/file/upload', methods=['POST']) # noqa: F821 @token_required async def upload(tenant_id): diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index c50e84ebc..494e1915b 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -33,6 +33,22 @@ class RagTokenizer(infinity.rag_tokenizer.RagTokenizer): return super().fine_grained_tokenize(tks) +def is_chinese(s): + return infinity.rag_tokenizer.is_chinese(s) + + +def is_number(s): + return infinity.rag_tokenizer.is_number(s) + + +def is_alphabet(s): + return infinity.rag_tokenizer.is_alphabet(s) + + +def naive_qie(txt): + return infinity.rag_tokenizer.naive_qie(txt) + + tokenizer = RagTokenizer() tokenize = tokenizer.tokenize fine_grained_tokenize = tokenizer.fine_grained_tokenize