Code format.

This commit is contained in:
Kevin Hu 2025-11-28 17:57:49 +08:00
parent bb46dcbbe3
commit 4e9d8f8e6f
2 changed files with 1 additions and 2 deletions

View file

@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import base64
import json
import logging
import re

View file

@ -266,7 +266,7 @@ def is_chinese(text):
def tokenize(d, txt, eng):
d["content_with_weight"] = txt
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])