Code format.
This commit is contained in:
parent
bb46dcbbe3
commit
4e9d8f8e6f
2 changed files with 1 additions and 2 deletions
|
|
@ -13,7 +13,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ def is_chinese(text):
|
|||
|
||||
def tokenize(d, txt, eng):
|
||||
d["content_with_weight"] = txt
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue