Code format.
This commit is contained in:
parent
bb46dcbbe3
commit
4e9d8f8e6f
2 changed files with 1 additions and 2 deletions
|
|
@ -13,7 +13,6 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import base64
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
|
||||||
|
|
@ -266,7 +266,7 @@ def is_chinese(text):
|
||||||
|
|
||||||
def tokenize(d, txt, eng):
|
def tokenize(d, txt, eng):
|
||||||
d["content_with_weight"] = txt
|
d["content_with_weight"] = txt
|
||||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
|
||||||
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue