respect customized delimiters
This commit is contained in:
parent
0d5589bfda
commit
0074df72bc
1 changed files with 68 additions and 8 deletions
|
|
@ -608,6 +608,26 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
|||
cks[-1] += t
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||
has_custom = bool(custom_delimiters)
|
||||
if has_custom:
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
|
||||
cks, tk_nums = [], []
|
||||
for sec, pos in sections:
|
||||
split_sec = re.split(r"(%s)" % custom_pattern, sec, flags=re.DOTALL)
|
||||
for sub_sec in split_sec:
|
||||
if re.fullmatch(custom_pattern, sub_sec or ""):
|
||||
continue
|
||||
text = "\n" + sub_sec
|
||||
local_pos = pos
|
||||
if num_tokens_from_string(text) < 8:
|
||||
local_pos = ""
|
||||
if local_pos and text.find(local_pos) < 0:
|
||||
text += local_pos
|
||||
cks.append(text)
|
||||
tk_nums.append(num_tokens_from_string(text))
|
||||
return cks
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
for sec, pos in sections:
|
||||
if num_tokens_from_string(sec) < chunk_token_num:
|
||||
|
|
@ -657,6 +677,29 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
|||
result_images[-1] = concat_img(result_images[-1], image)
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||
has_custom = bool(custom_delimiters)
|
||||
if has_custom:
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
|
||||
cks, result_images, tk_nums = [], [], []
|
||||
for text, image in zip(texts, images):
|
||||
text_str = text[0] if isinstance(text, tuple) else text
|
||||
text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
|
||||
split_sec = re.split(r"(%s)" % custom_pattern, text_str)
|
||||
for sub_sec in split_sec:
|
||||
if re.fullmatch(custom_pattern, sub_sec or ""):
|
||||
continue
|
||||
text_seg = "\n" + sub_sec
|
||||
local_pos = text_pos
|
||||
if num_tokens_from_string(text_seg) < 8:
|
||||
local_pos = ""
|
||||
if local_pos and text_seg.find(local_pos) < 0:
|
||||
text_seg += local_pos
|
||||
cks.append(text_seg)
|
||||
result_images.append(image)
|
||||
tk_nums.append(num_tokens_from_string(text_seg))
|
||||
return cks, result_images
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
for text, image in zip(texts, images):
|
||||
# if text is tuple, unpack it
|
||||
|
|
@ -748,6 +791,23 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||
images[-1] = concat_img(images[-1], image)
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||
has_custom = bool(custom_delimiters)
|
||||
if has_custom:
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
|
||||
cks, images, tk_nums = [], [], []
|
||||
pattern = r"(%s)" % custom_pattern
|
||||
for sec, image in sections:
|
||||
split_sec = re.split(pattern, sec)
|
||||
for sub_sec in split_sec:
|
||||
if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
|
||||
continue
|
||||
text_seg = "\n" + sub_sec
|
||||
cks.append(text_seg)
|
||||
images.append(image)
|
||||
tk_nums.append(num_tokens_from_string(text_seg))
|
||||
return cks, images
|
||||
|
||||
dels = get_delimiters(delimiter)
|
||||
pattern = r"(%s)" % dels
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue