diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py
index 0d392bccf..f4d327993 100644
--- a/api/apps/conversation_app.py
+++ b/api/apps/conversation_app.py
@@ -163,6 +163,7 @@ def completion():
del req["conversation_id"]
del req["messages"]
ans = chat(dia, msg, **req)
+ if not conv.reference: conv.reference = []
conv.reference.append(ans["reference"])
conv.message.append({"role": "assistant", "content": ans["answer"]})
ConversationService.update_by_id(conv.id, conv.to_dict())
diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py
index cc6f9810a..fccc0ecbb 100644
--- a/api/apps/dialog_app.py
+++ b/api/apps/dialog_app.py
@@ -32,7 +32,6 @@ def set_dialog():
dialog_id = req.get("dialog_id")
name = req.get("name", "New Dialog")
description = req.get("description", "A helpful Dialog")
- language = req.get("language", "Chinese")
top_n = req.get("top_n", 6)
similarity_threshold = req.get("similarity_threshold", 0.1)
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
@@ -80,7 +79,6 @@ def set_dialog():
"name": name,
"kb_ids": req["kb_ids"],
"description": description,
- "language": language,
"llm_id": llm_id,
"llm_setting": llm_setting,
"prompt_config": prompt_config,
diff --git a/api/apps/document_app.py b/api/apps/document_app.py
index 0649adaaa..9ce7e2b11 100644
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -272,7 +272,9 @@ def get(doc_id):
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
ext = re.search(r"\.([^.]+)$", doc.name)
if ext:
- response.headers.set('Content-Type', 'application/%s'%ext.group(1))
+ if doc.type == FileType.VISUAL.value:
+ response.headers.set('Content-Type', 'image/%s'%ext.group(1))
+ else: response.headers.set('Content-Type', 'application/%s'%ext.group(1))
return response
except Exception as e:
return server_error_response(e)
diff --git a/api/db/db_models.py b/api/db/db_models.py
index 49aa169cf..9b2dc7ffb 100644
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@@ -464,6 +464,7 @@ class Knowledgebase(DataBaseModel):
avatar = TextField(null=True, help_text="avatar base64 string")
tenant_id = CharField(max_length=32, null=False)
name = CharField(max_length=128, null=False, help_text="KB name", index=True)
+ language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
description = TextField(null=True, help_text="KB description")
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py
index 6bc1150d7..2a9647402 100644
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@@ -57,7 +57,7 @@ class TenantLLMService(CommonService):
@classmethod
@DB.connection_context()
- def model_instance(cls, tenant_id, llm_type, llm_name=None):
+ def model_instance(cls, tenant_id, llm_type, llm_name=None, lang="Chinese"):
e, tenant = TenantService.get_by_id(tenant_id)
if not e:
raise LookupError("Tenant not found")
@@ -87,7 +87,7 @@ class TenantLLMService(CommonService):
if model_config["llm_factory"] not in CvModel:
return
return CvModel[model_config["llm_factory"]](
- model_config["api_key"], model_config["llm_name"])
+ model_config["api_key"], model_config["llm_name"], lang)
if llm_type == LLMType.CHAT.value:
if model_config["llm_factory"] not in ChatModel:
@@ -120,11 +120,11 @@ class TenantLLMService(CommonService):
class LLMBundle(object):
- def __init__(self, tenant_id, llm_type, llm_name=None):
+ def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"):
self.tenant_id = tenant_id
self.llm_type = llm_type
self.llm_name = llm_name
- self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name)
+ self.mdl = TenantLLMService.model_instance(tenant_id, llm_type, llm_name, lang=lang)
assert self.mdl, "Can't find mole for {}/{}/{}".format(tenant_id, llm_type, llm_name)
def encode(self, texts: list, batch_size=32):
diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py
index 87e84a12a..a63130150 100644
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@@ -27,7 +27,24 @@ class TaskService(CommonService):
@classmethod
@DB.connection_context()
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
- fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.parser_config, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
+ fields = [
+ cls.model.id,
+ cls.model.doc_id,
+ cls.model.from_page,
+ cls.model.to_page,
+ Document.kb_id,
+ Document.parser_id,
+ Document.parser_config,
+ Document.name,
+ Document.type,
+ Document.location,
+ Document.size,
+ Knowledgebase.tenant_id,
+ Knowledgebase.language,
+ Tenant.embd_id,
+ Tenant.img2txt_id,
+ Tenant.asr_id,
+ cls.model.update_time]
docs = cls.model.select(*fields) \
.join(Document, on=(cls.model.doc_id == Document.id)) \
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
@@ -42,7 +59,6 @@ class TaskService(CommonService):
.paginate(1, items_per_page)
return list(docs.dicts())
-
@classmethod
@DB.connection_context()
def do_cancel(cls, id):
@@ -54,12 +70,11 @@ class TaskService(CommonService):
pass
return True
-
@classmethod
@DB.connection_context()
def update_progress(cls, id, info):
- cls.model.update(progress_msg=cls.model.progress_msg + "\n"+info["progress_msg"]).where(
+ cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
- cls.model.id == id).execute()
+ cls.model.id == id).execute()
diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py
index b504cfef3..1d6f15f33 100644
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@@ -167,7 +167,11 @@ def thumbnail(filename, blob):
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
- return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8")
+ image = Image.open(BytesIO(blob))
+ image.thumbnail((30, 30))
+ buffered = BytesIO()
+ image.save(buffered, format="png")
+ return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
if re.match(r".*\.(ppt|pptx)$", filename):
import aspose.slides as slides
diff --git a/deepdoc/README.md b/deepdoc/README.md
new file mode 100644
index 000000000..b2ad032dc
--- /dev/null
+++ b/deepdoc/README.md
@@ -0,0 +1,82 @@
+English | [简体中文](./README_zh.md)
+
+#*Deep*Doc
+
+---
+
+- [1. Introduction](#1)
+- [2. Vision](#2)
+- [3. Parser](#3)
+
+
+## 1. Introduction
+
+---
+With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
+an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
+There 2 parts in *Deep*Doc so far: vision and parser.
+
+
+## 2. Vision
+
+---
+
+We use vision information to resolve problems as human being.
+ - OCR. Since a lot of documents presented as images or at least be able to transform to image,
+ OCR is a very essential and fundamental or even universal solution for text extraction.
+
+
+

+
+
+ - Layout recognition. Documents from different domain may have various layouts,
+ like, newspaper, magazine, book and résumé are distinct in terms of layout.
+ Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
+ or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
+ We have 10 basic layout components which covers most cases:
+ - Text
+ - Title
+ - Figure
+ - Figure caption
+ - Table
+ - Table caption
+ - Header
+ - Footer
+ - Reference
+ - Equation
+
+

+
+
+ - Table Structure Recognition(TSR). Data table is a frequently used structure present data including numbers or text.
+ And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
+ Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
+ We have five labels for TSR task:
+ - Column
+ - Row
+ - Column header
+ - Projected row header
+ - Spanning cell
+
+

+
+
+
+## 3. Parser
+
+---
+
+Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
+The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
+ - Text chunks with their own positions in PDF(page number and rectangular positions).
+ - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
+ - Figures with caption and text in the figures.
+
+###Résumé
+
+---
+The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
+with various layouts could be resolved into structured data composed of nearly a hundred of fields.
+We haven't opened the parser yet, as we open the processing method after parsing procedure.
+
+
\ No newline at end of file
diff --git a/deepdoc/README_zh.md b/deepdoc/README_zh.md
new file mode 100644
index 000000000..c43cc5665
--- /dev/null
+++ b/deepdoc/README_zh.md
@@ -0,0 +1 @@
+[English](./README.md) | 简体中文
\ No newline at end of file
diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py
index 886f4ab9e..bbec40425 100644
--- a/deepdoc/parser/__init__.py
+++ b/deepdoc/parser/__init__.py
@@ -1,223 +1,8 @@
-import random
+
from .pdf_parser import HuParser as PdfParser
from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser
-
-import re
-
-from nltk import word_tokenize
-
-from rag.nlp import stemmer, huqie
-from rag.utils import num_tokens_from_string
-
-BULLET_PATTERN = [[
- r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
- r"第[零一二三四五六七八九十百0-9]+章",
- r"第[零一二三四五六七八九十百0-9]+节",
- r"第[零一二三四五六七八九十百0-9]+条",
- r"[\((][零一二三四五六七八九十百]+[\))]",
-], [
- r"第[0-9]+章",
- r"第[0-9]+节",
- r"[0-9]{,3}[\. 、]",
- r"[0-9]{,2}\.[0-9]{,2}",
- r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
- r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
-], [
- r"第[零一二三四五六七八九十百0-9]+章",
- r"第[零一二三四五六七八九十百0-9]+节",
- r"[零一二三四五六七八九十百]+[ 、]",
- r"[\((][零一二三四五六七八九十百]+[\))]",
- r"[\((][0-9]{,2}[\))]",
-], [
- r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
- r"Chapter (I+V?|VI*|XI|IX|X)",
- r"Section [0-9]+",
- r"Article [0-9]+"
-]
-]
-
-def random_choices(arr, k):
- k = min(len(arr), k)
- return random.choices(arr, k=k)
-
-def bullets_category(sections):
- global BULLET_PATTERN
- hits = [0] * len(BULLET_PATTERN)
- for i, pro in enumerate(BULLET_PATTERN):
- for sec in sections:
- for p in pro:
- if re.match(p, sec):
- hits[i] += 1
- break
- maxium = 0
- res = -1
- for i, h in enumerate(hits):
- if h <= maxium: continue
- res = i
- maxium = h
- return res
-
-
-def is_english(texts):
- eng = 0
- for t in texts:
- if re.match(r"[a-zA-Z]{2,}", t.strip()):
- eng += 1
- if eng / len(texts) > 0.8:
- return True
- return False
-
-
-def tokenize(d, t, eng):
- d["content_with_weight"] = t
- if eng:
- t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
- d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
- else:
- d["content_ltks"] = huqie.qie(t)
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
-
-
-def remove_contents_table(sections, eng=False):
- i = 0
- while i < len(sections):
- def get(i):
- nonlocal sections
- return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
-
- if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
- re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
- i += 1
- continue
- sections.pop(i)
- if i >= len(sections): break
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
- while not prefix:
- sections.pop(i)
- if i >= len(sections): break
- prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
- sections.pop(i)
- if i >= len(sections) or not prefix: break
- for j in range(i, min(i + 128, len(sections))):
- if not re.match(prefix, get(j)):
- continue
- for _ in range(i, j): sections.pop(i)
- break
-
-
-def make_colon_as_title(sections):
- if not sections: return []
- if type(sections[0]) == type(""): return sections
- i = 0
- while i < len(sections):
- txt, layout = sections[i]
- i += 1
- txt = txt.split("@")[0].strip()
- if not txt:
- continue
- if txt[-1] not in "::":
- continue
- txt = txt[::-1]
- arr = re.split(r"([。?!!?;;]| .)", txt)
- if len(arr) < 2 or len(arr[1]) < 32:
- continue
- sections.insert(i - 1, (arr[0][::-1], "title"))
- i += 1
-
-
-def hierarchical_merge(bull, sections, depth):
- if not sections or bull < 0: return []
- if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
- sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
- bullets_size = len(BULLET_PATTERN[bull])
- levels = [[] for _ in range(bullets_size + 2)]
-
- def not_title(txt):
- if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
- if len(txt) >= 128: return True
- return re.search(r"[,;,。;!!]", txt)
-
- for i, (txt, layout) in enumerate(sections):
- for j, p in enumerate(BULLET_PATTERN[bull]):
- if re.match(p, txt.strip()) and not not_title(txt):
- levels[j].append(i)
- break
- else:
- if re.search(r"(title|head)", layout):
- levels[bullets_size].append(i)
- else:
- levels[bullets_size + 1].append(i)
- sections = [t for t, _ in sections]
- for s in sections: print("--", s)
-
- def binary_search(arr, target):
- if not arr: return -1
- if target > arr[-1]: return len(arr) - 1
- if target < arr[0]: return -1
- s, e = 0, len(arr)
- while e - s > 1:
- i = (e + s) // 2
- if target > arr[i]:
- s = i
- continue
- elif target < arr[i]:
- e = i
- continue
- else:
- assert False
- return s
-
- cks = []
- readed = [False] * len(sections)
- levels = levels[::-1]
- for i, arr in enumerate(levels[:depth]):
- for j in arr:
- if readed[j]: continue
- readed[j] = True
- cks.append([j])
- if i + 1 == len(levels) - 1: continue
- for ii in range(i + 1, len(levels)):
- jj = binary_search(levels[ii], j)
- if jj < 0: continue
- if jj > cks[-1][-1]: cks[-1].pop(-1)
- cks[-1].append(levels[ii][jj])
- for ii in cks[-1]: readed[ii] = True
- for i in range(len(cks)):
- cks[i] = [sections[j] for j in cks[i][::-1]]
- print("--------------\n", "\n* ".join(cks[i]))
-
- return cks
-
-
-def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
- if not sections: return []
- if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
- cks = [""]
- tk_nums = [0]
- def add_chunk(t, pos):
- nonlocal cks, tk_nums, delimiter
- tnum = num_tokens_from_string(t)
- if tnum < 8: pos = ""
- if tk_nums[-1] > chunk_token_num:
- cks.append(t + pos)
- tk_nums.append(tnum)
- else:
- cks[-1] += t + pos
- tk_nums[-1] += tnum
-
- for sec, pos in sections:
- s, e = 0, 1
- while e < len(sec):
- if sec[e] in delimiter:
- add_chunk(sec[s: e+1], pos)
- s = e + 1
- e = s + 1
- else:
- e += 1
- if s < e: add_chunk(sec[s: e], pos)
-
- return cks
+from .ppt_parser import HuPptParser as PptParser
diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py
new file mode 100644
index 000000000..80c6df34d
--- /dev/null
+++ b/deepdoc/parser/ppt_parser.py
@@ -0,0 +1,52 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from io import BytesIO
+from pptx import Presentation
+
+
+class HuPptParser(object):
+ def __init__(self):
+ super().__init__()
+
+ def __extract(self, shape):
+ if shape.shape_type == 19:
+ tb = shape.table
+ rows = []
+ for i in range(1, len(tb.rows)):
+ rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+ return "\n".join(rows)
+
+ if shape.has_text_frame:
+ return shape.text_frame.text
+
+ if shape.shape_type == 6:
+ texts = []
+ for p in shape.shapes:
+ t = self.__extract(p)
+ if t: texts.append(t)
+ return "\n".join(texts)
+
+ def __call__(self, fnm, from_page, to_page, callback=None):
+ ppt = Presentation(fnm) if isinstance(
+ fnm, str) else Presentation(
+ BytesIO(fnm))
+ txts = []
+ self.total_page = len(ppt.slides)
+ for i, slide in enumerate(ppt.slides[from_page: to_page]):
+ texts = []
+ for shape in slide.shapes:
+ txt = self.__extract(shape)
+ if txt: texts.append(txt)
+ txts.append("\n".join(texts))
+
+ return txts
diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py
index 09c8a7a91..6e08d7c22 100644
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@@ -64,7 +64,11 @@ def load_model(model_dir, nm):
if not os.path.exists(model_file_path):
raise ValueError("not find model file path {}".format(
model_file_path))
- sess = ort.InferenceSession(model_file_path)
+
+ if ort.get_device() == "GPU":
+ sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
+ else:
+ sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
return sess, sess.get_inputs()[0]
diff --git a/rag/app/book.py b/rag/app/book.py
index c9996aeb3..2f0066889 100644
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -12,7 +12,7 @@
#
import copy
import re
-from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
+from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
from rag.nlp import huqie
from deepdoc.parser import PdfParser, DocxParser
@@ -47,7 +47,7 @@ class Pdf(PdfParser):
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
@@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
sections = [t for t, _ in sections]
# is it English
- eng = is_english(random_choices(sections, k=218))
+ eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))
res = []
# add tables
diff --git a/rag/app/laws.py b/rag/app/laws.py
index f24987086..0f2066d5c 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -14,7 +14,7 @@ import copy
import re
from io import BytesIO
from docx import Document
-from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
+from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie
from deepdoc.parser import PdfParser, DocxParser
@@ -68,7 +68,7 @@ class Pdf(PdfParser):
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
@@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
# is it English
- eng = is_english(sections)
+ eng = lang.lower() == "english"#is_english(sections)
# Remove 'Contents' part
remove_contents_table(sections, eng)
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 9b051ec93..14fbf0be8 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -1,7 +1,6 @@
import copy
import re
-from deepdoc.parser import tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, tokenize
from deepdoc.parser import PdfParser
from rag.utils import num_tokens_from_string
@@ -57,7 +56,7 @@ class Pdf(PdfParser):
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
@@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
# is it English
- eng = pdf_parser.is_english
+ eng = lang.lower() == "english"#pdf_parser.is_english
res = []
# add tables
diff --git a/rag/app/naive.py b/rag/app/naive.py
index aceb22f26..0446217c8 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -13,8 +13,7 @@
import copy
import re
from rag.app import laws
-from deepdoc.parser import is_english, tokenize, naive_merge
-from rag.nlp import huqie
+from rag.nlp import huqie, is_english, tokenize, naive_merge
from deepdoc.parser import PdfParser
from rag.settings import cron_logger
@@ -38,7 +37,7 @@ class Pdf(PdfParser):
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
This method apply the naive ways to chunk files.
@@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
- eng = is_english(cks)
+ eng = lang.lower() == "english"#is_english(cks)
res = []
# wrap up to es documents
for ck in cks:
diff --git a/rag/app/paper.py b/rag/app/paper.py
index ac9afd226..a050c1105 100644
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -15,8 +15,7 @@ import re
from collections import Counter
from api.db import ParserType
-from deepdoc.parser import tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, tokenize
from deepdoc.parser import PdfParser
import numpy as np
from rag.utils import num_tokens_from_string
@@ -140,7 +139,7 @@ class Pdf(PdfParser):
}
-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
@@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
# is it English
- eng = pdf_parser.is_english
+ eng = lang.lower() == "english"#pdf_parser.is_english
print("It's English.....", eng)
res = []
diff --git a/rag/app/picture.py b/rag/app/picture.py
new file mode 100644
index 000000000..fdaccc258
--- /dev/null
+++ b/rag/app/picture.py
@@ -0,0 +1,56 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import io
+
+import numpy as np
+from PIL import Image
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import tokenize
+from deepdoc.vision import OCR
+
+ocr = OCR()
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+ try:
+ cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
+ except Exception as e:
+ callback(prog=-1, msg=str(e))
+ return []
+ img = Image.open(io.BytesIO(binary))
+ doc = {
+ "docnm_kwd": filename,
+ "image": img
+ }
+ bxs = ocr(np.array(img))
+ txt = "\n".join([t[0] for _, t in bxs if t[0]])
+ eng = lang.lower() == "english"
+ callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+ if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
+ tokenize(doc, txt, eng)
+ callback(0.8, "OCR results is too long to use CV LLM.")
+ return [doc]
+
+ try:
+ callback(0.4, "Use CV LLM to describe the picture.")
+ ans = cv_mdl.describe(binary)
+ callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
+ txt += "\n" + ans
+ tokenize(doc, txt, eng)
+ return [doc]
+ except Exception as e:
+ callback(prog=-1, msg=str(e))
+
+ return []
diff --git a/rag/app/presentation.py b/rag/app/presentation.py
index 2cb660663..a82e514a0 100644
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -13,46 +13,14 @@
import copy
import re
from io import BytesIO
-from pptx import Presentation
-from deepdoc.parser import tokenize, is_english
+from rag.nlp import tokenize, is_english
from rag.nlp import huqie
-from deepdoc.parser import PdfParser
+from deepdoc.parser import PdfParser, PptParser
-class Ppt(object):
- def __init__(self):
- super().__init__()
-
- def __extract(self, shape):
- if shape.shape_type == 19:
- tb = shape.table
- rows = []
- for i in range(1, len(tb.rows)):
- rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
- return "\n".join(rows)
-
- if shape.has_text_frame:
- return shape.text_frame.text
-
- if shape.shape_type == 6:
- texts = []
- for p in shape.shapes:
- t = self.__extract(p)
- if t: texts.append(t)
- return "\n".join(texts)
-
+class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
- ppt = Presentation(fnm) if isinstance(
- fnm, str) else Presentation(
- BytesIO(fnm))
- txts = []
- self.total_page = len(ppt.slides)
- for i, slide in enumerate(ppt.slides[from_page: to_page]):
- texts = []
- for shape in slide.shapes:
- txt = self.__extract(shape)
- if txt: texts.append(txt)
- txts.append("\n".join(texts))
+ txts = super.__call__(fnm, from_page, to_page)
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
diff --git a/rag/app/qa.py b/rag/app/qa.py
index 34615a8e0..e649b4a23 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -14,7 +14,7 @@ import re
from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
-from deepdoc.parser import is_english, random_choices
+from rag.nlp import is_english, random_choices
from rag.nlp import huqie, stemmer
from deepdoc.parser import ExcelParser
@@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
return d
-def chunk(filename, binary=None, callback=None, **kwargs):
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
@@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
break
txt += l
lines = txt.split("\n")
- eng = is_english([rmPrefix(l) for l in lines[:100]])
+ eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
fails = []
for i, line in enumerate(lines):
arr = [l for l in line.split("\t") if len(l) > 1]
diff --git a/rag/app/table.py b/rag/app/table.py
index 635284308..68b2d3350 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -20,8 +20,7 @@ from openpyxl import load_workbook
from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService
-from deepdoc.parser import is_english, tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, is_english, tokenize
from deepdoc.parser import ExcelParser
@@ -112,7 +111,7 @@ def column_data_type(arr):
return arr, ty
-def chunk(filename, binary=None, callback=None, **kwargs):
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
@@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
for i in range(len(clmns))]
- eng = is_english(txts)
+ eng = lang.lower() == "english"#is_english(txts)
for ii, row in df.iterrows():
d = {}
row_txt = []
diff --git a/rag/llm/cv_model.py b/rag/llm/cv_model.py
index d663f90cc..02298c6a6 100644
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
@@ -13,12 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+import io
from abc import ABC
+
+from PIL import Image
from openai import OpenAI
import os
import base64
from io import BytesIO
+from api.utils import get_uuid
+from api.utils.file_utils import get_project_base_directory
+
class Base(ABC):
def __init__(self, key, model_name):
@@ -44,25 +50,26 @@ class Base(ABC):
{
"role": "user",
"content": [
- {
- "type": "text",
- "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等。",
- },
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}"
},
},
+ {
+ "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
+ "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
+ },
],
}
]
class GptV4(Base):
- def __init__(self, key, model_name="gpt-4-vision-preview"):
+ def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese"):
self.client = OpenAI(api_key=key)
self.model_name = model_name
+ self.lang = lang
def describe(self, image, max_tokens=300):
b64 = self.image2base64(image)
@@ -76,18 +83,40 @@ class GptV4(Base):
class QWenCV(Base):
- def __init__(self, key, model_name="qwen-vl-chat-v1"):
+ def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese"):
import dashscope
dashscope.api_key = key
self.model_name = model_name
+ self.lang = lang
+
+ def prompt(self, binary):
+ # stupid as hell
+ tmp_dir = get_project_base_directory("tmp")
+ if not os.path.exists(tmp_dir): os.mkdir(tmp_dir)
+ path = os.path.join(tmp_dir, "%s.jpg"%get_uuid())
+ Image.open(io.BytesIO(binary)).save(path)
+ return [
+ {
+ "role": "user",
+ "content": [
+ {
+ "image": f"file://{path}"
+ },
+ {
+ "text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
+ "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
+ },
+ ],
+ }
+ ]
def describe(self, image, max_tokens=300):
from http import HTTPStatus
from dashscope import MultiModalConversation
response = MultiModalConversation.call(model=self.model_name,
- messages=self.prompt(self.image2base64(image)))
+ messages=self.prompt(image))
if response.status_code == HTTPStatus.OK:
- return response.output.choices[0]['message']['content'], response.usage.output_tokens
+ return response.output.choices[0]['message']['content'][0]["text"], response.usage.output_tokens
return response.message, 0
@@ -95,9 +124,10 @@ from zhipuai import ZhipuAI
class Zhipu4V(Base):
- def __init__(self, key, model_name="glm-4v"):
+ def __init__(self, key, model_name="glm-4v", lang="Chinese"):
self.client = ZhipuAI(api_key=key)
self.model_name = model_name
+ self.lang = lang
def describe(self, image, max_tokens=1024):
b64 = self.image2base64(image)
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 37f5d0e21..2d4bb2d25 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -5,3 +5,219 @@ retrievaler = search.Dealer(ELASTICSEARCH)
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
+
+import re
+from nltk import word_tokenize
+from . import huqie
+from rag.utils import num_tokens_from_string
+import random
+
+BULLET_PATTERN = [[
+ r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
+ r"第[零一二三四五六七八九十百0-9]+章",
+ r"第[零一二三四五六七八九十百0-9]+节",
+ r"第[零一二三四五六七八九十百0-9]+条",
+ r"[\((][零一二三四五六七八九十百]+[\))]",
+], [
+ r"第[0-9]+章",
+ r"第[0-9]+节",
+ r"[0-9]{,3}[\. 、]",
+ r"[0-9]{,2}\.[0-9]{,2}",
+ r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
+ r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
+], [
+ r"第[零一二三四五六七八九十百0-9]+章",
+ r"第[零一二三四五六七八九十百0-9]+节",
+ r"[零一二三四五六七八九十百]+[ 、]",
+ r"[\((][零一二三四五六七八九十百]+[\))]",
+ r"[\((][0-9]{,2}[\))]",
+], [
+ r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
+ r"Chapter (I+V?|VI*|XI|IX|X)",
+ r"Section [0-9]+",
+ r"Article [0-9]+"
+]
+]
+
+def random_choices(arr, k):
+ k = min(len(arr), k)
+ return random.choices(arr, k=k)
+
+def bullets_category(sections):
+ global BULLET_PATTERN
+ hits = [0] * len(BULLET_PATTERN)
+ for i, pro in enumerate(BULLET_PATTERN):
+ for sec in sections:
+ for p in pro:
+ if re.match(p, sec):
+ hits[i] += 1
+ break
+ maxium = 0
+ res = -1
+ for i, h in enumerate(hits):
+ if h <= maxium: continue
+ res = i
+ maxium = h
+ return res
+
+
+def is_english(texts):
+ eng = 0
+ for t in texts:
+ if re.match(r"[a-zA-Z]{2,}", t.strip()):
+ eng += 1
+ if eng / len(texts) > 0.8:
+ return True
+ return False
+
+
+def tokenize(d, t, eng):
+ d["content_with_weight"] = t
+ if eng:
+ t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
+ d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
+ else:
+ d["content_ltks"] = huqie.qie(t)
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+
+
+def remove_contents_table(sections, eng=False):
+ i = 0
+ while i < len(sections):
+ def get(i):
+ nonlocal sections
+ return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
+
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
+ re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
+ i += 1
+ continue
+ sections.pop(i)
+ if i >= len(sections): break
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
+ while not prefix:
+ sections.pop(i)
+ if i >= len(sections): break
+ prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
+ sections.pop(i)
+ if i >= len(sections) or not prefix: break
+ for j in range(i, min(i + 128, len(sections))):
+ if not re.match(prefix, get(j)):
+ continue
+ for _ in range(i, j): sections.pop(i)
+ break
+
+
+def make_colon_as_title(sections):
+ if not sections: return []
+ if type(sections[0]) == type(""): return sections
+ i = 0
+ while i < len(sections):
+ txt, layout = sections[i]
+ i += 1
+ txt = txt.split("@")[0].strip()
+ if not txt:
+ continue
+ if txt[-1] not in "::":
+ continue
+ txt = txt[::-1]
+ arr = re.split(r"([。?!!?;;]| .)", txt)
+ if len(arr) < 2 or len(arr[1]) < 32:
+ continue
+ sections.insert(i - 1, (arr[0][::-1], "title"))
+ i += 1
+
+
+def hierarchical_merge(bull, sections, depth):
+ if not sections or bull < 0: return []
+ if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
+ sections = [(t,o) for t, o in sections if t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())]
+ bullets_size = len(BULLET_PATTERN[bull])
+ levels = [[] for _ in range(bullets_size + 2)]
+
+ def not_title(txt):
+ if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
+ if len(txt) >= 128: return True
+ return re.search(r"[,;,。;!!]", txt)
+
+ for i, (txt, layout) in enumerate(sections):
+ for j, p in enumerate(BULLET_PATTERN[bull]):
+ if re.match(p, txt.strip()) and not not_title(txt):
+ levels[j].append(i)
+ break
+ else:
+ if re.search(r"(title|head)", layout):
+ levels[bullets_size].append(i)
+ else:
+ levels[bullets_size + 1].append(i)
+ sections = [t for t, _ in sections]
+ for s in sections: print("--", s)
+
+ def binary_search(arr, target):
+ if not arr: return -1
+ if target > arr[-1]: return len(arr) - 1
+ if target < arr[0]: return -1
+ s, e = 0, len(arr)
+ while e - s > 1:
+ i = (e + s) // 2
+ if target > arr[i]:
+ s = i
+ continue
+ elif target < arr[i]:
+ e = i
+ continue
+ else:
+ assert False
+ return s
+
+ cks = []
+ readed = [False] * len(sections)
+ levels = levels[::-1]
+ for i, arr in enumerate(levels[:depth]):
+ for j in arr:
+ if readed[j]: continue
+ readed[j] = True
+ cks.append([j])
+ if i + 1 == len(levels) - 1: continue
+ for ii in range(i + 1, len(levels)):
+ jj = binary_search(levels[ii], j)
+ if jj < 0: continue
+ if jj > cks[-1][-1]: cks[-1].pop(-1)
+ cks[-1].append(levels[ii][jj])
+ for ii in cks[-1]: readed[ii] = True
+ for i in range(len(cks)):
+ cks[i] = [sections[j] for j in cks[i][::-1]]
+ print("--------------\n", "\n* ".join(cks[i]))
+
+ return cks
+
+
+def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
+ if not sections: return []
+ if type(sections[0]) == type(""): sections = [(s, "") for s in sections]
+ cks = [""]
+ tk_nums = [0]
+ def add_chunk(t, pos):
+ nonlocal cks, tk_nums, delimiter
+ tnum = num_tokens_from_string(t)
+ if tnum < 8: pos = ""
+ if tk_nums[-1] > chunk_token_num:
+ cks.append(t + pos)
+ tk_nums.append(tnum)
+ else:
+ cks[-1] += t + pos
+ tk_nums[-1] += tnum
+
+ for sec, pos in sections:
+ s, e = 0, 1
+ while e < len(sec):
+ if sec[e] in delimiter:
+ add_chunk(sec[s: e+1], pos)
+ s = e + 1
+ e = s + 1
+ else:
+ e += 1
+ if s < e: add_chunk(sec[s: e], pos)
+
+ return cks
+
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index c75260a02..042e2d42e 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -21,6 +21,7 @@ import hashlib
import copy
import re
import sys
+import traceback
from functools import partial
from timeit import default_timer as timer
@@ -36,7 +37,7 @@ from rag.nlp import search
from io import BytesIO
import pandas as pd
-from rag.app import laws, paper, presentation, manual, qa, table, book, resume
+from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture
from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
@@ -56,47 +57,31 @@ FACTORY = {
ParserType.QA.value: qa,
ParserType.TABLE.value: table,
ParserType.RESUME.value: resume,
+ ParserType.PICTURE.value: picture,
}
-def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
+def set_progress(task_id, from_page=0, to_page=-1,
+ prog=None, msg="Processing..."):
+ if prog is not None and prog < 0:
+ msg = "[ERROR]"+msg
cancel = TaskService.do_cancel(task_id)
if cancel:
msg += " [Canceled]"
prog = -1
- if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
+ if to_page > 0:
+ msg = f"Page({from_page}~{to_page}): " + msg
d = {"progress_msg": msg}
- if prog is not None: d["progress"] = prog
+ if prog is not None:
+ d["progress"] = prog
try:
TaskService.update_progress(task_id, d)
except Exception as e:
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
- if cancel:sys.exit()
-
-
-"""
-def chuck_doc(name, binary, tenant_id, cvmdl=None):
- suff = os.path.split(name)[-1].lower().split(".")[-1]
- if suff.find("pdf") >= 0:
- return PDF(binary)
- if suff.find("doc") >= 0:
- return DOC(binary)
- if re.match(r"(xlsx|xlsm|xltx|xltm)", suff):
- return EXC(binary)
- if suff.find("ppt") >= 0:
- return PPT(binary)
- if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
- name.lower()):
- txt = cvmdl.describe(binary)
- field = TextChunker.Fields()
- field.text_chunks = [(txt, binary)]
- field.table_chunks = []
- return field
-
- return TextChunker()(binary)
-"""
+ if cancel:
+ sys.exit()
def collect(comm, mod, tm):
@@ -109,29 +94,38 @@ def collect(comm, mod, tm):
return tasks
-def build(row, cvmdl):
+def build(row):
if row["size"] > DOC_MAXIMUM_SIZE:
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
return []
- callback = partial(set_progress, row["id"], row["from_page"], row["to_page"])
+ callback = partial(
+ set_progress,
+ row["id"],
+ row["from_page"],
+ row["to_page"])
chunker = FACTORY[row["parser_id"].lower()]
try:
- cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
- cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
- callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
+ cron_logger.info(
+ "Chunkking {}/{}".format(row["location"], row["name"]))
+ cks = chunker.chunk(row["name"], binary=MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"],
+ to_page=row["to_page"], lang=row["language"], callback=callback,
+ kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["doc_name"])
else:
- callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))
+ callback(-1, f"Internal server error: %s" %
+ str(e).replace("'", ""))
+ traceback.print_exc()
- cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
+ cron_logger.warn(
+ "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
return
- callback(msg="Finished slicing files. Start to embedding the content.")
+ callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
docs = []
doc = {
@@ -142,7 +136,8 @@ def build(row, cvmdl):
d = copy.deepcopy(doc)
d.update(ck)
md5 = hashlib.md5()
- md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8"))
+ md5.update((ck["content_with_weight"] +
+ str(d["doc_id"])).encode("utf-8"))
d["_id"] = md5.hexdigest()
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
@@ -173,7 +168,8 @@ def init_kb(row):
def embedding(docs, mdl, parser_config={}):
- tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [d["content_with_weight"] for d in docs]
+ tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
+ d["content_with_weight"] for d in docs]
tk_count = 0
if len(tts) == len(cnts):
tts, c = mdl.encode(tts)
@@ -182,7 +178,8 @@ def embedding(docs, mdl, parser_config={}):
cnts, c = mdl.encode(cnts)
tk_count += c
title_w = float(parser_config.get("filename_embd_weight", 0.1))
- vects = (title_w * tts + (1-title_w) * cnts) if len(tts) == len(cnts) else cnts
+ vects = (title_w * tts + (1 - title_w) *
+ cnts) if len(tts) == len(cnts) else cnts
assert len(vects) == len(docs)
for i, d in enumerate(docs):
@@ -192,7 +189,10 @@ def embedding(docs, mdl, parser_config={}):
def main(comm, mod):
- tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
+ tm_fnm = os.path.join(
+ get_project_base_directory(),
+ "rag/res",
+ f"{comm}-{mod}.tm")
tm = findMaxTm(tm_fnm)
rows = collect(comm, mod, tm)
if len(rows) == 0:
@@ -203,15 +203,13 @@ def main(comm, mod):
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
try:
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
- cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
- # TODO: sequence2text model
except Exception as e:
callback(prog=-1, msg=str(e))
continue
- st_tm = timer()
- cks = build(r, cv_mdl)
- if cks is None:continue
+ cks = build(r)
+ if cks is None:
+ continue
if not cks:
tmf.write(str(r["update_time"]) + "\n")
callback(1., "No chunk! Done!")
@@ -233,11 +231,15 @@ def main(comm, mod):
cron_logger.error(str(es_r))
else:
if TaskService.do_cancel(r["id"]):
- ELASTICSEARCH.deleteByQuery(Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
+ ELASTICSEARCH.deleteByQuery(
+ Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
continue
callback(1., "Done!")
- DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
- cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
+ DocumentService.increment_chunk_num(
+ r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
+ cron_logger.info(
+ "Chunk doc({}), token({}), chunks({})".format(
+ r["id"], tk_count, len(cks)))
tmf.write(str(r["update_time"]) + "\n")
tmf.close()