add field progress msg into docinfo; add file processing procedure

2023-12-18 16:51:55 +08:00 · 2023-12-18 16:51:55 +08:00 · 080a29912c
commit 080a29912c
parent b72bbdb76d
12 changed files with 275 additions and 21 deletions
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -1,7 +1,7 @@
 version: '2.2'
 services:
  es01:
-    container_name: docass-es-01
+    container_name: docgpt-es-01
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - esdata01:/usr/share/elasticsearch/data
@ -20,14 +20,14 @@ services:
        soft: -1
        hard: -1
    networks:
-      - docass
+      - docgpt
    restart: always

  kibana:
    depends_on: 
      - es01
    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
-    container_name: docass-kibana
+    container_name: docgpt-kibana
    volumes:
      - kibanadata:/usr/share/kibana/data
    ports:
@ -37,21 +37,21 @@ services:
      - ELASTICSEARCH_HOSTS=http://es01:9200
    mem_limit: ${MEM_LIMIT}
    networks:
-      - docass
+      - docgpt

  postgres:
    image: postgres
-    container_name: docass-postgres
+    container_name: docgpt-postgres
    environment:
      - POSTGRES_USER=${POSTGRES_USER}
      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
      - POSTGRES_DB=${POSTGRES_DB}
    ports:
-      - 5455:5455
+      - 5455:5432
    volumes:
-      - pg_data:/usr/share/elasticsearch/data
+      - pg_data:/var/lib/postgresql/data
    networks:
-      - docass
+      - docgpt
    restart: always


@ -64,5 +64,5 @@ volumes:
    driver: local

 networks:
-  docass:
+  docgpt:
    driver: bridge
--- a/migration/src/m20220101_000001_create_table.rs
+++ b/migration/src/m20220101_000001_create_table.rs
@ -134,6 +134,7 @@ impl MigrationTrait for Migration {
                    .col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
                    .col(ColumnDef::new(DocInfo::Type).string().not_null()).comment("doc|folder")
                    .col(ColumnDef::new(DocInfo::KbProgress).float().default(0))
+                    .col(ColumnDef::new(DocInfo::KbProgressMsg).string().default(""))
                    .col(ColumnDef::new(DocInfo::CreatedAt).date().not_null())
                    .col(ColumnDef::new(DocInfo::UpdatedAt).date().not_null())
                    .col(ColumnDef::new(DocInfo::IsDeleted).boolean().default(false))
@ -285,6 +286,7 @@ enum DocInfo {
    Size,
    Type,
    KbProgress,
+    KbProgressMsg,
    CreatedAt,
    UpdatedAt,
    IsDeleted,
@ -300,4 +302,4 @@ enum DialogInfo {
    CreatedAt,
    UpdatedAt,
    IsDeleted,
-}
+}
--- a/python/conf/sys.cnf
+++ b/python/conf/sys.cnf
@ -1,8 +1,7 @@
-[online]
+[infiniflow]
 es=127.0.0.1:9200
-idx_nm=toxic
 pgdb_usr=root
-pgdb_pwd=infiniflow_docass
+pgdb_pwd=infiniflow_docgpt
 pgdb_host=127.0.0.1
-pgdb_port=5432
+pgdb_port=5455

--- a/python/nlp/huchunk.py
+++ b/python/nlp/huchunk.py
@ -359,6 +359,47 @@ class ExcelChunker(HuChunker):
        return flds


+class PptChunker(HuChunker):
+
+    @dataclass
+    class Fields:
+        text_chunks: List = None
+        table_chunks: List = None
+
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, fnm):
+        from pptx import Presentation
+        ppt = Presentation(fnm)
+        flds = self.Fields()
+        for slide in ppt.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    flds.text_chunks.append((shape.text, None))
+        flds.table_chunks = []
+        return flds
+
+
+class TextChunker(HuChunker):
+
+    @dataclass
+    class Fields:
+        text_chunks: List = None
+        table_chunks: List = None
+
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, fnm):
+        flds = self.Fields()
+        with open(fnm, "r") as f:
+            txt = f.read()
+            flds.text_chunks = self.naive_text_chunk(txt)
+        flds.table_chunks = []
+        return flds
+
+
 if __name__ == "__main__":
    import sys
    sys.path.append(os.path.dirname(__file__) + "/../")
--- a/python/svr/parse_user_docs.py
+++ b/python/svr/parse_user_docs.py
@ -0,0 +1,171 @@
+import json, re, sys, os, hashlib, copy, glob, util, time, random
+from util.es_conn import HuEs, Postgres
+from util import rmSpace, findMaxDt
+from FlagEmbedding import FlagModel
+from nlp import huchunk, huqie
+import base64, hashlib
+from io import BytesIO
+from elasticsearch_dsl import Q
+from parser import (
+    PdfParser,
+    DocxParser,
+    ExcelParser
+)
+from nlp.huchunk import (
+    PdfChunker,
+    DocxChunker,
+    ExcelChunker,
+    PptChunker,
+    TextChunker
+)
+
+ES = HuEs("infiniflow")
+BATCH_SIZE = 64
+PG = Postgres("infiniflow", "docgpt")
+
+PDF = PdfChunker(PdfParser())
+DOC = DocxChunker(DocxParser())
+EXC = ExcelChunker(ExcelParser())
+PPT = PptChunker()
+
+
+def chuck_doc(name):
+    name = os.path.split(name)[-1].lower().split(".")[-1]
+    if name.find("pdf") >= 0: return PDF(name)
+    if name.find("doc") >= 0: return DOC(name)
+    if name.find("xlsx") >= 0: return EXC(name)
+    if name.find("ppt") >= 0: return PDF(name)
+    if name.find("pdf") >= 0: return PPT(name)
+    
+    if re.match(r"(txt|csv)", name): return TextChunker(name)
+
+
+def collect(comm, mod, tm):
+    sql = f"""
+    select 
+    did,
+    uid,
+    doc_name,
+    location,
+    updated_at
+    from docinfo
+    where 
+    updated_at >= '{tm}' 
+    and kb_progress = 0
+    and type = 'doc'
+    and MOD(uid, {comm}) = {mod}
+    order by updated_at asc
+    limit 1000
+    """
+    df = PG.select(sql)
+    df = df.fillna("")
+    mtm = str(df["updated_at"].max())[:19]
+    print("TOTAL:", len(df), "To: ", mtm)
+    return df, mtm
+
+
+def set_progress(did, prog, msg):
+    sql = f"""
+    update docinfo set kb_progress={prog}, kb_progress_msg='{msg}' where did={did}
+    """
+    PG.update(sql)
+
+
+def build(row):
+    if row["size"] > 256000000:
+        set_progress(row["did"], -1, "File size exceeds( <= 256Mb )")
+        return  []
+    doc = {
+        "doc_id": row["did"],
+        "title_tks": huqie.qie(os.path.split(row["location"])[-1]),
+        "updated_at": row["updated_at"]
+    }
+    random.seed(time.time())
+    set_progress(row["did"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
+    obj = chuck_doc(row["location"])
+    if not obj: 
+        set_progress(row["did"], -1, "Unsuported file type.")
+        return  []
+
+    set_progress(row["did"], random.randint(20, 60)/100.)
+
+    output_buffer = BytesIO()
+    docs = []
+    md5 = hashlib.md5()
+    for txt, img in obj.text_chunks:
+        d = copy.deepcopy(doc)
+        md5.update((txt + str(d["doc_id"])).encode("utf-8"))
+        d["_id"] = md5.hexdigest()
+        d["content_ltks"] = huqie.qie(txt)
+        d["docnm_kwd"] = rmSpace(d["docnm_tks"])
+        if not img:
+            docs.append(d)
+            continue
+        img.save(output_buffer, format='JPEG')
+        d["img_bin"] = base64.b64encode(output_buffer.getvalue())
+        docs.append(d)
+
+    for arr, img in obj.table_chunks:
+        for i, txt in enumerate(arr):
+            d = copy.deepcopy(doc)
+            d["content_ltks"] = huqie.qie(txt)
+            md5.update((txt + str(d["doc_id"])).encode("utf-8"))
+            d["_id"] = md5.hexdigest()
+            if not img:
+                docs.append(d)
+                continue
+            img.save(output_buffer, format='JPEG')
+            d["img_bin"] = base64.b64encode(output_buffer.getvalue())
+            docs.append(d)
+    set_progress(row["did"], random.randint(60, 70)/100., "Finished slicing. Start to embedding the content.")
+
+    return docs
+
+
+def index_name(uid):return f"docgpt_{uid}"
+
+def init_kb(row):
+    idxnm = index_name(row["uid"])
+    if ES.indexExist(idxnm): return
+    return ES.createIdx(idxnm, json.load(open("res/mapping.json", "r")))
+
+
+model = None
+def embedding(docs):
+    global model
+    tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
+    cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
+    vects = 0.1 * tts + 0.9 * cnts
+    assert len(vects) == len(docs)
+    for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
+    for d in docs:
+        set_progress(d["doc_id"], random.randint(70, 95)/100., 
+                     "Finished embedding! Start to build index!")
+
+
+def main(comm, mod):
+    tm_fnm = f"res/{comm}-{mod}.tm"
+    tmf = open(tm_fnm, "a+")
+    tm = findMaxDt(tm_fnm)
+    rows, tm = collect(comm, mod, tm)
+    for r in rows:
+        if r["is_deleted"]: 
+            ES.deleteByQuery(Q("term", dock_id=r["did"]), index_name(r["uid"]))
+            continue
+
+        cks = build(r)
+        ## TODO: exception handler
+        ## set_progress(r["did"], -1, "ERROR: ")
+        embedding(cks)
+        if cks: init_kb(r)
+        ES.bulk(cks, index_name(r["uid"]))
+        tmf.write(str(r["updated_at"]) + "\n")
+    tmf.close()
+
+
+if __name__ == "__main__":
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    main(comm, rank)
+
--- a/python/util/init.py
+++ b/python/util/init.py
@ -0,0 +1,19 @@
+import re
+
+def rmSpace(txt):
+    txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt)
+    return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt)
+
+def findMaxDt(fnm):
+    m = "1970-01-01 00:00:00"
+    try:
+        with open(fnm, "r") as f:
+            while True:
+                l = f.readline()
+                if not l:break
+                l = l.strip("\n")
+                if l == 'nan':continue
+                if l > m:m = l
+    except Exception as e:
+        print("WARNING: can't find "+ fnm)
+    return m
--- a/python/util/config.py
+++ b/python/util/config.py
@ -9,7 +9,6 @@ if not os.path.exists(__fnm): __fnm = "./sys.cnf"

 CF.read(__fnm)

-
 class Config:
    def __init__(self, env):
        self.env = env
--- a/python/util/db_conn.py
+++ b/python/util/db_conn.py
@ -3,7 +3,7 @@ import time
 from util import config
 import pandas as pd

-class Postgre(object):
+class Postgres(object):
    def __init__(self, env, dbnm):
        self.config = config.init(env)
        self.conn = None
@ -36,9 +36,28 @@ class Postgre(object):
            try:
                return pd.read_sql(sql, self.conn)
            except Exception as e:
-                logging.error(f"Fail to exec {sql}l  "+str(e))
+                logging.error(f"Fail to exec {sql}  "+str(e))
                self.__open__()
                time.sleep(1)

        return pd.DataFrame()

+
+    def update(self, sql):
+        for _ in range(10):
+            try:
+                cur = self.conn.cursor()
+                cur.execute(sql)
+                updated_rows = cur.rowcount
+                conn.commit()
+                cur.close()
+                return updated_rows
+            except Exception as e:
+                logging.error(f"Fail to exec {sql}  "+str(e))
+                self.__open__()
+                time.sleep(1)
+        return 0
+
+if __name__ == "__main__":
+    Postgres("infiniflow", "docgpt")
+
--- a/python/util/es_conn.py
+++ b/python/util/es_conn.py
@ -31,7 +31,7 @@ class HuEs:
        self.info = {}
        self.config = config.init(env)
        self.conn()
-        self.idxnm = self.config.get("idx_nm")
+        self.idxnm = self.config.get("idx_nm","")
        if not self.es.ping():
            raise Exception("Can't connect to ES cluster")

--- a/src/api/doc_info.rs
+++ b/src/api/doc_info.rs
@ -85,6 +85,7 @@ async fn upload(mut payload: Multipart, filename: web::Data<String>, did: web::D
        size,
        kb_infos: Vec::new(),
        kb_progress: 0.0,
+        kb_progress_msg: "".to_string(),
        location: "".to_string(),
        r#type: "".to_string(),
        created_at: Local::now().date_naive(),
@ -124,4 +125,4 @@ async fn mv(params: web::Json<MvParams>, data: web::Data<AppState>) -> Result<Ht
    Ok(HttpResponse::Ok()
        .content_type("application/json")
        .body(serde_json::to_string(&json_response).unwrap()))
-}
+}
--- a/src/entity/doc_info.rs
+++ b/src/entity/doc_info.rs
@ -14,6 +14,7 @@ pub struct Model {
    #[sea_orm(column_name = "type")]
    pub r#type: String,
    pub kb_progress: f64,
+    pub kb_progress_msg: String,
    pub location: String,
    #[sea_orm(ignore)]
    pub kb_infos: Vec<kb_info::Model>,
@ -57,4 +58,4 @@ impl Related<Entity> for Entity {
    }
 }

-impl ActiveModelBehavior for ActiveModel {}
+impl ActiveModelBehavior for ActiveModel {}
--- a/src/service/doc_info.rs
+++ b/src/service/doc_info.rs
@ -104,6 +104,7 @@ impl Mutation {
            size: Set(form_data.size.to_owned()),
            r#type: Set(form_data.r#type.to_owned()),
            kb_progress: Set(form_data.kb_progress.to_owned()),
+            kb_progress_msg: Set(form_data.kb_progress_msg.to_owned()),
            location: Set(form_data.location.to_owned()),
            created_at: Set(Local::now().date_naive()),
            updated_at: Set(Local::now().date_naive()),
@ -130,6 +131,7 @@ impl Mutation {
            size: Set(form_data.size.to_owned()),
            r#type: Set(form_data.r#type.to_owned()),
            kb_progress: Set(form_data.kb_progress.to_owned()),
+            kb_progress_msg: Set(form_data.kb_progress_msg.to_owned()),
            location: Set(form_data.location.to_owned()),
            created_at: Default::default(),
            updated_at: Set(Local::now().date_naive()),
@ -151,4 +153,4 @@ impl Mutation {
    pub async fn delete_all_doc_infos(db: &DbConn) -> Result<DeleteResult, DbErr> {
        Entity::delete_many().exec(db).await
    }
-}
+}