add field progress msg into docinfo; add file processing procedure
This commit is contained in:
parent
b72bbdb76d
commit
080a29912c
12 changed files with 275 additions and 21 deletions
|
|
@ -1,7 +1,7 @@
|
|||
version: '2.2'
|
||||
services:
|
||||
es01:
|
||||
container_name: docass-es-01
|
||||
container_name: docgpt-es-01
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
||||
volumes:
|
||||
- esdata01:/usr/share/elasticsearch/data
|
||||
|
|
@ -20,14 +20,14 @@ services:
|
|||
soft: -1
|
||||
hard: -1
|
||||
networks:
|
||||
- docass
|
||||
- docgpt
|
||||
restart: always
|
||||
|
||||
kibana:
|
||||
depends_on:
|
||||
- es01
|
||||
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
||||
container_name: docass-kibana
|
||||
container_name: docgpt-kibana
|
||||
volumes:
|
||||
- kibanadata:/usr/share/kibana/data
|
||||
ports:
|
||||
|
|
@ -37,21 +37,21 @@ services:
|
|||
- ELASTICSEARCH_HOSTS=http://es01:9200
|
||||
mem_limit: ${MEM_LIMIT}
|
||||
networks:
|
||||
- docass
|
||||
- docgpt
|
||||
|
||||
postgres:
|
||||
image: postgres
|
||||
container_name: docass-postgres
|
||||
container_name: docgpt-postgres
|
||||
environment:
|
||||
- POSTGRES_USER=${POSTGRES_USER}
|
||||
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||
- POSTGRES_DB=${POSTGRES_DB}
|
||||
ports:
|
||||
- 5455:5455
|
||||
- 5455:5432
|
||||
volumes:
|
||||
- pg_data:/usr/share/elasticsearch/data
|
||||
- pg_data:/var/lib/postgresql/data
|
||||
networks:
|
||||
- docass
|
||||
- docgpt
|
||||
restart: always
|
||||
|
||||
|
||||
|
|
@ -64,5 +64,5 @@ volumes:
|
|||
driver: local
|
||||
|
||||
networks:
|
||||
docass:
|
||||
docgpt:
|
||||
driver: bridge
|
||||
|
|
|
|||
|
|
@ -134,6 +134,7 @@ impl MigrationTrait for Migration {
|
|||
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
||||
.col(ColumnDef::new(DocInfo::Type).string().not_null()).comment("doc|folder")
|
||||
.col(ColumnDef::new(DocInfo::KbProgress).float().default(0))
|
||||
.col(ColumnDef::new(DocInfo::KbProgressMsg).string().default(""))
|
||||
.col(ColumnDef::new(DocInfo::CreatedAt).date().not_null())
|
||||
.col(ColumnDef::new(DocInfo::UpdatedAt).date().not_null())
|
||||
.col(ColumnDef::new(DocInfo::IsDeleted).boolean().default(false))
|
||||
|
|
@ -285,6 +286,7 @@ enum DocInfo {
|
|||
Size,
|
||||
Type,
|
||||
KbProgress,
|
||||
KbProgressMsg,
|
||||
CreatedAt,
|
||||
UpdatedAt,
|
||||
IsDeleted,
|
||||
|
|
@ -300,4 +302,4 @@ enum DialogInfo {
|
|||
CreatedAt,
|
||||
UpdatedAt,
|
||||
IsDeleted,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
[online]
|
||||
[infiniflow]
|
||||
es=127.0.0.1:9200
|
||||
idx_nm=toxic
|
||||
pgdb_usr=root
|
||||
pgdb_pwd=infiniflow_docass
|
||||
pgdb_pwd=infiniflow_docgpt
|
||||
pgdb_host=127.0.0.1
|
||||
pgdb_port=5432
|
||||
pgdb_port=5455
|
||||
|
||||
|
|
|
|||
|
|
@ -359,6 +359,47 @@ class ExcelChunker(HuChunker):
|
|||
return flds
|
||||
|
||||
|
||||
class PptChunker(HuChunker):
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, fnm):
|
||||
from pptx import Presentation
|
||||
ppt = Presentation(fnm)
|
||||
flds = self.Fields()
|
||||
for slide in ppt.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
flds.text_chunks.append((shape.text, None))
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
|
||||
class TextChunker(HuChunker):
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, fnm):
|
||||
flds = self.Fields()
|
||||
with open(fnm, "r") as f:
|
||||
txt = f.read()
|
||||
flds.text_chunks = self.naive_text_chunk(txt)
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(__file__) + "/../")
|
||||
|
|
|
|||
171
python/svr/parse_user_docs.py
Normal file
171
python/svr/parse_user_docs.py
Normal file
|
|
@ -0,0 +1,171 @@
|
|||
import json, re, sys, os, hashlib, copy, glob, util, time, random
|
||||
from util.es_conn import HuEs, Postgres
|
||||
from util import rmSpace, findMaxDt
|
||||
from FlagEmbedding import FlagModel
|
||||
from nlp import huchunk, huqie
|
||||
import base64, hashlib
|
||||
from io import BytesIO
|
||||
from elasticsearch_dsl import Q
|
||||
from parser import (
|
||||
PdfParser,
|
||||
DocxParser,
|
||||
ExcelParser
|
||||
)
|
||||
from nlp.huchunk import (
|
||||
PdfChunker,
|
||||
DocxChunker,
|
||||
ExcelChunker,
|
||||
PptChunker,
|
||||
TextChunker
|
||||
)
|
||||
|
||||
ES = HuEs("infiniflow")
|
||||
BATCH_SIZE = 64
|
||||
PG = Postgres("infiniflow", "docgpt")
|
||||
|
||||
PDF = PdfChunker(PdfParser())
|
||||
DOC = DocxChunker(DocxParser())
|
||||
EXC = ExcelChunker(ExcelParser())
|
||||
PPT = PptChunker()
|
||||
|
||||
|
||||
def chuck_doc(name):
|
||||
name = os.path.split(name)[-1].lower().split(".")[-1]
|
||||
if name.find("pdf") >= 0: return PDF(name)
|
||||
if name.find("doc") >= 0: return DOC(name)
|
||||
if name.find("xlsx") >= 0: return EXC(name)
|
||||
if name.find("ppt") >= 0: return PDF(name)
|
||||
if name.find("pdf") >= 0: return PPT(name)
|
||||
|
||||
if re.match(r"(txt|csv)", name): return TextChunker(name)
|
||||
|
||||
|
||||
def collect(comm, mod, tm):
|
||||
sql = f"""
|
||||
select
|
||||
did,
|
||||
uid,
|
||||
doc_name,
|
||||
location,
|
||||
updated_at
|
||||
from docinfo
|
||||
where
|
||||
updated_at >= '{tm}'
|
||||
and kb_progress = 0
|
||||
and type = 'doc'
|
||||
and MOD(uid, {comm}) = {mod}
|
||||
order by updated_at asc
|
||||
limit 1000
|
||||
"""
|
||||
df = PG.select(sql)
|
||||
df = df.fillna("")
|
||||
mtm = str(df["updated_at"].max())[:19]
|
||||
print("TOTAL:", len(df), "To: ", mtm)
|
||||
return df, mtm
|
||||
|
||||
|
||||
def set_progress(did, prog, msg):
|
||||
sql = f"""
|
||||
update docinfo set kb_progress={prog}, kb_progress_msg='{msg}' where did={did}
|
||||
"""
|
||||
PG.update(sql)
|
||||
|
||||
|
||||
def build(row):
|
||||
if row["size"] > 256000000:
|
||||
set_progress(row["did"], -1, "File size exceeds( <= 256Mb )")
|
||||
return []
|
||||
doc = {
|
||||
"doc_id": row["did"],
|
||||
"title_tks": huqie.qie(os.path.split(row["location"])[-1]),
|
||||
"updated_at": row["updated_at"]
|
||||
}
|
||||
random.seed(time.time())
|
||||
set_progress(row["did"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
|
||||
obj = chuck_doc(row["location"])
|
||||
if not obj:
|
||||
set_progress(row["did"], -1, "Unsuported file type.")
|
||||
return []
|
||||
|
||||
set_progress(row["did"], random.randint(20, 60)/100.)
|
||||
|
||||
output_buffer = BytesIO()
|
||||
docs = []
|
||||
md5 = hashlib.md5()
|
||||
for txt, img in obj.text_chunks:
|
||||
d = copy.deepcopy(doc)
|
||||
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
|
||||
d["_id"] = md5.hexdigest()
|
||||
d["content_ltks"] = huqie.qie(txt)
|
||||
d["docnm_kwd"] = rmSpace(d["docnm_tks"])
|
||||
if not img:
|
||||
docs.append(d)
|
||||
continue
|
||||
img.save(output_buffer, format='JPEG')
|
||||
d["img_bin"] = base64.b64encode(output_buffer.getvalue())
|
||||
docs.append(d)
|
||||
|
||||
for arr, img in obj.table_chunks:
|
||||
for i, txt in enumerate(arr):
|
||||
d = copy.deepcopy(doc)
|
||||
d["content_ltks"] = huqie.qie(txt)
|
||||
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
|
||||
d["_id"] = md5.hexdigest()
|
||||
if not img:
|
||||
docs.append(d)
|
||||
continue
|
||||
img.save(output_buffer, format='JPEG')
|
||||
d["img_bin"] = base64.b64encode(output_buffer.getvalue())
|
||||
docs.append(d)
|
||||
set_progress(row["did"], random.randint(60, 70)/100., "Finished slicing. Start to embedding the content.")
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def index_name(uid):return f"docgpt_{uid}"
|
||||
|
||||
def init_kb(row):
|
||||
idxnm = index_name(row["uid"])
|
||||
if ES.indexExist(idxnm): return
|
||||
return ES.createIdx(idxnm, json.load(open("res/mapping.json", "r")))
|
||||
|
||||
|
||||
model = None
|
||||
def embedding(docs):
|
||||
global model
|
||||
tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
|
||||
cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
|
||||
vects = 0.1 * tts + 0.9 * cnts
|
||||
assert len(vects) == len(docs)
|
||||
for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
|
||||
for d in docs:
|
||||
set_progress(d["doc_id"], random.randint(70, 95)/100.,
|
||||
"Finished embedding! Start to build index!")
|
||||
|
||||
|
||||
def main(comm, mod):
|
||||
tm_fnm = f"res/{comm}-{mod}.tm"
|
||||
tmf = open(tm_fnm, "a+")
|
||||
tm = findMaxDt(tm_fnm)
|
||||
rows, tm = collect(comm, mod, tm)
|
||||
for r in rows:
|
||||
if r["is_deleted"]:
|
||||
ES.deleteByQuery(Q("term", dock_id=r["did"]), index_name(r["uid"]))
|
||||
continue
|
||||
|
||||
cks = build(r)
|
||||
## TODO: exception handler
|
||||
## set_progress(r["did"], -1, "ERROR: ")
|
||||
embedding(cks)
|
||||
if cks: init_kb(r)
|
||||
ES.bulk(cks, index_name(r["uid"]))
|
||||
tmf.write(str(r["updated_at"]) + "\n")
|
||||
tmf.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from mpi4py import MPI
|
||||
comm = MPI.COMM_WORLD
|
||||
rank = comm.Get_rank()
|
||||
main(comm, rank)
|
||||
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
import re
|
||||
|
||||
def rmSpace(txt):
|
||||
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt)
|
||||
|
||||
def findMaxDt(fnm):
|
||||
m = "1970-01-01 00:00:00"
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
l = l.strip("\n")
|
||||
if l == 'nan':continue
|
||||
if l > m:m = l
|
||||
except Exception as e:
|
||||
print("WARNING: can't find "+ fnm)
|
||||
return m
|
||||
|
|
@ -9,7 +9,6 @@ if not os.path.exists(__fnm): __fnm = "./sys.cnf"
|
|||
|
||||
CF.read(__fnm)
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import time
|
|||
from util import config
|
||||
import pandas as pd
|
||||
|
||||
class Postgre(object):
|
||||
class Postgres(object):
|
||||
def __init__(self, env, dbnm):
|
||||
self.config = config.init(env)
|
||||
self.conn = None
|
||||
|
|
@ -36,9 +36,28 @@ class Postgre(object):
|
|||
try:
|
||||
return pd.read_sql(sql, self.conn)
|
||||
except Exception as e:
|
||||
logging.error(f"Fail to exec {sql}l "+str(e))
|
||||
logging.error(f"Fail to exec {sql} "+str(e))
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def update(self, sql):
|
||||
for _ in range(10):
|
||||
try:
|
||||
cur = self.conn.cursor()
|
||||
cur.execute(sql)
|
||||
updated_rows = cur.rowcount
|
||||
conn.commit()
|
||||
cur.close()
|
||||
return updated_rows
|
||||
except Exception as e:
|
||||
logging.error(f"Fail to exec {sql} "+str(e))
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
Postgres("infiniflow", "docgpt")
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ class HuEs:
|
|||
self.info = {}
|
||||
self.config = config.init(env)
|
||||
self.conn()
|
||||
self.idxnm = self.config.get("idx_nm")
|
||||
self.idxnm = self.config.get("idx_nm","")
|
||||
if not self.es.ping():
|
||||
raise Exception("Can't connect to ES cluster")
|
||||
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ async fn upload(mut payload: Multipart, filename: web::Data<String>, did: web::D
|
|||
size,
|
||||
kb_infos: Vec::new(),
|
||||
kb_progress: 0.0,
|
||||
kb_progress_msg: "".to_string(),
|
||||
location: "".to_string(),
|
||||
r#type: "".to_string(),
|
||||
created_at: Local::now().date_naive(),
|
||||
|
|
@ -124,4 +125,4 @@ async fn mv(params: web::Json<MvParams>, data: web::Data<AppState>) -> Result<Ht
|
|||
Ok(HttpResponse::Ok()
|
||||
.content_type("application/json")
|
||||
.body(serde_json::to_string(&json_response).unwrap()))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ pub struct Model {
|
|||
#[sea_orm(column_name = "type")]
|
||||
pub r#type: String,
|
||||
pub kb_progress: f64,
|
||||
pub kb_progress_msg: String,
|
||||
pub location: String,
|
||||
#[sea_orm(ignore)]
|
||||
pub kb_infos: Vec<kb_info::Model>,
|
||||
|
|
@ -57,4 +58,4 @@ impl Related<Entity> for Entity {
|
|||
}
|
||||
}
|
||||
|
||||
impl ActiveModelBehavior for ActiveModel {}
|
||||
impl ActiveModelBehavior for ActiveModel {}
|
||||
|
|
|
|||
|
|
@ -104,6 +104,7 @@ impl Mutation {
|
|||
size: Set(form_data.size.to_owned()),
|
||||
r#type: Set(form_data.r#type.to_owned()),
|
||||
kb_progress: Set(form_data.kb_progress.to_owned()),
|
||||
kb_progress_msg: Set(form_data.kb_progress_msg.to_owned()),
|
||||
location: Set(form_data.location.to_owned()),
|
||||
created_at: Set(Local::now().date_naive()),
|
||||
updated_at: Set(Local::now().date_naive()),
|
||||
|
|
@ -130,6 +131,7 @@ impl Mutation {
|
|||
size: Set(form_data.size.to_owned()),
|
||||
r#type: Set(form_data.r#type.to_owned()),
|
||||
kb_progress: Set(form_data.kb_progress.to_owned()),
|
||||
kb_progress_msg: Set(form_data.kb_progress_msg.to_owned()),
|
||||
location: Set(form_data.location.to_owned()),
|
||||
created_at: Default::default(),
|
||||
updated_at: Set(Local::now().date_naive()),
|
||||
|
|
@ -151,4 +153,4 @@ impl Mutation {
|
|||
pub async fn delete_all_doc_infos(db: &DbConn) -> Result<DeleteResult, DbErr> {
|
||||
Entity::delete_many().exec(db).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue