diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 000000000..6bdc0f472 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,68 @@ +version: '2.2' +services: + es01: + container_name: docass-es-01 + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - esdata01:/usr/share/elasticsearch/data + ports: + - ${ES_PORT}:9200 + environment: + - node.name=es01 + - cluster.name=${CLUSTER_NAME} + - cluster.initial_master_nodes=es01 + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + - bootstrap.memory_lock=false + - xpack.security.enabled=false + mem_limit: ${MEM_LIMIT} + ulimits: + memlock: + soft: -1 + hard: -1 + networks: + - docass + restart: always + + kibana: + depends_on: + - es01 + image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + container_name: docass-kibana + volumes: + - kibanadata:/usr/share/kibana/data + ports: + - ${KIBANA_PORT}:5601 + environment: + - SERVERNAME=kibana + - ELASTICSEARCH_HOSTS=http://es01:9200 + mem_limit: ${MEM_LIMIT} + networks: + - docass + + postgres: + image: postgres + container_name: docass-postgres + environment: + - POSTGRES_USER=${POSTGRES_USER} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + - POSTGRES_DB=${POSTGRES_DB} + ports: + - 5455:5455 + volumes: + - pg_data:/usr/share/elasticsearch/data + networks: + - docass + restart: always + + +volumes: + esdata01: + driver: local + kibanadata: + driver: local + pg_data: + driver: local + +networks: + docass: + driver: bridge diff --git a/python/README.md b/python/README.md new file mode 100644 index 000000000..4f351eb9a --- /dev/null +++ b/python/README.md @@ -0,0 +1,22 @@ + +```shell + +docker pull postgres + +LOCAL_POSTGRES_DATA=./postgres-data + +docker run + --name docass-postgres + -p 5455:5432 + -v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data + -e POSTGRES_USER=root + -e POSTGRES_PASSWORD=infiniflow_docass + -e POSTGRES_DB=docass + -d + postgres + +docker network create elastic +docker pull elasticsearch:8.11.3; +docker pull docker.elastic.co/kibana/kibana:8.11.3 + +``` diff --git a/python/conf/sys.cnf b/python/conf/sys.cnf index 375573651..fc0d64c41 100755 --- a/python/conf/sys.cnf +++ b/python/conf/sys.cnf @@ -1,4 +1,8 @@ [online] es=127.0.0.1:9200 idx_nm=toxic +pgdb_usr=root +pgdb_pwd=infiniflow_docass +pgdb_host=127.0.0.1 +pgdb_port=5432 diff --git a/python/cv/table_recognize.py b/python/cv/table_recognize.py index de4f90164..7cb7925da 100644 --- a/python/cv/table_recognize.py +++ b/python/cv/table_recognize.py @@ -37,19 +37,10 @@ class TableTransformer: continue box = [round(x, 2) for x in box.tolist()] feas.append({ - "top": box[1], "bottom": box[-1], - "x0": box[0], "x1": box[2], + "type": id2label[label.item()], "score": score.item(), - "label": id2label[label.item()] + "bbox": box }) - wids = [f["x1"] - f["x0"] - for f in feas if f["label"].find("row") > 0] - if wids: - mw = max(wids) / 2 - for f in feas: - if f["label"].find("row") > 0 and f["x1"] - f["x0"] < mw: - f["x1"] += mw - res.append(feas) return res @@ -68,7 +59,7 @@ class TableTransformer: )] + ":{:.2f}".format(score), fill=(r, g, b)) img.save(f"./t{i}.%d.jpg" % randint(0, 1000)) - def __call__(self, images): + def __call__(self, images, threshold=0.8): res = [] for i in range(0, len(images), self.batch_size): imgs = images[i: i + self.batch_size] @@ -81,9 +72,9 @@ class TableTransformer: # [scores, labels, boxes}] with torch.no_grad(): bres = self.rec_img_pro.post_process_object_detection(outputs, - threshold=0.80, + threshold=threshold, target_sizes=target_sizes) - self.__draw(bres, imgs, self.rec_mdl.config.id2label) + #self.__draw(bres, imgs, self.rec_mdl.config.id2label) res.extend(self.__friendly(bres, self.rec_mdl.config.id2label)) return res diff --git a/python/nlp/huchunk.py b/python/nlp/huchunk.py index ba47545fc..619640227 100644 --- a/python/nlp/huchunk.py +++ b/python/nlp/huchunk.py @@ -291,6 +291,12 @@ class PdfChunker(HuChunker): class DocxChunker(HuChunker): + + @dataclass + class Fields: + text_chunks: List = None + table_chunks: List = None + def __init__(self, doc_parser): self.doc = doc_parser super().__init__() @@ -336,6 +342,12 @@ class DocxChunker(HuChunker): class ExcelChunker(HuChunker): + + @dataclass + class Fields: + text_chunks: List = None + table_chunks: List = None + def __init__(self, excel_parser): self.excel = excel_parser super().__init__() @@ -354,10 +366,10 @@ if __name__ == "__main__": from parser import PdfParser ckr = PdfChunker(PdfParser()) if sys.argv[1].split(".")[-1].lower().find("doc") >= 0: - from .parser import DocxParser + from parser import DocxParser ckr = DocxChunker(DocxParser()) if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0: - from .parser import ExcelParser + from parser import ExcelParser ckr = ExcelChunker(ExcelParser()) # ckr.html(sys.argv[1]) diff --git a/python/parser/pdf_parser.py b/python/parser/pdf_parser.py index 744716aab..7fd341518 100644 --- a/python/parser/pdf_parser.py +++ b/python/parser/pdf_parser.py @@ -323,7 +323,7 @@ class HuParser: return layouts def __table_paddle(self, images): - tbls = self.tbl_det([np.array(img) for img in images], thr=0.5) + tbls = self.tbl_det([img for img in images], threshold=0.5) res = [] # align left&right for rows, align top&bottom for columns for tbl in tbls: diff --git a/python/util/db_conn.py b/python/util/db_conn.py new file mode 100644 index 000000000..b67e13e92 --- /dev/null +++ b/python/util/db_conn.py @@ -0,0 +1,44 @@ +import logging +import time +from util import config +import pandas as pd + +class Postgre(object): + def __init__(self, env, dbnm): + self.config = config.init(env) + self.conn = None + self.dbnm = dbnm + self.__open__() + + def __open__(self): + import psycopg2 + try: + if self.conn:self.__close__() + del self.conn + except Exception as e: + pass + + try: + self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}") + except Exception as e: + logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e)) + + + def __close__(self): + try: + self.conn.close() + except Exception as e: + logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e)) + + + def select(self, sql): + for _ in range(10): + try: + return pd.read_sql(sql, self.conn) + except Exception as e: + logging.error(f"Fail to exec {sql}l "+str(e)) + self.__open__() + time.sleep(1) + + return pd.DataFrame() +