add docker compose
This commit is contained in:
parent
6e69a39c9e
commit
53d3a2c1c1
8 changed files with 322 additions and 17 deletions
68
docker/docker-compose.yml
Normal file
68
docker/docker-compose.yml
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
version: '2.2'
|
||||||
|
services:
|
||||||
|
es01:
|
||||||
|
container_name: docass-es-01
|
||||||
|
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
||||||
|
volumes:
|
||||||
|
- esdata01:/usr/share/elasticsearch/data
|
||||||
|
ports:
|
||||||
|
- ${ES_PORT}:9200
|
||||||
|
environment:
|
||||||
|
- node.name=es01
|
||||||
|
- cluster.name=${CLUSTER_NAME}
|
||||||
|
- cluster.initial_master_nodes=es01
|
||||||
|
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
||||||
|
- bootstrap.memory_lock=false
|
||||||
|
- xpack.security.enabled=false
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
ulimits:
|
||||||
|
memlock:
|
||||||
|
soft: -1
|
||||||
|
hard: -1
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
depends_on:
|
||||||
|
- es01
|
||||||
|
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
||||||
|
container_name: docass-kibana
|
||||||
|
volumes:
|
||||||
|
- kibanadata:/usr/share/kibana/data
|
||||||
|
ports:
|
||||||
|
- ${KIBANA_PORT}:5601
|
||||||
|
environment:
|
||||||
|
- SERVERNAME=kibana
|
||||||
|
- ELASTICSEARCH_HOSTS=http://es01:9200
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
|
||||||
|
postgres:
|
||||||
|
image: postgres
|
||||||
|
container_name: docass-postgres
|
||||||
|
environment:
|
||||||
|
- POSTGRES_USER=${POSTGRES_USER}
|
||||||
|
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||||
|
- POSTGRES_DB=${POSTGRES_DB}
|
||||||
|
ports:
|
||||||
|
- 5455:5455
|
||||||
|
volumes:
|
||||||
|
- pg_data:/usr/share/elasticsearch/data
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
esdata01:
|
||||||
|
driver: local
|
||||||
|
kibanadata:
|
||||||
|
driver: local
|
||||||
|
pg_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
docass:
|
||||||
|
driver: bridge
|
||||||
164
docker/docker-compose.yml.bk
Normal file
164
docker/docker-compose.yml.bk
Normal file
|
|
@ -0,0 +1,164 @@
|
||||||
|
version: '2.2'
|
||||||
|
services:
|
||||||
|
setup:
|
||||||
|
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
||||||
|
volumes:
|
||||||
|
- certs:/usr/share/elasticsearch/config/certs
|
||||||
|
user: "0"
|
||||||
|
command: >
|
||||||
|
bash -c '
|
||||||
|
if [ x${ELASTIC_PASSWORD} == x ]; then
|
||||||
|
echo "Set the ELASTIC_PASSWORD environment variable in the .env file";
|
||||||
|
exit 1;
|
||||||
|
elif [ x${KIBANA_PASSWORD} == x ]; then
|
||||||
|
echo "Set the KIBANA_PASSWORD environment variable in the .env file";
|
||||||
|
exit 1;
|
||||||
|
elif [ x${POSTGRES_USER} == x ]; then
|
||||||
|
echo "Set the POSTGRES_USER environment variable in the .env file";
|
||||||
|
exit 1;
|
||||||
|
elif [ x${POSTGRES_PASSWORD} == x ]; then
|
||||||
|
echo "Set the POSTGRES_PASSWORD environment variable in the .env file";
|
||||||
|
exit 1;
|
||||||
|
elif [ x${POSTGRES_DB} == x ]; then
|
||||||
|
echo "Set the POSTGRES_DB environment variable in the .env file";
|
||||||
|
exit 1;
|
||||||
|
fi;
|
||||||
|
if [ ! -f config/certs/ca.zip ]; then
|
||||||
|
echo "Creating CA";
|
||||||
|
bin/elasticsearch-certutil ca --silent --pem -out config/certs/ca.zip;
|
||||||
|
unzip config/certs/ca.zip -d config/certs;
|
||||||
|
fi;
|
||||||
|
if [ ! -f config/certs/certs.zip ]; then
|
||||||
|
echo "Creating certs";
|
||||||
|
echo -ne \
|
||||||
|
"instances:\n"\
|
||||||
|
" - name: es01\n"\
|
||||||
|
" dns:\n"\
|
||||||
|
" - es01\n"\
|
||||||
|
" - localhost\n"\
|
||||||
|
" ip:\n"\
|
||||||
|
" - 127.0.0.1\n"\
|
||||||
|
> config/certs/instances.yml;
|
||||||
|
bin/elasticsearch-certutil cert --silent --pem -out config/certs/certs.zip --in config/certs/instances.yml --ca-cert config/certs/ca/ca.crt --ca-key config/certs/ca/ca.key;
|
||||||
|
unzip config/certs/certs.zip -d config/certs;
|
||||||
|
fi;
|
||||||
|
echo "Setting file permissions"
|
||||||
|
chown -R root:root config/certs;
|
||||||
|
find . -type d -exec chmod 750 \{\} \;;
|
||||||
|
find . -type f -exec chmod 640 \{\} \;;
|
||||||
|
echo "Waiting for Elasticsearch availability";
|
||||||
|
until curl -s --cacert config/certs/ca/ca.crt https://es01:9200 | grep -q "missing authentication credentials"; do sleep 30; done;
|
||||||
|
echo "Setting kibana_system password";
|
||||||
|
until curl -s -X POST --cacert config/certs/ca/ca.crt -u "elastic:${ELASTIC_PASSWORD}" -H "Content-Type: application/json" https://es01:9200/_security/user/kibana_system/_password -d "{\"password\":\"${KIBANA_PASSWORD}\"}" | grep -q "^{}"; do sleep 10; done;
|
||||||
|
echo "All done!";
|
||||||
|
'
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "[ -f config/certs/es01/es01.crt ]"]
|
||||||
|
interval: 1s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 120
|
||||||
|
|
||||||
|
es01:
|
||||||
|
depends_on:
|
||||||
|
setup:
|
||||||
|
condition: service_healthy
|
||||||
|
container_name: docass-es-01
|
||||||
|
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
||||||
|
volumes:
|
||||||
|
- certs:/usr/share/elasticsearch/config/certs
|
||||||
|
- esdata01:/usr/share/elasticsearch/data
|
||||||
|
ports:
|
||||||
|
- ${ES_PORT}:9200
|
||||||
|
environment:
|
||||||
|
- node.name=es01
|
||||||
|
- cluster.name=${CLUSTER_NAME}
|
||||||
|
- cluster.initial_master_nodes=es01
|
||||||
|
- discovery.seed_hosts=es01
|
||||||
|
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
||||||
|
- bootstrap.memory_lock=true
|
||||||
|
#- xpack.security.enabled=false
|
||||||
|
#- xpack.security.http.ssl.enabled=false
|
||||||
|
#- xpack.security.http.ssl.key=certs/es01/es01.key
|
||||||
|
#- xpack.security.http.ssl.certificate=certs/es01/es01.crt
|
||||||
|
#- xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
|
||||||
|
#- xpack.security.transport.ssl.enabled=false
|
||||||
|
#- xpack.security.transport.ssl.key=certs/es01/es01.key
|
||||||
|
#- xpack.security.transport.ssl.certificate=certs/es01/es01.crt
|
||||||
|
#- xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
|
||||||
|
#- xpack.security.transport.ssl.verification_mode=certificate
|
||||||
|
#- xpack.license.self_generated.type=${LICENSE}
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
ulimits:
|
||||||
|
memlock:
|
||||||
|
soft: -1
|
||||||
|
hard: -1
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD-SHELL",
|
||||||
|
"curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
|
||||||
|
]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 120
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
#restart: always
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
depends_on:
|
||||||
|
- es01
|
||||||
|
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
||||||
|
container_name: docass-kibana
|
||||||
|
volumes:
|
||||||
|
- certs:/usr/share/kibana/config/certs
|
||||||
|
- kibanadata:/usr/share/kibana/data
|
||||||
|
ports:
|
||||||
|
- ${KIBANA_PORT}:5601
|
||||||
|
environment:
|
||||||
|
- SERVERNAME=kibana
|
||||||
|
- ELASTICSEARCH_HOSTS=https://es01:9200
|
||||||
|
- ELASTICSEARCH_USERNAME=kibana_system
|
||||||
|
- ELASTICSEARCH_PASSWORD=${KIBANA_PASSWORD}
|
||||||
|
- ELASTICSEARCH_SSL_CERTIFICATEAUTHORITIES=config/certs/ca/ca.crt
|
||||||
|
mem_limit: ${MEM_LIMIT}
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
[
|
||||||
|
"CMD-SHELL",
|
||||||
|
"curl -s -I http://localhost:5601 | grep -q 'HTTP/1.1 302 Found'",
|
||||||
|
]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 120
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
|
||||||
|
postgres:
|
||||||
|
image: postgres
|
||||||
|
container_name: docass-postgres
|
||||||
|
environment:
|
||||||
|
- POSTGRES_USER=${POSTGRES_USER}
|
||||||
|
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||||
|
- POSTGRES_DB=${POSTGRES_DB}
|
||||||
|
ports:
|
||||||
|
- 5455:5455
|
||||||
|
volumes:
|
||||||
|
- pg_data:/usr/share/elasticsearch/data
|
||||||
|
networks:
|
||||||
|
- docass
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
certs:
|
||||||
|
driver: local
|
||||||
|
esdata01:
|
||||||
|
driver: local
|
||||||
|
kibanadata:
|
||||||
|
driver: local
|
||||||
|
pg_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
docass:
|
||||||
|
driver: bridge
|
||||||
22
python/README.md
Normal file
22
python/README.md
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
|
||||||
|
```shell
|
||||||
|
|
||||||
|
docker pull postgres
|
||||||
|
|
||||||
|
LOCAL_POSTGRES_DATA=./postgres-data
|
||||||
|
|
||||||
|
docker run
|
||||||
|
--name docass-postgres
|
||||||
|
-p 5455:5432
|
||||||
|
-v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
|
||||||
|
-e POSTGRES_USER=root
|
||||||
|
-e POSTGRES_PASSWORD=infiniflow_docass
|
||||||
|
-e POSTGRES_DB=docass
|
||||||
|
-d
|
||||||
|
postgres
|
||||||
|
|
||||||
|
docker network create elastic
|
||||||
|
docker pull elasticsearch:8.11.3;
|
||||||
|
docker pull docker.elastic.co/kibana/kibana:8.11.3
|
||||||
|
|
||||||
|
```
|
||||||
|
|
@ -1,4 +1,8 @@
|
||||||
[online]
|
[online]
|
||||||
es=127.0.0.1:9200
|
es=127.0.0.1:9200
|
||||||
idx_nm=toxic
|
idx_nm=toxic
|
||||||
|
pgdb_usr=root
|
||||||
|
pgdb_pwd=infiniflow_docass
|
||||||
|
pgdb_host=127.0.0.1
|
||||||
|
pgdb_port=5432
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,19 +37,10 @@ class TableTransformer:
|
||||||
continue
|
continue
|
||||||
box = [round(x, 2) for x in box.tolist()]
|
box = [round(x, 2) for x in box.tolist()]
|
||||||
feas.append({
|
feas.append({
|
||||||
"top": box[1], "bottom": box[-1],
|
"type": id2label[label.item()],
|
||||||
"x0": box[0], "x1": box[2],
|
|
||||||
"score": score.item(),
|
"score": score.item(),
|
||||||
"label": id2label[label.item()]
|
"bbox": box
|
||||||
})
|
})
|
||||||
wids = [f["x1"] - f["x0"]
|
|
||||||
for f in feas if f["label"].find("row") > 0]
|
|
||||||
if wids:
|
|
||||||
mw = max(wids) / 2
|
|
||||||
for f in feas:
|
|
||||||
if f["label"].find("row") > 0 and f["x1"] - f["x0"] < mw:
|
|
||||||
f["x1"] += mw
|
|
||||||
|
|
||||||
res.append(feas)
|
res.append(feas)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
@ -68,7 +59,7 @@ class TableTransformer:
|
||||||
)] + ":{:.2f}".format(score), fill=(r, g, b))
|
)] + ":{:.2f}".format(score), fill=(r, g, b))
|
||||||
img.save(f"./t{i}.%d.jpg" % randint(0, 1000))
|
img.save(f"./t{i}.%d.jpg" % randint(0, 1000))
|
||||||
|
|
||||||
def __call__(self, images):
|
def __call__(self, images, threshold=0.8):
|
||||||
res = []
|
res = []
|
||||||
for i in range(0, len(images), self.batch_size):
|
for i in range(0, len(images), self.batch_size):
|
||||||
imgs = images[i: i + self.batch_size]
|
imgs = images[i: i + self.batch_size]
|
||||||
|
|
@ -81,9 +72,9 @@ class TableTransformer:
|
||||||
# [scores, labels, boxes}]
|
# [scores, labels, boxes}]
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
bres = self.rec_img_pro.post_process_object_detection(outputs,
|
bres = self.rec_img_pro.post_process_object_detection(outputs,
|
||||||
threshold=0.80,
|
threshold=threshold,
|
||||||
target_sizes=target_sizes)
|
target_sizes=target_sizes)
|
||||||
self.__draw(bres, imgs, self.rec_mdl.config.id2label)
|
#self.__draw(bres, imgs, self.rec_mdl.config.id2label)
|
||||||
res.extend(self.__friendly(bres, self.rec_mdl.config.id2label))
|
res.extend(self.__friendly(bres, self.rec_mdl.config.id2label))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -291,6 +291,12 @@ class PdfChunker(HuChunker):
|
||||||
|
|
||||||
|
|
||||||
class DocxChunker(HuChunker):
|
class DocxChunker(HuChunker):
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Fields:
|
||||||
|
text_chunks: List = None
|
||||||
|
table_chunks: List = None
|
||||||
|
|
||||||
def __init__(self, doc_parser):
|
def __init__(self, doc_parser):
|
||||||
self.doc = doc_parser
|
self.doc = doc_parser
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
@ -336,6 +342,12 @@ class DocxChunker(HuChunker):
|
||||||
|
|
||||||
|
|
||||||
class ExcelChunker(HuChunker):
|
class ExcelChunker(HuChunker):
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Fields:
|
||||||
|
text_chunks: List = None
|
||||||
|
table_chunks: List = None
|
||||||
|
|
||||||
def __init__(self, excel_parser):
|
def __init__(self, excel_parser):
|
||||||
self.excel = excel_parser
|
self.excel = excel_parser
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
@ -354,10 +366,10 @@ if __name__ == "__main__":
|
||||||
from parser import PdfParser
|
from parser import PdfParser
|
||||||
ckr = PdfChunker(PdfParser())
|
ckr = PdfChunker(PdfParser())
|
||||||
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
||||||
from .parser import DocxParser
|
from parser import DocxParser
|
||||||
ckr = DocxChunker(DocxParser())
|
ckr = DocxChunker(DocxParser())
|
||||||
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
||||||
from .parser import ExcelParser
|
from parser import ExcelParser
|
||||||
ckr = ExcelChunker(ExcelParser())
|
ckr = ExcelChunker(ExcelParser())
|
||||||
|
|
||||||
# ckr.html(sys.argv[1])
|
# ckr.html(sys.argv[1])
|
||||||
|
|
|
||||||
|
|
@ -323,7 +323,7 @@ class HuParser:
|
||||||
return layouts
|
return layouts
|
||||||
|
|
||||||
def __table_paddle(self, images):
|
def __table_paddle(self, images):
|
||||||
tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
|
tbls = self.tbl_det([img for img in images], threshold=0.5)
|
||||||
res = []
|
res = []
|
||||||
# align left&right for rows, align top&bottom for columns
|
# align left&right for rows, align top&bottom for columns
|
||||||
for tbl in tbls:
|
for tbl in tbls:
|
||||||
|
|
|
||||||
44
python/util/db_conn.py
Normal file
44
python/util/db_conn.py
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from util import config
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class Postgre(object):
|
||||||
|
def __init__(self, env, dbnm):
|
||||||
|
self.config = config.init(env)
|
||||||
|
self.conn = None
|
||||||
|
self.dbnm = dbnm
|
||||||
|
self.__open__()
|
||||||
|
|
||||||
|
def __open__(self):
|
||||||
|
import psycopg2
|
||||||
|
try:
|
||||||
|
if self.conn:self.__close__()
|
||||||
|
del self.conn
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def __close__(self):
|
||||||
|
try:
|
||||||
|
self.conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def select(self, sql):
|
||||||
|
for _ in range(10):
|
||||||
|
try:
|
||||||
|
return pd.read_sql(sql, self.conn)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Fail to exec {sql}l "+str(e))
|
||||||
|
self.__open__()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
Loading…
Add table
Reference in a new issue