KevinHuSh
commited on
Commit
·
738c322
1
Parent(s):
f4456af
add docker compose (#8)
Browse files* add docker compose
* add docker compose
- docker/docker-compose.yml +68 -0
- python/README.md +22 -0
- python/conf/sys.cnf +4 -0
- python/nlp/huchunk.py +14 -2
- python/parser/pdf_parser.py +1 -1
- python/util/db_conn.py +44 -0
docker/docker-compose.yml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '2.2'
|
| 2 |
+
services:
|
| 3 |
+
es01:
|
| 4 |
+
container_name: docass-es-01
|
| 5 |
+
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
|
| 6 |
+
volumes:
|
| 7 |
+
- esdata01:/usr/share/elasticsearch/data
|
| 8 |
+
ports:
|
| 9 |
+
- ${ES_PORT}:9200
|
| 10 |
+
environment:
|
| 11 |
+
- node.name=es01
|
| 12 |
+
- cluster.name=${CLUSTER_NAME}
|
| 13 |
+
- cluster.initial_master_nodes=es01
|
| 14 |
+
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
|
| 15 |
+
- bootstrap.memory_lock=false
|
| 16 |
+
- xpack.security.enabled=false
|
| 17 |
+
mem_limit: ${MEM_LIMIT}
|
| 18 |
+
ulimits:
|
| 19 |
+
memlock:
|
| 20 |
+
soft: -1
|
| 21 |
+
hard: -1
|
| 22 |
+
networks:
|
| 23 |
+
- docass
|
| 24 |
+
restart: always
|
| 25 |
+
|
| 26 |
+
kibana:
|
| 27 |
+
depends_on:
|
| 28 |
+
- es01
|
| 29 |
+
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
| 30 |
+
container_name: docass-kibana
|
| 31 |
+
volumes:
|
| 32 |
+
- kibanadata:/usr/share/kibana/data
|
| 33 |
+
ports:
|
| 34 |
+
- ${KIBANA_PORT}:5601
|
| 35 |
+
environment:
|
| 36 |
+
- SERVERNAME=kibana
|
| 37 |
+
- ELASTICSEARCH_HOSTS=http://es01:9200
|
| 38 |
+
mem_limit: ${MEM_LIMIT}
|
| 39 |
+
networks:
|
| 40 |
+
- docass
|
| 41 |
+
|
| 42 |
+
postgres:
|
| 43 |
+
image: postgres
|
| 44 |
+
container_name: docass-postgres
|
| 45 |
+
environment:
|
| 46 |
+
- POSTGRES_USER=${POSTGRES_USER}
|
| 47 |
+
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
| 48 |
+
- POSTGRES_DB=${POSTGRES_DB}
|
| 49 |
+
ports:
|
| 50 |
+
- 5455:5455
|
| 51 |
+
volumes:
|
| 52 |
+
- pg_data:/usr/share/elasticsearch/data
|
| 53 |
+
networks:
|
| 54 |
+
- docass
|
| 55 |
+
restart: always
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
volumes:
|
| 59 |
+
esdata01:
|
| 60 |
+
driver: local
|
| 61 |
+
kibanadata:
|
| 62 |
+
driver: local
|
| 63 |
+
pg_data:
|
| 64 |
+
driver: local
|
| 65 |
+
|
| 66 |
+
networks:
|
| 67 |
+
docass:
|
| 68 |
+
driver: bridge
|
python/README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
```shell
|
| 3 |
+
|
| 4 |
+
docker pull postgres
|
| 5 |
+
|
| 6 |
+
LOCAL_POSTGRES_DATA=./postgres-data
|
| 7 |
+
|
| 8 |
+
docker run
|
| 9 |
+
--name docass-postgres
|
| 10 |
+
-p 5455:5432
|
| 11 |
+
-v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
|
| 12 |
+
-e POSTGRES_USER=root
|
| 13 |
+
-e POSTGRES_PASSWORD=infiniflow_docass
|
| 14 |
+
-e POSTGRES_DB=docass
|
| 15 |
+
-d
|
| 16 |
+
postgres
|
| 17 |
+
|
| 18 |
+
docker network create elastic
|
| 19 |
+
docker pull elasticsearch:8.11.3;
|
| 20 |
+
docker pull docker.elastic.co/kibana/kibana:8.11.3
|
| 21 |
+
|
| 22 |
+
```
|
python/conf/sys.cnf
CHANGED
|
@@ -1,4 +1,8 @@
|
|
| 1 |
[online]
|
| 2 |
es=127.0.0.1:9200
|
| 3 |
idx_nm=toxic
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
|
|
|
| 1 |
[online]
|
| 2 |
es=127.0.0.1:9200
|
| 3 |
idx_nm=toxic
|
| 4 |
+
pgdb_usr=root
|
| 5 |
+
pgdb_pwd=infiniflow_docass
|
| 6 |
+
pgdb_host=127.0.0.1
|
| 7 |
+
pgdb_port=5432
|
| 8 |
|
python/nlp/huchunk.py
CHANGED
|
@@ -291,6 +291,12 @@ class PdfChunker(HuChunker):
|
|
| 291 |
|
| 292 |
|
| 293 |
class DocxChunker(HuChunker):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
def __init__(self, doc_parser):
|
| 295 |
self.doc = doc_parser
|
| 296 |
super().__init__()
|
|
@@ -336,6 +342,12 @@ class DocxChunker(HuChunker):
|
|
| 336 |
|
| 337 |
|
| 338 |
class ExcelChunker(HuChunker):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
def __init__(self, excel_parser):
|
| 340 |
self.excel = excel_parser
|
| 341 |
super().__init__()
|
|
@@ -354,10 +366,10 @@ if __name__ == "__main__":
|
|
| 354 |
from parser import PdfParser
|
| 355 |
ckr = PdfChunker(PdfParser())
|
| 356 |
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
| 357 |
-
from
|
| 358 |
ckr = DocxChunker(DocxParser())
|
| 359 |
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
| 360 |
-
from
|
| 361 |
ckr = ExcelChunker(ExcelParser())
|
| 362 |
|
| 363 |
# ckr.html(sys.argv[1])
|
|
|
|
| 291 |
|
| 292 |
|
| 293 |
class DocxChunker(HuChunker):
|
| 294 |
+
|
| 295 |
+
@dataclass
|
| 296 |
+
class Fields:
|
| 297 |
+
text_chunks: List = None
|
| 298 |
+
table_chunks: List = None
|
| 299 |
+
|
| 300 |
def __init__(self, doc_parser):
|
| 301 |
self.doc = doc_parser
|
| 302 |
super().__init__()
|
|
|
|
| 342 |
|
| 343 |
|
| 344 |
class ExcelChunker(HuChunker):
|
| 345 |
+
|
| 346 |
+
@dataclass
|
| 347 |
+
class Fields:
|
| 348 |
+
text_chunks: List = None
|
| 349 |
+
table_chunks: List = None
|
| 350 |
+
|
| 351 |
def __init__(self, excel_parser):
|
| 352 |
self.excel = excel_parser
|
| 353 |
super().__init__()
|
|
|
|
| 366 |
from parser import PdfParser
|
| 367 |
ckr = PdfChunker(PdfParser())
|
| 368 |
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
| 369 |
+
from parser import DocxParser
|
| 370 |
ckr = DocxChunker(DocxParser())
|
| 371 |
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
| 372 |
+
from parser import ExcelParser
|
| 373 |
ckr = ExcelChunker(ExcelParser())
|
| 374 |
|
| 375 |
# ckr.html(sys.argv[1])
|
python/parser/pdf_parser.py
CHANGED
|
@@ -323,7 +323,7 @@ class HuParser:
|
|
| 323 |
return layouts
|
| 324 |
|
| 325 |
def __table_paddle(self, images):
|
| 326 |
-
tbls = self.tbl_det([
|
| 327 |
res = []
|
| 328 |
# align left&right for rows, align top&bottom for columns
|
| 329 |
for tbl in tbls:
|
|
|
|
| 323 |
return layouts
|
| 324 |
|
| 325 |
def __table_paddle(self, images):
|
| 326 |
+
tbls = self.tbl_det([img for img in images], threshold=0.5)
|
| 327 |
res = []
|
| 328 |
# align left&right for rows, align top&bottom for columns
|
| 329 |
for tbl in tbls:
|
python/util/db_conn.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
from util import config
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
class Postgre(object):
|
| 7 |
+
def __init__(self, env, dbnm):
|
| 8 |
+
self.config = config.init(env)
|
| 9 |
+
self.conn = None
|
| 10 |
+
self.dbnm = dbnm
|
| 11 |
+
self.__open__()
|
| 12 |
+
|
| 13 |
+
def __open__(self):
|
| 14 |
+
import psycopg2
|
| 15 |
+
try:
|
| 16 |
+
if self.conn:self.__close__()
|
| 17 |
+
del self.conn
|
| 18 |
+
except Exception as e:
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def __close__(self):
|
| 28 |
+
try:
|
| 29 |
+
self.conn.close()
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e))
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def select(self, sql):
|
| 35 |
+
for _ in range(10):
|
| 36 |
+
try:
|
| 37 |
+
return pd.read_sql(sql, self.conn)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logging.error(f"Fail to exec {sql}l "+str(e))
|
| 40 |
+
self.__open__()
|
| 41 |
+
time.sleep(1)
|
| 42 |
+
|
| 43 |
+
return pd.DataFrame()
|
| 44 |
+
|