KevinHuSh
commited on
Commit
·
cdd9565
1
Parent(s):
3fc700a
finish add thumbnail to video,image,pdf files (#18)
Browse files- Cargo.toml +1 -1
- migration/src/m20220101_000001_create_table.rs +5 -5
- python/svr/add_thumbnail2file.py +118 -0
- python/util/minio_conn.py +14 -1
- src/api/doc_info.rs +7 -5
Cargo.toml
CHANGED
|
@@ -32,4 +32,4 @@ regex = "1.10.2"
|
|
| 32 |
name = "doc_gpt"
|
| 33 |
|
| 34 |
[workspace]
|
| 35 |
-
members = [".", "migration"]
|
|
|
|
| 32 |
name = "doc_gpt"
|
| 33 |
|
| 34 |
[workspace]
|
| 35 |
+
members = [".", "migration"]
|
migration/src/m20220101_000001_create_table.rs
CHANGED
|
@@ -201,7 +201,7 @@ impl MigrationTrait for Migration {
|
|
| 201 |
.col(ColumnDef::new(DocInfo::Location).string().not_null())
|
| 202 |
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
| 203 |
.col(ColumnDef::new(DocInfo::Type).string().not_null())
|
| 204 |
-
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().
|
| 205 |
.comment("doc type|folder")
|
| 206 |
.col(
|
| 207 |
ColumnDef::new(DocInfo::CreatedAt)
|
|
@@ -274,28 +274,28 @@ impl MigrationTrait for Migration {
|
|
| 274 |
.values_panic([
|
| 275 |
(1).into(),
|
| 276 |
"Video".into(),
|
| 277 |
-
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)".into(),
|
| 278 |
(1).into(),
|
| 279 |
(1).into(),
|
| 280 |
])
|
| 281 |
.values_panic([
|
| 282 |
(1).into(),
|
| 283 |
"Picture".into(),
|
| 284 |
-
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)".into(),
|
| 285 |
(2).into(),
|
| 286 |
(2).into(),
|
| 287 |
])
|
| 288 |
.values_panic([
|
| 289 |
(1).into(),
|
| 290 |
"Music".into(),
|
| 291 |
-
".*\\.(
|
| 292 |
(3).into(),
|
| 293 |
(3).into(),
|
| 294 |
])
|
| 295 |
.values_panic([
|
| 296 |
(1).into(),
|
| 297 |
"Document".into(),
|
| 298 |
-
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)".into(),
|
| 299 |
(3).into(),
|
| 300 |
(3).into(),
|
| 301 |
])
|
|
|
|
| 201 |
.col(ColumnDef::new(DocInfo::Location).string().not_null())
|
| 202 |
.col(ColumnDef::new(DocInfo::Size).big_integer().not_null())
|
| 203 |
.col(ColumnDef::new(DocInfo::Type).string().not_null())
|
| 204 |
+
.col(ColumnDef::new(DocInfo::ThumbnailBase64).string().default(""))
|
| 205 |
.comment("doc type|folder")
|
| 206 |
.col(
|
| 207 |
ColumnDef::new(DocInfo::CreatedAt)
|
|
|
|
| 274 |
.values_panic([
|
| 275 |
(1).into(),
|
| 276 |
"Video".into(),
|
| 277 |
+
".*\\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)".into(),
|
| 278 |
(1).into(),
|
| 279 |
(1).into(),
|
| 280 |
])
|
| 281 |
.values_panic([
|
| 282 |
(1).into(),
|
| 283 |
"Picture".into(),
|
| 284 |
+
".*\\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)".into(),
|
| 285 |
(2).into(),
|
| 286 |
(2).into(),
|
| 287 |
])
|
| 288 |
.values_panic([
|
| 289 |
(1).into(),
|
| 290 |
"Music".into(),
|
| 291 |
+
".*\\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)".into(),
|
| 292 |
(3).into(),
|
| 293 |
(3).into(),
|
| 294 |
])
|
| 295 |
.values_panic([
|
| 296 |
(1).into(),
|
| 297 |
"Document".into(),
|
| 298 |
+
".*\\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)".into(),
|
| 299 |
(3).into(),
|
| 300 |
(3).into(),
|
| 301 |
])
|
python/svr/add_thumbnail2file.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, datetime, random, re, cv2
|
| 2 |
+
from os.path import dirname, realpath
|
| 3 |
+
sys.path.append(dirname(realpath(__file__)) + "/../")
|
| 4 |
+
from util.db_conn import Postgres
|
| 5 |
+
from util.minio_conn import HuMinio
|
| 6 |
+
from util import findMaxDt
|
| 7 |
+
import base64
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from PIL import Image
|
| 11 |
+
import pdfplumber
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
PG = Postgres("infiniflow", "docgpt")
|
| 15 |
+
MINIO = HuMinio("infiniflow")
|
| 16 |
+
def set_thumbnail(did, base64):
|
| 17 |
+
sql = f"""
|
| 18 |
+
update doc_info set thumbnail_base64='{base64}'
|
| 19 |
+
where
|
| 20 |
+
did={did}
|
| 21 |
+
"""
|
| 22 |
+
PG.update(sql)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def collect(comm, mod, tm):
|
| 26 |
+
sql = f"""
|
| 27 |
+
select
|
| 28 |
+
did, uid, doc_name, location, updated_at
|
| 29 |
+
from doc_info
|
| 30 |
+
where
|
| 31 |
+
updated_at >= '{tm}'
|
| 32 |
+
and MOD(did, {comm}) = {mod}
|
| 33 |
+
and is_deleted=false
|
| 34 |
+
and type <> 'folder'
|
| 35 |
+
and thumbnail_base64=''
|
| 36 |
+
order by updated_at asc
|
| 37 |
+
limit 10
|
| 38 |
+
"""
|
| 39 |
+
docs = PG.select(sql)
|
| 40 |
+
if len(docs) == 0:return pd.DataFrame()
|
| 41 |
+
|
| 42 |
+
mtm = str(docs["updated_at"].max())[:19]
|
| 43 |
+
print("TOTAL:", len(docs), "To: ", mtm)
|
| 44 |
+
return docs
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def build(row):
|
| 48 |
+
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
|
| 49 |
+
row["doc_name"].lower().strip()):
|
| 50 |
+
set_thumbnail(row["did"], "_")
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
def thumbnail(img, SIZE=128):
|
| 54 |
+
w,h = img.size
|
| 55 |
+
p = SIZE/max(w, h)
|
| 56 |
+
w, h = int(w*p), int(h*p)
|
| 57 |
+
img.thumbnail((w, h))
|
| 58 |
+
buffered = BytesIO()
|
| 59 |
+
try:
|
| 60 |
+
img.save(buffered, format="JPEG")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
try:
|
| 63 |
+
img.save(buffered, format="PNG")
|
| 64 |
+
except Exception as ee:
|
| 65 |
+
pass
|
| 66 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
|
| 70 |
+
if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
|
| 71 |
+
pdf = pdfplumber.open(iobytes)
|
| 72 |
+
img = pdf.pages[0].to_image().annotated
|
| 73 |
+
set_thumbnail(row["did"], thumbnail(img))
|
| 74 |
+
|
| 75 |
+
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
|
| 76 |
+
img = Image.open(iobytes)
|
| 77 |
+
set_thumbnail(row["did"], thumbnail(img))
|
| 78 |
+
|
| 79 |
+
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
|
| 80 |
+
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
|
| 81 |
+
row["location"],
|
| 82 |
+
expires=datetime.timedelta(seconds=60)
|
| 83 |
+
)
|
| 84 |
+
cap = cv2.VideoCapture(url)
|
| 85 |
+
succ = cap.isOpened()
|
| 86 |
+
i = random.randint(1, 11)
|
| 87 |
+
while succ:
|
| 88 |
+
ret, frame = cap.read()
|
| 89 |
+
if not ret: break
|
| 90 |
+
if i > 0:
|
| 91 |
+
i -= 1
|
| 92 |
+
continue
|
| 93 |
+
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
| 94 |
+
print(img.size)
|
| 95 |
+
set_thumbnail(row["did"], thumbnail(img))
|
| 96 |
+
cap.release()
|
| 97 |
+
cv2.destroyAllWindows()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def main(comm, mod):
|
| 101 |
+
global model
|
| 102 |
+
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
|
| 103 |
+
tm = findMaxDt(tm_fnm)
|
| 104 |
+
rows = collect(comm, mod, tm)
|
| 105 |
+
if len(rows) == 0:return
|
| 106 |
+
|
| 107 |
+
tmf = open(tm_fnm, "a+")
|
| 108 |
+
for _, r in rows.iterrows():
|
| 109 |
+
build(r)
|
| 110 |
+
tmf.write(str(r["updated_at"]) + "\n")
|
| 111 |
+
tmf.close()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
from mpi4py import MPI
|
| 116 |
+
comm = MPI.COMM_WORLD
|
| 117 |
+
main(comm.Get_size(), comm.Get_rank())
|
| 118 |
+
|
python/util/minio_conn.py
CHANGED
|
@@ -54,11 +54,24 @@ class HuMinio(object):
|
|
| 54 |
r = self.conn.get_object(bucket, fnm)
|
| 55 |
return r.read()
|
| 56 |
except Exception as e:
|
| 57 |
-
logging.error(f"
|
| 58 |
self.__open__()
|
| 59 |
time.sleep(1)
|
| 60 |
return
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
if __name__ == "__main__":
|
| 63 |
conn = HuMinio("infiniflow")
|
| 64 |
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
|
|
|
| 54 |
r = self.conn.get_object(bucket, fnm)
|
| 55 |
return r.read()
|
| 56 |
except Exception as e:
|
| 57 |
+
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
| 58 |
self.__open__()
|
| 59 |
time.sleep(1)
|
| 60 |
return
|
| 61 |
|
| 62 |
+
|
| 63 |
+
def get_presigned_url(self, bucket, fnm, expires):
|
| 64 |
+
for _ in range(10):
|
| 65 |
+
try:
|
| 66 |
+
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
| 69 |
+
self.__open__()
|
| 70 |
+
time.sleep(1)
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
if __name__ == "__main__":
|
| 76 |
conn = HuMinio("infiniflow")
|
| 77 |
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
src/api/doc_info.rs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
use std::collections::HashMap;
|
| 2 |
use std::io::BufReader;
|
| 3 |
use actix_multipart_extract::{ File, Multipart, MultipartForm };
|
|
|
|
| 4 |
use actix_web::{ HttpResponse, post, web };
|
| 5 |
use chrono::{ Utc, FixedOffset };
|
| 6 |
use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
|
|
@@ -68,7 +69,7 @@ pub struct UploadForm {
|
|
| 68 |
fn file_type(filename: &String) -> String {
|
| 69 |
let fnm = filename.to_lowercase();
|
| 70 |
if
|
| 71 |
-
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa)$")
|
| 72 |
.unwrap()
|
| 73 |
.captures(&fnm)
|
| 74 |
{
|
|
@@ -76,7 +77,7 @@ fn file_type(filename: &String) -> String {
|
|
| 76 |
}
|
| 77 |
if
|
| 78 |
let Some(_) = Regex::new(
|
| 79 |
-
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng)$"
|
| 80 |
)
|
| 81 |
.unwrap()
|
| 82 |
.captures(&fnm)
|
|
@@ -84,14 +85,14 @@ fn file_type(filename: &String) -> String {
|
|
| 84 |
return "Picture".to_owned();
|
| 85 |
}
|
| 86 |
if
|
| 87 |
-
let Some(_) = Regex::new(r"\.(
|
| 88 |
.unwrap()
|
| 89 |
.captures(&fnm)
|
| 90 |
{
|
| 91 |
return "Music".to_owned();
|
| 92 |
}
|
| 93 |
if
|
| 94 |
-
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp)$")
|
| 95 |
.unwrap()
|
| 96 |
.captures(&fnm)
|
| 97 |
{
|
|
@@ -100,6 +101,7 @@ fn file_type(filename: &String) -> String {
|
|
| 100 |
"Other".to_owned()
|
| 101 |
}
|
| 102 |
|
|
|
|
| 103 |
#[post("/v1.0/upload")]
|
| 104 |
async fn upload(
|
| 105 |
payload: Multipart<UploadForm>,
|
|
|
|
| 1 |
+
use std::collections::{HashMap};
|
| 2 |
use std::io::BufReader;
|
| 3 |
use actix_multipart_extract::{ File, Multipart, MultipartForm };
|
| 4 |
+
use actix_web::web::Bytes;
|
| 5 |
use actix_web::{ HttpResponse, post, web };
|
| 6 |
use chrono::{ Utc, FixedOffset };
|
| 7 |
use minio::s3::args::{ BucketExistsArgs, MakeBucketArgs, PutObjectArgs };
|
|
|
|
| 69 |
fn file_type(filename: &String) -> String {
|
| 70 |
let fnm = filename.to_lowercase();
|
| 71 |
if
|
| 72 |
+
let Some(_) = Regex::new(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$")
|
| 73 |
.unwrap()
|
| 74 |
.captures(&fnm)
|
| 75 |
{
|
|
|
|
| 77 |
}
|
| 78 |
if
|
| 79 |
let Some(_) = Regex::new(
|
| 80 |
+
r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$"
|
| 81 |
)
|
| 82 |
.unwrap()
|
| 83 |
.captures(&fnm)
|
|
|
|
| 85 |
return "Picture".to_owned();
|
| 86 |
}
|
| 87 |
if
|
| 88 |
+
let Some(_) = Regex::new(r"\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$")
|
| 89 |
.unwrap()
|
| 90 |
.captures(&fnm)
|
| 91 |
{
|
| 92 |
return "Music".to_owned();
|
| 93 |
}
|
| 94 |
if
|
| 95 |
+
let Some(_) = Regex::new(r"\.(pdf|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key)$")
|
| 96 |
.unwrap()
|
| 97 |
.captures(&fnm)
|
| 98 |
{
|
|
|
|
| 101 |
"Other".to_owned()
|
| 102 |
}
|
| 103 |
|
| 104 |
+
|
| 105 |
#[post("/v1.0/upload")]
|
| 106 |
async fn upload(
|
| 107 |
payload: Multipart<UploadForm>,
|