|
@@ -0,0 +1,335 @@
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+from .__load__ import *
|
|
|
|
+
|
|
|
|
+class Extract(object):
|
|
|
|
+
|
|
|
|
+ def update(self, site, appid, source_file, source_id, source_type, uid):
|
|
|
|
+
|
|
|
|
+ info = self.getFile(appid, source_file)
|
|
|
|
+
|
|
|
|
+ extract = Demeter.model('extract')
|
|
|
|
+ extract.site_id = site
|
|
|
|
+ extract.key = info['key']
|
|
|
|
+
|
|
|
|
+ data = extract.select(type='fetchone')
|
|
|
|
+ if not data:
|
|
|
|
+ extract.site_id = site
|
|
|
|
+ extract.uid = uid
|
|
|
|
+ extract.key = info['key']
|
|
|
|
+ extract.name = info['name']
|
|
|
|
+ extract.source_id = source_id
|
|
|
|
+ extract.source_type = source_type
|
|
|
|
+ extract.source_size = 0
|
|
|
|
+ extract.source_file = info['source_file']
|
|
|
|
+ extract.local_file = info['local_file']
|
|
|
|
+ extract.local_path = info['local_path']
|
|
|
|
+ id = extract.insert()
|
|
|
|
+ info['status'] = 1
|
|
|
|
+ info['id'] = id
|
|
|
|
+ else:
|
|
|
|
+ info['id'] = data['id']
|
|
|
|
+ info['status'] = data['status']
|
|
|
|
+ if uid:
|
|
|
|
+ self.auth(site, uid, info['id'], 1)
|
|
|
|
+
|
|
|
|
+ return info
|
|
|
|
+
|
|
|
|
+ def getAuth(self, site, uid, extract_id):
|
|
|
|
+ auth = Demeter.model('extract_auth')
|
|
|
|
+ auth.uid = uid
|
|
|
|
+ auth.site_id = site
|
|
|
|
+ auth.extract_id = extract_id
|
|
|
|
+ data = auth.select(type='fetchone')
|
|
|
|
+ return data
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def auth(self, site, uid, extract_id, status):
|
|
|
|
+ auth = Demeter.model('extract_auth')
|
|
|
|
+ auth.uid = uid
|
|
|
|
+ auth.site_id = site
|
|
|
|
+ auth.extract_id = extract_id
|
|
|
|
+ data = auth.select(type='fetchone')
|
|
|
|
+ if not data:
|
|
|
|
+ auth.site_id = site
|
|
|
|
+ auth.uid = uid
|
|
|
|
+ auth.extract_id = extract_id
|
|
|
|
+ auth.status = status
|
|
|
|
+ auth.insert()
|
|
|
|
+ elif data['status'] != status:
|
|
|
|
+ # 适用于文档转让
|
|
|
|
+ auth.id = data['id']
|
|
|
|
+ update = {}
|
|
|
|
+ update['status'] = status
|
|
|
|
+ auth.update(update)
|
|
|
|
+
|
|
|
|
+ return True
|
|
|
|
+
|
|
|
|
+ def get(self, site, appid, file):
|
|
|
|
+ extract = Demeter.model('extract')
|
|
|
|
+ extract.site_id = site
|
|
|
|
+ #extract.key = self.getKey(appid, file)
|
|
|
|
+ extract.key = file
|
|
|
|
+ data = extract.select(type='fetchone')
|
|
|
|
+ return data
|
|
|
|
+
|
|
|
|
+ def getKey(self, appid, file):
|
|
|
|
+ return Demeter.sha1(str(appid) + '_' + str(file))
|
|
|
|
+
|
|
|
|
+ def getFile(self, appid, file):
|
|
|
|
+ info = {}
|
|
|
|
+
|
|
|
|
+ (filepath,temp) = os.path.split(file)
|
|
|
|
+ (filename,extension) = os.path.splitext(temp)
|
|
|
|
+
|
|
|
|
+ info['source_file'] = file
|
|
|
|
+ info['key'] = self.getKey(appid, file)
|
|
|
|
+ info['ext'] = extension
|
|
|
|
+ info['name'] = filename
|
|
|
|
+
|
|
|
|
+ info = self.getLocalFile(appid, file, info)
|
|
|
|
+
|
|
|
|
+ return info
|
|
|
|
+
|
|
|
|
+ def getLocalFile(self, appid, file, info):
|
|
|
|
+
|
|
|
|
+ day = str(date.today())
|
|
|
|
+ day = day.split('-')
|
|
|
|
+
|
|
|
|
+ #filename = Demeter.md5(str(uuid.uuid5(uuid.uuid1(), info['key'])))
|
|
|
|
+ filename = info['key']
|
|
|
|
+ filepath = str(appid) + '/' + day[0] + '/' + day[1] + '/' + day[2]
|
|
|
|
+ path = ''
|
|
|
|
+
|
|
|
|
+ if 'save' in Demeter.config['setting']:
|
|
|
|
+ filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
|
|
|
|
+ else:
|
|
|
|
+ filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
|
|
|
|
+
|
|
|
|
+ local = filepath + info['ext']
|
|
|
|
+
|
|
|
|
+ info['local_file'] = local
|
|
|
|
+ info['local_path'] = filepath + '/'
|
|
|
|
+
|
|
|
|
+ return info
|
|
|
|
+ if File.exists(local):
|
|
|
|
+ return info
|
|
|
|
+ else:
|
|
|
|
+ self.download(file, local);
|
|
|
|
+ return info
|
|
|
|
+
|
|
|
|
+ def download(self, file, local):
|
|
|
|
+ if 'http' in file:
|
|
|
|
+ import requests
|
|
|
|
+ r = requests.get(file, stream=True)
|
|
|
|
+ with open(local, 'wb') as up:
|
|
|
|
+ for chunk in r.iter_content(chunk_size=1024):
|
|
|
|
+ if chunk:
|
|
|
|
+ up.write(chunk)
|
|
|
|
+
|
|
|
|
+ else:
|
|
|
|
+ import shutil
|
|
|
|
+ shutil.copyfile(file, local)
|
|
|
|
+
|
|
|
|
+ if File.exists(local):
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def total(self, path):
|
|
|
|
+ page = 0
|
|
|
|
+ for parentdir,dirname,filenames in os.walk(path):
|
|
|
|
+ for filename in filenames:
|
|
|
|
+ if os.path.splitext(filename)[1]=='.page':
|
|
|
|
+ page = page + 1
|
|
|
|
+ return page
|
|
|
|
+
|
|
|
|
+ def handle(self, id):
|
|
|
|
+ model = Demeter.model('extract')
|
|
|
|
+ model.id = id
|
|
|
|
+ info = model.select(type='fetchone')
|
|
|
|
+
|
|
|
|
+ if not info:
|
|
|
|
+ return
|
|
|
|
+ siteModel = Demeter.model('site')
|
|
|
|
+ siteModel.id = info['site_id']
|
|
|
|
+ site = siteModel.select(type='fetchone')
|
|
|
|
+
|
|
|
|
+ status = True
|
|
|
|
+ if info['status'] == 1 or info['status'] == 4:
|
|
|
|
+ status = False
|
|
|
|
+
|
|
|
|
+ if info and status == False:
|
|
|
|
+ model.id = id
|
|
|
|
+ update = {}
|
|
|
|
+ update['status'] = 2
|
|
|
|
+ model.update(update)
|
|
|
|
+
|
|
|
|
+ if not File.exists(info['local_file']):
|
|
|
|
+ self.download(info['source_file'], info['local_file'])
|
|
|
|
+
|
|
|
|
+ if True:
|
|
|
|
+ File.mkdir(info['local_path'])
|
|
|
|
+
|
|
|
|
+ Demeter.service(info['ext'], 'docs').load(info)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if 'txt' in info['ext']:
|
|
|
|
+ import chardet
|
|
|
|
+ file_path = info['local_file']
|
|
|
|
+ with open(file_path, 'rb') as fp:
|
|
|
|
+ file_data = fp.read()
|
|
|
|
+ result = chardet.detect(file_data)
|
|
|
|
+ fp.close()
|
|
|
|
+ if result['encoding'] != 'utf-8':
|
|
|
|
+ file_content = file_data.decode(result['encoding'])
|
|
|
|
+ f = open(file_path, 'w')
|
|
|
|
+ f.write(file_content)
|
|
|
|
+ f.close()
|
|
|
|
+ if 'pdf' in info['ext']:
|
|
|
|
+
|
|
|
|
+ handle = self.command(info)
|
|
|
|
+ Shell.popen(handle)
|
|
|
|
+ if File.exists(info['html']):
|
|
|
|
+
|
|
|
|
+ # 处理图片
|
|
|
|
+ self.pic(info)
|
|
|
|
+ #self.string_switch(info['html'], "taste", "tasting")
|
|
|
|
+ # 获取有多少页
|
|
|
|
+ page = self.total(info['path'])
|
|
|
|
+ model.id = id
|
|
|
|
+ size = os.path.getsize(info['local'])
|
|
|
|
+ update = {}
|
|
|
|
+ update['file_size'] = size
|
|
|
|
+ update['page'] = page
|
|
|
|
+ update['status'] = 3
|
|
|
|
+ model.update(update)
|
|
|
|
+
|
|
|
|
+ # 通知接口 通知应用成功转换
|
|
|
|
+ info['page'] = page
|
|
|
|
+ info['file_size'] = size
|
|
|
|
+ info['extract_status'] = 1
|
|
|
|
+ self.api(info, site)
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ model.id = id
|
|
|
|
+ update = {}
|
|
|
|
+ update['status'] = 4
|
|
|
|
+ model.update(update)
|
|
|
|
+ # 通知接口 通知应用失败转换
|
|
|
|
+ info['page'] = 0
|
|
|
|
+ info['file_size'] = 0
|
|
|
|
+ info['extract_status'] = 2
|
|
|
|
+ self.api(info, site)
|
|
|
|
+
|
|
|
|
+ def api(self, info, site):
|
|
|
|
+ if 'file_id' in info and info['file_id']:
|
|
|
|
+ api = site['api']
|
|
|
|
+
|
|
|
|
+ appid = site['appid']
|
|
|
|
+ appsecret = site['appsecret']
|
|
|
|
+ timestamp = Demeter.time()
|
|
|
|
+ nonce = Demeter.hash()
|
|
|
|
+ file = info['key']
|
|
|
|
+ file_id = info['file_id']
|
|
|
|
+ uid = info['uid']
|
|
|
|
+
|
|
|
|
+ param = self.signature(site['id'], appid, appsecret, timestamp, nonce, file, file_id, uid)
|
|
|
|
+
|
|
|
|
+ param['url'] = 'main/view'
|
|
|
|
+ param['img'] = info['url'] + '.jpg'
|
|
|
|
+ param['page'] = info['page']
|
|
|
|
+ param['ext'] = info['ext']
|
|
|
|
+ param['file_size'] = info['file_size']
|
|
|
|
+ param['status'] = info['extract_status']
|
|
|
|
+
|
|
|
|
+ Demeter.curl(api, param, 'post')
|
|
|
|
+
|
|
|
|
+ def pic(self, info):
|
|
|
|
+ from extract2jpg import extract2jpg
|
|
|
|
+ from wand.image import Image
|
|
|
|
+ width = 800
|
|
|
|
+ # 对生成的图片进行缩放
|
|
|
|
+ files = File.getFiles(info['path'])
|
|
|
|
+ if files:
|
|
|
|
+ for file in files:
|
|
|
|
+ if 'jpg' in file or 'png' in file:
|
|
|
|
+ file = info['path'] + file
|
|
|
|
+ with Image(filename=file) as img:
|
|
|
|
+ target_width, target_height = self.getSize(width, img.width, img.height)
|
|
|
|
+ img.sample(target_width, target_height)
|
|
|
|
+ img.save(filename=file)
|
|
|
|
+
|
|
|
|
+ # 生成图片
|
|
|
|
+ dest = info['html'] + '.photo'
|
|
|
|
+ result = extract2jpg.extract_extract2jpg(info['extract'], dest, pages="0,1")
|
|
|
|
+
|
|
|
|
+ i = 0
|
|
|
|
+ for j in result[0]['output_jpgfiles']:
|
|
|
|
+ source = j
|
|
|
|
+ if i == 0:
|
|
|
|
+ dest = info['html'] + '.jpg'
|
|
|
|
+ else:
|
|
|
|
+ dest = info['html'] + '.jpg_' + str(i) + '.jpg'
|
|
|
|
+ command = 'mv '+source+' ' + dest
|
|
|
|
+ Shell.popen(command)
|
|
|
|
+
|
|
|
|
+ with Image(filename=dest) as img:
|
|
|
|
+ target_width, target_height = self.getSize(width, img.width, img.height)
|
|
|
|
+ img.sample(target_width, target_height)
|
|
|
|
+ img.save(filename=dest)
|
|
|
|
+ i = i+1
|
|
|
|
+
|
|
|
|
+ '''
|
|
|
|
+ from wand.image import Image
|
|
|
|
+ extract = Image(filename=source, resolution=50)
|
|
|
|
+ jpg = extract.extract('jpg')
|
|
|
|
+ req_image = []
|
|
|
|
+ i = 0
|
|
|
|
+ for img in jpg.sequence:
|
|
|
|
+ if i == 0:
|
|
|
|
+ img_page = Image(image=img)
|
|
|
|
+ req_image.append(img_page.make_blob('jpg'))
|
|
|
|
+ i = i+1
|
|
|
|
+
|
|
|
|
+ for img in req_image:
|
|
|
|
+ ff = open(dest, 'wb')
|
|
|
|
+ ff.write(img)
|
|
|
|
+ ff.close()
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def getSize(self, target_width, img_width, img_height):
|
|
|
|
+ if img_width > target_width:
|
|
|
|
+ ratio = target_width / img_width
|
|
|
|
+ target_height = int(ratio * img_height)
|
|
|
|
+ else:
|
|
|
|
+ target_width = img_width
|
|
|
|
+ target_height = img_height
|
|
|
|
+ return target_width, target_height
|
|
|
|
+
|
|
|
|
+ def string_switch(self, x,y,z,s=1):
|
|
|
|
+ with open(x, "r", encoding="utf-8") as f:
|
|
|
|
+ #readlines以列表的形式将文件读出
|
|
|
|
+ lines = f.readlines()
|
|
|
|
+
|
|
|
|
+ with open(x, "w", encoding="utf-8") as f_w:
|
|
|
|
+ #定义一个数字,用来记录在读取文件时在列表中的位置
|
|
|
|
+ n = 0
|
|
|
|
+ #默认选项,只替换第一次匹配到的行中的字符串
|
|
|
|
+ if s == 1:
|
|
|
|
+ for line in lines:
|
|
|
|
+ if y in line:
|
|
|
|
+ line = line.replace(y,z)
|
|
|
|
+ f_w.write(line)
|
|
|
|
+ n += 1
|
|
|
|
+ break
|
|
|
|
+ f_w.write(line)
|
|
|
|
+ n += 1
|
|
|
|
+ #将剩余的文本内容继续输出
|
|
|
|
+ for i in range(n,len(lines)):
|
|
|
|
+ f_w.write(lines[i])
|
|
|
|
+ #全局匹配替换
|
|
|
|
+ elif s == 'g':
|
|
|
|
+ for line in lines:
|
|
|
|
+ if y in line:
|
|
|
|
+ line = line.replace(y,z)
|
|
|
|
+ f_w.write(line)
|