123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- # -*- coding: utf-8 -*-
- from .__load__ import *
- class Extract(object):
- def update(self, site, appid, source_file, source_id, source_type, uid):
- info = self.getFile(appid, source_file)
- extract = Demeter.model('extract')
- extract.site_id = site
- extract.key = info['key']
- data = extract.select(type='fetchone')
- if not data:
- extract.site_id = site
- extract.uid = uid
- extract.key = info['key']
- extract.name = info['name']
- extract.source_id = source_id
- extract.source_type = source_type
- extract.source_size = 0
- extract.source_file = info['source_file']
- extract.local_file = info['local_file']
- extract.local_path = info['local_path']
- id = extract.insert()
- info['status'] = 1
- info['id'] = id
- else:
- info['id'] = data['id']
- info['status'] = data['status']
- if uid:
- self.auth(site, uid, info['id'], 1)
- return info
- def getAuth(self, site, uid, extract_id):
- auth = Demeter.model('extract_auth')
- auth.uid = uid
- auth.site_id = site
- auth.extract_id = extract_id
- data = auth.select(type='fetchone')
- return data
- def auth(self, site, uid, extract_id, status):
- auth = Demeter.model('extract_auth')
- auth.uid = uid
- auth.site_id = site
- auth.extract_id = extract_id
- data = auth.select(type='fetchone')
- if not data:
- auth.site_id = site
- auth.uid = uid
- auth.extract_id = extract_id
- auth.status = status
- auth.insert()
- elif data['status'] != status:
- # 适用于文档转让
- auth.id = data['id']
- update = {}
- update['status'] = status
- auth.update(update)
- return True
- def get(self, site, appid, file):
- extract = Demeter.model('extract')
- extract.site_id = site
- #extract.key = self.getKey(appid, file)
- extract.key = file
- data = extract.select(type='fetchone')
- return data
- def getKey(self, appid, file):
- return Demeter.sha1(str(appid) + '_' + str(file))
- def getFile(self, appid, file):
- info = {}
- (filepath,temp) = os.path.split(file)
- (filename,extension) = os.path.splitext(temp)
- info['source_file'] = file
- info['key'] = self.getKey(appid, file)
- info['ext'] = extension
- info['name'] = filename
- info = self.getLocalFile(appid, file, info)
- return info
- def getLocalFile(self, appid, file, info):
- day = str(date.today())
- day = day.split('-')
- #filename = Demeter.md5(str(uuid.uuid5(uuid.uuid1(), info['key'])))
- filename = info['key']
- filepath = str(appid) + '/' + day[0] + '/' + day[1] + '/' + day[2]
- path = ''
- if 'save' in Demeter.config['setting']:
- filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
- else:
- filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
- local = filepath + info['ext']
- info['local_file'] = local
- info['local_path'] = filepath + '/'
- return info
- if File.exists(local):
- return info
- else:
- self.download(file, local);
- return info
- def download(self, file, local):
- if 'http' in file:
- import requests
- r = requests.get(file, stream=True)
- with open(local, 'wb') as up:
- for chunk in r.iter_content(chunk_size=1024):
- if chunk:
- up.write(chunk)
- else:
- import shutil
- shutil.copyfile(file, local)
- if File.exists(local):
- return True
- return False
- def total(self, path):
- page = 0
- for parentdir,dirname,filenames in os.walk(path):
- for filename in filenames:
- if os.path.splitext(filename)[1]=='.page':
- page = page + 1
- return page
- def handle(self, id):
- model = Demeter.model('extract')
- model.id = id
- info = model.select(type='fetchone')
- if not info:
- return
- siteModel = Demeter.model('site')
- siteModel.id = info['site_id']
- site = siteModel.select(type='fetchone')
- status = True
- if info['status'] == 1 or info['status'] == 4:
- status = False
- if info and status == False:
- model.id = id
- update = {}
- update['status'] = 2
- model.update(update)
- if not File.exists(info['local_file']):
- self.download(info['source_file'], info['local_file'])
- if True:
- File.mkdir(info['local_path'])
- Demeter.service(info['ext'], 'docs').load(info)
- if 'txt' in info['ext']:
- import chardet
- file_path = info['local_file']
- with open(file_path, 'rb') as fp:
- file_data = fp.read()
- result = chardet.detect(file_data)
- fp.close()
- if result['encoding'] != 'utf-8':
- file_content = file_data.decode(result['encoding'])
- f = open(file_path, 'w')
- f.write(file_content)
- f.close()
- if 'pdf' in info['ext']:
- handle = self.command(info)
- Shell.popen(handle)
- if File.exists(info['html']):
- # 处理图片
- self.pic(info)
- #self.string_switch(info['html'], "taste", "tasting")
- # 获取有多少页
- page = self.total(info['path'])
- model.id = id
- size = os.path.getsize(info['local'])
- update = {}
- update['file_size'] = size
- update['page'] = page
- update['status'] = 3
- model.update(update)
- # 通知接口 通知应用成功转换
- info['page'] = page
- info['file_size'] = size
- info['extract_status'] = 1
- self.api(info, site)
- return
- model.id = id
- update = {}
- update['status'] = 4
- model.update(update)
- # 通知接口 通知应用失败转换
- info['page'] = 0
- info['file_size'] = 0
- info['extract_status'] = 2
- self.api(info, site)
- def api(self, info, site):
- if 'file_id' in info and info['file_id']:
- api = site['api']
- appid = site['appid']
- appsecret = site['appsecret']
- timestamp = Demeter.time()
- nonce = Demeter.hash()
- file = info['key']
- file_id = info['file_id']
- uid = info['uid']
- param = self.signature(site['id'], appid, appsecret, timestamp, nonce, file, file_id, uid)
- param['url'] = 'main/view'
- param['img'] = info['url'] + '.jpg'
- param['page'] = info['page']
- param['ext'] = info['ext']
- param['file_size'] = info['file_size']
- param['status'] = info['extract_status']
- Demeter.curl(api, param, 'post')
- def pic(self, info):
- from extract2jpg import extract2jpg
- from wand.image import Image
- width = 800
- # 对生成的图片进行缩放
- files = File.getFiles(info['path'])
- if files:
- for file in files:
- if 'jpg' in file or 'png' in file:
- file = info['path'] + file
- with Image(filename=file) as img:
- target_width, target_height = self.getSize(width, img.width, img.height)
- img.sample(target_width, target_height)
- img.save(filename=file)
- # 生成图片
- dest = info['html'] + '.photo'
- result = extract2jpg.extract_extract2jpg(info['extract'], dest, pages="0,1")
- i = 0
- for j in result[0]['output_jpgfiles']:
- source = j
- if i == 0:
- dest = info['html'] + '.jpg'
- else:
- dest = info['html'] + '.jpg_' + str(i) + '.jpg'
- command = 'mv '+source+' ' + dest
- Shell.popen(command)
-
- with Image(filename=dest) as img:
- target_width, target_height = self.getSize(width, img.width, img.height)
- img.sample(target_width, target_height)
- img.save(filename=dest)
- i = i+1
- '''
- from wand.image import Image
- extract = Image(filename=source, resolution=50)
- jpg = extract.extract('jpg')
- req_image = []
- i = 0
- for img in jpg.sequence:
- if i == 0:
- img_page = Image(image=img)
- req_image.append(img_page.make_blob('jpg'))
- i = i+1
- for img in req_image:
- ff = open(dest, 'wb')
- ff.write(img)
- ff.close()
- '''
- def getSize(self, target_width, img_width, img_height):
- if img_width > target_width:
- ratio = target_width / img_width
- target_height = int(ratio * img_height)
- else:
- target_width = img_width
- target_height = img_height
- return target_width, target_height
- def string_switch(self, x,y,z,s=1):
- with open(x, "r", encoding="utf-8") as f:
- #readlines以列表的形式将文件读出
- lines = f.readlines()
-
- with open(x, "w", encoding="utf-8") as f_w:
- #定义一个数字,用来记录在读取文件时在列表中的位置
- n = 0
- #默认选项,只替换第一次匹配到的行中的字符串
- if s == 1:
- for line in lines:
- if y in line:
- line = line.replace(y,z)
- f_w.write(line)
- n += 1
- break
- f_w.write(line)
- n += 1
- #将剩余的文本内容继续输出
- for i in range(n,len(lines)):
- f_w.write(lines[i])
- #全局匹配替换
- elif s == 'g':
- for line in lines:
- if y in line:
- line = line.replace(y,z)
- f_w.write(line)
|