extract.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. class Extract(object):
  4. def update(self, site, appid, source_file, source_id, source_type, uid):
  5. info = self.getFile(appid, source_file)
  6. extract = Demeter.model('extract')
  7. extract.site_id = site
  8. extract.key = info['key']
  9. data = extract.select(type='fetchone')
  10. if not data:
  11. extract.site_id = site
  12. extract.uid = uid
  13. extract.key = info['key']
  14. extract.name = info['name']
  15. extract.source_id = source_id
  16. extract.source_type = source_type
  17. extract.source_size = 0
  18. extract.source_file = info['source_file']
  19. extract.local_file = info['local_file']
  20. extract.local_path = info['local_path']
  21. id = extract.insert()
  22. info['status'] = 1
  23. info['id'] = id
  24. else:
  25. info['id'] = data['id']
  26. info['status'] = data['status']
  27. if uid:
  28. self.auth(site, uid, info['id'], 1)
  29. return info
  30. def getAuth(self, site, uid, extract_id):
  31. auth = Demeter.model('extract_auth')
  32. auth.uid = uid
  33. auth.site_id = site
  34. auth.extract_id = extract_id
  35. data = auth.select(type='fetchone')
  36. return data
  37. def auth(self, site, uid, extract_id, status):
  38. auth = Demeter.model('extract_auth')
  39. auth.uid = uid
  40. auth.site_id = site
  41. auth.extract_id = extract_id
  42. data = auth.select(type='fetchone')
  43. if not data:
  44. auth.site_id = site
  45. auth.uid = uid
  46. auth.extract_id = extract_id
  47. auth.status = status
  48. auth.insert()
  49. elif data['status'] != status:
  50. # 适用于文档转让
  51. auth.id = data['id']
  52. update = {}
  53. update['status'] = status
  54. auth.update(update)
  55. return True
  56. def get(self, site, appid, file):
  57. extract = Demeter.model('extract')
  58. extract.site_id = site
  59. #extract.key = self.getKey(appid, file)
  60. extract.key = file
  61. data = extract.select(type='fetchone')
  62. return data
  63. def getKey(self, appid, file):
  64. return Demeter.sha1(str(appid) + '_' + str(file))
  65. def getFile(self, appid, file):
  66. info = {}
  67. (filepath,temp) = os.path.split(file)
  68. (filename,extension) = os.path.splitext(temp)
  69. info['source_file'] = file
  70. info['key'] = self.getKey(appid, file)
  71. info['ext'] = extension
  72. info['name'] = filename
  73. info = self.getLocalFile(appid, file, info)
  74. return info
  75. def getLocalFile(self, appid, file, info):
  76. day = str(date.today())
  77. day = day.split('-')
  78. #filename = Demeter.md5(str(uuid.uuid5(uuid.uuid1(), info['key'])))
  79. filename = info['key']
  80. filepath = str(appid) + '/' + day[0] + '/' + day[1] + '/' + day[2]
  81. path = ''
  82. if 'save' in Demeter.config['setting']:
  83. filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
  84. else:
  85. filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
  86. local = filepath + info['ext']
  87. info['local_file'] = local
  88. info['local_path'] = filepath + '/'
  89. return info
  90. if File.exists(local):
  91. return info
  92. else:
  93. self.download(file, local);
  94. return info
  95. def download(self, file, local):
  96. if 'http' in file:
  97. import requests
  98. r = requests.get(file, stream=True)
  99. with open(local, 'wb') as up:
  100. for chunk in r.iter_content(chunk_size=1024):
  101. if chunk:
  102. up.write(chunk)
  103. else:
  104. import shutil
  105. shutil.copyfile(file, local)
  106. if File.exists(local):
  107. return True
  108. return False
  109. def total(self, path):
  110. page = 0
  111. for parentdir,dirname,filenames in os.walk(path):
  112. for filename in filenames:
  113. if os.path.splitext(filename)[1]=='.page':
  114. page = page + 1
  115. return page
  116. def handle(self, id):
  117. model = Demeter.model('extract')
  118. model.id = id
  119. info = model.select(type='fetchone')
  120. if not info:
  121. return
  122. siteModel = Demeter.model('site')
  123. siteModel.id = info['site_id']
  124. site = siteModel.select(type='fetchone')
  125. status = True
  126. if info['status'] == 1 or info['status'] == 4:
  127. status = False
  128. if info and status == False:
  129. model.id = id
  130. update = {}
  131. update['status'] = 2
  132. model.update(update)
  133. if not File.exists(info['local_file']):
  134. self.download(info['source_file'], info['local_file'])
  135. if True:
  136. File.mkdir(info['local_path'])
  137. Demeter.service(info['ext'], 'docs').load(info)
  138. if 'txt' in info['ext']:
  139. import chardet
  140. file_path = info['local_file']
  141. with open(file_path, 'rb') as fp:
  142. file_data = fp.read()
  143. result = chardet.detect(file_data)
  144. fp.close()
  145. if result['encoding'] != 'utf-8':
  146. file_content = file_data.decode(result['encoding'])
  147. f = open(file_path, 'w')
  148. f.write(file_content)
  149. f.close()
  150. if 'pdf' in info['ext']:
  151. handle = self.command(info)
  152. Shell.popen(handle)
  153. if File.exists(info['html']):
  154. # 处理图片
  155. self.pic(info)
  156. #self.string_switch(info['html'], "taste", "tasting")
  157. # 获取有多少页
  158. page = self.total(info['path'])
  159. model.id = id
  160. size = os.path.getsize(info['local'])
  161. update = {}
  162. update['file_size'] = size
  163. update['page'] = page
  164. update['status'] = 3
  165. model.update(update)
  166. # 通知接口 通知应用成功转换
  167. info['page'] = page
  168. info['file_size'] = size
  169. info['extract_status'] = 1
  170. self.api(info, site)
  171. return
  172. model.id = id
  173. update = {}
  174. update['status'] = 4
  175. model.update(update)
  176. # 通知接口 通知应用失败转换
  177. info['page'] = 0
  178. info['file_size'] = 0
  179. info['extract_status'] = 2
  180. self.api(info, site)
  181. def api(self, info, site):
  182. if 'file_id' in info and info['file_id']:
  183. api = site['api']
  184. appid = site['appid']
  185. appsecret = site['appsecret']
  186. timestamp = Demeter.time()
  187. nonce = Demeter.hash()
  188. file = info['key']
  189. file_id = info['file_id']
  190. uid = info['uid']
  191. param = self.signature(site['id'], appid, appsecret, timestamp, nonce, file, file_id, uid)
  192. param['url'] = 'main/view'
  193. param['img'] = info['url'] + '.jpg'
  194. param['page'] = info['page']
  195. param['ext'] = info['ext']
  196. param['file_size'] = info['file_size']
  197. param['status'] = info['extract_status']
  198. Demeter.curl(api, param, 'post')
  199. def pic(self, info):
  200. from extract2jpg import extract2jpg
  201. from wand.image import Image
  202. width = 800
  203. # 对生成的图片进行缩放
  204. files = File.getFiles(info['path'])
  205. if files:
  206. for file in files:
  207. if 'jpg' in file or 'png' in file:
  208. file = info['path'] + file
  209. with Image(filename=file) as img:
  210. target_width, target_height = self.getSize(width, img.width, img.height)
  211. img.sample(target_width, target_height)
  212. img.save(filename=file)
  213. # 生成图片
  214. dest = info['html'] + '.photo'
  215. result = extract2jpg.extract_extract2jpg(info['extract'], dest, pages="0,1")
  216. i = 0
  217. for j in result[0]['output_jpgfiles']:
  218. source = j
  219. if i == 0:
  220. dest = info['html'] + '.jpg'
  221. else:
  222. dest = info['html'] + '.jpg_' + str(i) + '.jpg'
  223. command = 'mv '+source+' ' + dest
  224. Shell.popen(command)
  225. with Image(filename=dest) as img:
  226. target_width, target_height = self.getSize(width, img.width, img.height)
  227. img.sample(target_width, target_height)
  228. img.save(filename=dest)
  229. i = i+1
  230. '''
  231. from wand.image import Image
  232. extract = Image(filename=source, resolution=50)
  233. jpg = extract.extract('jpg')
  234. req_image = []
  235. i = 0
  236. for img in jpg.sequence:
  237. if i == 0:
  238. img_page = Image(image=img)
  239. req_image.append(img_page.make_blob('jpg'))
  240. i = i+1
  241. for img in req_image:
  242. ff = open(dest, 'wb')
  243. ff.write(img)
  244. ff.close()
  245. '''
  246. def getSize(self, target_width, img_width, img_height):
  247. if img_width > target_width:
  248. ratio = target_width / img_width
  249. target_height = int(ratio * img_height)
  250. else:
  251. target_width = img_width
  252. target_height = img_height
  253. return target_width, target_height
  254. def string_switch(self, x,y,z,s=1):
  255. with open(x, "r", encoding="utf-8") as f:
  256. #readlines以列表的形式将文件读出
  257. lines = f.readlines()
  258. with open(x, "w", encoding="utf-8") as f_w:
  259. #定义一个数字,用来记录在读取文件时在列表中的位置
  260. n = 0
  261. #默认选项,只替换第一次匹配到的行中的字符串
  262. if s == 1:
  263. for line in lines:
  264. if y in line:
  265. line = line.replace(y,z)
  266. f_w.write(line)
  267. n += 1
  268. break
  269. f_w.write(line)
  270. n += 1
  271. #将剩余的文本内容继续输出
  272. for i in range(n,len(lines)):
  273. f_w.write(lines[i])
  274. #全局匹配替换
  275. elif s == 'g':
  276. for line in lines:
  277. if y in line:
  278. line = line.replace(y,z)
  279. f_w.write(line)