parser.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. # 提取器
  4. class Parser(object):
  5. def get(self, id=0, site_id = 0, uid = 0, source_id = 0, source = '', notify='', sync=True, method='json'):
  6. extract = Demeter.model('extract')
  7. if int(id) > 0:
  8. extract.id = id
  9. else:
  10. extract.key = self.getKey(site_id, source)
  11. data = extract.select(type='fetchone')
  12. if not data:
  13. if not source:
  14. return source
  15. info = self.getFile(site_id, source)
  16. info['uid'] = uid
  17. info['source_id'] = source_id
  18. info['notify'] = notify
  19. info['status'] = 1
  20. info['id'] = Demeter.service('common').update('extract', False, info)
  21. else:
  22. info = data
  23. if sync:
  24. return self.handle(info, method);
  25. return info['id']
  26. def getFile(self, site_id, file):
  27. info = {}
  28. (filepath,temp) = os.path.split(file)
  29. (filename,extension) = os.path.splitext(temp)
  30. info['site_id'] = site_id;
  31. info['source'] = file
  32. info['key'] = self.getKey(site_id, file)
  33. info['ext'] = extension
  34. info['name'] = filename
  35. info['file'] = file
  36. info['path'] = filepath
  37. info = self.getLocal(info)
  38. info['size'] = os.path.getsize(info['file'])
  39. return info
  40. def getKey(self, site_id, file):
  41. return Demeter.md5(str(site_id) + '_' + str(file))
  42. def getLocal(self, info):
  43. if 'http' in info['source']:
  44. day = str(date.today())
  45. day = day.split('-')
  46. filename = info['key']
  47. filepath = str(info['site_id']) + '/' + day[0] + '/' + day[1] + '/' + day[2]
  48. else:
  49. filename = info['name']
  50. filepath = info['path']
  51. if 'save' in Demeter.config['setting']:
  52. filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
  53. else:
  54. filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
  55. info['file'] = filepath + info['ext']
  56. info['path'] = filepath + '/'
  57. if File.exists(info['file']):
  58. return info
  59. else:
  60. self.download(info['source'], info['file']);
  61. return info
  62. def download(self, file, local):
  63. if 'http' in file:
  64. import requests
  65. r = requests.get(file, stream=True)
  66. with open(local, 'wb') as up:
  67. for chunk in r.iter_content(chunk_size=1024):
  68. if chunk:
  69. up.write(chunk)
  70. else:
  71. import shutil
  72. shutil.copyfile(file, local)
  73. if File.exists(local):
  74. return True
  75. return False
  76. def handle(self, info, method):
  77. param = {}
  78. param['method'] = 'extract'
  79. param['page'] = 0
  80. param['status'] = 4
  81. param['source_id'] = info['source_id']
  82. status = True
  83. if info['status'] == 1 or info['status'] == 4:
  84. status = False
  85. #if info and status == False:
  86. if info:
  87. Demeter.service('common').update('extract', info['id'], {'status':2})
  88. if not File.exists(info['file']):
  89. self.download(info['source'], info['file'])
  90. if True:
  91. File.mkdir(info['path'])
  92. obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path']})
  93. func = getattr(obj, method)
  94. result = func()
  95. if result and 'page' in result and result['page'] > 0:
  96. param['content'] = result['content']
  97. param['page'] = result['page']
  98. param['status'] = 3
  99. if method == 'json':
  100. method = 1
  101. else:
  102. method = 2
  103. data = {'extract_id': info['id'], 'method' : method}
  104. content = Demeter.service('common').one('extract_content', **data)
  105. if not content:
  106. result = json.dumps(result, ensure_ascii=False)
  107. data['content'] = result
  108. Demeter.service('common').update('extract_content', False, data)
  109. Demeter.service('common').update('extract', info['id'], {'status':param['status'], 'page':param['page']})
  110. Demeter.service('callback').send(info['site_id'], param)
  111. return param