parser.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. # 提取器
  4. class Parser(object):
  5. def get(self, host = '', id=0, site_id = 0, uid = 0, source_id = 0, source = '', notify='', sync=True, method='json', **kwargs):
  6. extract = Demeter.model('extract')
  7. if int(id) > 0:
  8. extract.id = id
  9. else:
  10. extract.key = self.getKey(site_id, source)
  11. data = extract.select(type='fetchone')
  12. if not data:
  13. if not source:
  14. return 'error'
  15. info = self.getFile(site_id, source, host)
  16. info['uid'] = uid
  17. info['source_id'] = source_id
  18. info['notify'] = notify
  19. info['status'] = 1
  20. info['id'] = Demeter.service('common').update('extract', False, info)
  21. else:
  22. info = data
  23. if sync:
  24. return self.handle(info, method);
  25. return info['id']
  26. def getFile(self, site_id, file, host):
  27. info = {}
  28. (filepath,temp) = os.path.split(file)
  29. (filename,extension) = os.path.splitext(temp)
  30. info['site_id'] = site_id;
  31. info['source'] = file
  32. info['key'] = self.getKey(site_id, file)
  33. info['ext'] = extension
  34. info['name'] = filename
  35. info['host'] = host;
  36. info['file'] = file
  37. info['path'] = filepath
  38. info = self.getLocal(info)
  39. info['size'] = os.path.getsize(info['file'])
  40. return info
  41. def getKey(self, site_id, file):
  42. return Demeter.md5(str(site_id) + '_' + str(file))
  43. def getLocal(self, info):
  44. if 'http' in info['source']:
  45. # 远程的
  46. day = str(date.today())
  47. day = day.split('-')
  48. filename = info['key']
  49. filepath = str(info['site_id']) + '/' + day[0] + '/' + day[1] + '/' + day[2]
  50. filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath))
  51. else:
  52. # 本地的
  53. filename = info['name']
  54. filepath = info['path']
  55. info['host'] = ''
  56. filepath = File.mkdirs(filepath) + '/' + filename
  57. info['file'] = filepath + info['ext']
  58. info['path'] = filepath + '/'
  59. if File.exists(info['file']):
  60. return info
  61. else:
  62. self.download(info['source'], info['file']);
  63. return info
  64. def download(self, file, local):
  65. if 'http' in file:
  66. import requests
  67. r = requests.get(file, stream=True)
  68. with open(local, 'wb') as up:
  69. for chunk in r.iter_content(chunk_size=1024):
  70. if chunk:
  71. up.write(chunk)
  72. else:
  73. import shutil
  74. shutil.copyfile(file, local)
  75. if File.exists(local):
  76. return True
  77. return False
  78. def handle(self, info, method):
  79. param = {}
  80. param['method'] = 'extract'
  81. param['page'] = 0
  82. param['status'] = 4
  83. param['source_id'] = info['source_id']
  84. status = True
  85. if info['status'] == 1 or info['status'] == 4:
  86. status = False
  87. #if info and status == False:
  88. if info:
  89. Demeter.service('common').update('extract', info['id'], {'status':2})
  90. if not File.exists(info['file']):
  91. self.download(info['source'], info['file'])
  92. if True:
  93. File.mkdir(info['path'])
  94. obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path'], 'host':info['host']})
  95. func = getattr(obj, method)
  96. result = func()
  97. if result and 'page' in result and result['page'] > 0:
  98. param['content'] = result['content']
  99. param['page'] = result['page']
  100. param['status'] = 3
  101. if method == 'json':
  102. method = 1
  103. else:
  104. method = 2
  105. data = {'extract_id': info['id'], 'method' : method}
  106. content = Demeter.service('common').one('extract_content', **data)
  107. if not content:
  108. result = json.dumps(result, ensure_ascii=False)
  109. data['content'] = result
  110. Demeter.service('common').update('extract_content', False, data)
  111. Demeter.service('common').update('extract', info['id'], {'status':param['status'], 'page':param['page']})
  112. Demeter.service('callback').send(info['site_id'], param)
  113. return param