Doc.php 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. <?php
  2. namespace Spider\Lib;
  3. use Dever;
  4. class Doc
  5. {
  6. private $url;
  7. private $host;
  8. private $rule;
  9. public $type = 'dom';
  10. private $log = false;
  11. static protected $instance;
  12. static public function getInstance($url, $rule = '')
  13. {
  14. $key = $url . md5($rule);
  15. if (empty(self::$instance[$key])) {
  16. # 此处开协程:Dever::coroutine(self::$instance[$key]);
  17. self::$instance[$key] = new self($url, $rule);
  18. }
  19. return self::$instance[$key];
  20. }
  21. public function __construct($url, $rule = '')
  22. {
  23. $this->url($url);
  24. $this->rule = $rule;
  25. if (strpos($this->rule, '$json') !== false) {
  26. $this->type = 'json';
  27. } else {
  28. $this->type = 'dom';
  29. }
  30. }
  31. private function url($url)
  32. {
  33. $this->url = $url;
  34. $value = parse_url($this->url);
  35. $this->host = $value['scheme'] . '://' . $value['host'];
  36. }
  37. public function get()
  38. {
  39. $doc = $this->doc();
  40. if ($this->rule) {
  41. $doc = $this->find($doc, $this->rule);
  42. }
  43. return $doc;
  44. }
  45. public function doc($url = false)
  46. {
  47. $url = $url ? $url : $this->url;
  48. $html = $this->download($url);
  49. return ($this->getClass())::init($html);
  50. }
  51. private function download($url)
  52. {
  53. $this->addLog($url . '下载中...');
  54. $download = new Download($url);
  55. $this->addLog($url . '下载完成');
  56. return $download->get();
  57. }
  58. private function collect($data, $include, $exclude, $filter)
  59. {
  60. if ($include) {
  61. $include = explode("\n", str_replace("\r", '', $include));
  62. foreach ($include as $k => $v) {
  63. $state = preg_match('/' . $v . '/i', $data);
  64. if ($state) {
  65. break;
  66. }
  67. }
  68. if (!$state) {
  69. return 'error';
  70. }
  71. }
  72. if ($exclude) {
  73. $exclude = explode("\n", str_replace("\r", '', $exclude));
  74. foreach ($exclude as $k => $v) {
  75. $state = preg_match('/' . $v . '/i', $data);
  76. if (!$state) {
  77. return 'error';
  78. }
  79. }
  80. }
  81. if ($filter) {
  82. $filter = explode("\n", str_replace("\r", '', $filter));
  83. foreach ($filter as $k => $v) {
  84. $s = '';
  85. if (strstr($v, '=>')) {
  86. $temp = explode('=>', $v);
  87. $v = $temp[0];
  88. $s = $temp[1];
  89. }
  90. $data = preg_replace('/' . $v . '/i', $s, $data);
  91. }
  92. }
  93. return $data;
  94. }
  95. private function getClass()
  96. {
  97. return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
  98. }
  99. public function find($doc, $rule)
  100. {
  101. return ($this->getClass())::find($doc, $rule);
  102. }
  103. public function init($data)
  104. {
  105. if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
  106. $data = $this->doc($data);
  107. } else {
  108. $data = ($this->getClass())::init($data);
  109. }
  110. return $data;
  111. }
  112. public function rule($data, $col, $config)
  113. {
  114. $name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
  115. $this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
  116. $method = 'rule_' . $this->type;
  117. $data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
  118. $this->addLog($name . '解析完成');
  119. return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
  120. }
  121. public function getUrl($data, $col, $config)
  122. {
  123. $url = $this->rule($data, $col, $config);
  124. if (strpos($url, 'http') === false) {
  125. if ($url[0] == '/') {
  126. $url = $this->host . $url;
  127. } else {
  128. $url = $this->url . $url;
  129. }
  130. }
  131. return $url;
  132. }
  133. public function addLog($string)
  134. {
  135. if ($this->log) {
  136. $this->log->add($string);
  137. }
  138. }
  139. public function saveLog()
  140. {
  141. if ($this->log) {
  142. $this->log->save();
  143. }
  144. }
  145. public function outLog()
  146. {
  147. if ($this->log) {
  148. $this->log->out();
  149. }
  150. }
  151. public function log(Log $log)
  152. {
  153. $this->log = $log;
  154. }
  155. }