Doc.php 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. <?php
  2. namespace Spider\Lib;
  3. use Dever;
  4. class Doc
  5. {
  6. private $url;
  7. private $host;
  8. private $path;
  9. private $rule;
  10. public $type = 'dom';
  11. private $log = false;
  12. static protected $instance;
  13. static public function getInstance($url, $rule = '')
  14. {
  15. $key = $url . md5($rule);
  16. if (empty(self::$instance[$key])) {
  17. # 此处开协程:Dever::coroutine(self::$instance[$key]);
  18. self::$instance[$key] = new self($url, $rule);
  19. }
  20. return self::$instance[$key];
  21. }
  22. public function __construct($url, $rule = '')
  23. {
  24. $this->url($url);
  25. $this->rule = $rule;
  26. if (strpos($this->rule, '$json') !== false) {
  27. $this->type = 'json';
  28. } else {
  29. $this->type = 'dom';
  30. }
  31. }
  32. private function url($url = false)
  33. {
  34. if (!$url) {
  35. return;
  36. }
  37. $this->url = $url;
  38. $value = parse_url($this->url);
  39. $this->path = $this->host = $value['scheme'] . '://' . $value['host'];
  40. if (isset($value['path']) && $value['path']) {
  41. $temp = explode('/', $value['path']);
  42. unset($temp[count($temp)-1]);
  43. $this->path .= implode('/', $temp);
  44. }
  45. $this->path .= '/';
  46. }
  47. public function get($param = array())
  48. {
  49. $doc = $this->doc(false, $param);
  50. if ($this->rule) {
  51. $doc = $this->find($doc, $this->rule);
  52. }
  53. return $doc;
  54. }
  55. public function doc($url = false, $param = array())
  56. {
  57. $this->url($url);
  58. $html = $this->download($this->url, $param);
  59. return ($this->getClass())::init($html);
  60. }
  61. private function download($url, $header = '', $param = '')
  62. {
  63. $this->addLog($url . '下载中...');
  64. $download = new Download($url, $header, $param);
  65. $this->addLog($url . '下载完成');
  66. return $download->get($this->type);
  67. }
  68. private function collect($data, $include, $exclude, $filter)
  69. {
  70. if ($include) {
  71. $include = explode("\n", str_replace("\r", '', $include));
  72. foreach ($include as $k => $v) {
  73. $state = preg_match('/' . $v . '/i', $data);
  74. if ($state) {
  75. break;
  76. }
  77. }
  78. if (!$state) {
  79. return 'error';
  80. }
  81. }
  82. if ($exclude) {
  83. $exclude = explode("\n", str_replace("\r", '', $exclude));
  84. foreach ($exclude as $k => $v) {
  85. $state = preg_match('/' . $v . '/i', $data);
  86. if (!$state) {
  87. return 'error';
  88. }
  89. }
  90. }
  91. if ($filter) {
  92. $filter = explode("\n", str_replace("\r", '', $filter));
  93. foreach ($filter as $k => $v) {
  94. $s = '';
  95. if (strstr($v, '=>')) {
  96. $temp = explode('=>', $v);
  97. $v = $temp[0];
  98. $s = $temp[1];
  99. }
  100. $data = preg_replace('/' . $v . '/i', $s, $data);
  101. }
  102. }
  103. return $data;
  104. }
  105. private function getClass()
  106. {
  107. return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
  108. }
  109. public function find($doc, $rule)
  110. {
  111. return ($this->getClass())::find($doc, $rule);
  112. }
  113. public function init($data)
  114. {
  115. if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
  116. $data = $this->doc($data);
  117. } else {
  118. $data = ($this->getClass())::init($data);
  119. }
  120. return $data;
  121. }
  122. public function rule($data, $col, $config)
  123. {
  124. $name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
  125. $this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
  126. $method = 'rule_' . $this->type;
  127. $result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
  128. if (isset($config['collect_url']) && $config['collect_url']) {
  129. $collect_url = explode("\n", str_replace("\r", '', $config['collect_url']));
  130. if (!isset($collect_url[1])) {
  131. $collect_url[1] = '';
  132. }
  133. $temp = array();
  134. $temp[] = $result;
  135. $this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);
  136. $result = implode(',', $temp);
  137. }
  138. $this->addLog($name . '解析完成');
  139. return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
  140. }
  141. public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)
  142. {
  143. $url = $this->getUrl($data, $col, $collect_url, $key);
  144. if ($url) {
  145. if ($collect_include && !strstr($url, $collect_include)) {
  146. return;
  147. }
  148. $data = $this->init($url);
  149. if ($data) {
  150. $temp = $this->getRule($data, $col, $collect_rule, $key);
  151. if ($temp) {
  152. $result[] = $temp;
  153. $this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);
  154. }
  155. }
  156. }
  157. }
  158. public function getUrl($data, $col, $collect_rule, $key)
  159. {
  160. $url = $this->getRule($data, $col, $collect_rule, $key);
  161. if (!$url) {
  162. return '';
  163. }
  164. if (strpos($url, 'http') === false) {
  165. if ($url[0] == '/') {
  166. $url = $this->host . $url;
  167. } elseif (strstr($url, '.')) {
  168. $url = $this->path . $url;
  169. } else {
  170. $url = $this->url . $url;
  171. }
  172. }
  173. return $url;
  174. }
  175. public function getRule($data, $col, $collect_rule, $key)
  176. {
  177. return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);
  178. }
  179. public function addLog($string)
  180. {
  181. if ($this->log) {
  182. $this->log->add($string);
  183. }
  184. }
  185. public function saveLog()
  186. {
  187. if ($this->log) {
  188. $this->log->save();
  189. }
  190. }
  191. public function outLog()
  192. {
  193. if ($this->log) {
  194. $this->log->out();
  195. }
  196. }
  197. public function log(Log $log)
  198. {
  199. $this->log = $log;
  200. }
  201. }