Doc.php 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. <?php
  2. namespace Spider\Lib;
  3. use Dever;
  4. class Doc
  5. {
  6. private $url;
  7. private $host;
  8. private $path;
  9. private $rule;
  10. public $type = 'dom';
  11. private $log = false;
  12. private $cur = '';
  13. static protected $instance;
  14. static public function getInstance($url, $rule = '')
  15. {
  16. $key = $url . md5($rule);
  17. if (empty(self::$instance[$key])) {
  18. # 此处开协程:Dever::coroutine(self::$instance[$key]);
  19. self::$instance[$key] = new self($url, $rule);
  20. }
  21. return self::$instance[$key];
  22. }
  23. public function __construct($url, $rule = '')
  24. {
  25. $this->url($url);
  26. $this->rule = $rule;
  27. if (strpos($this->rule, '$json') !== false) {
  28. $this->type = 'json';
  29. } else {
  30. $this->type = 'dom';
  31. }
  32. }
  33. private function url($url = false)
  34. {
  35. if (!$url) {
  36. return;
  37. }
  38. $this->url = $url;
  39. $value = parse_url($this->url);
  40. $this->path = $this->host = $value['scheme'] . '://' . $value['host'];
  41. if (isset($value['path']) && $value['path']) {
  42. $temp = explode('/', $value['path']);
  43. unset($temp[count($temp)-1]);
  44. $this->path .= implode('/', $temp);
  45. }
  46. $this->path .= '/';
  47. }
  48. public function get($param = array())
  49. {
  50. $doc = $this->doc(false, $param);
  51. if (!$this->cur) {
  52. $this->cur = $doc;
  53. }
  54. if ($this->rule) {
  55. $doc = $this->find($doc, $this->rule);
  56. }
  57. return $doc;
  58. }
  59. public function doc($url = false, $param = array())
  60. {
  61. $this->url($url);
  62. $html = $this->download($this->url, $param);
  63. return ($this->getClass())::init($html);
  64. }
  65. public function getCur()
  66. {
  67. return $this->cur;
  68. }
  69. private function download($url, $header = '', $param = '')
  70. {
  71. $this->addLog($url . '下载中...');
  72. $download = new Download($url, $header, $param);
  73. $this->addLog($url . '下载完成');
  74. return $download->get($this->type);
  75. }
  76. private function collect($data, $include, $exclude, $filter, $attr)
  77. {
  78. if ($include) {
  79. $include = Dever::split($include);
  80. foreach ($include as $k => $v) {
  81. $state = preg_match('/' . $v . '/i', $data);
  82. if ($state) {
  83. break;
  84. }
  85. }
  86. if (!$state) {
  87. return 'error';
  88. }
  89. }
  90. if ($exclude) {
  91. $exclude = Dever::split($exclude);
  92. foreach ($exclude as $k => $v) {
  93. $state = preg_match('/' . $v . '/i', $data);
  94. if (!$state) {
  95. return 'error';
  96. }
  97. }
  98. }
  99. if ($filter) {
  100. $filter = Dever::split($filter);
  101. foreach ($filter as $k => $v) {
  102. $s = '';
  103. if (strstr($v, '=>')) {
  104. $temp = explode('=>', $v);
  105. $v = $temp[0];
  106. $s = $temp[1];
  107. }
  108. $data = preg_replace('/' . $v . '/i', $s, $data);
  109. }
  110. }
  111. if ($attr) {
  112. $attr = Dever::split($attr);
  113. foreach ($attr as $k => $v) {
  114. $s = '';
  115. $v = $v . '="(.*?)"';
  116. echo $v;
  117. echo "\r\n";
  118. $data = preg_replace('/' . $v . '/i', $s, $data);
  119. }
  120. echo $data;die;
  121. }
  122. return $data;
  123. }
  124. private function getClass()
  125. {
  126. return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
  127. }
  128. public function find($doc, $rule)
  129. {
  130. return ($this->getClass())::find($doc, $rule);
  131. }
  132. public function init($data)
  133. {
  134. if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
  135. $data = $this->doc($data);
  136. } else {
  137. $data = ($this->getClass())::init($data);
  138. }
  139. return $data;
  140. }
  141. public function rule($data, $col, $config)
  142. {
  143. $name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
  144. $this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
  145. $method = 'rule_' . $this->type;
  146. $result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
  147. if (isset($config['collect_url']) && $config['collect_url']) {
  148. $collect_url = Dever::split($config['collect_url']);
  149. if (!isset($collect_url[1])) {
  150. $collect_url[1] = '';
  151. }
  152. $temp = array();
  153. $temp[] = $result;
  154. $this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);
  155. $result = implode(',', $temp);
  156. }
  157. $this->addLog($name . '解析完成');
  158. return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter'], $config['collect_attr']);
  159. }
  160. public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)
  161. {
  162. $url = $this->getUrl($data, $col, $collect_url, $key);
  163. if ($url) {
  164. if ($collect_include && !strstr($url, $collect_include)) {
  165. return;
  166. }
  167. $data = $this->init($url);
  168. if ($data) {
  169. $temp = $this->getRule($data, $col, $collect_rule, $key);
  170. if ($temp) {
  171. $result[] = $temp;
  172. $this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);
  173. }
  174. }
  175. }
  176. }
  177. public function getUrl($data, $col, $collect_rule, $key)
  178. {
  179. $url = $this->getRule($data, $col, $collect_rule, $key);
  180. if (!$url) {
  181. return '';
  182. }
  183. if (strpos($url, 'http') === false) {
  184. if ($url[0] == '/') {
  185. $url = $this->host . $url;
  186. } elseif (strstr($url, '.')) {
  187. $url = $this->path . $url;
  188. } else {
  189. $url = $this->url . $url;
  190. }
  191. }
  192. return $url;
  193. }
  194. public function getRule($data, $col, $collect_rule, $key)
  195. {
  196. return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);
  197. }
  198. public function addLog($string)
  199. {
  200. if ($this->log) {
  201. $this->log->add($string);
  202. }
  203. }
  204. public function saveLog()
  205. {
  206. if ($this->log) {
  207. $this->log->save();
  208. }
  209. }
  210. public function outLog()
  211. {
  212. if ($this->log) {
  213. $this->log->out();
  214. }
  215. }
  216. public function log(Log $log)
  217. {
  218. $this->log = $log;
  219. }
  220. }