123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- <?php
- namespace Spider\Lib;
- use Dever;
- class Doc
- {
- private $url;
- private $host;
- private $path;
- private $rule;
- public $type = 'dom';
- private $log = false;
- private $cur = '';
- static protected $instance;
- static public function getInstance($url, $rule = '')
- {
- $key = $url . md5($rule);
- if (empty(self::$instance[$key])) {
- # 此处开协程:Dever::coroutine(self::$instance[$key]);
- self::$instance[$key] = new self($url, $rule);
- }
- return self::$instance[$key];
- }
- public function __construct($url, $rule = '')
- {
- $this->url($url);
- $this->rule = $rule;
- if (strpos($this->rule, '$json') !== false) {
- $this->type = 'json';
- } else {
- $this->type = 'dom';
- }
- }
- private function url($url = false)
- {
- if (!$url) {
- return;
- }
- $this->url = $url;
- $value = parse_url($this->url);
- $this->path = $this->host = $value['scheme'] . '://' . $value['host'];
- if (isset($value['path']) && $value['path']) {
- $temp = explode('/', $value['path']);
- unset($temp[count($temp)-1]);
- $this->path .= implode('/', $temp);
- }
- $this->path .= '/';
- }
- public function get($param = array())
- {
- $doc = $this->doc(false, $param);
- if (!$this->cur) {
- $this->cur = $doc;
- }
- if ($this->rule) {
- $doc = $this->find($doc, $this->rule);
- }
- return $doc;
- }
- public function doc($url = false, $param = array())
- {
- $this->url($url);
- $html = $this->download($this->url, $param);
- return ($this->getClass())::init($html);
- }
- public function getCur()
- {
- return $this->cur;
- }
- private function download($url, $header = '', $param = '')
- {
- $this->addLog($url . '下载中...');
- $download = new Download($url, $header, $param);
- $this->addLog($url . '下载完成');
- return $download->get($this->type);
- }
- private function collect($data, $include, $exclude, $filter, $attr)
- {
- if ($include) {
- $include = Dever::split($include);
- foreach ($include as $k => $v) {
- $state = preg_match('/' . $v . '/i', $data);
- if ($state) {
- break;
- }
- }
- if (!$state) {
- return 'error';
- }
- }
- if ($exclude) {
- $exclude = Dever::split($exclude);
- foreach ($exclude as $k => $v) {
- $state = preg_match('/' . $v . '/i', $data);
- if (!$state) {
- return 'error';
- }
- }
- }
- if ($filter) {
- $filter = Dever::split($filter);
- foreach ($filter as $k => $v) {
- $s = '';
- if (strstr($v, '=>')) {
- $temp = explode('=>', $v);
- $v = $temp[0];
- $s = $temp[1];
- }
- $data = preg_replace('/' . $v . '/i', $s, $data);
- }
- }
- if ($attr) {
- $attr = Dever::split($attr);
- foreach ($attr as $k => $v) {
- $s = '';
- $v = $v . '="(.*?)"';
- $data = preg_replace('/' . $v . '/i', $s, $data);
- }
- }
-
- return $data;
- }
- private function getClass()
- {
- return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
- }
- public function find($doc, $rule)
- {
- return ($this->getClass())::find($doc, $rule);
- }
- public function init($data)
- {
- if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
- $data = $this->doc($data);
- } else {
- $data = ($this->getClass())::init($data);
- }
- return $data;
- }
- public function rule($data, $col, $config)
- {
- $name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
- $this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
- $method = 'rule_' . $this->type;
- $result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
- if (isset($config['collect_url']) && $config['collect_url']) {
- $collect_url = Dever::split($config['collect_url']);
- if (!isset($collect_url[1])) {
- $collect_url[1] = '';
- }
- $temp = array();
- $temp[] = $result;
- $this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);
- $result = implode(',', $temp);
- }
-
- $this->addLog($name . '解析完成');
- return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter'], $config['collect_attr']);
- }
- public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)
- {
- $url = $this->getUrl($data, $col, $collect_url, $key);
- if ($url) {
- if ($collect_include && !strstr($url, $collect_include)) {
- return;
- }
- $data = $this->init($url);
- if ($data) {
- $temp = $this->getRule($data, $col, $collect_rule, $key);
- if ($temp) {
- $result[] = $temp;
- $this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);
- }
- }
- }
- }
- public function getUrl($data, $col, $collect_rule, $key)
- {
- $url = $this->getRule($data, $col, $collect_rule, $key);
- if (!$url) {
- return '';
- }
- if (strpos($url, 'http') === false) {
- if ($url[0] == '/') {
- $url = $this->host . $url;
- } elseif (strstr($url, '.')) {
- $url = $this->path . $url;
- } else {
- $url = $this->url . $url;
- }
- }
- return $url;
- }
- public function getRule($data, $col, $collect_rule, $key)
- {
- return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);
- }
- public function addLog($string)
- {
- if ($this->log) {
- $this->log->add($string);
- }
- }
- public function saveLog()
- {
- if ($this->log) {
- $this->log->save();
- }
- }
- public function outLog()
- {
- if ($this->log) {
- $this->log->out();
- }
- }
- public function log(Log $log)
- {
- $this->log = $log;
- }
- }
|