| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 | <?phpnamespace Spider\Lib;use Dever;class Doc{	private $url;	private $host;	private $path;	private $rule;	public $type = 'dom';	private $log = false;	private $cur = '';	static protected $instance;	static public function getInstance($url, $rule = '')	{		$key = $url . md5($rule);		if (empty(self::$instance[$key])) {			# 此处开协程:Dever::coroutine(self::$instance[$key]);			self::$instance[$key] = new self($url, $rule);		}		return self::$instance[$key];	}	public function __construct($url, $rule = '')	{		$this->url($url);		$this->rule = $rule;		if (strpos($this->rule, '$json') !== false) {			$this->type = 'json';		} else {			$this->type = 'dom';		}	}	private function url($url = false)	{		if (!$url) {			return;		}		$this->url = $url;		$value = parse_url($this->url);		$this->path = $this->host = $value['scheme'] . '://' . $value['host'];		if (isset($value['path']) && $value['path']) {			$temp = explode('/', $value['path']);			unset($temp[count($temp)-1]);			$this->path .= implode('/', $temp);		}		$this->path .= '/';	}	public function get($param = array())	{		$doc = $this->doc(false, $param);		if (!$this->cur) {			$this->cur = $doc;		}		if ($this->rule) {			$doc = $this->find($doc, $this->rule);		}		return $doc;	}	public function doc($url = false, $param = array())	{		$this->url($url);		$html = $this->download($this->url, $param);		return ($this->getClass())::init($html);	}	public function getCur()	{		return $this->cur;	}	private function download($url, $header = '', $param = '')	{		$this->addLog($url . '下载中...');		$download = new Download($url, $header, $param);		$this->addLog($url . '下载完成');		return $download->get($this->type);	}	private function collect($data, $include, $exclude, $filter)	{		if ($include) {			$include = explode("\n", str_replace("\r", '', $include));			foreach ($include as $k => $v) {				$state = preg_match('/' . $v . '/i', $data);				if ($state) {					break;				}			}			if (!$state) {				return 'error';			}		}		if ($exclude) {			$exclude = explode("\n", str_replace("\r", '', $exclude));			foreach ($exclude as $k => $v) {				$state = preg_match('/' . $v . '/i', $data);				if (!$state) {					return 'error';				}			}		}		if ($filter) {			$filter = explode("\n", str_replace("\r", '', $filter));			foreach ($filter as $k => $v) {				$s = '';				if (strstr($v, '=>')) {					$temp = explode('=>', $v);					$v = $temp[0];					$s = $temp[1];				}				$data = preg_replace('/' . $v . '/i', $s, $data);			}		}				return $data;	}	private function getClass()	{		return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);	}	public function find($doc, $rule)	{		return ($this->getClass())::find($doc, $rule);	}	public function init($data)	{		if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {			$data = $this->doc($data);		} else {			$data = ($this->getClass())::init($data);		}		return $data;	}	public function rule($data, $col, $config)	{		$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';		$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');		$method = 'rule_' . $this->type;		$result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);		if (isset($config['collect_url']) && $config['collect_url']) {			$collect_url = explode("\n", str_replace("\r", '', $config['collect_url']));			if (!isset($collect_url[1])) {				$collect_url[1] = '';			}			$temp = array();			$temp[] = $result;			$this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);			$result = implode(',', $temp);		}				$this->addLog($name . '解析完成');		return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);	}	public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)	{		$url = $this->getUrl($data, $col, $collect_url, $key);		if ($url) {			if ($collect_include && !strstr($url, $collect_include)) {				return;			}			$data = $this->init($url);			if ($data) {				$temp = $this->getRule($data, $col, $collect_rule, $key);				if ($temp) {					$result[] = $temp;					$this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);				}			}		}	}	public function getUrl($data, $col, $collect_rule, $key)	{		$url = $this->getRule($data, $col, $collect_rule, $key);		if (!$url) {			return '';		}		if (strpos($url, 'http') === false) {			if ($url[0] == '/') {				$url = $this->host . $url;			} elseif (strstr($url, '.')) {				$url = $this->path . $url;			} else {				$url = $this->url . $url;			}		}		return $url;	}	public function getRule($data, $col, $collect_rule, $key)	{		return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);	}	public function addLog($string)	{		if ($this->log) {			$this->log->add($string);		}	}	public function saveLog()	{		if ($this->log) {			$this->log->save();		}	}	public function outLog()	{		if ($this->log) {			$this->log->out();		}	}	public function log(Log $log)	{		$this->log = $log;	}}
 |