| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 | 
							- <?php
 
- namespace Spider\Lib;
 
- use Dever;
 
- class Doc
 
- {
 
- 	private $url;
 
- 	private $host;
 
- 	private $rule;
 
- 	public $type = 'dom';
 
- 	private $log = false;
 
- 	static protected $instance;
 
- 	static public function getInstance($url, $rule = '')
 
- 	{
 
- 		$key = $url . md5($rule);
 
- 		if (empty(self::$instance[$key])) {
 
- 			# 此处开协程:Dever::coroutine(self::$instance[$key]);
 
- 			self::$instance[$key] = new self($url, $rule);
 
- 		}
 
- 		return self::$instance[$key];
 
- 	}
 
- 	public function __construct($url, $rule = '')
 
- 	{
 
- 		$this->url($url);
 
- 		$this->rule = $rule;
 
- 		if (strpos($this->rule, '$json') !== false) {
 
- 			$this->type = 'json';
 
- 		} else {
 
- 			$this->type = 'dom';
 
- 		}
 
- 	}
 
- 	private function url($url)
 
- 	{
 
- 		$this->url = $url;
 
- 		$value = parse_url($this->url);
 
- 		$this->host = $value['scheme'] . '://' . $value['host'];
 
- 	}
 
- 	public function get()
 
- 	{
 
- 		$doc = $this->doc();
 
- 		if ($this->rule) {
 
- 			$doc = $this->find($doc, $this->rule);
 
- 		}
 
- 		return $doc;
 
- 	}
 
- 	public function doc($url = false)
 
- 	{
 
- 		$url = $url ? $url : $this->url;
 
- 		$html = $this->download($url);
 
- 		return ($this->getClass())::init($html);
 
- 	}
 
- 	private function download($url)
 
- 	{
 
- 		$this->addLog($url . '下载中...');
 
- 		$download = new Download($url);
 
- 		$this->addLog($url . '下载完成');
 
- 		return $download->get();
 
- 	}
 
- 	private function collect($data, $include, $exclude, $filter)
 
- 	{
 
- 		if ($include) {
 
- 			$include = explode("\n", str_replace("\r", '', $include));
 
- 			foreach ($include as $k => $v) {
 
- 				$state = preg_match('/' . $v . '/i', $data);
 
- 				if ($state) {
 
- 					break;
 
- 				}
 
- 			}
 
- 			if (!$state) {
 
- 				return 'error';
 
- 			}
 
- 		}
 
- 		if ($exclude) {
 
- 			$exclude = explode("\n", str_replace("\r", '', $exclude));
 
- 			foreach ($exclude as $k => $v) {
 
- 				$state = preg_match('/' . $v . '/i', $data);
 
- 				if (!$state) {
 
- 					return 'error';
 
- 				}
 
- 			}
 
- 		}
 
- 		if ($filter) {
 
- 			$filter = explode("\n", str_replace("\r", '', $filter));
 
- 			foreach ($filter as $k => $v) {
 
- 				$s = '';
 
- 				if (strstr($v, '=>')) {
 
- 					$temp = explode('=>', $v);
 
- 					$v = $temp[0];
 
- 					$s = $temp[1];
 
- 				}
 
- 				$data = preg_replace('/' . $v . '/i', $s, $data);
 
- 			}
 
- 		}
 
- 		
 
- 		return $data;
 
- 	}
 
- 	private function getClass()
 
- 	{
 
- 		return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
 
- 	}
 
- 	public function find($doc, $rule)
 
- 	{
 
- 		return ($this->getClass())::find($doc, $rule);
 
- 	}
 
- 	public function init($data)
 
- 	{
 
- 		if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
 
- 			$data = $this->doc($data);
 
- 		} else {
 
- 			$data = ($this->getClass())::init($data);
 
- 		}
 
- 		return $data;
 
- 	}
 
- 	public function rule($data, $col, $config)
 
- 	{
 
- 		$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
 
- 		$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
 
- 		$method = 'rule_' . $this->type;
 
- 		$data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
 
- 		$this->addLog($name . '解析完成');
 
- 		return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
 
- 	}
 
- 	public function getUrl($data, $col, $config)
 
- 	{
 
- 		$url = $this->rule($data, $col, $config);
 
- 		if (strpos($url, 'http') === false) {
 
- 			if ($url[0] == '/') {
 
- 				$url = $this->host . $url;
 
- 			} else {
 
- 				$url = $this->url . $url;
 
- 			}
 
- 		}
 
- 		return $url;
 
- 	}
 
- 	public function addLog($string)
 
- 	{
 
- 		if ($this->log) {
 
- 			$this->log->add($string);
 
- 		}
 
- 	}
 
- 	public function saveLog()
 
- 	{
 
- 		if ($this->log) {
 
- 			$this->log->save();
 
- 		}
 
- 	}
 
- 	public function outLog()
 
- 	{
 
- 		if ($this->log) {
 
- 			$this->log->out();
 
- 		}
 
- 	}
 
- 	public function log(Log $log)
 
- 	{
 
- 		$this->log = $log;
 
- 	}
 
- }
 
 
  |