dever 3 年之前
父節點
當前提交
a4e0429bc5
共有 3 個文件被更改,包括 59 次插入11 次删除
  1. 2 2
      database/col.php
  2. 56 9
      lib/Doc.php
  3. 1 0
      lib/doc/Core.php

+ 2 - 2
database/col.php

@@ -170,12 +170,12 @@ return array
 		'collect_url'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集链接规则-如果相同规则下还有其他页面,这里填写获取该页面的规则',
+			'name' 		=> '采集链接规则-如果当前页面里仍然下一页内容,这里设置获取下一页内容的链接规则,第二行可以输入链接中要包含的字符',
 			'default' 	=> '',
 			'desc' 		=> '采集链接规则',
 			'match' 	=> 'option',
 			'update'	=> 'textarea',
-			'list'		=> true,
+			//'list'		=> true,
 			'edit'		=> 'textarea',
 		),
 

+ 56 - 9
lib/Doc.php

@@ -6,6 +6,7 @@ class Doc
 {
 	private $url;
 	private $host;
+	private $path;
 	private $rule;
 	public $type = 'dom';
 	private $log = false;
@@ -33,11 +34,20 @@ class Doc
 		}
 	}
 
-	private function url($url)
+	private function url($url = false)
 	{
+		if (!$url) {
+			return;
+		}
 		$this->url = $url;
 		$value = parse_url($this->url);
-		$this->host = $value['scheme'] . '://' . $value['host'];
+		$this->path = $this->host = $value['scheme'] . '://' . $value['host'];
+		if (isset($value['path']) && $value['path']) {
+			$temp = explode('/', $value['path']);
+			unset($temp[count($temp)-1]);
+			$this->path .= implode('/', $temp);
+		}
+		$this->path .= '/';
 	}
 
 	public function get($param = array())
@@ -51,8 +61,8 @@ class Doc
 
 	public function doc($url = false, $param = array())
 	{
-		$url = $url ? $url : $this->url;
-		$html = $this->download($url, $param);
+		$this->url($url);
+		$html = $this->download($this->url, $param);
 		return ($this->getClass())::init($html);
 	}
 
@@ -127,25 +137,58 @@ class Doc
 
 	public function rule($data, $col, $config)
 	{
-
 		$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
 		$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
 
 		$method = 'rule_' . $this->type;
 
-		$data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
+		$result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
 
+		if (isset($config['collect_url']) && $config['collect_url']) {
+			$collect_url = explode("\n", str_replace("\r", '', $config['collect_url']));
+			if (!isset($collect_url[1])) {
+				$collect_url[1] = '';
+			}
+			$temp = array();
+			$temp[] = $result;
+			$this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);
+			$result = implode(',', $temp);
+		}
+		
 		$this->addLog($name . '解析完成');
 
-		return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
+		return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
 	}
 
-	public function getUrl($data, $col, $config)
+	public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)
 	{
-		$url = $this->rule($data, $col, $config);
+		$url = $this->getUrl($data, $col, $collect_url, $key);
+		if ($url) {
+			if ($collect_include && !strstr($url, $collect_include)) {
+				return;
+			}
+			$data = $this->init($url);
+			if ($data) {
+				$temp = $this->getRule($data, $col, $collect_rule, $key);
+				if ($temp) {
+					$result[] = $temp;
+					$this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);
+				}
+			}
+		}
+	}
+
+	public function getUrl($data, $col, $collect_rule, $key)
+	{
+		$url = $this->getRule($data, $col, $collect_rule, $key);
+		if (!$url) {
+			return '';
+		}
 		if (strpos($url, 'http') === false) {
 			if ($url[0] == '/') {
 				$url = $this->host . $url;
+			} elseif (strstr($url, '.')) {
+				$url = $this->path . $url;
 			} else {
 				$url = $this->url . $url;
 			}
@@ -153,6 +196,10 @@ class Doc
 		return $url;
 	}
 
+	public function getRule($data, $col, $collect_rule, $key)
+	{
+		return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);
+	}
 
 	public function addLog($string)
 	{

+ 1 - 0
lib/doc/Core.php

@@ -20,6 +20,7 @@ class Core
 
 	public static function rule($doc, $data, $col, $rule, $key)
 	{
+		$result = '';
 		$rule = explode("\n", str_replace("\r", '', $rule));
 		if (isset($rule[0]) && $rule[0]) {
 			if (isset($col[$rule[0]]) && $rule[0] != $key) {