rabin 7 years ago
parent
commit
804208c20a
15 changed files with 229 additions and 157 deletions
  1. 3 1
      database/col.php
  2. 2 12
      database/project.php
  3. 11 0
      lib/Api.php
  4. 10 10
      lib/Doc.php
  5. 6 3
      lib/Download.php
  6. 1 1
      lib/Log.php
  7. 11 10
      lib/Parse.php
  8. 2 2
      lib/Project.php
  9. 29 2
      lib/Queue.php
  10. 20 0
      lib/doc/Core.php
  11. 38 23
      lib/doc/Dom.php
  12. 15 12
      lib/doc/Json.php
  13. 18 18
      src/Cate.php
  14. 39 39
      src/Collect.php
  15. 24 24
      src/Data.php

+ 3 - 1
database/col.php

@@ -15,7 +15,9 @@ return array
 	'menu' => false,
 	# 后台菜单排序
 	'order' => 6,
-	'desc' => '字段唯一标识符:如果用逗号隔开,则前面的是dever公共函数,如maketime.date,则会调用Dever::maketime()来处理date的数据',
+	'desc' => '字段唯一标识符:如果用逗号隔开,则前面的是dever公共函数,如maketime.date,则会调用Dever::maketime()来处理date的数据
+	<br />
+	<a href="'.Dever::url('spider/lib/api.run?id=' . Dever::input('search_option_project_id', -1) . '&test=1').'" target="_blank">点此进行测试</a>:请按住ctrl打开新页面并查看源代码',
 	# 数据结构
 	'struct' => array
 	(

+ 2 - 12
database/project.php

@@ -138,21 +138,11 @@ return array
 			'list'		=> true,
 		),
 
-		'page_rule'		=> array
-		(
-			'type' 		=> 'varchar-300',
-			'name' 		=> '采集页数规则-填写采集的页数的正则表达式',
-			'default' 	=> '',
-			'desc' 		=> '采集页数规则',
-			'match' 	=> 'option',
-			'update'	=> 'textarea',
-		),
-
 		'page_num'		=> array
 		(
 			'type' 		=> 'int-11',
-			'name' 		=> '采集页数',
-			'default' 	=> '10',
+			'name' 		=> '采集页数-值为0则默认采集1000页',
+			'default' 	=> '0',
 			'desc' 		=> '采集页数',
 			'match' 	=> 'option',
 			'update'	=> 'text',

+ 11 - 0
lib/Api.php

@@ -15,6 +15,17 @@ class Api
 		return 'yes';
 	}
 
+	public function test_api($id, $ajax = false)
+	{
+		$project = new Project($id);
+		$config = $project->get();
+		$state = true;
+		while ($state) {
+			$state = $this->queue($config);
+		}
+		return 'yes';
+	}
+
 	private function queue($config)
 	{
 		$url = $config['queue']->pop();

+ 10 - 10
lib/Doc.php

@@ -12,14 +12,14 @@ class Doc
 	static protected $instance;
 
 	static public function getInstance($url, $rule = '')
-    {
-        $key = $url . md5($rule);
-        if (empty(self::$instance[$key])) {
-            self::$instance[$key] = new self($url, $rule);
-        }
+	{
+		$key = $url . md5($rule);
+		if (empty(self::$instance[$key])) {
+			self::$instance[$key] = new self($url, $rule);
+		}
 
-        return self::$instance[$key];
-    }
+		return self::$instance[$key];
+	}
 
 	public function __construct($url, $rule = '')
 	{
@@ -65,10 +65,10 @@ class Doc
 
 	private function collect($data, $include, $exclude, $filter)
 	{
-		if ($include && strpos($data, $include) === false) {
+		if ($include && !preg_match('/' . $include . '/i', $data)) {
 			return 'error';
 		}
-		if ($exclude && strpos($data, $exclude) !== false) {
+		if ($exclude && preg_match('/' . $exclude . '/i', $data)) {
 			return 'error';
 		}
 		if ($filter) {
@@ -95,7 +95,7 @@ class Doc
 
 		$method = 'rule_' . $this->type;
 
-		$data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule']);
+		$data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
 
 		$this->addLog($name . '解析完成');
 

+ 6 - 3
lib/Download.php

@@ -19,12 +19,15 @@ class Download
 	private function filter($string)
 	{
 		$encode = mb_detect_encoding($string, array('GB2312','GBK','UTF-8'));
-		if ($encode == 'GB2312' || $encode == 'GBK' || $encode == 'EUC-CN') {
-			$string = \iconv('GBK', 'UTF-8', $string);
+		$config = array('GB2312', 'GBK', 'EUC-CN', 'CP936');
+		if (in_array($encode, $config)) {
+			$string = iconv('GBK', 'UTF-8', $string);
 		}
+		/*
 		if ($encode == 'CP936') {
-			$string = \iconv('SJIS', 'UTF-8', $string);
+			$string = iconv('SJIS', 'UTF-8', $string);
 		}
+		*/
 		$string = str_replace(PHP_EOL, '', $string);
 		return $string;
 	}

+ 1 - 1
lib/Log.php

@@ -41,7 +41,7 @@ class Log
 
 	public function out()
 	{
-		print_r($this->content);
+		print_r(Dever::table($this->content));
 	}
 
 	public function save()

+ 11 - 10
lib/Parse.php

@@ -15,9 +15,10 @@ class Parse
 		$doc = Doc::getInstance($url, $rule);
 		$doc->log(new Log($project));
 		$data = $doc->get();
-		foreach ($data as $k => $v) {
-			print_r($data);die;
-			$this->data[$k] = $this->load($doc, $v, $col);
+		if ($data) {
+			foreach ($data as $k => $v) {
+				$this->data[$k] = $this->load($doc, $v, $col, $project);
+			}
 		}
 		$doc->saveLog();
 	}
@@ -27,16 +28,16 @@ class Parse
 		return $this->data;
 	}
 
-	private function load($doc, $data, $col)
+	private function load($doc, $data, $col, $project)
 	{
-		foreach ($data as $v) {
+		$result = array();
+		foreach ($col as $v) {
 			$callback = false;
 			if (strpos($v['key'], '.') !== false) {
 				$temp = explode('.', $v['key']);
 				$v['key'] = $temp[1];
 				$callback = $temp[0];
 			}
-
 			$value = $doc->rule($data, $col, $v);
 			if ($value == 'error') {
 				break;
@@ -45,14 +46,14 @@ class Parse
 				$value = Dever::{$callback}($value);
 			}
 
-			$data[$v['key']] = $value;
+			$result[$v['key']] = $value;
 		}
 		if (Dever::input('test') == 1) {
 			$doc->outLog();
-			print_r($data);die;
+			print_r(Dever::table($result));die;
 		}
-		$this->update($data, $project);
-		return $data;
+		$this->update($result, $project);
+		return $result;
 	}
 
 	private function update($data, $project)

+ 2 - 2
lib/Project.php

@@ -45,6 +45,6 @@ class Project
 		$this->config['col'] = Dever::db('spider/col')->getList(['where_project_id' => $this->config['id']]);
 		$this->config['url'] = explode("\r\n", $this->config['site']);
 		$this->config['queue'] = new Queue();
-		array_walk($this->config['url'], [$this->config['queue'], 'push']);
+		array_walk($this->config['url'], [$this->config['queue'], 'push'], $this->config['page_num']);
 	}
-}
+}

+ 29 - 2
lib/Queue.php

@@ -8,9 +8,13 @@ class Queue
 {
 	private $data = array();
 
-	public function push($value)
+	public function push($value, $key = 0, $num = 0)
 	{
-		array_push($this->data, $value);
+		if (strpos($value, '{') !== false && strpos($value, '}') !== false) {
+			$this->preg($value, $num);
+		} else {
+			array_push($this->data, $value);
+		}
 		return true;
 	}
 
@@ -18,4 +22,27 @@ class Queue
 	{
 		return array_shift($this->data);
 	}
+
+	private function preg($value, $num)
+	{
+		$pat = '/{(.*?)}/i';
+		preg_match_all($pat, $value, $match);
+		if (isset($match[1][0]) && $match[1][0]) {
+			if ($num <= 0) $num = 1000;
+			parse_str($match[1][0], $param);
+			$this->page($param, $match[0][0], $value, $num);
+		}
+	}
+
+	private function page($param, $replace, $value, $num)
+	{
+		if (isset($param['page'])) {
+			for ($i = $param['page']; $i <= $num; $i++) {
+				$url = str_replace($replace, $i, $value);
+				$this->push($url);
+			}
+		} else {
+			$this->push(str_replace($replace, '', $value));
+		}
+	}
 }

+ 20 - 0
lib/doc/Core.php

@@ -1,6 +1,7 @@
 <?php
 namespace Spider\Lib\Doc;
 use Dever;
+use Spider\Lib\Doc as Doc;
 
 class Core
 {
@@ -8,8 +9,27 @@ class Core
 	{
 		$temp = explode('||', $pattern);
 		$index = isset($temp[1]) ? $temp[1] : 1;
+		$temp[0] = '来源:互联网 时间:(.*?)<a href="#comments">';
 		preg_match_all('/' . $temp[0] . '/i', $string, $match);
 		$result = $match[$index][0];
 		return $result;
 	}
+
+	public static function rule($doc, $data, $col, $rule, $key)
+	{
+		$rule = explode("\n", str_replace("\r", '', $rule));
+		if (isset($rule[0]) && $rule[0]) {
+			if (isset($col[$rule[0]]) && $rule[0] != $key) {
+				$result = Doc::getInstance($doc->getUrl($data, $col, $col[$rule[0]]), $rule[1])->get();
+				array_shift($rule);
+			} elseif (isset($data[$rule[0]])) {
+				$result = $data[$rule[0]];
+			} else {
+				$result = Dom::find($data, $rule[0]);
+			}
+		}
+		if (isset($rule[1]) && $rule[1]) $result = self::match($rule[1], $result);
+		
+		return $result;
+	}
 }

+ 38 - 23
lib/doc/Dom.php

@@ -2,38 +2,53 @@
 namespace Spider\Lib\Doc;
 include(DEVER_APP_PATH . 'third/phpQuery.php');
 use Dever;
-use Spider\Lib\Doc as Doc;
 use phpQuery;
 
-class Dom extends Core
+class Dom
 {
 	public static function init($html)
-    {
-        return phpQuery::newDocumentHTML($html);
-    }
+	{
+		return phpQuery::newDocumentHTML($html);
+	}
 
 	public static function find($dom, $rule)
-    {
-        $rule = str_replace(array('$', ').'), array('$dom->find', ')->'), $rule);
+	{
+		list($rule, $attr) = self::each($rule);
+		$rule = str_replace(array('$', ').', '$dom->find.'), array('$dom->find', ')->', '$dom->'), $rule);
 		$cmd = '$dom = ' . $rule . ';';
 		eval($cmd);
-		return $dom;
-    }
+		return self::findAttr($dom, $attr);
+	}
 
-	public static function rule($doc, $dom, $col, $rule)
-    {
-    	$dom = pq($dom);
-        $result = $dom->html();
-		$rule = explode("\n", $rule);
-		if (isset($rule[0]) && $rule[0]) {
-			if (isset($col[$rule[0]])) {
-				$url = $doc->getUrl($dom, $col, $col[$rule[0]]);
-				$doc = Doc::getInstance($url, '')->get();
-				array_shift($rule);
-			}
-			$result = self::find($doc, $rule[0]);
+	public function each($rule)
+	{
+		$attr = '';
+		if (strpos($rule, '.each().') !== false) {
+			$temp = explode('.each()', $rule);
+			$rule = $temp[0];
+			$attr = '$' . $temp[1];
 		}
-		if (isset($rule[1]) && $rule[1]) $result = parent::match($rule[1], $result);
+		return array($rule, $attr);
+	}
+
+	public function findAttr($dom, $attr)
+	{
+		if (!$attr) {
+			return $dom;
+		}
+		$data = array();
+		foreach ($dom as $k => $v) {
+			$data[] = self::find(pq($v), $attr);
+		}
+		return json_encode($data, JSON_UNESCAPED_UNICODE);
+	}
+
+	public static function rule($doc, $dom, $col, $rule, $key)
+	{
+		$dom = pq($dom);
+		$result = $dom->html();
+
+		$result = Core::rule($doc, $dom, $col, $rule, $key);
 		return $result;
-    }
+	}
 }

+ 15 - 12
lib/doc/Json.php

@@ -1,22 +1,25 @@
 <?php
 namespace Spider\Lib\Doc;
 use Dever;
-use Spider\Lib\Doc as Doc;
 
-class Dom extends Core
+class Json
 {
 	public static function init($html)
-    {
-        return json_decode($html, true);
-    }
+	{
+		return json_decode($html, true);
+	}
 
 	public static function find($data, $rule)
-    {
-        print_r($data);die;
-    }
+	{
+		$rule = str_replace('$json', '$data', $rule);
+		$cmd = '$data = ' . $rule . ';';
+		eval($cmd);
+		return $data;
+	}
 
-	public static function rule($doc, $dom, $col, $rule)
-    {
-        print_r($data);die;
-    }
+	public static function rule($doc, $data, $col, $rule, $key)
+	{
+		$result = Core::rule($doc, $data, $col, $rule, $key);
+		return $result;
+	}
 }

+ 18 - 18
src/Cate.php

@@ -6,25 +6,25 @@ use Dever;
 
 class Cate
 {
-    /**
-     * 获取栏目列表
-     *
-     * @return mixed
-     */
-    public function get()
-    {
-        $data = Dever::load('spider/cate-main');
+	/**
+	 * 获取栏目列表
+	 *
+	 * @return mixed
+	 */
+	public function get()
+	{
+		$data = Dever::load('spider/cate-main');
 
-        if ($data) {
-            $child = Dever::load('spider/cate-child');
+		if ($data) {
+			$child = Dever::load('spider/cate-child');
 
-            foreach ($data as $k => $v) {
-                if (isset($child[$k])) {
-                    $data[$k]['child'] = $child[$k];
-                }
-            }
-        }
+			foreach ($data as $k => $v) {
+				if (isset($child[$k])) {
+					$data[$k]['child'] = $child[$k];
+				}
+			}
+		}
 
-        return $data;
-    }
+		return $data;
+	}
 }

+ 39 - 39
src/Collect.php

@@ -575,50 +575,50 @@ class Collect
 	{
 		global $zbp;
 		//http开头验证
-        if(strpos($file, "http") !== 0)
-        {
-            return $file;
-        }
-        
+		if(strpos($file, "http") !== 0)
+		{
+			return $file;
+		}
+		
 		//获取请求头并检测死链
-        $heads = get_headers($file);
-        if(!(stristr($heads[0], "200") && stristr($heads[0], "OK")))
-        {
-            return $file;
-        }
-        
-        
-        //格式验证(扩展名验证和Content-Type验证)
-        $fileType = strtolower(strrchr($file, '.'));
-        if(stristr($heads['Content-Type'], "image"))
-        {
-            return $file;
-        }
-
-        //打开输出缓冲区并获取远程图片
-        ob_start();
-        $context = stream_context_create
-        (
-            array('http' => array
-            (
-                'follow_location' => false // don't follow redirects
-            ))
-        );
-        readfile($file, false, $context);
-        $img = ob_get_contents();
-        ob_end_clean();
-        preg_match("/[\/]([^\/]*)[\.]?[^\.\/]*$/", $file, $m);
-        
-        $filename = $m ? $m[1] : "";
-        $filesize = strlen($img);
-        $filetype = strtolower(strrchr($filename, '.'));
+		$heads = get_headers($file);
+		if(!(stristr($heads[0], "200") && stristr($heads[0], "OK")))
+		{
+			return $file;
+		}
+		
+		
+		//格式验证(扩展名验证和Content-Type验证)
+		$fileType = strtolower(strrchr($file, '.'));
+		if(stristr($heads['Content-Type'], "image"))
+		{
+			return $file;
+		}
+
+		//打开输出缓冲区并获取远程图片
+		ob_start();
+		$context = stream_context_create
+		(
+			array('http' => array
+			(
+				'follow_location' => false // don't follow redirects
+			))
+		);
+		readfile($file, false, $context);
+		$img = ob_get_contents();
+		ob_end_clean();
+		preg_match("/[\/]([^\/]*)[\.]?[^\.\/]*$/", $file, $m);
+		
+		$filename = $m ? $m[1] : "";
+		$filesize = strlen($img);
+		$filetype = strtolower(strrchr($filename, '.'));
 
 		# 这里要修改upload类 算了
-        $root = Dever::path($zbp->usersdir . 'upload/collect/');
-        $id = ceil($id/1000);
+		$root = Dever::path($zbp->usersdir . 'upload/collect/');
+		$id = ceil($id/1000);
 		$filepath = Dever::path($root . $id . '/');
 		$filepath = 'upload/collect/' . $id . '/';
-        $filename = md5($file) . $filetype;
+		$filename = md5($file) . $filetype;
 
 		$upload = new Upload;
 		$upload->Dir = $filepath;

+ 24 - 24
src/Data.php

@@ -7,28 +7,28 @@ use Spider\Lib\Log;
 
 class Data
 {
-    public function value($id)
-    {
-        $data = Dever::load('spider/data-one', $id);
-        $data['value'] = Dever::table(json_decode($data['value'], true));
-
-        return $data['value'];
-    }
-
-    public function get($project_id = 1)
-    {
-        $data = Dever::load('spider/data-all', array('option_project_id' => $project_id));
-
-        return $data;
-    }
-
-    public function log($project_id = 1)
-    {
-        if (is_numeric($project_id)) {
-            $log = new Log($project_id);
-            $content = explode("\r\n", $log->get());
-
-            return $content;
-        }
-    }
+	public function value($id)
+	{
+		$data = Dever::load('spider/data-one', $id);
+		$data['value'] = Dever::table(json_decode($data['value'], true));
+
+		return $data['value'];
+	}
+
+	public function get($project_id = 1)
+	{
+		$data = Dever::load('spider/data-all', array('option_project_id' => $project_id));
+
+		return $data;
+	}
+
+	public function log($project_id = 1)
+	{
+		if (is_numeric($project_id)) {
+			$log = new Log($project_id);
+			$content = explode("\r\n", $log->get());
+
+			return $content;
+		}
+	}
 }