rabin 7 years ago
parent
commit
065d438e8c
10 changed files with 311 additions and 136 deletions
  1. 3 2
      database/data.php
  2. 5 4
      database/project.php
  3. 144 0
      lib/Doc.php
  4. 0 41
      lib/Dom.php
  5. 52 0
      lib/Log.php
  6. 20 89
      lib/Parse.php
  7. 15 0
      lib/doc/Core.php
  8. 39 0
      lib/doc/Dom.php
  9. 22 0
      lib/doc/Json.php
  10. 11 0
      src/Data.php

+ 3 - 2
database/data.php

@@ -14,6 +14,7 @@ return array
 	'lang' => '数据管理',
 	# 后台菜单排序
 	'order' => 6,
+	'menu'	=> false,
 	# 数据结构
 	'struct' => array
 	(
@@ -51,8 +52,8 @@ return array
 			'match' 	=> 'is_string',
 			//'update'	=> 'editor',
 			'search'	=> 'fulltext',
-			'list'		=> 'Dever::load("spider/data.value", {id})',
-			'modal'		=> '查看详情',
+			//'list'		=> 'Dever::load("spider/data.value", {id})',
+			//'modal'		=> '查看详情',
 		),
 
 		'log_id'		=> array

+ 5 - 4
database/project.php

@@ -54,6 +54,7 @@ return array
 	'lang' => '项目管理',
 	# 后台菜单排序
 	'order' => 20,
+	//'desc' => 'API:http://192.168.15.10/plant/spider/??project_id=1',
 	# 数据结构
 	'struct' => array
 	(
@@ -128,7 +129,7 @@ return array
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-支持dom解析、正则',
+			'name' 		=> '采集规则-支持dom解析、正则、json格式',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'is_string',
@@ -191,7 +192,7 @@ return array
 			'callback'	=> 'maketime',
 		),
 
-		'intervals'		=> array
+		'interval'		=> array
 		(
 			'type' 		=> 'int-11',
 			'name' 		=> '抓取间隔秒数-填写开始时间之后的间隔抓取的秒数,为0则只抓取一次',
@@ -255,8 +256,8 @@ return array
 		# 可以删除
 		'list_button' => array
 		(
-			'list_col' => array('字段设置', '"col&search_option_project_id={id}"'),
-			'list_data' => array('数据列表', '"data&search_option_project_id={id}&oper_save_jump=project&oper_parent=project"'),
+			'list_col' => array('字段设置', '"col&search_option_project_id={id}&oper_parent=project"'),
+			//'list_data' => array('数据列表', '"data&search_option_project_id={id}&oper_save_jump=project&oper_parent=project"'),
 			//'delete' => array('采集数据', 'Dever::url("spider/data.daemon?id={id}&")'),
 		),
 		

+ 144 - 0
lib/Doc.php

@@ -0,0 +1,144 @@
+<?php
+namespace Spider\Lib;
+use Dever;
+
+class Doc
+{
+	private $url;
+	private $host;
+	private $rule;
+	public $type = 'dom';
+	private $log = false;
+	static protected $instance;
+
+	static public function getInstance($url, $rule = '')
+    {
+        $key = $url . md5($rule);
+        if (empty(self::$instance[$key])) {
+            self::$instance[$key] = new self($url, $rule);
+        }
+
+        return self::$instance[$key];
+    }
+
+	public function __construct($url, $rule = '')
+	{
+		$this->url($url);
+		$this->rule = $rule;
+	}
+
+	private function url($url)
+	{
+		$this->url = $url;
+		$value = parse_url($this->url);
+		$this->host = $value['scheme'] . '://' . $value['host'];
+	}
+
+	public function get()
+	{
+		$doc = $this->doc();
+		if ($this->rule) {
+			$doc = $this->find($doc, $this->rule);
+		}
+		return $doc;
+	}
+
+	private function doc()
+	{
+		$html = $this->download($this->url);
+		if (strpos($this->rule, '$json') !== false) {
+			$this->type = 'json';
+		} else {
+			$this->type = 'dom';
+		}
+		return ($this->getClass())::init($html);
+	}
+
+
+	private function download($url)
+	{
+		$this->addLog($this->url . '下载中...');
+		$download = new Download($url);
+		$this->addLog($this->url . '下载完成');
+		return $download->get();
+	}
+
+	private function collect($data, $include, $exclude, $filter)
+	{
+		if ($include && strpos($data, $include) === false) {
+			return 'error';
+		}
+		if ($exclude && strpos($data, $exclude) !== false) {
+			return 'error';
+		}
+		if ($filter) {
+			$data = preg_replace('/' . $filter . '/i', '', $data);
+		}
+		
+		return $data;
+	}
+
+	private function getClass()
+	{
+		return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
+	}
+
+	public function find($doc, $rule)
+	{
+		return ($this->getClass())::find($doc, $rule);
+	}
+
+	public function rule($data, $col, $config)
+	{
+		$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
+		$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
+
+		$method = 'rule_' . $this->type;
+
+		$data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule']);
+
+		$this->addLog($name . '解析完成');
+
+		return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
+	}
+
+	public function getUrl($data, $col, $config)
+	{
+		$url = $this->rule($data, $col, $config);
+		if (strpos($url, 'http') === false) {
+			if ($url[0] == '/') {
+				$url = $this->host . $url;
+			} else {
+				$url = $this->url . $url;
+			}
+		}
+		return $url;
+	}
+
+
+	public function addLog($string)
+	{
+		if ($this->log) {
+			$this->log->add($string);
+		}
+	}
+
+	public function saveLog()
+	{
+		if ($this->log) {
+			$this->log->save();
+		}
+	}
+
+	public function outLog()
+	{
+		if ($this->log) {
+			$this->log->out();
+		}
+	}
+
+	public function log(Log $log)
+	{
+		$this->log = $log;
+	}
+}

+ 0 - 41
lib/Dom.php

@@ -1,41 +0,0 @@
-<?php
-namespace Spider\Lib;
-include(DEVER_APP_PATH . 'third/phpQuery.php');
-use Dever;
-use phpQuery;
-
-class Dom
-{
-	private $query;
-
-	public function __construct($url, $rule = '')
-	{
-		$html = $this->download($url);
-		$dom = phpQuery::newDocumentHTML($html);
-		if ($rule) {
-			$dom = $this->find($dom, $rule);
-		}
-		
-		$this->query = $dom;
-	}
-
-
-	private function download($url)
-	{
-		$download = new Download($url);
-		return $download->get();
-	}
-
-	public function get()
-	{
-		return $this->query;
-	}
-
-	public function find($dom, $rule)
-	{
-		$rule = str_replace('$', '$dom->find', $rule);
-		$cmd = '$dom = ' . $rule . ';';
-		eval($cmd);
-		return $dom;
-	}
-}

+ 52 - 0
lib/Log.php

@@ -0,0 +1,52 @@
+<?php
+namespace Spider\Lib;
+use Dever;
+
+class Log
+{
+	private $time = 0;
+	private $file;
+	private $content = array();
+
+	public function __construct($key)
+	{
+		$key = date('Y-m-d') . '_' . $key;
+		$this->file = Dever::path(Dever::data() . 'log/', 'spider/' . $key);
+	}
+
+	public function get()
+	{
+		$content = file_get_contents($this->file);
+		return $content;
+	}
+
+	public function add($string)
+	{
+		$time = time();
+		if ($this->time == 0) {
+			$hs = 0;
+			$this->time = $time;
+		} else {
+			$hs = $time - $this->time;
+		}
+		$time = date('Y-m-d H:i:s', $time);
+		$content = array
+		(
+			'时间:' . $time,
+			'耗时:' . $hs . 'MS',
+			'内容:' . str_replace("\n", '<--', $string),
+		);
+		$this->content[] = implode(' |-| ', $content);
+	}
+
+	public function out()
+	{
+		print_r($this->content);
+	}
+
+	public function save()
+	{
+		$content = implode("\r\n", $this->content);
+		file_put_contents($this->file, $content, FILE_APPEND);
+	}
+}

+ 20 - 89
lib/Parse.php

@@ -6,16 +6,20 @@ class Parse
 {
 	private $url = '';
 	private $host = '';
-	private $dom = array();
+	private $log;
+	private $doc = array();
 	private $data = array();
 
 	public function __construct($url, $project, $rule, $col)
 	{
-		$this->url($url);
-		$dom = $this->dom($rule);
-		foreach ($dom as $k => $v) {
-			$this->handle(pq($v), $k, $col, $project);
+		$doc = Doc::getInstance($url, $rule);
+		$doc->log(new Log($project));
+		$data = $doc->get();
+		foreach ($data as $k => $v) {
+			print_r($data);die;
+			$this->data[$k] = $this->load($doc, $v, $col);
 		}
+		$doc->saveLog();
 	}
 
 	public function get()
@@ -23,34 +27,17 @@ class Parse
 		return $this->data;
 	}
 
-	private function url($url)
+	private function load($doc, $data, $col)
 	{
-		$this->url = $url;
-		$value = parse_url($this->url);
-		$this->host = $value['scheme'] . '://' . $value['host'];
-	}
-
-	private function dom($rule, $url = '')
-	{
-		$url = $url ? $url : $this->url;
-		if (empty($this->dom[$url])) {
-			$dom = new Dom($url, $rule);
-			$this->dom[$url] = $dom->get();
-		}
-		
-		return $this->dom[$url];
-	}
-
-	private function handle($dom, $index, $col, $project)
-	{
-		foreach ($col as $v) {
+		foreach ($data as $v) {
 			$callback = false;
 			if (strpos($v['key'], '.') !== false) {
 				$temp = explode('.', $v['key']);
 				$v['key'] = $temp[1];
 				$callback = $temp[0];
 			}
-			$value = $this->load($dom, $col, $v);
+
+			$value = $doc->rule($data, $col, $v);
 			if ($value == 'error') {
 				break;
 			}
@@ -58,9 +45,14 @@ class Parse
 				$value = Dever::{$callback}($value);
 			}
 
-			$this->data[$index][$v['key']] = $value;
+			$data[$v['key']] = $value;
+		}
+		if (Dever::input('test') == 1) {
+			$doc->outLog();
+			print_r($data);die;
 		}
-		$this->update($this->data[$index], $project);
+		$this->update($data, $project);
+		return $data;
 	}
 
 	private function update($data, $project)
@@ -85,65 +77,4 @@ class Parse
 			$id = Dever::db('spider/data')->insert($update);
 		}
 	}
-
-	private function load($dom, $col, $config)
-	{
-		$data = $this->rule($dom, $col, $config['collect_rule'], $config['collect_include'], $config['collect_exclude']);
-		if ($config['collect_include'] && strpos($data, $config['collect_include']) === false) {
-			return 'error';
-		}
-		if ($config['collect_exclude'] && strpos($data, $config['collect_exclude']) !== false) {
-			return 'error';
-		}
-		if ($config['collect_filter']) {
-			$data = preg_replace('/' . $config['collect_filter'] . '/i', '', $data);
-		}
-		return $data;
-	}
-
-	private function rule($dom, $col, $rule, $include, $exclude)
-	{
-		$result = $dom->html();
-		$rule = explode("\n", $rule);
-		if (isset($rule[0]) && $rule[0]) {
-			if (isset($col[$rule[0]])) {
-				$url = $this->getUrl($dom, $col, $col[$rule[0]]);
-				$dom = $this->dom('', $url);
-				array_shift($rule);
-			}
-			$result = $this->find($dom, $rule[0], $result);
-		}
-		if (isset($rule[1]) && $rule[1]) $result = $this->match($rule[1], $result);
-		return $result;
-	}
-
-	private function find($dom, $string, $result)
-	{
-		$string = str_replace(array('$', ').'), array('$dom->find', ')->'), $string);
-		$cmd = '$result = ' . $string . ';';
-		eval($cmd);
-		return $result;
-	}
-
-	private function getUrl($dom, $col, $config)
-	{
-		$url = $this->load($dom, $col, $config);
-		if (strpos($url, 'http') === false) {
-			if ($url[0] == '/') {
-				$url = $this->host . $url;
-			} else {
-				$url = $this->url . $url;
-			}
-		}
-		return $url;
-	}
-
-	private function match($pattern, $string)
-	{
-		$temp = explode('||', $pattern);
-		$index = isset($temp[1]) ? $temp[1] : 1;
-		preg_match_all('/' . $temp[0] . '/i', $string, $match);
-		$result = $match[$index][0];
-		return $result;
-	}
 }

+ 15 - 0
lib/doc/Core.php

@@ -0,0 +1,15 @@
+<?php
+namespace Spider\Lib\Doc;
+use Dever;
+
+class Core
+{
+	public static function match($pattern, $string)
+	{
+		$temp = explode('||', $pattern);
+		$index = isset($temp[1]) ? $temp[1] : 1;
+		preg_match_all('/' . $temp[0] . '/i', $string, $match);
+		$result = $match[$index][0];
+		return $result;
+	}
+}

+ 39 - 0
lib/doc/Dom.php

@@ -0,0 +1,39 @@
+<?php
+namespace Spider\Lib\Doc;
+include(DEVER_APP_PATH . 'third/phpQuery.php');
+use Dever;
+use Spider\Lib\Doc as Doc;
+use phpQuery;
+
+class Dom extends Core
+{
+	public static function init($html)
+    {
+        return phpQuery::newDocumentHTML($html);
+    }
+
+	public static function find($dom, $rule)
+    {
+        $rule = str_replace(array('$', ').'), array('$dom->find', ')->'), $rule);
+		$cmd = '$dom = ' . $rule . ';';
+		eval($cmd);
+		return $dom;
+    }
+
+	public static function rule($doc, $dom, $col, $rule)
+    {
+    	$dom = pq($dom);
+        $result = $dom->html();
+		$rule = explode("\n", $rule);
+		if (isset($rule[0]) && $rule[0]) {
+			if (isset($col[$rule[0]])) {
+				$url = $doc->getUrl($dom, $col, $col[$rule[0]]);
+				$doc = Doc::getInstance($url, '')->get();
+				array_shift($rule);
+			}
+			$result = self::find($doc, $rule[0]);
+		}
+		if (isset($rule[1]) && $rule[1]) $result = parent::match($rule[1], $result);
+		return $result;
+    }
+}

+ 22 - 0
lib/doc/Json.php

@@ -0,0 +1,22 @@
+<?php
+namespace Spider\Lib\Doc;
+use Dever;
+use Spider\Lib\Doc as Doc;
+
+class Dom extends Core
+{
+	public static function init($html)
+    {
+        return json_decode($html, true);
+    }
+
+	public static function find($data, $rule)
+    {
+        print_r($data);die;
+    }
+
+	public static function rule($doc, $dom, $col, $rule)
+    {
+        print_r($data);die;
+    }
+}

+ 11 - 0
src/Data.php

@@ -3,6 +3,7 @@
 namespace Spider\Src;
 
 use Dever;
+use Spider\Lib\Log;
 
 class Data
 {
@@ -20,4 +21,14 @@ class Data
 
         return $data;
     }
+
+    public function log($project_id = 1)
+    {
+        if (is_numeric($project_id)) {
+            $log = new Log($project_id);
+            $content = explode("\r\n", $log->get());
+
+            return $content;
+        }
+    }
 }