dever 4 年 前
コミット
3205313125
6 ファイル変更154 行追加48 行削除
  1. 63 1
      database/project.php
  2. 27 21
      lib/Api.php
  3. 7 7
      lib/Doc.php
  4. 35 15
      lib/Download.php
  5. 2 2
      lib/Parse.php
  6. 20 2
      lib/doc/Core.php

+ 63 - 1
database/project.php

@@ -16,6 +16,18 @@ $status = array
 	4 => '运行中',
 );
 
+$request_type = array
+(
+	1 => 'get',
+	2 => 'post',
+);
+
+$content_type = array
+(
+	1 => '普通表单',
+	2 => 'JSON格式',
+);
+
 $project = function()
 {
 	$array = array
@@ -105,10 +117,36 @@ return array
 			//'edit'		=> 'textarea',
 		),
 
+		'request_type'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '请求方式',
+			'default' 	=> '1',
+			'desc' 		=> '请求方式',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $request_type,
+			'list'		=> true,
+			'update'	=> $id < 0 ? 'hidden': 'radio',
+			//'edit'		=> true,
+		),
+
+		'content_type'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '请求媒体类型',
+			'default' 	=> '1',
+			'desc' 		=> '请求媒体类型',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $content_type,
+			'list'		=> true,
+			'update'	=> $id < 0 ? 'hidden': 'radio',
+			//'edit'		=> true,
+		),
+
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-为空则获取整个页面的内容,支持dom解析、json格式,如$(".info .title a").each().attr("href")',
+			'name' 		=> '采集规则-为空则获取整个页面的内容,支持dom解析、json格式,dom解析$(".info .title a").each().attr("href"),json格式$json[\'data\']',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',
@@ -179,6 +217,30 @@ return array
 			'update'	=> $id < 0 ? 'hidden' : 'text',
 		),
 
+		'header'		=> array
+		(
+			'type' 		=> 'varchar-8000',
+			'name' 		=> 'Header参数-换行为多个参数,格式cookie: 11',
+			'default' 	=> '',
+			'desc' 		=> 'Header参数',
+			'match' 	=> 'option',
+			'update'	=> $id < 0 ? 'hidden': 'textarea',
+			//'edit'		=> true,
+			//'list'		=> true,
+		),
+
+		'param'		=> array
+		(
+			'type' 		=> 'varchar-8000',
+			'name' 		=> '其他参数-一般为post传入的参数,必须是json格式',
+			'default' 	=> '',
+			'desc' 		=> '其他参数',
+			'match' 	=> 'option',
+			'update'	=> $id < 0 ? 'hidden': 'textarea',
+			//'edit'		=> true,
+			//'list'		=> true,
+		),
+
 		'push'		=> array
 		(
 			'type' 		=> 'varchar-2000',

+ 27 - 21
lib/Api.php

@@ -94,11 +94,20 @@ class Api
 		$col = $this->col($config['id']);
 		$set = $this->set($config['id']);
 
-		if (strpos($config['site'], '{') !== false && strpos($config['site'], '}') !== false) {
-			$this->preg($config, $col, $set);
+		$config['curl'] = array
+		(
+			'request_type' => $config['request_type'],
+			'content_type' => $config['content_type'],
+			'header' => $config['header'],
+			'param' => $config['param'],
+		);
+		if (strpos($config['site'], '{page=') !== false) {
+			$this->page($config['site'], 1, $config, $col, $set);
+		} if (strpos($config['param'], '{page=') !== false) {
+			$this->page($config['param'], 2, $config, $col, $set);
 		} else {
 			Dever::load('spider/lib/project')->set($config, 4, 1);
-			$this->parse($config['site'], $config['id'], $config['collect_rule'], $col, $set, $config['push']);
+			$this->parse($config['site'], $config['id'], $config['collect_rule'], $config['curl'], $col, $set, $config['push']);
 		}
 		/*
 		Dever::task(function() use($config, $this)
@@ -120,34 +129,31 @@ class Api
 		return Dever::db('spider/set')->getList(['where_pid' => $project]);
 	}
 
-	private function parse($url, $project, $rule, $col, $set, $push)
+	private function parse($url, $project, $rule, $param, $col, $set, $push)
 	{
-		$parse = new Parse($url, $project, $rule, $col, $set, $push);
+		$parse = new Parse($url, $project, $rule, $param, $col, $set, $push);
 		return $parse->get();
 	}
 
-	private function preg($config, $col, $set)
+	private function page($source, $type, $config, $col, $set)
 	{
-		$pat = '/{(.*?)}/i';
-		preg_match_all($pat, $config['site'], $match);
+		$pat = '/{page=(.*?)}/i';
+		preg_match_all($pat, $source, $match);
 		if (isset($match[1][0]) && $match[1][0]) {
 			if ($config['page_num'] <= 0) $config['page_num'] = 100;
-			parse_str($match[1][0], $param);
-			$this->page($param, $match[0][0], $config, $col, $set);
-		}
-	}
+			//parse_str($match[1][0], $param);
+			$page = $match[1][0];
 
-	private function page($param, $replace, $config, $col, $set)
-	{
-		if (isset($param['page']) && $param['page']) {
-			for ($i = $param['page']; $i <= $config['page_num']; $i++) {
-				$url = str_replace($replace, $i, $config['site']);
+			for ($i = $page; $i <= $config['page_num']; $i++) {
+				$source = str_replace($match[0][0], $i, $source);
 				Dever::load('spider/lib/project')->set($config, 4, $i);
-				$this->parse($url, $config['id'], $config['collect_rule'], $col, $set, $config['push']);
+				if ($type == 1) {
+					$config['site'] = $source;
+				} else {
+					$config['curl']['param'] = $source;
+				}
+				$this->parse($config['site'], $config['id'], $config['collect_rule'], $config['curl'], $col, $set, $config['push']);
 			}
-		} else {
-			Dever::load('spider/lib/project')->set($config, 4, 1);
-			$this->parse(str_replace($replace, '', $value), $config['id'], $config['collect_rule'], $col, $set, $config['push']);
 		}
 	}
 }

+ 7 - 7
lib/Doc.php

@@ -40,29 +40,29 @@ class Doc
 		$this->host = $value['scheme'] . '://' . $value['host'];
 	}
 
-	public function get()
+	public function get($param = array())
 	{
-		$doc = $this->doc();
+		$doc = $this->doc(false, $param);
 		if ($this->rule) {
 			$doc = $this->find($doc, $this->rule);
 		}
 		return $doc;
 	}
 
-	public function doc($url = false)
+	public function doc($url = false, $param = array())
 	{
 		$url = $url ? $url : $this->url;
-		$html = $this->download($url);
+		$html = $this->download($url, $param);
 		return ($this->getClass())::init($html);
 	}
 
 
-	private function download($url)
+	private function download($url, $header = '', $param = '')
 	{
 		$this->addLog($url . '下载中...');
-		$download = new Download($url);
+		$download = new Download($url, $header, $param);
 		$this->addLog($url . '下载完成');
-		return $download->get();
+		return $download->get($this->type);
 	}
 
 	private function collect($data, $include, $exclude, $filter)

+ 35 - 15
lib/Download.php

@@ -6,29 +6,49 @@ class Download
 {
 	private $data;
 
-	public function __construct($url)
+	public function __construct($url, $param)
 	{
-		$this->data = Dever::curl($url);
+		if (isset($param['request_type']) && $param['request_type'] == 2) {
+			$param['request_type'] = 'post';
+		} else {
+			$param['request_type'] = 'get';
+		}
+
+		if (isset($param['content_type']) && $param['content_type'] == 2) {
+			$param['content_type'] = true;
+		} else {
+			$param['content_type'] = false;
+		}
+		if (!isset($param['param'])) {
+			$param['param'] = array();
+		}
+		if (!isset($param['header'])) {
+			$param['header'] = false;
+		}
+		$this->data = Dever::curl($url, $param['param'], $param['request_type'], $param['content_type'], $param['header']);
 	}
 
-	public function get()
+	public function get($type)
 	{
-		return $this->filter($this->data);
+		return $this->filter($this->data, $type);
 	}
 
-	private function filter($string)
+	private function filter($string, $type)
 	{
-		$encode = mb_detect_encoding($string, array('GB2312','GBK','UTF-8'));
-		$config = array('GB2312', 'GBK', 'EUC-CN', 'CP936');
-		if (in_array($encode, $config)) {
-			$string = iconv('GBK', 'UTF-8', $string);
-		}
-		/*
-		if ($encode == 'CP936') {
-			$string = iconv('SJIS', 'UTF-8', $string);
+		if ($type != 'json') {
+			$encode = mb_detect_encoding($string, array('GB2312','GBK','UTF-8'));
+			$config = array('GB2312', 'GBK', 'EUC-CN', 'CP936');
+			if (in_array($encode, $config)) {
+				$string = iconv('GBK', 'UTF-8', $string);
+			}
+			/*
+			if ($encode == 'CP936') {
+				$string = iconv('SJIS', 'UTF-8', $string);
+			}
+			*/
+			$string = str_replace(PHP_EOL, '', $string);
 		}
-		*/
-		$string = str_replace(PHP_EOL, '', $string);
+		
 		return $string;
 	}
 }

+ 2 - 2
lib/Parse.php

@@ -10,11 +10,11 @@ class Parse
 	private $doc = array();
 	private $data = array();
 
-	public function __construct($url, $project, $rule, $col, $set, $push)
+	public function __construct($url, $project, $rule, $param, $col, $set, $push)
 	{
 		$doc = Doc::getInstance($url, $rule);
 		$doc->log(new Log($project));
-		$data = $doc->get();
+		$data = $doc->get($param);
 		if ($data) {
 			if (!is_array($data) && !is_object($data)) {
 				$state = Dever::json_decode($data);

+ 20 - 2
lib/doc/Core.php

@@ -21,8 +21,25 @@ class Core
 	public static function rule($doc, $data, $col, $rule, $key)
 	{
 		$rule = explode("\n", str_replace("\r", '', $rule));
-		print_r($data[$rule[0]]);die;
-		print_r($rule[0]);die;
+		if (isset($rule[0]) && $rule[0]) {
+			if (is_string($data)) {
+				$data = json_decode($data, true);
+			}
+			if (is_object($data)) {
+				$result = Dom::find($data, $rule[0]);
+			} elseif (is_array($data)) {
+				$temp = explode('.', $rule[0]);
+
+				$result = $data;
+				foreach ($temp as $k => $v) {
+					if (isset($result[$v])) {
+						$result = $result[$v];
+					}
+				}
+			}
+		}
+
+		/*
 		if (isset($rule[0]) && $rule[0]) {
 			if (isset($col[$rule[0]]) && $rule[0] != $key) {
 				# 此处开task
@@ -38,6 +55,7 @@ class Core
 				$result = Dom::find($data, $rule[0]);
 			}
 		}
+		*/
 		if (isset($rule[1]) && $rule[1]) $result = self::match($rule[1], $result);
 		
 		return $result;