dever 3 years ago
parent
commit
53604e4d77
8 changed files with 117 additions and 37 deletions
  1. 1 1
      database/cate.php
  2. 31 4
      database/col.php
  3. 4 4
      database/project.php
  4. 9 1
      lib/Api.php
  5. 8 0
      lib/Doc.php
  6. 57 26
      lib/Parse.php
  7. 3 0
      lib/doc/Dom.php
  8. 4 1
      src/Data.php

+ 1 - 1
database/cate.php

@@ -5,7 +5,7 @@ return array
     # 表名
     'name' => 'cate',
     # 显示给用户看的名称
-    'lang' => '采集源设置',
+    'lang' => '采集源',
     # 后台菜单排序
     'order' => 9,
     # 数据结构

+ 31 - 4
database/col.php

@@ -5,6 +5,7 @@ $type = array
 (
 	1 => '文本内容',
 	2 => '文件资源',
+	3 => '富文本内容',
 );
 
 $local = array
@@ -12,6 +13,19 @@ $local = array
 	1 => '是',
 	2 => '否',
 );
+
+$source = array
+(
+	1 => '详情页',
+	2 => '列表页',
+);
+
+$upload = function()
+{
+	$info = Dever::db('upload/upload')->select();
+	return $info;
+};
+
 return array
 (
 	# 表名
@@ -77,6 +91,18 @@ return array
 			'update'	=> 'hidden',
 		),
 
+		'source'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '字段来源',
+			'default' 	=> '1',
+			'desc' 		=> '字段来源',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $source,
+			'update'	=> 'radio',
+			'list'		=> true,
+		),
+
 		'type'		=> array
 		(
 			'type' 		=> 'tinyint-1',
@@ -105,11 +131,12 @@ return array
 		'res_key'		=> array
 		(
 			'type' 		=> 'int-11',
-			'name' 		=> '资源库ID-直接输入资源库的配置ID,将根据配置传入到相应的资源库里',
+			'name' 		=> '资源库-选择资源库,将根据配置传入到相应的资源库里',
 			'default' 	=> '1',
 			'desc' 		=> '资源库ID',
 			'match' 	=> 'is_numeric',
-			'update'	=> 'text',
+			'update'	=> 'select',
+			'option' 	=> $upload,
 			'show'		=> 'local=1',
 		),
 		
@@ -170,7 +197,7 @@ return array
 		'collect_url'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集链接规则-如果当前页面里仍然下一页内容,这里设置获取下一页内容的链接规则,第二行可以输入链接中要包含的字符',
+			'name' 		=> '采集链接规则-如果当前页面里需要采集下一页内容,这里可以设置获取下一页内容的链接规则,第二行可以输入链接中要包含的字符',
 			'default' 	=> '',
 			'desc' 		=> '采集链接规则',
 			'match' 	=> 'option',
@@ -214,7 +241,7 @@ return array
 			(
 				'id' => 'desc',
 			),
-			'col' => '*|key',
+			'col' => '*|source|key|',
 		),
 	)
 );

+ 4 - 4
database/project.php

@@ -61,7 +61,7 @@ return array
 	# 表名
 	'name' => 'project',
 	# 显示给用户看的名称
-	'lang' => '项目配置',
+	'lang' => '采集规则',
 	'status' => $status,
 	'path' => $path,
 	# 后台菜单排序
@@ -100,7 +100,7 @@ return array
 		'name'		=> array
 		(
 			'type' 		=> 'varchar-100',
-			'name' 		=> '项目名称',
+			'name' 		=> '名称',
 			'default' 	=> '',
 			'desc' 		=> '请输入项目名称',
 			'match' 	=> 'is_string',
@@ -125,7 +125,7 @@ return array
 		'site'		=> array
 		(
 			'type' 		=> 'text-255',
-			'name' 		=> '采集网址-如有分页,请写成这样{page=1}',
+			'name' 		=> '采集网址-如有分页,请写成这样{page=1},如首页和第n页链接不同,请用||隔开,如首页是test,第二页是test/page/1,就写成test||/page/{page=1}',
 			'default' 	=> '',
 			'desc' 		=> '采集网址',
 			'match' 	=> 'is_string',
@@ -163,7 +163,7 @@ return array
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-为空则获取整个页面的内容,支持dom解析、json格式,dom解析$(".info .title a").each().attr("href"),json格式$json["data"]',
+			'name' 		=> '采集规则-采集页面链接规则,为空则获取整个页面的内容,支持dom解析、json格式,dom解析$(".info .title a").each().attr("href"),json格式$json["data"]',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',

+ 9 - 1
lib/Api.php

@@ -119,7 +119,7 @@ class Api
 		Dever::load('spider/lib/project')->set($config, 2);
 	}
 
-	private function col($project)
+	private function col($project, $source = 1)
 	{
 		return Dever::db('spider/col')->getList(['where_pid' => $project]);
 	}
@@ -148,6 +148,14 @@ class Api
 				$source = str_replace($match[0][0], $i, $source);
 				Dever::load('spider/lib/project')->set($config, 4, $i);
 				if ($type == 1) {
+					if (strstr($source, '||')) {
+						$temp = explode('||', $source);
+						if ($i == 1) {
+							$source = $temp[0];
+						} else {
+							$source = $temp[0] . $temp[1];
+						}
+					}
 					$config['site'] = $source;
 				} else {
 					$config['curl']['param'] = $source;

+ 8 - 0
lib/Doc.php

@@ -10,6 +10,7 @@ class Doc
 	private $rule;
 	public $type = 'dom';
 	private $log = false;
+	private $cur = '';
 	static protected $instance;
 
 	static public function getInstance($url, $rule = '')
@@ -53,6 +54,9 @@ class Doc
 	public function get($param = array())
 	{
 		$doc = $this->doc(false, $param);
+		if (!$this->cur) {
+			$this->cur = $doc;
+		}
 		if ($this->rule) {
 			$doc = $this->find($doc, $this->rule);
 		}
@@ -66,6 +70,10 @@ class Doc
 		return ($this->getClass())::init($html);
 	}
 
+	public function getCur()
+	{
+		return $this->cur;
+	}
 
 	private function download($url, $header = '', $param = '')
 	{

+ 57 - 26
lib/Parse.php

@@ -48,8 +48,6 @@ class Parse
 
 	private function load($doc, $index, $data, $col, $set, $push, $project)
 	{
-		$result = $table = array();
-		$data = $doc->init($data);
 		if (!$col) {
 			if (Dever::input('test') == 1) {
 				$doc->outLog();
@@ -57,6 +55,44 @@ class Parse
 			}
 			return false;
 		}
+		$result = $table = array();
+
+		if (isset($col[1])) {
+			$data = $doc->init($data);
+			$this->getCol($doc, $col[1], $data, $result, $table);
+		}
+
+		if (isset($col[2])) {
+			$data = $doc->getCur();
+			$this->getCol($doc, $col[2], $data, $result, $table);
+		}
+
+		if ($set) {
+			foreach ($set as $v) {
+				$value = $this->set($index, $v, $project);
+				$result[$v['key']] = $value;
+				if (Dever::input('test') == 1) {
+					$table[$v['name']] = $value;
+				}
+			}
+		}
+
+		if ($push) {
+			$result['test'] = Dever::input('test');
+			$this->push($push, $result, $project);
+		}
+
+		if (Dever::input('test') == 1) {
+			$doc->outLog();
+			echo Dever::table($table);die;
+		}
+
+		$this->update($result, $project);
+		return $result;
+	}
+
+	private function getCol($doc, $col, $data, &$result, &$table)
+	{
 		foreach ($col as $v) {
 			$callback = false;
 			if (strpos($v['key'], '.') !== false) {
@@ -86,39 +122,19 @@ class Parse
 				$value = $this->filter($value);
 			}
 
-			$result[$v['key']] = $value;
-			if (Dever::input('test') == 1) {
-				$table[$v['name']] = $value;
-			}
-		}
-
-		if ($set) {
-			foreach ($set as $v) {
-				$value = $this->set($index, $v, $project);
+			if ($value) {
 				$result[$v['key']] = $value;
 				if (Dever::input('test') == 1) {
 					$table[$v['name']] = $value;
 				}
 			}
 		}
-
-		if ($push) {
-			$result['test'] = Dever::input('test');
-			$this->push($push, $result);
-		}
-
-		if (Dever::input('test') == 1) {
-			$doc->outLog();
-			echo Dever::table($table);die;
-		}
-
-		$this->update($result, $project);
-		return $result;
 	}
 
-	private function push($push, $data)
+	private function push($push, $data, $project)
 	{
 		$push = explode("\n", str_replace("\r", '', $push));
+		$data['project_id'] = $project;
 		foreach ($push as $k => $v) {
 			if (strstr($v, 'http')) {
 				Dever::curl($v, $data, 'post');
@@ -196,16 +212,31 @@ class Parse
 
 	private function copy($file)
 	{
+		if (strstr($file, ',')) {
+			$temp = explode(',', $file);
+			$file = array();
+			foreach($temp as $k => $v) {
+				$f = $this->copy($v);
+				if ($f) {
+					$file[] = $f;
+				}
+			}
+			$file = implode(',', $file);
+			return $file;
+		}
 		$data = Dever::load('upload/save.copy?file=' . $file . '&key=' . $this->res . '&state=1');
 		if (isset($data['url'])) {
 			return $data['url'];
 		} else {
-			return $file;
+			return '';
 		}
 	}
 
 	private function update($data, $project)
 	{
+		if (!$data) {
+			return;
+		}
 		$param['option_pid'] = $project;
 		$param['option_value'] = json_encode($data, JSON_UNESCAPED_UNICODE);
 		$info = Dever::db('spider/data')->one($param);

+ 3 - 0
lib/doc/Dom.php

@@ -8,6 +8,9 @@ class Dom
 {
 	public static function init($html)
 	{
+		if (strstr($html, 'gb2312')) {
+			$html = str_replace('gb2312', 'utf-8', $html);
+		}
 		return phpQuery::newDocumentHTML($html);
 	}
 

+ 4 - 1
src/Data.php

@@ -12,9 +12,12 @@ class Data
 		$data = Dever::load('spider/data-one', $id);
 		$data = json_decode($data['value'], true);
 		$result = array();
+		if (!$data) {
+			return '';
+		}
 		foreach ($data as $k => $v) {
 			$col = Dever::db('spider/col')->one(array('key' => $k));
-			if ($col) {
+			if ($col && $col['type'] < 3 && strlen($v) < 1000) {
 				$result[$col['name'] . '('.$k.')'] = $v;
 			}
 		}