dever 4 years ago
parent
commit
d00a8236da
5 changed files with 205 additions and 13 deletions
  1. 15 3
      database/col.php
  2. 5 2
      database/project.php
  3. 138 0
      database/set.php
  4. 27 5
      lib/Doc.php
  5. 20 3
      lib/Parse.php

+ 15 - 3
database/col.php

@@ -128,7 +128,7 @@ return array
 		'collect_include'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '包含字符-如果填写该项,则进一步过滤,只保留包含有该字符的',
+			'name' 		=> '包含字符-如果填写该项,则只保留包含有该字符的,多个换行隔开,多个是或的关系',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',
@@ -138,7 +138,7 @@ return array
 		'collect_exclude'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '不包含字符-如果填写该项,则进一步过滤,只保留不包含有该字符的',
+			'name' 		=> '不包含字符-如果填写该项,则只保留不包含有该字符的,多个换行隔开,多个是且的关系',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',
@@ -148,13 +148,25 @@ return array
 		'collect_filter'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '过滤规则-如果填写该项,则过滤掉符合该规则的字符,仅支持字符串和正则',
+			'name' 		=> '过滤规则-如果填写该项,则过滤掉符合该规则的字符,仅支持字符串和正则,输入=>可以替换成后边的字符,如不如输入则过滤为空,多个换行隔开',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',
 			'update'	=> 'textarea',
 		),
 
+		'collect_filter_link'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '是否过滤链接-将过滤内容中的链接',
+			'default' 	=> '2',
+			'desc' 		=> '是否过滤链接',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $local,
+			'update'	=> 'radio',
+			'control'	=> 'local',
+		),
+
 		'state'		=> array
 		(
 			'type' 		=> 'tinyint-1',

+ 5 - 2
database/project.php

@@ -136,7 +136,7 @@ return array
 			'match' 	=> 'is_numeric',
 			'option' 	=> $status,
 			'list'		=> 'Dever::load("spider/lib/project.status", {id})',
-			//'update'	=> $id > 0 ? 'hidden' : 'radio',
+			'update'	=> 'radio',
 			//'edit'		=> true,
 		),
 
@@ -236,12 +236,15 @@ return array
 		'list_button' => array
 		(
 			'update' => array('编辑', '"project&option_pid={project_id}"'),
+			'list_data' => array('查看采集数据', '"data&search_option_pid={id}&oper_save_jump=project&oper_parent=project"', '{project_id} > 0'),
 			'delete' => '删除',
 			'br1' => array('<br /><br />'),
 			'add' => array('新增子项目', '"project&option_pid={id}&oper_parent=project&oper_save_jump=project"', '{project_id} == -1'),
 
 			'list_col' => array('设置采集字段', '"col&search_option_pid={id}&oper_parent=project"', '{project_id} > 0 && {status} <= 2'),
-			'list_data' => array('查看采集数据', '"data&search_option_pid={id}&oper_save_jump=project&oper_parent=project"', '{project_id} > 0'),
+
+			'list_col1' => array('设置自定义字段', '"set&search_option_pid={id}&oper_parent=project"', '{project_id} > 0 && {status} <= 2'),
+			
 			'br2' => array('<br /><br />'),
 			'new' => array('测试采集', 'Dever::url("spider/lib/api.test?id={id}")', '{project_id} > 0'),
 			'oper1' => array('开始采集', 'Dever::url("spider/lib/api.add?id={id}")', '{project_id} > 0 && {status} <= 2'),

+ 138 - 0
database/set.php

@@ -0,0 +1,138 @@
+<?php
+
+# 定义几个常用的选项
+$type = array
+(
+	1 => '固定值',
+	2 => '自增值',
+	3 => '区间随机值',
+	4 => '时间随机值',
+	5 => '公式算法',
+);
+
+return array
+(
+	# 表名
+	'name' => 'set',
+	# 显示给用户看的名称
+	'lang' => '自定义字段管理',
+	'menu' => false,
+	# 后台菜单排序
+	'order' => 7,
+	# 数据结构
+	'struct' => array
+	(
+	
+		'id' 		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> 'ID',
+			'default' 	=> '',
+			'desc' 		=> '',
+			'match' 	=> 'is_numeric',
+			'search'	=> 'order',
+			'list'		=> true,
+			'order'		=> 'desc',
+		),
+
+		'name'		=> array
+		(
+			'type' 		=> 'varchar-100',
+			'name' 		=> '字段名称',
+			'default' 	=> '',
+			'desc' 		=> '字段名称',
+			'match' 	=> 'is_string',
+			'update'	=> 'text',
+			'search'	=> 'fulltext',
+			'list'		=> true,
+			'edit'		=> true,
+		),
+
+		'key'		=> array
+		(
+			'type' 		=> 'varchar-100',
+			'name' 		=> '字段唯一标识符',
+			'default' 	=> '',
+			'desc' 		=> '字段唯一标识符',
+			'match' 	=> 'is_string',
+			'update'	=> 'text',
+			'search'	=> 'fulltext',
+			'list'		=> true,
+			'edit'		=> true,
+		),
+
+		'pid'		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> '项目id',
+			'default' 	=> Dever::input('search_option_pid', -1),
+			'desc' 		=> '请选择项目id',
+			'match' 	=> 'is_numeric',
+			'search'	=> 'order',
+			'update'	=> 'hidden',
+		),
+
+		'type'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '字段类型-如果选择文件类型,将只会保存文件的网络地址',
+			'default' 	=> '1',
+			'desc' 		=> '字段类型',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $type,
+			'update'	=> 'radio',
+			'list'		=> true,
+			'control'	=> 'type',
+		),
+
+		'value'		=> array
+		(
+			'type' 		=> 'varchar-2000',
+			'name' 		=> '字段值',
+			'default' 	=> '',
+			'desc' 		=> '采集规则',
+			'match' 	=> 'is_string',
+			'update'	=> 'textarea',
+			'list'		=> true,
+			'edit'		=> 'textarea',
+		),
+
+		'state'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '数据状态',
+			'default' 	=> '1',
+			'desc' 		=> '请选择状态',
+			'match' 	=> 'is_numeric',
+		),
+		
+		'cdate'		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> '录入时间',
+			'match' 	=> array('is_numeric', time()),
+			'desc' 		=> '',
+			# 只有insert时才生效
+			'insert'	=> true,
+			'list'		=> 'date("Y-m-d H:i:s", {cdate})',
+		),
+	),
+
+	'request' => array
+	(
+		'getList' => array
+		(
+			'where' => array
+			(
+				'pid' => 'yes',
+				'state' => 1,
+			),
+			'type' => 'all',
+			'order' => array
+			(
+				'id' => 'desc',
+			),
+			'col' => '*|key',
+		),
+	)
+);

+ 27 - 5
lib/Doc.php

@@ -67,16 +67,38 @@ class Doc
 
 	private function collect($data, $include, $exclude, $filter)
 	{
-		if ($include && !preg_match('/' . $include . '/i', $data)) {
-			return 'error';
+		if ($include) {
+			$include = explode("\n", str_replace("\r", '', $include));
+			foreach ($include as $k => $v) {
+				$state = preg_match('/' . $v . '/i', $data);
+				if ($state) {
+					break;
+				}
+			}
+
+			if (!$state) {
+				return 'error';
+			}
 		}
-		if ($exclude && preg_match('/' . $exclude . '/i', $data)) {
-			return 'error';
+		if ($exclude) {
+			$exclude = explode("\n", str_replace("\r", '', $exclude));
+			foreach ($exclude as $k => $v) {
+				$state = preg_match('/' . $v . '/i', $data);
+				if (!$state) {
+					return 'error';
+				}
+			}
 		}
 		if ($filter) {
 			$filter = explode("\n", str_replace("\r", '', $filter));
 			foreach ($filter as $k => $v) {
-				$data = preg_replace('/' . $v . '/i', '', $data);
+				$s = '';
+				if (strstr($v, '=>')) {
+					$temp = explode('=>', $v);
+					$v = $temp[0];
+					$s = $temp[1];
+				}
+				$data = preg_replace('/' . $v . '/i', $s, $data);
 			}
 		}
 		

+ 20 - 3
lib/Parse.php

@@ -81,6 +81,10 @@ class Parse
 				$value = $this->local($value, $v['type']);
 			}
 
+			if ($v['collect_filter_link'] == 1) {
+				$value = $this->filter($value);
+			}
+
 			$result[$v['key']] = $value;
 			if (Dever::input('test') == 1) {
 				$table[$v['name']] = $value;
@@ -88,12 +92,26 @@ class Parse
 		}
 		if (Dever::input('test') == 1) {
 			$doc->outLog();
-			print_r(Dever::table($table));die;
+			echo Dever::table($table);die;
 		}
 		$this->update($result, $project);
 		return $result;
 	}
 
+	private function filter($content)
+	{
+		$rule = '<(a).+href="(.*?)"(.*?)>(.*?)<\/a>';
+		$content = preg_replace_callback('/' . $rule . '/i', array($this, 'filter_replace'), $content);
+		return $content;
+	}
+
+	private function filter_replace($result)
+	{
+		if (isset($result[4]) && $result[4]) {
+			return $result[4];
+		}
+	}
+
 	private function local($content, $type = 1)
 	{
 		if ($type == 1) {
@@ -102,7 +120,6 @@ class Parse
 		} else {
 			$content = $this->copy($content);
 		}
-		
 
 		return $content;
 	}
@@ -124,7 +141,7 @@ class Parse
 		if (isset($data['url'])) {
 			return $data['url'];
 		} else {
-			return '';
+			return $file;
 		}
 	}