dever 3 年之前
父節點
當前提交
07ffc4b376
共有 9 個文件被更改,包括 309 次插入56 次删除
  1. 25 1
      database/cate.php
  2. 14 4
      database/col.php
  3. 17 1
      database/data.php
  4. 2 2
      database/project.php
  5. 86 17
      lib/Api.php
  6. 14 6
      lib/Doc.php
  7. 1 1
      lib/Doc/Core.php
  8. 51 23
      lib/Parse.php
  9. 99 1
      src/Data.php

+ 25 - 1
database/cate.php

@@ -26,7 +26,7 @@ return array
         'name'      => array
         (
             'type'      => 'varchar-32',
-            'name'      => '分类名称',
+            'name'      => '采集源名称',
             'default'   => '',
             'desc'      => '请输入名称',
             'match'     => 'is_string',
@@ -34,6 +34,30 @@ return array
             'search'    => 'fulltext',
             'list'      => true,
         ),
+
+        'site'      => array
+        (
+            'type'      => 'text-255',
+            'name'      => '采集源网址',
+            'default'   => '',
+            'desc'      => '采集源网址',
+            'match'     => 'is_string',
+            'update'    => 'text',
+            //'list'        => true,
+            //'edit'        => 'textarea',
+        ),
+
+        'collect_rule'      => array
+        (
+            'type'      => 'varchar-500',
+            'name'      => '采集列表链接-采集列表页面链接规则,为空则不采集列表页面链接,第二行可以输入链接中要包含的字符',
+            'default'   => '',
+            'desc'      => '采集规则',
+            'match'     => 'option',
+            'update'    => 'textarea',
+            //'edit'        => true,
+            //'list'        => true,
+        ),
         
         'reorder'       => array
         (

+ 14 - 4
database/col.php

@@ -175,9 +175,19 @@ return array
 		'collect_filter'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '过滤规则-如果填写该项,则过滤掉符合该规则的字符,仅支持字符串和正则,输入=>可以替换成后边的字符,如不如输入则过滤为空,多个换行隔开',
+			'name' 		=> '过滤字符串-如果填写该项,则过滤掉符合该规则的字符,仅支持字符串和正则,输入=>可以替换成后边的字符,如不如输入则过滤为空,多个换行隔开',
 			'default' 	=> '',
-			'desc' 		=> '采集规则',
+			'desc' 		=> '过滤字符串',
+			'match' 	=> 'option',
+			'update'	=> 'textarea',
+		),
+
+		'collect_attr'		=> array
+		(
+			'type' 		=> 'varchar-500',
+			'name' 		=> '过滤属性-如果填写该项,则过滤掉html中的属性,这里直接填写属性的名称,如style,多个换行隔开',
+			'default' 	=> '',
+			'desc' 		=> '过滤属性',
 			'match' 	=> 'option',
 			'update'	=> 'textarea',
 		),
@@ -197,9 +207,9 @@ return array
 		'collect_url'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集链接规则-如果当前页面里需要采集下一页内容,这里可以设置获取下一页内容的链接规则,第二行可以输入链接中要包含的字符',
+			'name' 		=> '采集下级链接-如果当前页面里需要采集下一页内容,这里可以设置获取下一页内容的链接规则,第二行可以输入链接中要包含的字符',
 			'default' 	=> '',
-			'desc' 		=> '采集链接规则',
+			'desc' 		=> '采集下级链接',
 			'match' 	=> 'option',
 			'update'	=> 'textarea',
 			//'list'		=> true,

+ 17 - 1
database/data.php

@@ -6,6 +6,8 @@ $option = array
 	1 => '正常',
 	2 => '删除',
 );
+
+$pid = Dever::input('search_option_pid');
 return array
 (
 	# 表名
@@ -43,6 +45,18 @@ return array
 			//'show'		=> 'cate_id',
 		),
 
+		'source'		=> array
+		(
+			'type' 		=> 'varchar-2000',
+			'name' 		=> '来源',
+			'default' 	=> '',
+			'desc' 		=> '来源',
+			'match' 	=> 'is_string',
+			'update'	=> 'text',
+			'search'	=> 'fulltext',
+			'list'		=> true,
+		),
+
 		'value'		=> array
 		(
 			'type' 		=> 'text-1000',
@@ -101,7 +115,9 @@ return array
         'button' => array
         (
             //'新增兑换码' => array('fast', 1, 'config&where_id=1'),
-            '导出采集数据' => '',
+            //'导出EXCEL' => array('location', 'spider/data.excel?id=' . $pid),
+            '导出SQL' => array('location', 'spider/data.sql?id=' . $pid . '&time=' . time()),
+            '清空数据' => array('oper', 'spider/data.drop?id=' . $pid),
         ),
 	),
 

+ 2 - 2
database/project.php

@@ -125,7 +125,7 @@ return array
 		'site'		=> array
 		(
 			'type' 		=> 'text-255',
-			'name' 		=> '采集网址-如有分页,请写成这样{page=1},如首页和第n页链接不同,请用||隔开,如首页是test,第二页是test/page/1,就写成test||/page/{page=1}',
+			'name' 		=> '采集网址-第一行填写首页链接,第二行填写后续的分页部分,分页写成{page=1},分类写成{cate=1}',
 			'default' 	=> '',
 			'desc' 		=> '采集网址',
 			'match' 	=> 'is_string',
@@ -163,7 +163,7 @@ return array
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-采集页面链接规则,为空则获取整个页面的内容,支持dom解析、json格式,dom解析$(".info .title a").each().attr("href"),json格式$json["data"]',
+			'name' 		=> '采集详情链接-采集详情页面链接规则,为空则进行单页采集,直接获取整个页面的内容,支持dom解析、json格式,dom解析$(".info .title a").each().attr("href"),json格式$json["data"]',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'option',

+ 86 - 17
lib/Api.php

@@ -26,7 +26,7 @@ class Api
 
 	public function test_api($id)
 	{
-		Dever::setInput('test', 1);
+		//Dever::setInput('test', 1);
 		$this->run($id);
 		return 'reload';
 	}
@@ -91,6 +91,10 @@ class Api
 		if (!$config) {
 			return false;
 		}
+		$cate = Dever::db('spider/cate')->find($config['cate_id']);
+		if (!$cate) {
+			return false;
+		}
 		$col = $this->col($config['id']);
 		$set = $this->set($config['id']);
 
@@ -101,10 +105,77 @@ class Api
 			'header' => $config['header'],
 			'param' => $config['param'],
 		);
-		if (strpos($config['site'], '{page=') !== false) {
-			$this->page($config['site'], 1, $config, $col, $set);
-		} if (strpos($config['param'], '{page=') !== false) {
-			$this->page($config['param'], 2, $config, $col, $set);
+		
+		$site = Dever::split($config['site']);
+
+		$config['site'] = $site[0];
+		$config['page'] = '';
+
+		if (!strstr($config['site'], 'http')) {
+			$config['site'] = $cate['site'] . $config['site'];
+		}
+
+		if (isset($site[1]) && $site[1]) {
+			$config['page'] = $site[1];
+		}
+
+		if ($cate['collect_rule'] && $cate['site']) {
+			$rule = Dever::split($cate['collect_rule']);
+			if (!isset($rule[1])) {
+				$rule[1] = '';
+			}
+
+			$doc = Doc::getInstance($cate['site'], $rule[0]);
+
+			$doc->log(new Log($id));
+			$data = $doc->get($config['curl']);
+			$data = Dever::json_decode($data);
+			if ($data) {
+				foreach ($data as $k => $v) {
+					if (!$v) {
+						continue;
+					}
+					if ($rule[1] && !strstr($v, $rule[1])) {
+						continue;
+					}
+					$config['site'] = $v;
+					$this->task($config, $col, $set, $v);
+				}
+			}
+		} else {
+			$this->task($config, $col, $set);
+		}
+
+		
+		Dever::load('spider/lib/project')->set($config, 2);
+	}
+
+	private function task($config, $col, $set, $cate = false)
+	{
+		if (strpos($config['site'], '{cate=') !== false) {
+			$pat = '/{cate=(.*?)}/i';
+			preg_match_all($pat, $config['site'], $match);
+			if (isset($match[1][0]) && $match[1][0]) {
+				$cate = $cate ? $cate : $match[1][0];
+				$config['site'] = str_replace($match[0][0], $cate, $config['site']);
+			}
+		}
+
+		if ($config['page'] && strpos($config['page'], '{cate=') !== false) {
+			$pat = '/{cate=(.*?)}/i';
+			preg_match_all($pat, $config['page'], $match);
+			if (isset($match[1][0]) && $match[1][0]) {
+				$cate = $cate ? $cate : $match[1][0];
+				$config['page'] = str_replace($match[0][0], $cate, $config['page']);
+			}
+		}
+
+		if ($config['page'] && strpos($config['page'], '{page=') !== false) {
+			$this->page($config['page'], 1, $config, $col, $set);
+		} elseif (strpos($config['site'], '{page=') !== false) {
+			$this->page($config['site'], 2, $config, $col, $set);
+		} elseif ($config['param'] && strpos($config['param'], '{page=') !== false) {
+			$this->page($config['param'], 3, $config, $col, $set);
 		} else {
 			Dever::load('spider/lib/project')->set($config, 4, 1);
 			$this->parse($config['site'], $config['id'], $config['collect_rule'], $config['curl'], $col, $set, $config['push']);
@@ -116,7 +187,6 @@ class Api
 			$this->parse($config['url'], $config['id'], $config['collect_rule'], $col);
 		});
 		*/
-		Dever::load('spider/lib/project')->set($config, 2);
 	}
 
 	private function col($project, $source = 1)
@@ -137,6 +207,7 @@ class Api
 
 	private function page($source, $type, $config, $col, $set)
 	{
+		$site = $config['site'];
 		$pat = '/{page=(.*?)}/i';
 		preg_match_all($pat, $source, $match);
 		if (isset($match[1][0]) && $match[1][0]) {
@@ -145,22 +216,20 @@ class Api
 			$page = $match[1][0];
 
 			for ($i = $page; $i <= $config['page_num']; $i++) {
-				$source = str_replace($match[0][0], $i, $source);
+				$site_page = str_replace($match[0][0], $i, $source);
 				Dever::load('spider/lib/project')->set($config, 4, $i);
 				if ($type == 1) {
-					if (strstr($source, '||')) {
-						$temp = explode('||', $source);
-						if ($i == 1) {
-							$source = $temp[0];
-						} else {
-							$source = $temp[0] . $temp[1];
-						}
+					if ($i == 1) {
+						$site = $config['site'];
+					} else {
+						$site = $config['site'] . $site_page;
 					}
-					$config['site'] = $source;
+				} elseif ($type == 2) {
+					$site = $site_page;
 				} else {
-					$config['curl']['param'] = $source;
+					$config['curl']['param'] = $site_page;
 				}
-				$this->parse($config['site'], $config['id'], $config['collect_rule'], $config['curl'], $col, $set, $config['push']);
+				$this->parse($site, $config['id'], $config['collect_rule'], $config['curl'], $col, $set, $config['push']);
 			}
 		}
 	}

+ 14 - 6
lib/Doc.php

@@ -83,10 +83,10 @@ class Doc
 		return $download->get($this->type);
 	}
 
-	private function collect($data, $include, $exclude, $filter)
+	private function collect($data, $include, $exclude, $filter, $attr)
 	{
 		if ($include) {
-			$include = explode("\n", str_replace("\r", '', $include));
+			$include = Dever::split($include);
 			foreach ($include as $k => $v) {
 				$state = preg_match('/' . $v . '/i', $data);
 				if ($state) {
@@ -99,7 +99,7 @@ class Doc
 			}
 		}
 		if ($exclude) {
-			$exclude = explode("\n", str_replace("\r", '', $exclude));
+			$exclude = Dever::split($exclude);
 			foreach ($exclude as $k => $v) {
 				$state = preg_match('/' . $v . '/i', $data);
 				if (!$state) {
@@ -108,7 +108,7 @@ class Doc
 			}
 		}
 		if ($filter) {
-			$filter = explode("\n", str_replace("\r", '', $filter));
+			$filter = Dever::split($filter);
 			foreach ($filter as $k => $v) {
 				$s = '';
 				if (strstr($v, '=>')) {
@@ -119,6 +119,14 @@ class Doc
 				$data = preg_replace('/' . $v . '/i', $s, $data);
 			}
 		}
+		if ($attr) {
+			$attr = Dever::split($attr);
+			foreach ($attr as $k => $v) {
+				$s = '';
+				$v = $v . '="(.*?)"';
+				$data = preg_replace('/' . $v . '/i', $s, $data);
+			}
+		}
 		
 		return $data;
 	}
@@ -153,7 +161,7 @@ class Doc
 		$result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
 
 		if (isset($config['collect_url']) && $config['collect_url']) {
-			$collect_url = explode("\n", str_replace("\r", '', $config['collect_url']));
+			$collect_url = Dever::split($config['collect_url']);
 			if (!isset($collect_url[1])) {
 				$collect_url[1] = '';
 			}
@@ -165,7 +173,7 @@ class Doc
 		
 		$this->addLog($name . '解析完成');
 
-		return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
+		return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter'], $config['collect_attr']);
 	}
 
 	public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)

+ 1 - 1
lib/Doc/Core.php

@@ -21,7 +21,7 @@ class Core
 	public static function rule($doc, $data, $col, $rule, $key)
 	{
 		$result = '';
-		$rule = explode("\n", str_replace("\r", '', $rule));
+		$rule = Dever::split($rule);
 		if (isset($rule[0]) && $rule[0]) {
 			if (isset($col[$rule[0]]) && $rule[0] != $key) {
 				if (isset($rule[1]) && $rule[1]) {

+ 51 - 23
lib/Parse.php

@@ -1,7 +1,7 @@
 <?php
 namespace Spider\Lib;
 use Dever;
-
+use Spider\Lib\Doc\Dom;
 class Parse
 {
 	private $url = '';
@@ -31,10 +31,10 @@ class Parse
 						if (is_string($v) && !strstr($v, 'http')) {
 							$v = $host . ltrim($v, '/');
 						}
-						$this->data[$k] = $this->load($doc, $k, $v, $col, $set, $push, $project);
+						$this->data[$k] = $this->load($doc, $k, $v, $col, $set, $push, $project, $v);
 					}
 				} else {
-					$this->data = $this->load($doc, 0, $data, $col, $set, $push, $project);
+					$this->data = $this->load($doc, 0, $data, $col, $set, $push, $project, $url);
 				}
 			}
 		}
@@ -46,7 +46,7 @@ class Parse
 		return $this->data;
 	}
 
-	private function load($doc, $index, $data, $col, $set, $push, $project)
+	private function load($doc, $index, $data, $col, $set, $push, $project, $source)
 	{
 		if (!$col) {
 			if (Dever::input('test') == 1) {
@@ -87,7 +87,7 @@ class Parse
 			echo Dever::table($table);die;
 		}
 
-		$this->update($result, $project);
+		$this->update($result, $project, $source);
 		return $result;
 	}
 
@@ -133,7 +133,7 @@ class Parse
 
 	private function push($push, $data, $project)
 	{
-		$push = explode("\n", str_replace("\r", '', $push));
+		$push = Dever::split($push);
 		$data['project_id'] = $project;
 		foreach ($push as $k => $v) {
 			if (strstr($v, 'http')) {
@@ -163,10 +163,10 @@ class Parse
 			eval($eval);
 			return $value;
 		} elseif ($data['type'] == 4) {
-			$temp = explode("\n", str_replace("\r", '', $data['value']));
+			$temp = Dever::split($data['value']);
 			return mt_rand($temp[0], $temp[1]);
 		} elseif ($data['type'] == 5) {
-			$temp = explode("\n", str_replace("\r", '', $data['value']));
+			$temp = Dever::split($data['value']);
 			$temp[0] = Dever::maketime($temp[0]);
 			$temp[1] = Dever::maketime($temp[1]);
 			return mt_rand($temp[0], $temp[1]);
@@ -190,8 +190,23 @@ class Parse
 	private function local($content, $type = 1)
 	{
 		if ($type == 1) {
+			$doc = Dom::init($content);
+			$pic = Dom::find($doc, '$("img").each().attr("src")');
+			if ($pic) {
+				$content = $this->local_replace($pic, $content);
+			}
+			$video = Dom::find($doc, '$("video").each().attr("src")');
+			if ($video) {
+				$content = $this->local_replace($video, $content);
+			}
+			$audio = Dom::find($doc, '$("audio").each().attr("src")');
+			if ($audio) {
+				$content = $this->local_replace($audio, $content);
+			}
+			/*
 			$rule = '<(img|video|audio).+src=\"?(.+\.(jpg|gif|bmp|bnp|png))\"?.+>';
-			$content = preg_replace_callback('/' . $rule . '/i', array($this, 'local_replace'), $content);
+			$content = preg_replace_callback('/' . $rule . '/i', array($this, 'local_rule_replace'), $content);
+			*/
 		} else {
 			$content = $this->copy($content);
 		}
@@ -199,7 +214,25 @@ class Parse
 		return $content;
 	}
 
-	private function local_replace($result)
+	private function local_replace($file, $content)
+	{
+		if (is_string($file) && strstr($file, '[')) {
+			$file = Dever::json_decode($file);
+		}
+		if (is_array($file)) {
+			foreach ($file as $k => $v) {
+				$content = $this->local_replace($v, $content);
+			}
+		} else {
+			$result = $this->copy($file);
+			if ($result) {
+				$content = str_replace($file, $result, $content);
+			}
+		}
+		return $content;
+	}
+
+	private function local_rule_replace($result)
 	{
 		if (isset($result[2]) && $result[2]) {
 			$file = $this->copy($result[2]);
@@ -232,29 +265,24 @@ class Parse
 		}
 	}
 
-	private function update($data, $project)
+	private function update($data, $project, $source)
 	{
 		if (!$data) {
 			return;
 		}
-		$param['option_pid'] = $project;
-		$param['option_value'] = json_encode($data, JSON_UNESCAPED_UNICODE);
+		$param['pid'] = $project;
+		$param['source'] = $source;
 		$info = Dever::db('spider/data')->one($param);
+		$param['value'] = json_encode($data, JSON_UNESCAPED_UNICODE);
 		if ($info) {
-			$update = array();
-			foreach ($param as $i => $j) {
-				$i = str_replace('option_', 'set_', $i);
-				$update[$i] = $j;
-			}
+			$update = $param;
 			$id = $update['where_id'] = $info['id'];
 			Dever::db('spider/data')->update($update);
 		} else {
-			$update = array();
-			foreach ($param as $i => $j) {
-				$i = str_replace('option_', 'add_', $i);
-				$update[$i] = $j;
-			}
+			$update = $param;
 			$id = Dever::db('spider/data')->insert($update);
 		}
+		echo $id;
+		echo "\r\n";
 	}
 }

+ 99 - 1
src/Data.php

@@ -44,7 +44,16 @@ class Data
 		}
 	}
 
-	# 导出对账单
+	# 清空数据
+	public function drop()
+	{
+		$where['pid'] = Dever::input('id');
+		Dever::db('spider/data')->delete($where);
+
+		return 'reload';
+	}
+
+	# 导出excel
     public function excel()
     {
     	$where['pid'] = Dever::input('id');
@@ -101,4 +110,93 @@ class Data
 
         Dever::excelExport($body, $header, $file);
     }
+
+    # 导出sql
+    public function sql()
+    {
+    	$where['pid'] = Dever::input('id');
+    	$data = Dever::db('spider/data')->select($where);
+		
+		if (!$data) {
+			return '';
+		}
+		$col = Dever::db('spider/col')->select($where);
+		$set = Dever::db('spider/set')->select($where);
+
+		$info = Dever::db('spider/project')->find($where['pid']);
+        $file = $info['name'] . '的采集数据';
+        $header = $body = array();
+
+        if ($col) {
+        	foreach ($col as $k => $v) {
+        		$header[] = '`' . $v['key'] . '`';
+        	}
+        }
+
+        if ($set) {
+        	foreach ($set as $k => $v) {
+        		$header[] = '`' . $v['key'] . '`';
+        	}
+        }
+
+        if ($data) {
+        	$i = 0;
+            foreach ($data as $k => $v) {
+
+            	if (!$v['value']) {
+            		continue;
+            	}
+            	$value = Dever::json_decode($v['value']);
+            	if (!$value) {
+            		continue;
+            	}
+                if ($col) {
+		        	foreach ($col as $k => $v) {
+		        		$body[$i][] = isset($value[$v['key']]) ? $value[$v['key']] : '';
+		        	}
+		        }
+
+		        if ($set) {
+		        	foreach ($set as $k => $v) {
+		        		$body[$i][] = isset($value[$v['key']]) ? $value[$v['key']] : '';
+		        	}
+		        }
+
+		        $i++;
+            }
+        }
+
+        $col = implode(',', $header);
+        $sql = $this->createInsertSql('data', $col, $body);
+
+        $file = Dever::data() . 'upload/data.sql';
+        if (is_file($file)) {
+        	unlink($file);
+        }
+        file_put_contents($file, $sql);
+
+        $file = Dever::upload($file);
+
+        return Dever::location($file);
+    }
+
+    public function createInsertSql($table, $col, $value)
+    {
+        $sql = 'INSERT INTO `' . $table . '` (' . $col . ') VALUES ';
+
+        $max = count($value) - 1;
+
+        foreach ($value as $k => $v) {
+        	$v = implode('\',\'', $v);
+            $sql .= '(\'' . $v . '\')';
+
+            if ($k >= $max) {
+                $sql .= '';
+            } else {
+                $sql .= ',';
+            }
+        }
+
+        return $sql;
+    }
 }