dever 4 years ago
parent
commit
b550889b32
7 changed files with 139 additions and 60 deletions
  1. 46 4
      database/col.php
  2. 10 11
      database/project.php
  3. 41 30
      lib/Api.php
  4. 8 7
      lib/Doc.php
  5. 22 7
      lib/Parse.php
  6. 11 1
      lib/Project.php
  7. 1 0
      src/Data.php

+ 46 - 4
database/col.php

@@ -1,10 +1,16 @@
 <?php
 
 # 定义几个常用的选项
-$option = array
+$type = array
 (
-	1 => '正常',
-	2 => '删除',
+	1 => '文本内容',
+	2 => '文件资源',
+);
+
+$local = array
+(
+	1 => '是',
+	2 => '否',
 );
 return array
 (
@@ -70,11 +76,47 @@ return array
 			'search'	=> 'order',
 			'update'	=> 'hidden',
 		),
+
+		'type'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '字段类型-如果选择文件类型,将只会保存文件的网络地址',
+			'default' 	=> '1',
+			'desc' 		=> '字段类型',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $type,
+			'update'	=> 'radio',
+			'list'		=> true,
+			'control'	=> 'type',
+		),
+
+		'local'		=> array
+		(
+			'type' 		=> 'tinyint-1',
+			'name' 		=> '资源是否本地化-资源包括图片、视频等,本地化后将转成本地地址',
+			'default' 	=> '1',
+			'desc' 		=> '资源是否本地化',
+			'match' 	=> 'is_numeric',
+			'option' 	=> $local,
+			'update'	=> 'radio',
+			'control'	=> 'local',
+		),
+
+		'res_key'		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> '资源库ID-直接输入资源库的配置ID,将根据配置传入到相应的资源库里',
+			'default' 	=> '1',
+			'desc' 		=> '资源库ID',
+			'match' 	=> 'is_numeric',
+			'update'	=> 'text',
+			'show'		=> 'local=1',
+		),
 		
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-支持dom解析、正则',
+			'name' 		=> '采集规则-支持dom解析、正则,如$(".info .title a").html(),或者直接输入正则表达式',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'is_string',

+ 10 - 11
database/project.php

@@ -50,9 +50,8 @@ return array
 	'path' => $path,
 	# 后台菜单排序
 	'order' => 20,
-	'desc' => Dever::markdown('**启动守护进程的方法,以下三项任选其一:**
-	1. 常规任务:请将 ```* * * * * root php '.$path.'daemon/main.php``` 放到cron中[建议每分钟执行一次] 
-	2. 长期运行:请执行 ```php '.$path.'daemon/main.php 1>/dev/null 2>&1 &``` 指令,放置后台运行[一般用于队列等需要长期运行的项目]'),
+	'desc' => Dever::markdown('**启动守护进程的方法:**
+	常规任务:请将 ```* * * * * root php '.$path.'daemon/main.php``` 放到crontab中[建议每分钟执行一次]'),
 	# 数据结构
 	'struct' => array
 	(
@@ -100,7 +99,7 @@ return array
 			'name' 		=> '采集网址-如有分页,请写成这样{page=1}',
 			'default' 	=> '',
 			'desc' 		=> '采集网址',
-			'match' 	=> 'option',
+			'match' 	=> 'is_string',
 			'update'	=> $id < 0 ? 'hidden': 'textarea',
 			//'list'		=> true,
 			//'edit'		=> 'textarea',
@@ -109,10 +108,10 @@ return array
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-支持dom解析、json格式,如$(".info .title a").each().attr("href")',
+			'name' 		=> '采集规则-为空则获取整个页面的内容,支持dom解析、json格式,如$(".info .title a").each().attr("href")',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
-			'match' 	=> 'is_string',
+			'match' 	=> 'option',
 			'update'	=> $id < 0 ? 'hidden': 'textarea',
 			//'edit'		=> true,
 			//'list'		=> true,
@@ -268,14 +267,15 @@ return array
 			),
 			'col' => '*|id',
 		),
-		# 获取所有入队并且符合当前时间的配置
-		'get' => array
+		# 获取所有已完成,并且有时间间隔的数据,重新入队
+		'getAll' => array
 		(
 			'option' => array
 			(
 				'id' => 'yes',
 				'status' => 2,
-				'sdate' => array('yes-sdate`+`interval', '<='),
+				'interval' => array('yes', '>='),
+				'sdate' => array('yes-sdate', '<='),
 				'state' => 1,
 			),
 			'type' => 'all',
@@ -283,14 +283,13 @@ return array
 			'col' => '*',
 		),
 
-		# 获取所有运行中的数据
 		'getOne' => array
 		(
 			'option' => array
 			(
 				'id' => 'yes',
 				'status' => array('yes'),
-				'sdate' => array('yes-sdate`+`interval', '<='),
+				'sdate' => array('yes-sdate', '<='),
 				'state' => 1,
 			),
 			'type' => 'one',

+ 41 - 30
lib/Api.php

@@ -8,48 +8,71 @@ class Api
 	public function add_api($id)
 	{
 		# 写入队列
-		$config = Dever::load('spider/lib/project')->get($id);
+		if (is_array($id)) {
+			$config = $id;
+		} else {
+			$config = Dever::load('spider/lib/project')->get($id);
+		}
+		
 		if (!$config) {
 			Dever::alert('项目不存在');
 		}
-		Dever::load('spider/lib/project')->set($config, 3);
-		Dever::load('spider/lib/queue')->push($config['id']);
-		return 'yes';
+		if ($config['status'] <= 2) {
+			Dever::load('spider/lib/project')->set($config, 3);
+			Dever::load('spider/lib/queue')->push($config['id']);
+		}
+		return 'reload';
 	}
 
 	public function test_api($id)
 	{
 		Dever::setInput('test', 1);
 		$this->run($id);
-		return 'yes';
+		return 'reload';
 	}
 
-	public function cron()
+	# 守护进程 每分钟执行一次即可
+	public function daemon()
 	{
-		if (!$this->queue) {
-			$this->queue = new Queue();
+		# 查看进程是否存在
+		$state = Dever::process('lib/api.cron', true);
+		if ($state <= 0) {
+			Dever::daemon('lib/api.cron', 'spider');
+		}
+
+		# 查看当前所有项目是否可以开始运行
+		$data = Dever::load('spider/lib/project')->getAll();
+		if ($data) {
+			foreach ($data as $k => $v) {
+				$this->add_api($v);
+			}
 		}
-		Dever::import('task');
+	}
+
+	public function cron()
+	{
+		$this->queue = new Queue();
+		//Dever::import('task');
 		while (1) {
 			$this->load();
 		}
 	}
 
-	private function load()
+	public function load()
 	{
 		try {
 			$id = $this->queue->pop();
 			if ($id) {
 				$config = Dever::load('spider/lib/project')->get($id);
-				if ($config && $config['status'] <= 2 && time() >= $config['sdate']) {
+				if ($config) {
 					# 推到后台运行
 					# 获取当前执行的进程数量
-		            $num = $this->getNum();
+		            $num = Dever::process('lib/api.run', true);
 		            if ($num >= 1000) {
 		                # 等会儿再执行
 		                sleep(60);
 		            }
-		            $this->popen($id);
+		            Dever::daemon('lib/api.run?id=' . $id, 'spider');
 				}
 			}
 			return true;
@@ -58,8 +81,12 @@ class Api
         }
 	}
 
-	public function run($id)
+	public function run()
 	{
+		$id = Dever::input('id');
+		if (!$id) {
+			return false;
+		}
 		$config = Dever::load('spider/lib/project')->get($id);
 		if (!$config) {
 			return false;
@@ -83,22 +110,6 @@ class Api
 		Dever::load('spider/lib/project')->set($config, 2);
 	}
 
-	# 将数据推到子进程处理
-    public function popen($id)
-    {
-    	$path = Dever::db('spider/project')->config['path'] . 'daemon/run.php';
-        $command = 'php '.$path.' -send id=' . $id . ' 1>/dev/null 2>&1 &';
-        exec($command);
-    }
-
-	# 获取当前执行的子进程数量
-    public function getNum()
-    {
-        $command = 'ps -ef | grep gateway/api.task/runOne | grep -v grep | wc -l';
-        $num = exec($command);
-        return $num;
-    }
-
 	private function col($project)
 	{
 		return Dever::db('spider/col')->getList(['where_pid' => $project]);

+ 8 - 7
lib/Doc.php

@@ -26,6 +26,11 @@ class Doc
 	{
 		$this->url($url);
 		$this->rule = $rule;
+		if (strpos($this->rule, '$json') !== false) {
+			$this->type = 'json';
+		} else {
+			$this->type = 'dom';
+		}
 	}
 
 	private function url($url)
@@ -48,11 +53,6 @@ class Doc
 	{
 		$url = $url ? $url : $this->url;
 		$html = $this->download($url);
-		if (strpos($this->rule, '$json') !== false) {
-			$this->type = 'json';
-		} else {
-			$this->type = 'dom';
-		}
 		return ($this->getClass())::init($html);
 	}
 
@@ -95,16 +95,17 @@ class Doc
 
 	public function init($data)
 	{
-		if (is_string($data) && strstr($data, 'http')) {
+		if (is_string($data) && filter_var($data, FILTER_VALIDATE_URL) !== false) {
 			$data = $this->doc($data);
 		} else {
-			$data = ($this->getClass())::get($data);
+			$data = ($this->getClass())::init($data);
 		}
 		return $data;
 	}
 
 	public function rule($data, $col, $config)
 	{
+
 		$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
 		$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
 

+ 22 - 7
lib/Parse.php

@@ -17,16 +17,23 @@ class Parse
 		$data = $doc->get();
 		if ($data) {
 			if (!is_array($data) && !is_object($data)) {
-				$data = Dever::json_decode($data);
+				$state = Dever::json_decode($data);
+				if ($state) {
+					$data = $state;
+				}
 			}
 			if ($data) {
-				$domain = parse_url($url);
-				$host = $domain['scheme'] . '://' . $domain['host'];
-				foreach ($data as $k => $v) {
-					if (!strstr($v, 'http')) {
-						$v = $host . $v;
+				if (is_array($data)) {
+					$domain = parse_url($url);
+					$host = $domain['scheme'] . '://' . $domain['host'];
+					foreach ($data as $k => $v) {
+						if (!strstr($v, 'http')) {
+							$v = $host . $v;
+						}
+						$this->data[$k] = $this->load($doc, $v, $col, $project);
 					}
-					$this->data[$k] = $this->load($doc, $v, $col, $project);
+				} else {
+					$this->data = $this->load($doc, $data, $col, $project);
 				}
 			}
 		}
@@ -42,6 +49,13 @@ class Parse
 	{
 		$result = $table = array();
 		$data = $doc->init($data);
+		if (!$col) {
+			if (Dever::input('test') == 1) {
+				$doc->outLog();
+				echo 'error';die;
+			}
+			return false;
+		}
 		foreach ($col as $v) {
 			$callback = false;
 			if (strpos($v['key'], '.') !== false) {
@@ -49,6 +63,7 @@ class Parse
 				$v['key'] = $temp[1];
 				$callback = $temp[0];
 			}
+
 			$value = $doc->rule($data, $col, $v);
 			if ($value == 'error') {
 				break;

+ 11 - 1
lib/Project.php

@@ -56,7 +56,7 @@ class Project
 		
 		$data['where_id'] = $config['id'];
 		if ($status == 3) {
-			$data['num'] += 1;
+			$data['num'] = $config['num'] + 1;
 		}
 		if ($status == 2 && $config['interval'] > 0) {
 			# 已结束,设置下次的时间
@@ -64,4 +64,14 @@ class Project
 		}
 		return Dever::db('spider/project')->update($data);
 	}
+
+	# 获取可以运行的项目
+	public function getAll()
+	{
+		$where['interval'] = 1;
+		$where['sdate'] = time();
+		$data = Dever::db('spider/project')->getAll($where);
+
+		return $data;
+	}
 }

+ 1 - 0
src/Data.php

@@ -25,6 +25,7 @@ class Data
 
 	public function get($project_id = 1)
 	{
+		echo Dever::input('id');die;
 		$data = Dever::load('spider/data-all', array('option_project_id' => $project_id));
 
 		return $data;