rabin 7 years ago
parent
commit
ac1bf04c6d
8 changed files with 84 additions and 678 deletions
  1. 7 0
      daemon/main.php
  2. 34 13
      lib/Api.php
  3. 1 0
      lib/Doc.php
  4. 2 10
      lib/Project.php
  5. 31 11
      lib/Queue.php
  6. 6 2
      lib/doc/Core.php
  7. 3 0
      package.json
  8. 0 642
      src/Collect.php

+ 7 - 0
daemon/main.php

@@ -0,0 +1,7 @@
+<?php
+
+define('DEVER_DAEMON', true);
+
+include(dirname(__FILE__) . DIRECTORY_SEPARATOR . '../index.php');
+
+Dever::load('spider/api.cron');

+ 34 - 13
lib/Api.php

@@ -4,33 +4,49 @@ use Dever;
 
 class Api
 {
-	public function run_api($id, $ajax = false)
+	private $queue;
+	public function run($id)
 	{
+		# 写入队列
 		$project = new Project($id);
 		$config = $project->get();
-		$state = true;
-		while ($state) {
-			$state = $this->queue($config);
+		if (!$this->queue) {
+			$this->queue = new Queue('db');
 		}
+		$send['id'] = $config['id'];
+		$send['collect_rule'] = $config['collect_rule'];
+		foreach ($config['url'] as $k => $v) {
+			$this->queue->push($v, $send, $config['page_num']);
+		}
+	}
+
+	public function test_api($id)
+	{
+		Dever::setInput('test', 1);
+		$this->queue = new Queue('data');
+		$this->run($id);
+		$this->cron();
 		return 'yes';
 	}
 
-	public function test_api($id, $ajax = false)
+	private function cron()
 	{
-		$project = new Project($id);
-		$config = $project->get();
+		if (!$this->queue) {
+			$this->queue = new Queue('db');
+		}
 		$state = true;
 		while ($state) {
-			$state = $this->queue($config);
+			$state = $this->load();
 		}
-		return 'yes';
 	}
 
-	private function queue($config)
+	private function load()
 	{
-		$url = $config['queue']->pop();
-		if ($url) {
-			$this->parse($url, $config['id'], $config['collect_rule'], $config['col']);
+		$config = $this->queue->pop();
+		if ($config) {
+			# 此处开task
+			$col = $this->col($config['id']);
+			$this->parse($config['url'], $config['id'], $config['collect_rule'], $col);
 			$state = true;
 		} else {
 			$state = false;
@@ -38,6 +54,11 @@ class Api
 		return $state;
 	}
 
+	private function col($project)
+	{
+		return Dever::db('spider/col')->getList(['where_project_id' => $project]);
+	}
+
 	private function parse($url, $project, $rule, $col)
 	{
 		$parse = new Parse($url, $project, $rule, $col);

+ 1 - 0
lib/Doc.php

@@ -15,6 +15,7 @@ class Doc
 	{
 		$key = $url . md5($rule);
 		if (empty(self::$instance[$key])) {
+			# 此处开协程:Dever::coroutine(self::$instance[$key]);
 			self::$instance[$key] = new self($url, $rule);
 		}
 

+ 2 - 10
lib/Project.php

@@ -15,11 +15,11 @@ class Project
 		}
 		$this->config = Dever::db('spider/project')->one($id);
 		$this->check();
-		$this->runing();
 	}
 
 	public function get()
 	{
+		$this->setting();
 		return $this->config;
 	}
 
@@ -34,17 +34,9 @@ class Project
 		}
 	}
 
-	private function runing()
-	{
-		Dever::db('spider/project')->update(['status' => 2, 'where_id' => $this->config['id']]);
-		$this->setting();
-	}
-
 	private function setting()
 	{
-		$this->config['col'] = Dever::db('spider/col')->getList(['where_project_id' => $this->config['id']]);
+		Dever::db('spider/project')->update(['status' => 2, 'where_id' => $this->config['id']]);
 		$this->config['url'] = explode("\r\n", $this->config['site']);
-		$this->config['queue'] = new Queue();
-		array_walk($this->config['url'], [$this->config['queue'], 'push'], $this->config['page_num']);
 	}
 }

+ 31 - 11
lib/Queue.php

@@ -6,43 +6,63 @@ use Dever;
 
 class Queue
 {
-	private $data = array();
+	public function __construct($method)
+	{
+		if (!Dever::config('base')->queue) {
+			Dever::config('base')->queue = array('method' => $method);
+		} else {
+			Dever::config('base')->queue['method'] = $method;
+		}
+		
+		Dever::import('queue');
+	}
 
-	public function push($value, $key = 0, $num = 0)
+	public function push($url, $config, $num = 0)
 	{
-		if (strpos($value, '{') !== false && strpos($value, '}') !== false) {
-			$this->preg($value, $num);
+		if (strpos($url, '{') !== false && strpos($url, '}') !== false) {
+			$this->preg($url, $num, $config);
 		} else {
-			array_push($this->data, $value);
+			$this->push_db($url, $config);
 		}
 		return true;
 	}
 
 	public function pop()
 	{
-		return array_shift($this->data);
+		return $this->pop_db();
 	}
 
-	private function preg($value, $num)
+	private function preg($value, $num, $config)
 	{
 		$pat = '/{(.*?)}/i';
 		preg_match_all($pat, $value, $match);
 		if (isset($match[1][0]) && $match[1][0]) {
 			if ($num <= 0) $num = 1000;
 			parse_str($match[1][0], $param);
-			$this->page($param, $match[0][0], $value, $num);
+			$this->page($param, $match[0][0], $value, $num, $config);
 		}
 	}
 
-	private function page($param, $replace, $value, $num)
+	private function page($param, $replace, $value, $num, $config)
 	{
 		if (isset($param['page'])) {
 			for ($i = $param['page']; $i <= $num; $i++) {
 				$url = str_replace($replace, $i, $value);
-				$this->push($url);
+				$this->push($url, $config);
 			}
 		} else {
-			$this->push(str_replace($replace, '', $value));
+			$this->push(str_replace($replace, '', $value), $config);
 		}
 	}
+
+	private function push_db($value, $config)
+	{
+		$config['url'] = $value;
+		Dever::push($config);
+	}
+
+	private function pop_db()
+	{
+		return Dever::pop();
+	}
 }

+ 6 - 2
lib/doc/Core.php

@@ -9,9 +9,12 @@ class Core
 	{
 		$temp = explode('||', $pattern);
 		$index = isset($temp[1]) ? $temp[1] : 1;
-		$temp[0] = '来源:互联网 时间:(.*?)<a href="#comments">';
 		preg_match_all('/' . $temp[0] . '/i', $string, $match);
-		$result = $match[$index][0];
+		$result = '';
+		if (isset($match[$index][0])) {
+			$result = $match[$index][0];
+		}
+		
 		return $result;
 	}
 
@@ -20,6 +23,7 @@ class Core
 		$rule = explode("\n", str_replace("\r", '', $rule));
 		if (isset($rule[0]) && $rule[0]) {
 			if (isset($col[$rule[0]]) && $rule[0] != $key) {
+				# 此处开task
 				$result = Doc::getInstance($doc->getUrl($data, $col, $col[$rule[0]]), $rule[1])->get();
 				array_shift($rule);
 			} elseif (isset($data[$rule[0]])) {

+ 3 - 0
package.json

@@ -0,0 +1,3 @@
+{
+	"rely": "queue,process"
+}

+ 0 - 642
src/Collect.php

@@ -1,642 +0,0 @@
-<?php
-
-namespace Spider\Src;
-
-use Dever;
-
-class Collect
-{
-	# 保存数据
-	protected $_data;
-	
-	# 停止采集
-	public function stop()
-	{
-		$id	= Dever::input('id');
-		Dever::load('collect/config-update', array('set_state' => 1, 'where_id' => $id));
-	}
-	
-	# 生成配置文件
-	public function create()
-	{
-		$id	= Dever::input('id');
-		if($id > 0)
-		{
-			$data = Dever::load('collect/config-check', $id);
-			if($data)
-			{
-				foreach($data as $k => $v)
-				{
-					if(is_string($k) && $k != 'id')
-					{
-						$result[$k] = $v;
-					}
-				}
-				
-				$result = base64_encode(serialize($result));
-				$html = '<div style="width:100%; table-layout:fixed; word-break: break-all; overflow:hidden;">'.$result.'</div>';
-				echo $html;die;
-			}
-		}
-		echo '没有配置数据';die;
-	}
-	
-	# 开始后台采集数据
-	public function daemon()
-	{
-		$id	= Dever::input('id');
-		if($id > 0)
-		{
-			$data = Dever::load('collect/config-check', $id);
-
-			//Dever::daemon('data.run?id=' . $id, 'collect');
-
-			# 加入cron
-			Dever::cron($data['name'], $data['sdate'], 'data.run?id=' . $id, $data['time'], 'collect');
-			
-			Dever::abert('已经开始自动采集数据,关闭本窗口即可!');
-		}
-	}
-	
-	# 开始实时采集数据
-	public function run()
-	{
-		$id	= Dever::input('id');
-		
-		$where = array();
-		# 启动
-		if($id > 0)
-		{
-			$param['option_id'] = $id;
-
-			$config = Dever::load('collect/config-check', $id);
-
-			$this->_run($config);
-		}
-	}
-	
-	private function _run($config)
-	{
-		# 运行中
-		Dever::load('collect/config-update', array('set_status' => 2, 'where_id' => $config['id']));
-
-		//while(true);
-		if($config)
-		{
-			//$config = Dever::collect_decode($config);
-			if(strstr($config['site'], '|g|'))
-			{
-				$temp = explode('|g|', $config['site']);
-				$config['url'] = $temp[0] . $temp[1];
-				$config['site'] = $temp[0];
-
-				$this->_create($config);
-			}
-			else
-			{
-				$config['url'] = $config['site'];
-
-				$url = explode("\r\n", $config['url']);
-				foreach($url as $k => $v)
-				{
-					if($v)
-					{
-						$config['url'] = $v;
-						$this->_create($config);
-					}
-				}
-			}
-			
-			if(Dever::input('test') == 1)
-			{
-				print_r($this->_data);die;
-			}
-			
-			# 继续
-			$status = 4;
-			if($config['time'] <= 0)
-			{
-				# 完成状态
-				$status = 4;
-			}
-			Dever::load('collect/config-update', array('set_status' => $status, 'set_num' => $config['num'] + 1, 'set_sdate' => time(), 'where_id' => $config['id']));
-
-			die;
-		}
-	}
-	
-	private function _create($config, $page = 1)
-	{
-		$result = array();
-		# 分析整个网页
-		if($config['site_rule_content'] == 1)
-		{
-			if(Dever::input('ajax') == 1 && Dever::input('url'))
-			{
-				$config['url'] = Dever::input('url');
-			}
-			list($temp, $result) = $this->_match($config['url'], $config['site_rule'], true);
-
-			if(Dever::input('ajax') == 1)
-			{
-				$data['txt'] = $this->_ajax('网址' . $config['url'] . '分析中,已得到内容链接,分析内容中...');
-				$data['url'] = implode(',', $result);
-				$data['page'] = $config['page_rule'];
-				$data['site'] = $config['site'];
-
-				echo json_encode($data);die;
-			}
-		}
-		else
-		{
-			$result = explode("\r\n", $config['url']);
-		}
-
-		if($result)
-		{
-			foreach($result as $k => $v)
-			{
-				$this->_createOne($v, $config);
-			}
-			
-			if($config['page_rule'])
-			{
-				$config_page = $config['page_rule'];
-				//sleep(2);
-				$max = false;
-				if(strstr($config['page_rule'], '|g|'))
-				{
-					$temp = explode('|g|', $config['page_rule']);
-					$config_page = $temp[0];
-					$max = $temp[1];
-				}
-				$page = $page + 1;
-				if($max && $page > $max)
-				{
-					# 最多只能跑这个页数的数据
-				}
-				else
-				{
-					$config['url'] = $config['site'] . '' . str_replace('(*)', $page, $config_page);
-
-					$this->_create($config, $page);
-				}
-			}
-		}
-		//sleep(2);
-	}
-
-	private function _ajax($txt)
-	{
-		return '[时间:' . date('Y-m-d H:i:s') . ']' . $txt;
-	}
-
-	# ajax 取一条
-	public function cronOne()
-	{
-		$id	= Dever::input('id');
-		$where = array
-		(
-			array('=', 'id', $id),
-		);
-		# 验证当前配置的状态
-		$data = DEVER_Db::select('config', '', $where);
-		$config = $data[0];
-		$config = Dever::collect_decode($config);
-		if(strstr($config['site'], '|g|'))
-		{
-			$temp = explode('|g|', $config['site']);
-			$config['url'] = $temp[0] . $temp[1];
-			$config['site'] = $temp[0];
-		}
-		else
-		{
-			$config['url'] = $config['site'];
-		}
-
-		$url = Dever::input('url');
-		Dever::input('ajax', 1);
-
-		if($url && $config)
-		{
-			$this->_createOne($url, $config);
-		}
-	}
-
-	private function _createOne($v, $config)
-	{
-		if(!strstr($v, 'http://'))
-		{
-			if(strstr($v, '/'))
-			{
-				$t = explode('/', $v);
-			}
-			else
-			{
-				$v = '/' . $v;
-			}
-
-			if(!$t[0])
-			{
-				$u = parse_url($config['url']);
-				$v = 'http://' . $u['host'] . $v;
-			} 
-			else
-			{
-				$v = $config['url'] . $v;
-			}
-		}
-
-		# 分析标题
-		$link = $v;
-
-		list($html, $name) = $this->_match($v, $config['name_rule'], true);
-
-		if($name && isset($name[0]) && $name[0])
-		{
-			if($config['name_include'] && !strstr($name[0], $config['name_include']))
-			{
-				return;
-			}
-			# 分析内容
-			$content = '';
-			if($config['content_rule'])
-			{
-				$content = $this->_loadContent($v,$config, $html);
-			}
-
-			//print_r($content);die;
-			
-			# 分析摘要
-			$intro = '';
-			if($config['intro_rule'])
-			{
-				list($temp, $string) = $this->_match($html, $config['intro_rule']);
-				
-				if($string && $string[0])
-				{
-					$intro = $string[0];
-				}
-			}
-			
-			$cdate = '';
-			# 分析时间
-			if($config['date_rule'])
-			{
-				list($temp, $date) = $this->_match($html, $config['date_rule']);
-				
-				if($date && $date[0])
-				{
-					$cdate = $date[0];
-				}
-			}
-
-			$extend = '';
-			# 分析扩展字段
-			if($config['extend_rule'])
-			{
-				list($temp, $extend) = $this->_match($html, $config['extend_rule'], false, true);
-			}
-			
-			# 入库
-			$this->_data($name[0], $content, $intro, $cdate, $config, $config['url'], $link, $extend);
-		}
-	}
-
-	private function _loadContent($url, $config, $html)
-	{
-
-		$content = '';
-		if(strstr($config['content_rule'], '|page|'))
-		{
-			$page = explode('|page|', $config['content_rule']);
-			$config['content_rule'] = $page[0];
-			$content = $this->_getContent($config, $html, false);
-			if(strstr($url, '.html'))
-			{
-				$url = str_replace('.html', '', $url);
-			}
-			for($i = 2; $i<= 100 ; $i++)
-			{
-				$u = $url . str_replace('(*)', $i, $page[1]);
-
-				$ct = $this->_getContent($config, $u, true);
-
-				if($ct && !strstr($content, $ct))
-				{
-					$content = $content . "\r\n" . $ct;
-				}
-				else
-				{
-					break;
-				}
-			}
-		}
-		else
-		{
-			$content = $this->_getContent($config, $html, false);
-		}
-
-		return $content;
-
-	}
-
-	private function _getContent($config, $url, $state)
-	{
-		list($temp, $content) = $this->_match($url, $config['content_rule'], $state);
-						
-		$contents = array();
-
-		if($content && is_array($content))
-		{
-			foreach($content as $a => $b)
-			{
-				$contents[] = '<p>'.rtrim(ltrim($b)).'</p>';
-			}
-		}
-						
-		if($contents)
-		{
-			$content = implode('', $contents);
-		}
-						
-		# 过滤不想要的字符
-		if($content && $config['content_include'])
-		{
-			$pic = '';
-			$temp = explode("\r\n", $config['content_include']);
-			foreach($temp as $tk => $tv)
-			{
-				$gg =  '|g|0';
-				$method = false;
-				if(strstr($tv, '||'))
-				{
-					$temp = explode('||', $tv);
-					$tv = $temp[0];
-					$method = $temp[1];
-					$gg = '';
-				}
-				list($t, $include) = $this->_match($content, $tv . $gg);
-				if($method == 'pic' && $include)
-				{
-					foreach($include as $ik => $iv)
-					{
-						$u = parse_url($config['url']);
-						$iv = 'http://' . $u['host'] . '/'. $iv;
-						$pic = '<img src="'.$iv.'">';
-					}
-				}
-				elseif($include)
-				{
-					foreach($include as $ik => $iv)
-					{
-						if($pic)
-						{
-							$content = str_replace($iv, $pic, $content);
-							$pic = '';
-						}
-						else
-						{
-							$content = str_replace($iv, '', $content);
-						}
-					}
-				}
-			}
-		}
-		return $content;
-	}
-	
-	private function _match($data, $rule, $type = false, $mul = false)
-	{
-		if(strstr($rule, "\r\n") || $mul == true)
-		{
-			$array = explode("\r\n", $rule);
-			$return = array();
-			foreach($array as $k => $v)
-			{
-				if($v)
-				{
-					$result = $this->_match($data, $v, $type);
-					if($result)
-					{
-						if($mul == true && isset($result[1]))
-						{
-							$return[] = $result[1][0];
-						}
-						else
-						{
-							return $result;
-							break;
-						}
-					}
-				}
-			}
-			if($mul == true && $return)
-			{
-				return array(false, implode('||', $return));
-			}
-			
-			return array(false, false);
-		}
-		
-		$index = 1;
-		if(strstr($rule, '|g|'))
-		{
-			$array = explode('|g|', $rule);
-			$rule = $array[0];
-			$index = $array[1];
-		}
-		
-		# 这里做这个替换是为了防止有人不做转义,而有的又做了转义
-		if(strstr($rule, '\\/'))
-		{
-			$rule = str_replace('\\/', '/', $rule);
-		}
-		if(strstr($rule, '/'))
-		{
-			$rule = str_replace('/', '\\/', $rule);
-		}
-		
-		if($type == true)
-		{
-			if(isset($array[2]) && $array[2])
-			{
-				$data = $array[2] . $data;
-			}
-			//sleep(1);
-			$data = Dever::curl($data);
-
-			if(!$data) return array(false, false);
-
-			
-			$encode = mb_detect_encoding($data, array('GB2312','GBK','UTF-8'));
-
-			//echo $encode;die;
-
-			if($encode == 'GB2312' || $encode == 'GBK' || $encode == 'EUC-CN')
-			{
-				$data = \iconv('GBK', 'UTF-8', $data);
-			}
-
-			if($encode == 'CP936')
-			{
-				$data = \iconv('SJIS', 'UTF-8', $data);
-			}
-		}
-		
-		# 过滤换行
-		$data = str_replace(PHP_EOL, '', $data); 
-
-		preg_match_all('/' . $rule . '/i', $data, $result);
-			
-		return array($data, $result[$index]);
-	}
-	
-	# 将得到的数据生成一份保存下来
-	private function _data($name, $content, $intro, $cdate, $config, $site, $link, $extend)
-	{
-		$data['add_name'] = strip_tags(trim($name));
-		$data['add_cate_id'] = $config['cate_id'] ? $config['cate_id'] : 1;
-		//$data['add_admin'] = 1;
-		
-		# 标签分析工具
-		//$data['log_Tag'] = $this->_tag($name);
-		$data['add_content'] = $content;
-		$data['add_extend'] = $extend;
-		$data['add_info'] = trim($intro);
-		$data['add_cdate'] = $cdate ? Dever::maketime(strip_tags(trim($cdate))) : time();
-		$data['add_source_link'] = $link;
-		$data['add_source_list'] = $site;
-		$data['add_config_id'] = $config['id'];
-
-		//print_r($data);die;
-		
-		if(Dever::input('test') == 1)
-		{
-			$num = count($this->_data);
-			$this->_data[$num] = $data;
-			echo '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> ';
-			print_r($data);die;
-			return;
-		}
-		
-		# 验证是否已经存在数据,用标题不靠谱,原站如果改了标题就不好了
-		$info = Dever::load('collect/data-check', array('option_source_link' => $link));
-
-		//print_r($info);die;
-
-		# 直接入库
-		if($info)
-		{
-			$update['set_name'] = $data['add_name'];
-			$update['set_cate_id'] = $data['add_cate_id'];
-			$update['set_content'] = $data['add_content'];
-			$update['set_info'] = $data['add_info'];
-			$update['set_extend'] = $extend;
-			$update['set_cdate'] = time();
-			$update['set_config_id'] = $config['id'];
-			$update['set_source_link'] = $link;
-			$update['set_source_list'] = $site;
-			$update['where_id'] = $info['id'];
-
-			Dever::load('collect/data-update', $update);
-			$id = $info['id'];
-			$desc = '(<a style="color:red">已采集过,直接覆盖</a>)';
-		}
-		else
-		{
-			$id = Dever::load('collect/data-insert', $data);
-			$desc = '(<a style="color:blue">新采集的内容</a>)';
-		}
-
-		if(Dever::input('ajax') == 1)
-		{
-			if($config['type'] == 1)
-			{
-				$txt = '采集成功';
-			}
-			else
-			{
-				$txt = '采集成功';
-			}
-			$result['txt'] = $this->_ajax('内容[<a href="'.$link.'" target="_blank">' . $data['add_name'] . '</a>]' . $txt . $desc);
-			$result['url'] = $link;
-
-			echo json_encode($result);die;
-		}
-		
-		return true;
-	}
-	
-	/**
-	 * 上传操作 直接参考的ueditor插件
-	 */
-	private function _upload($file, $id)
-	{
-		global $zbp;
-		//http开头验证
-		if(strpos($file, "http") !== 0)
-		{
-			return $file;
-		}
-		
-		//获取请求头并检测死链
-		$heads = get_headers($file);
-		if(!(stristr($heads[0], "200") && stristr($heads[0], "OK")))
-		{
-			return $file;
-		}
-		
-		
-		//格式验证(扩展名验证和Content-Type验证)
-		$fileType = strtolower(strrchr($file, '.'));
-		if(stristr($heads['Content-Type'], "image"))
-		{
-			return $file;
-		}
-
-		//打开输出缓冲区并获取远程图片
-		ob_start();
-		$context = stream_context_create
-		(
-			array('http' => array
-			(
-				'follow_location' => false // don't follow redirects
-			))
-		);
-		readfile($file, false, $context);
-		$img = ob_get_contents();
-		ob_end_clean();
-		preg_match("/[\/]([^\/]*)[\.]?[^\.\/]*$/", $file, $m);
-		
-		$filename = $m ? $m[1] : "";
-		$filesize = strlen($img);
-		$filetype = strtolower(strrchr($filename, '.'));
-
-		# 这里要修改upload类 算了
-		$root = Dever::path($zbp->usersdir . 'upload/collect/');
-		$id = ceil($id/1000);
-		$filepath = Dever::path($root . $id . '/');
-		$filepath = 'upload/collect/' . $id . '/';
-		$filename = md5($file) . $filetype;
-
-		$upload = new Upload;
-		$upload->Dir = $filepath;
-		$upload->Name = $filename;
-		# 这个暂时没用啊
-		//$upload->SourceName = $filepath . $filename;
-		$upload->MimeType = $heads['Content-Type'];
-		$upload->Size = $fileSize;
-		$upload->AuthorID = $zbp->user->ID;
-		
-		if(!$upload->SaveBase64File(base64_encode($img)))
-		{
-			return $file;
-		}
-		
-		$upload->Save();
-		$file = $upload->Url;
-
-		return $file;
-	}
-}