dever 6 years ago
parent
commit
7d0e3a106c
8 changed files with 41 additions and 16 deletions
  1. 1 1
      database/col.php
  2. 2 2
      database/project.php
  3. 4 0
      lib/Api.php
  4. 9 5
      lib/Doc.php
  5. 1 1
      lib/Log.php
  6. 13 3
      lib/Parse.php
  7. 6 2
      lib/doc/Core.php
  8. 5 2
      lib/doc/Dom.php

+ 1 - 1
database/col.php

@@ -17,7 +17,7 @@ return array
 	'order' => 6,
 	'desc' => '字段唯一标识符:如果用逗号隔开,则前面的是dever公共函数,如maketime.date,则会调用Dever::maketime()来处理date的数据
 	<br />
-	<a href="'.Dever::url('spider/lib/api.run?id=' . Dever::input('search_option_project_id', -1) . '&test=1').'" target="_blank">点此进行测试</a>:请按住ctrl打开新页面并查看源代码',
+	<a href="'.Dever::url('spider/lib/api.test?id=' . Dever::input('search_option_project_id', -1)).'" target="_blank">点此进行测试</a>:请按住ctrl打开新页面并查看源代码',
 	# 数据结构
 	'struct' => array
 	(

+ 2 - 2
database/project.php

@@ -143,7 +143,7 @@ return array
 		'site'		=> array
 		(
 			'type' 		=> 'text-255',
-			'name' 		=> '采集网址-多个网址换行隔开',
+			'name' 		=> '采集网址-多个网址换行隔开,如有分页,请写成这样pg{page=1}',
 			'default' 	=> '',
 			'desc' 		=> '采集网址',
 			'match' 	=> 'option',
@@ -155,7 +155,7 @@ return array
 		'collect_rule'		=> array
 		(
 			'type' 		=> 'varchar-500',
-			'name' 		=> '采集规则-支持dom解析、正则、json格式',
+			'name' 		=> '采集规则-支持dom解析、json格式,如$(".info .title a").each().attr("href")',
 			'default' 	=> '',
 			'desc' 		=> '采集规则',
 			'match' 	=> 'is_string',

+ 4 - 0
lib/Api.php

@@ -46,11 +46,15 @@ class Api
 		$config = $this->queue->pop();
 		if ($config) {
 			# 此处开task
+			$col = $this->col($config['id']);
+			$this->parse($config['url'], $config['id'], $config['collect_rule'], $col);
+			/*
 			Dever::task(function() use($config, $this)
 			{
 				$col = $this->col($config['id']);
 				$this->parse($config['url'], $config['id'], $config['collect_rule'], $col);
 			});
+			*/
 			
 			$state = true;
 		} else {

+ 9 - 5
lib/Doc.php

@@ -44,9 +44,10 @@ class Doc
 		return $doc;
 	}
 
-	private function doc()
+	public function doc($url = false)
 	{
-		$html = $this->download($this->url);
+		$url = $url ? $url : $this->url;
+		$html = $this->download($url);
 		if (strpos($this->rule, '$json') !== false) {
 			$this->type = 'json';
 		} else {
@@ -58,9 +59,9 @@ class Doc
 
 	private function download($url)
 	{
-		$this->addLog($this->url . '下载中...');
+		$this->addLog($url . '下载中...');
 		$download = new Download($url);
-		$this->addLog($this->url . '下载完成');
+		$this->addLog($url . '下载完成');
 		return $download->get();
 	}
 
@@ -73,7 +74,10 @@ class Doc
 			return 'error';
 		}
 		if ($filter) {
-			$data = preg_replace('/' . $filter . '/i', '', $data);
+			$filter = explode("\n", str_replace("\r", '', $filter));
+			foreach ($filter as $k => $v) {
+				$data = preg_replace('/' . $v . '/i', '', $data);
+			}
 		}
 		
 		return $data;

+ 1 - 1
lib/Log.php

@@ -11,7 +11,7 @@ class Log
 	public function __construct($key)
 	{
 		$key = date('Y-m-d') . '_' . $key;
-		$this->file = Dever::path(Dever::data() . 'log/', 'spider/' . $key);
+		$this->file = Dever::path(Dever::data() . 'logs/', 'spider/' . $key);
 	}
 
 	public function get()

+ 13 - 3
lib/Parse.php

@@ -16,6 +16,9 @@ class Parse
 		$doc->log(new Log($project));
 		$data = $doc->get();
 		if ($data) {
+			if (!is_array($data) && !is_object($data)) {
+				$data = Dever::json_decode($data);
+			}
 			foreach ($data as $k => $v) {
 				$this->data[$k] = $this->load($doc, $v, $col, $project);
 			}
@@ -30,7 +33,7 @@ class Parse
 
 	private function load($doc, $data, $col, $project)
 	{
-		$result = array();
+		$result = $table = array();
 		foreach ($col as $v) {
 			$callback = false;
 			if (strpos($v['key'], '.') !== false) {
@@ -43,14 +46,21 @@ class Parse
 				break;
 			}
 			if ($callback) {
-				$value = Dever::{$callback}($value);
+				if (function_exists($callback)) {
+					$value = $callback($value);
+				} else {
+					$value = Dever::{$callback}($value);
+				}
 			}
 
 			$result[$v['key']] = $value;
+			if (Dever::input('test') == 1) {
+				$table[$v['name']] = $value;
+			}
 		}
 		if (Dever::input('test') == 1) {
 			$doc->outLog();
-			print_r(Dever::table($result));die;
+			print_r(Dever::table($table));die;
 		}
 		$this->update($result, $project);
 		return $result;

+ 6 - 2
lib/doc/Core.php

@@ -24,8 +24,12 @@ class Core
 		if (isset($rule[0]) && $rule[0]) {
 			if (isset($col[$rule[0]]) && $rule[0] != $key) {
 				# 此处开task
-				$result = Doc::getInstance($doc->getUrl($data, $col, $col[$rule[0]]), $rule[1])->get();
-				array_shift($rule);
+				if (isset($rule[1]) && $rule[1]) {
+					$result = Doc::getInstance($doc->getUrl($data, $col, $col[$rule[0]]), $rule[1])->get();
+					array_shift($rule);
+				} else {
+					$result = self::rule($doc, $data, $col, $col[$rule[0]]['collect_rule'], $key);
+				}
 			} elseif (isset($data[$rule[0]])) {
 				$result = $data[$rule[0]];
 			} else {

+ 5 - 2
lib/doc/Dom.php

@@ -45,8 +45,11 @@ class Dom
 
 	public static function rule($doc, $dom, $col, $rule, $key)
 	{
-		$dom = pq($dom);
-		$result = $dom->html();
+		if (is_string($dom) && strstr($dom, 'http')) {
+			$dom = $doc->doc($dom);
+		} else {
+			$dom = pq($dom);
+		}
 
 		$result = Core::rule($doc, $dom, $col, $rule, $key);
 		return $result;