|
@@ -6,16 +6,20 @@ class Parse
|
|
|
{
|
|
|
private $url = '';
|
|
|
private $host = '';
|
|
|
- private $dom = array();
|
|
|
+ private $log;
|
|
|
+ private $doc = array();
|
|
|
private $data = array();
|
|
|
|
|
|
public function __construct($url, $project, $rule, $col)
|
|
|
{
|
|
|
- $this->url($url);
|
|
|
- $dom = $this->dom($rule);
|
|
|
- foreach ($dom as $k => $v) {
|
|
|
- $this->handle(pq($v), $k, $col, $project);
|
|
|
+ $doc = Doc::getInstance($url, $rule);
|
|
|
+ $doc->log(new Log($project));
|
|
|
+ $data = $doc->get();
|
|
|
+ foreach ($data as $k => $v) {
|
|
|
+ print_r($data);die;
|
|
|
+ $this->data[$k] = $this->load($doc, $v, $col);
|
|
|
}
|
|
|
+ $doc->saveLog();
|
|
|
}
|
|
|
|
|
|
public function get()
|
|
@@ -23,34 +27,17 @@ class Parse
|
|
|
return $this->data;
|
|
|
}
|
|
|
|
|
|
- private function url($url)
|
|
|
+ private function load($doc, $data, $col)
|
|
|
{
|
|
|
- $this->url = $url;
|
|
|
- $value = parse_url($this->url);
|
|
|
- $this->host = $value['scheme'] . '://' . $value['host'];
|
|
|
- }
|
|
|
-
|
|
|
- private function dom($rule, $url = '')
|
|
|
- {
|
|
|
- $url = $url ? $url : $this->url;
|
|
|
- if (empty($this->dom[$url])) {
|
|
|
- $dom = new Dom($url, $rule);
|
|
|
- $this->dom[$url] = $dom->get();
|
|
|
- }
|
|
|
-
|
|
|
- return $this->dom[$url];
|
|
|
- }
|
|
|
-
|
|
|
- private function handle($dom, $index, $col, $project)
|
|
|
- {
|
|
|
- foreach ($col as $v) {
|
|
|
+ foreach ($data as $v) {
|
|
|
$callback = false;
|
|
|
if (strpos($v['key'], '.') !== false) {
|
|
|
$temp = explode('.', $v['key']);
|
|
|
$v['key'] = $temp[1];
|
|
|
$callback = $temp[0];
|
|
|
}
|
|
|
- $value = $this->load($dom, $col, $v);
|
|
|
+
|
|
|
+ $value = $doc->rule($data, $col, $v);
|
|
|
if ($value == 'error') {
|
|
|
break;
|
|
|
}
|
|
@@ -58,9 +45,14 @@ class Parse
|
|
|
$value = Dever::{$callback}($value);
|
|
|
}
|
|
|
|
|
|
- $this->data[$index][$v['key']] = $value;
|
|
|
+ $data[$v['key']] = $value;
|
|
|
+ }
|
|
|
+ if (Dever::input('test') == 1) {
|
|
|
+ $doc->outLog();
|
|
|
+ print_r($data);die;
|
|
|
}
|
|
|
- $this->update($this->data[$index], $project);
|
|
|
+ $this->update($data, $project);
|
|
|
+ return $data;
|
|
|
}
|
|
|
|
|
|
private function update($data, $project)
|
|
@@ -85,65 +77,4 @@ class Parse
|
|
|
$id = Dever::db('spider/data')->insert($update);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- private function load($dom, $col, $config)
|
|
|
- {
|
|
|
- $data = $this->rule($dom, $col, $config['collect_rule'], $config['collect_include'], $config['collect_exclude']);
|
|
|
- if ($config['collect_include'] && strpos($data, $config['collect_include']) === false) {
|
|
|
- return 'error';
|
|
|
- }
|
|
|
- if ($config['collect_exclude'] && strpos($data, $config['collect_exclude']) !== false) {
|
|
|
- return 'error';
|
|
|
- }
|
|
|
- if ($config['collect_filter']) {
|
|
|
- $data = preg_replace('/' . $config['collect_filter'] . '/i', '', $data);
|
|
|
- }
|
|
|
- return $data;
|
|
|
- }
|
|
|
-
|
|
|
- private function rule($dom, $col, $rule, $include, $exclude)
|
|
|
- {
|
|
|
- $result = $dom->html();
|
|
|
- $rule = explode("\n", $rule);
|
|
|
- if (isset($rule[0]) && $rule[0]) {
|
|
|
- if (isset($col[$rule[0]])) {
|
|
|
- $url = $this->getUrl($dom, $col, $col[$rule[0]]);
|
|
|
- $dom = $this->dom('', $url);
|
|
|
- array_shift($rule);
|
|
|
- }
|
|
|
- $result = $this->find($dom, $rule[0], $result);
|
|
|
- }
|
|
|
- if (isset($rule[1]) && $rule[1]) $result = $this->match($rule[1], $result);
|
|
|
- return $result;
|
|
|
- }
|
|
|
-
|
|
|
- private function find($dom, $string, $result)
|
|
|
- {
|
|
|
- $string = str_replace(array('$', ').'), array('$dom->find', ')->'), $string);
|
|
|
- $cmd = '$result = ' . $string . ';';
|
|
|
- eval($cmd);
|
|
|
- return $result;
|
|
|
- }
|
|
|
-
|
|
|
- private function getUrl($dom, $col, $config)
|
|
|
- {
|
|
|
- $url = $this->load($dom, $col, $config);
|
|
|
- if (strpos($url, 'http') === false) {
|
|
|
- if ($url[0] == '/') {
|
|
|
- $url = $this->host . $url;
|
|
|
- } else {
|
|
|
- $url = $this->url . $url;
|
|
|
- }
|
|
|
- }
|
|
|
- return $url;
|
|
|
- }
|
|
|
-
|
|
|
- private function match($pattern, $string)
|
|
|
- {
|
|
|
- $temp = explode('||', $pattern);
|
|
|
- $index = isset($temp[1]) ? $temp[1] : 1;
|
|
|
- preg_match_all('/' . $temp[0] . '/i', $string, $match);
|
|
|
- $result = $match[$index][0];
|
|
|
- return $result;
|
|
|
- }
|
|
|
}
|