123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149 |
- <?php
- namespace Spider\Lib;
- use Dever;
- class Parse
- {
- private $url = '';
- private $host = '';
- private $dom = array();
- private $data = array();
- public function __construct($url, $project, $rule, $col)
- {
- $this->url($url);
- $dom = $this->dom($rule);
- foreach ($dom as $k => $v) {
- $this->handle(pq($v), $k, $col, $project);
- }
- }
- public function get()
- {
- return $this->data;
- }
- private function url($url)
- {
- $this->url = $url;
- $value = parse_url($this->url);
- $this->host = $value['scheme'] . '://' . $value['host'];
- }
- private function dom($rule, $url = '')
- {
- $url = $url ? $url : $this->url;
- if (empty($this->dom[$url])) {
- $dom = new Dom($url, $rule);
- $this->dom[$url] = $dom->get();
- }
-
- return $this->dom[$url];
- }
- private function handle($dom, $index, $col, $project)
- {
- foreach ($col as $v) {
- $callback = false;
- if (strpos($v['key'], '.') !== false) {
- $temp = explode('.', $v['key']);
- $v['key'] = $temp[1];
- $callback = $temp[0];
- }
- $value = $this->load($dom, $col, $v);
- if ($value == 'error') {
- break;
- }
- if ($callback) {
- $value = Dever::{$callback}($value);
- }
- $this->data[$index][$v['key']] = $value;
- }
- $this->update($this->data[$index], $project);
- }
- private function update($data, $project)
- {
- $param['option_project_id'] = $project;
- $param['option_value'] = json_encode($data);
- $info = Dever::db('spider/data')->one($param);
- if ($info) {
- $update = array();
- foreach ($param as $i => $j) {
- $i = str_replace('option_', 'set_', $i);
- $update[$i] = $j;
- }
- $id = $update['where_id'] = $info['id'];
- Dever::db('spider/data')->update($update);
- } else {
- $update = array();
- foreach ($param as $i => $j) {
- $i = str_replace('option_', 'add_', $i);
- $update[$i] = $j;
- }
- $id = Dever::db('spider/data')->insert($update);
- }
- }
- private function load($dom, $col, $config)
- {
- $data = $this->rule($dom, $col, $config['collect_rule'], $config['collect_include'], $config['collect_exclude']);
- if ($config['collect_include'] && strpos($data, $config['collect_include']) === false) {
- return 'error';
- }
- if ($config['collect_exclude'] && strpos($data, $config['collect_exclude']) !== false) {
- return 'error';
- }
- if ($config['collect_filter']) {
- $data = preg_replace('/' . $config['collect_filter'] . '/i', '', $data);
- }
- return $data;
- }
- private function rule($dom, $col, $rule, $include, $exclude)
- {
- $result = $dom->html();
- $rule = explode("\n", $rule);
- if (isset($rule[0]) && $rule[0]) {
- if (isset($col[$rule[0]])) {
- $url = $this->getUrl($dom, $col, $col[$rule[0]]);
- $dom = $this->dom('', $url);
- array_shift($rule);
- }
- $result = $this->find($dom, $rule[0], $result);
- }
- if (isset($rule[1]) && $rule[1]) $result = $this->match($rule[1], $result);
- return $result;
- }
- private function find($dom, $string, $result)
- {
- $string = str_replace(array('$', ').'), array('$dom->find', ')->'), $string);
- $cmd = '$result = ' . $string . ';';
- eval($cmd);
- return $result;
- }
- private function getUrl($dom, $col, $config)
- {
- $url = $this->load($dom, $col, $config);
- if (strpos($url, 'http') === false) {
- if ($url[0] == '/') {
- $url = $this->host . $url;
- } else {
- $url = $this->url . $url;
- }
- }
- return $url;
- }
- private function match($pattern, $string)
- {
- $temp = explode('||', $pattern);
- $index = isset($temp[1]) ? $temp[1] : 1;
- preg_match_all('/' . $temp[0] . '/i', $string, $match);
- $result = $match[$index][0];
- return $result;
- }
- }
|