url($url); $dom = $this->dom($rule); foreach ($dom as $k => $v) { $this->handle(pq($v), $k, $col, $project); } } public function get() { return $this->data; } private function url($url) { $this->url = $url; $value = parse_url($this->url); $this->host = $value['scheme'] . '://' . $value['host']; } private function dom($rule, $url = '') { $url = $url ? $url : $this->url; if (empty($this->dom[$url])) { $dom = new Dom($url, $rule); $this->dom[$url] = $dom->get(); } return $this->dom[$url]; } private function handle($dom, $index, $col, $project) { foreach ($col as $v) { $callback = false; if (strpos($v['key'], '.') !== false) { $temp = explode('.', $v['key']); $v['key'] = $temp[1]; $callback = $temp[0]; } $value = $this->load($dom, $col, $v); if ($value == 'error') { break; } if ($callback) { $value = Dever::{$callback}($value); } $this->data[$index][$v['key']] = $value; } $this->update($this->data[$index], $project); } private function update($data, $project) { $param['option_project_id'] = $project; $param['option_value'] = json_encode($data); $info = Dever::db('spider/data')->one($param); if ($info) { $update = array(); foreach ($param as $i => $j) { $i = str_replace('option_', 'set_', $i); $update[$i] = $j; } $id = $update['where_id'] = $info['id']; Dever::db('spider/data')->update($update); } else { $update = array(); foreach ($param as $i => $j) { $i = str_replace('option_', 'add_', $i); $update[$i] = $j; } $id = Dever::db('spider/data')->insert($update); } } private function load($dom, $col, $config) { $data = $this->rule($dom, $col, $config['collect_rule'], $config['collect_include'], $config['collect_exclude']); if ($config['collect_include'] && strpos($data, $config['collect_include']) === false) { return 'error'; } if ($config['collect_exclude'] && strpos($data, $config['collect_exclude']) !== false) { return 'error'; } if ($config['collect_filter']) { $data = preg_replace('/' . $config['collect_filter'] . '/i', '', $data); } return $data; } private function rule($dom, $col, $rule, $include, $exclude) { $result = $dom->html(); $rule = explode("\n", $rule); if (isset($rule[0]) && $rule[0]) { if (isset($col[$rule[0]])) { $url = $this->getUrl($dom, $col, $col[$rule[0]]); $dom = $this->dom('', $url); array_shift($rule); } $result = $this->find($dom, $rule[0], $result); } if (isset($rule[1]) && $rule[1]) $result = $this->match($rule[1], $result); return $result; } private function find($dom, $string, $result) { $string = str_replace(array('$', ').'), array('$dom->find', ')->'), $string); $cmd = '$result = ' . $string . ';'; eval($cmd); return $result; } private function getUrl($dom, $col, $config) { $url = $this->load($dom, $col, $config); if (strpos($url, 'http') === false) { if ($url[0] == '/') { $url = $this->host . $url; } else { $url = $this->url . $url; } } return $url; } private function match($pattern, $string) { $temp = explode('||', $pattern); $index = isset($temp[1]) ? $temp[1] : 1; preg_match_all('/' . $temp[0] . '/i', $string, $match); $result = $match[$index][0]; return $result; } }