|
@@ -6,6 +6,7 @@ class Doc
|
|
|
{
|
|
|
private $url;
|
|
|
private $host;
|
|
|
+ private $path;
|
|
|
private $rule;
|
|
|
public $type = 'dom';
|
|
|
private $log = false;
|
|
@@ -33,11 +34,20 @@ class Doc
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private function url($url)
|
|
|
+ private function url($url = false)
|
|
|
{
|
|
|
+ if (!$url) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
$this->url = $url;
|
|
|
$value = parse_url($this->url);
|
|
|
- $this->host = $value['scheme'] . '://' . $value['host'];
|
|
|
+ $this->path = $this->host = $value['scheme'] . '://' . $value['host'];
|
|
|
+ if (isset($value['path']) && $value['path']) {
|
|
|
+ $temp = explode('/', $value['path']);
|
|
|
+ unset($temp[count($temp)-1]);
|
|
|
+ $this->path .= implode('/', $temp);
|
|
|
+ }
|
|
|
+ $this->path .= '/';
|
|
|
}
|
|
|
|
|
|
public function get($param = array())
|
|
@@ -51,8 +61,8 @@ class Doc
|
|
|
|
|
|
public function doc($url = false, $param = array())
|
|
|
{
|
|
|
- $url = $url ? $url : $this->url;
|
|
|
- $html = $this->download($url, $param);
|
|
|
+ $this->url($url);
|
|
|
+ $html = $this->download($this->url, $param);
|
|
|
return ($this->getClass())::init($html);
|
|
|
}
|
|
|
|
|
@@ -127,25 +137,58 @@ class Doc
|
|
|
|
|
|
public function rule($data, $col, $config)
|
|
|
{
|
|
|
-
|
|
|
$name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
|
|
|
$this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
|
|
|
|
|
|
$method = 'rule_' . $this->type;
|
|
|
|
|
|
- $data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
|
|
|
+ $result = $this->getRule($data, $col, $config['collect_rule'], $config['key']);
|
|
|
|
|
|
+ if (isset($config['collect_url']) && $config['collect_url']) {
|
|
|
+ $collect_url = explode("\n", str_replace("\r", '', $config['collect_url']));
|
|
|
+ if (!isset($collect_url[1])) {
|
|
|
+ $collect_url[1] = '';
|
|
|
+ }
|
|
|
+ $temp = array();
|
|
|
+ $temp[] = $result;
|
|
|
+ $this->getNext($temp, $data, $col, $collect_url[0], $collect_url[1], $config['collect_rule'], $config['key']);
|
|
|
+ $result = implode(',', $temp);
|
|
|
+ }
|
|
|
+
|
|
|
$this->addLog($name . '解析完成');
|
|
|
|
|
|
- return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
|
|
|
+ return $this->collect($result, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
|
|
|
}
|
|
|
|
|
|
- public function getUrl($data, $col, $config)
|
|
|
+ public function getNext(&$result, $data, $col, $collect_url, $collect_include, $collect_rule, $key)
|
|
|
{
|
|
|
- $url = $this->rule($data, $col, $config);
|
|
|
+ $url = $this->getUrl($data, $col, $collect_url, $key);
|
|
|
+ if ($url) {
|
|
|
+ if ($collect_include && !strstr($url, $collect_include)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ $data = $this->init($url);
|
|
|
+ if ($data) {
|
|
|
+ $temp = $this->getRule($data, $col, $collect_rule, $key);
|
|
|
+ if ($temp) {
|
|
|
+ $result[] = $temp;
|
|
|
+ $this->getNext($result, $data, $col, $collect_url, $collect_include, $collect_rule, $key);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public function getUrl($data, $col, $collect_rule, $key)
|
|
|
+ {
|
|
|
+ $url = $this->getRule($data, $col, $collect_rule, $key);
|
|
|
+ if (!$url) {
|
|
|
+ return '';
|
|
|
+ }
|
|
|
if (strpos($url, 'http') === false) {
|
|
|
if ($url[0] == '/') {
|
|
|
$url = $this->host . $url;
|
|
|
+ } elseif (strstr($url, '.')) {
|
|
|
+ $url = $this->path . $url;
|
|
|
} else {
|
|
|
$url = $this->url . $url;
|
|
|
}
|
|
@@ -153,6 +196,10 @@ class Doc
|
|
|
return $url;
|
|
|
}
|
|
|
|
|
|
+ public function getRule($data, $col, $collect_rule, $key)
|
|
|
+ {
|
|
|
+ return ($this->getClass())::rule($this, $data, $col, $collect_rule, $key);
|
|
|
+ }
|
|
|
|
|
|
public function addLog($string)
|
|
|
{
|