Doc.php 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. <?php
  2. namespace Spider\Lib;
  3. use Dever;
  4. class Doc
  5. {
  6. private $url;
  7. private $host;
  8. private $rule;
  9. public $type = 'dom';
  10. private $log = false;
  11. static protected $instance;
  12. static public function getInstance($url, $rule = '')
  13. {
  14. $key = $url . md5($rule);
  15. if (empty(self::$instance[$key])) {
  16. self::$instance[$key] = new self($url, $rule);
  17. }
  18. return self::$instance[$key];
  19. }
  20. public function __construct($url, $rule = '')
  21. {
  22. $this->url($url);
  23. $this->rule = $rule;
  24. }
  25. private function url($url)
  26. {
  27. $this->url = $url;
  28. $value = parse_url($this->url);
  29. $this->host = $value['scheme'] . '://' . $value['host'];
  30. }
  31. public function get()
  32. {
  33. $doc = $this->doc();
  34. if ($this->rule) {
  35. $doc = $this->find($doc, $this->rule);
  36. }
  37. return $doc;
  38. }
  39. private function doc()
  40. {
  41. $html = $this->download($this->url);
  42. if (strpos($this->rule, '$json') !== false) {
  43. $this->type = 'json';
  44. } else {
  45. $this->type = 'dom';
  46. }
  47. return ($this->getClass())::init($html);
  48. }
  49. private function download($url)
  50. {
  51. $this->addLog($this->url . '下载中...');
  52. $download = new Download($url);
  53. $this->addLog($this->url . '下载完成');
  54. return $download->get();
  55. }
  56. private function collect($data, $include, $exclude, $filter)
  57. {
  58. if ($include && !preg_match('/' . $include . '/i', $data)) {
  59. return 'error';
  60. }
  61. if ($exclude && preg_match('/' . $exclude . '/i', $data)) {
  62. return 'error';
  63. }
  64. if ($filter) {
  65. $data = preg_replace('/' . $filter . '/i', '', $data);
  66. }
  67. return $data;
  68. }
  69. private function getClass()
  70. {
  71. return 'Spider\\Lib\\Doc\\' . ucfirst($this->type);
  72. }
  73. public function find($doc, $rule)
  74. {
  75. return ($this->getClass())::find($doc, $rule);
  76. }
  77. public function rule($data, $col, $config)
  78. {
  79. $name = '字段[' . $config['name'] . '('.$config['key'].')]' . '"';
  80. $this->addLog($name . '正在按照规则['.$config['collect_rule'].']进行解析');
  81. $method = 'rule_' . $this->type;
  82. $data = ($this->getClass())::rule($this, $data, $col, $config['collect_rule'], $config['key']);
  83. $this->addLog($name . '解析完成');
  84. return $this->collect($data, $config['collect_include'], $config['collect_exclude'], $config['collect_filter']);
  85. }
  86. public function getUrl($data, $col, $config)
  87. {
  88. $url = $this->rule($data, $col, $config);
  89. if (strpos($url, 'http') === false) {
  90. if ($url[0] == '/') {
  91. $url = $this->host . $url;
  92. } else {
  93. $url = $this->url . $url;
  94. }
  95. }
  96. return $url;
  97. }
  98. public function addLog($string)
  99. {
  100. if ($this->log) {
  101. $this->log->add($string);
  102. }
  103. }
  104. public function saveLog()
  105. {
  106. if ($this->log) {
  107. $this->log->save();
  108. }
  109. }
  110. public function outLog()
  111. {
  112. if ($this->log) {
  113. $this->log->out();
  114. }
  115. }
  116. public function log(Log $log)
  117. {
  118. $this->log = $log;
  119. }
  120. }