Parse.php 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. <?php
  2. namespace Spider\Lib;
  3. use Dever;
  4. class Parse
  5. {
  6. private $url = '';
  7. private $host = '';
  8. private $dom = array();
  9. private $data = array();
  10. public function __construct($url, $project, $rule, $col)
  11. {
  12. $this->url($url);
  13. $dom = $this->dom($rule);
  14. foreach ($dom as $k => $v) {
  15. $this->handle(pq($v), $k, $col, $project);
  16. }
  17. }
  18. public function get()
  19. {
  20. return $this->data;
  21. }
  22. private function url($url)
  23. {
  24. $this->url = $url;
  25. $value = parse_url($this->url);
  26. $this->host = $value['scheme'] . '://' . $value['host'];
  27. }
  28. private function dom($rule, $url = '')
  29. {
  30. $url = $url ? $url : $this->url;
  31. if (empty($this->dom[$url])) {
  32. $dom = new Dom($url, $rule);
  33. $this->dom[$url] = $dom->get();
  34. }
  35. return $this->dom[$url];
  36. }
  37. private function handle($dom, $index, $col, $project)
  38. {
  39. foreach ($col as $v) {
  40. $callback = false;
  41. if (strpos($v['key'], '.') !== false) {
  42. $temp = explode('.', $v['key']);
  43. $v['key'] = $temp[1];
  44. $callback = $temp[0];
  45. }
  46. $value = $this->load($dom, $col, $v);
  47. if ($value == 'error') {
  48. break;
  49. }
  50. if ($callback) {
  51. $value = Dever::{$callback}($value);
  52. }
  53. $this->data[$index][$v['key']] = $value;
  54. }
  55. $this->update($this->data[$index], $project);
  56. }
  57. private function update($data, $project)
  58. {
  59. $param['option_project_id'] = $project;
  60. $param['option_value'] = json_encode($data);
  61. $info = Dever::db('spider/data')->one($param);
  62. if ($info) {
  63. $update = array();
  64. foreach ($param as $i => $j) {
  65. $i = str_replace('option_', 'set_', $i);
  66. $update[$i] = $j;
  67. }
  68. $id = $update['where_id'] = $info['id'];
  69. Dever::db('spider/data')->update($update);
  70. } else {
  71. $update = array();
  72. foreach ($param as $i => $j) {
  73. $i = str_replace('option_', 'add_', $i);
  74. $update[$i] = $j;
  75. }
  76. $id = Dever::db('spider/data')->insert($update);
  77. }
  78. }
  79. private function load($dom, $col, $config)
  80. {
  81. $data = $this->rule($dom, $col, $config['collect_rule'], $config['collect_include'], $config['collect_exclude']);
  82. if ($config['collect_include'] && strpos($data, $config['collect_include']) === false) {
  83. return 'error';
  84. }
  85. if ($config['collect_exclude'] && strpos($data, $config['collect_exclude']) !== false) {
  86. return 'error';
  87. }
  88. if ($config['collect_filter']) {
  89. $data = preg_replace('/' . $config['collect_filter'] . '/i', '', $data);
  90. }
  91. return $data;
  92. }
  93. private function rule($dom, $col, $rule, $include, $exclude)
  94. {
  95. $result = $dom->html();
  96. $rule = explode("\n", $rule);
  97. if (isset($rule[0]) && $rule[0]) {
  98. if (isset($col[$rule[0]])) {
  99. $url = $this->getUrl($dom, $col, $col[$rule[0]]);
  100. $dom = $this->dom('', $url);
  101. array_shift($rule);
  102. }
  103. $result = $this->find($dom, $rule[0], $result);
  104. }
  105. if (isset($rule[1]) && $rule[1]) $result = $this->match($rule[1], $result);
  106. return $result;
  107. }
  108. private function find($dom, $string, $result)
  109. {
  110. $string = str_replace(array('$', ').'), array('$dom->find', ')->'), $string);
  111. $cmd = '$result = ' . $string . ';';
  112. eval($cmd);
  113. return $result;
  114. }
  115. private function getUrl($dom, $col, $config)
  116. {
  117. $url = $this->load($dom, $col, $config);
  118. if (strpos($url, 'http') === false) {
  119. if ($url[0] == '/') {
  120. $url = $this->host . $url;
  121. } else {
  122. $url = $this->url . $url;
  123. }
  124. }
  125. return $url;
  126. }
  127. private function match($pattern, $string)
  128. {
  129. $temp = explode('||', $pattern);
  130. $index = isset($temp[1]) ? $temp[1] : 1;
  131. preg_match_all('/' . $temp[0] . '/i', $string, $match);
  132. $result = $match[$index][0];
  133. return $result;
  134. }
  135. }