dever 6 years ago
parent
commit
ab34f6be87
5 changed files with 664 additions and 0 deletions
  1. 27 0
      content/database/article.php
  2. 181 0
      spider/database/data.php
  3. 186 0
      spider/database/wechat.php
  4. 8 0
      spider/index.php
  5. 262 0
      spider/lib/Get.php

+ 27 - 0
content/database/article.php

@@ -48,6 +48,12 @@ $share = array
 	2 => '不显示',
 );
 
+$wechat = array
+(
+	1 => '否',
+	2 => '是',
+);
+
 # 常用的col
 $col = 'id,cate_id,name,pic_cover,pdate,num_add_view+num_view as num_view,num_up+num_add_up as num_up,num_comment,share_yes,share_title,share_pic,share_content,function,content,author_id';
 
@@ -126,6 +132,27 @@ return array
 			'search'	=> 'select',
 		),
 
+		'wechat'		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> '是否公众号文章',
+			'default' 	=> '1',
+			'desc' 		=> '是否公众号文章',
+			'match' 	=> 'is_numeric',
+			//'update'	=> 'select',
+			'option'	=> $wechat,
+			'search'	=> 'select',
+		),
+
+		'wechat_data_id'		=> array
+		(
+			'type' 		=> 'int-11',
+			'name' 		=> '微信数据id',
+			'default' 	=> '1',
+			'desc' 		=> '微信数据id',
+			'match' 	=> 'is_numeric',
+		),
+
 		'tag'		=> array
 		(
 			'type' 		=> 'text-255',

+ 181 - 0
spider/database/data.php

@@ -0,0 +1,181 @@
+<?php
+
+$audit = Dever::config('base')->audit;
+
+
+$copyright = array
+(
+    1 => '原创',
+    2 => '非原创',
+);
+
+$wechat = function()
+{
+    $array = array();
+    $info = Dever::db('spider/wechat')->state();
+    if($info)
+    {
+        $array += $info;
+    }
+    return $array;
+};
+
+return array
+(
+    # 表名
+    'name' => 'data',
+    # 显示给用户看的名称
+    'lang' => '文章列表',
+    'order' => 100,
+    'end' => array
+    (
+        'update' => 'spider/lib/get.up',
+        'update_audit' => 'spider/lib/get.up',
+    ),
+
+    # 同步更新另外一个或多个表的数据
+    /*
+    'syncone' => array
+    (
+        'audit/data' => array
+        (
+            # 更新另外一个表的字段 => 本表的字段
+            'where' => array('data_id' => 'id', 'type' => 1),
+            # 要更新的数据
+            'update' => array('data_id' => 'id', 'type' => 1, 'pdate' => 'pdate', 'reorder' => 'reorder', 'name' => 'name', 'audit' => 'audit', 'status' => 'status', 'cate_id' => 'cate_id'),
+        )
+    ),
+    */
+    # 数据结构
+    'struct' => array
+    (
+        'id'        => array
+        (
+            'type'      => 'int-11',
+            'name'      => 'ID',
+            'default'   => '',
+            'desc'      => '',
+            'match'     => 'is_numeric',
+            //'list'        => true,
+        ),
+
+        'wechat_id'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '所属公众号',
+            'default'   => '1',
+            'desc'      => '所属公众号',
+            'match'     => 'is_numeric',
+            //'update'    => 'select',
+            'option'    => $wechat,
+            'search'    => 'select',
+            'list'      => true,
+        ),
+        
+        'name'      => array
+        (
+            'type'      => 'varchar-180',
+            'name'      => '标题',
+            'default'   => '',
+            'desc'      => '标题',
+            'match'     => 'is_string',
+            'update'    => 'text',
+            'list'      => true,
+            //增加预览
+            'preview'   => true,
+        ),
+
+        'copyright'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '原创',
+            'default'   => '1',
+            'desc'      => '原创',
+            'match'     => 'is_numeric',
+            'update'    => 'select',
+            'option'    => $copyright,
+            'search'    => 'select',
+            'list'      => true,
+        ),
+
+        'cover'      => array
+        (
+            'type'      => 'varchar-150',
+            'name'      => '封面图',
+            'default'   => '',
+            'desc'      => '封面图',
+            'match'     => 'option',
+            'update'    => 'image',
+            'key'       => '1',
+            'place'     => '150',
+        ),
+
+        'state'     => array
+        (
+            'type'      => 'tinyint-1',
+            'name'      => '状态',
+            'default'   => '1',
+            'desc'      => '请选择状态',
+            'match'     => 'is_numeric',
+        ),
+
+        'content'       => array
+        (
+            'type'      => 'text-255',
+            'name'      => '内容',
+            'default'   => '',
+            'desc'      => '请输入内容',
+            'match'     => 'is_string',
+            'update'    => 'editor',
+        ),
+
+        'pdate'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '发布时间',
+            'match'     => 'is_numeric',
+            'default'   => '',
+            'desc'      => '',
+            'update'    => 'date',
+            'callback'  => 'maketime',
+            'order'     => 'desc',
+            'list'      => 'date("Y-m-d H:i:s", {pdate})',
+        ),
+
+        'audit'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '审核状态',
+            'default'   => '1',
+            'desc'      => '审核',
+            'match'     => 'is_numeric',
+            'update'  => 'select',
+            'option'    => $audit,
+            'search'    => 'select',
+            'list'      => true,
+            'edit'      => true,
+        ),
+
+        'cdate'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '录入时间',
+            'match'     => array('is_numeric', time()),
+            'desc'      => '',
+            # 只有insert时才生效
+            'insert'    => true,
+        ),
+    ),
+    
+    # 管理功能
+    'manage' => array
+    (
+        'insert' => false,
+    ),
+
+    # request 请求接口定义
+    'request' => array
+    (
+        
+    ),
+);

+ 186 - 0
spider/database/wechat.php

@@ -0,0 +1,186 @@
+<?php
+
+
+$cate = function()
+{
+    $array = array();
+    $info = Dever::db('content/cate')->state();
+    if($info)
+    {
+        $array += $info;
+    }
+    return $array;
+};
+
+$author = function()
+{
+    $array = array();
+    $info = Dever::db('content/author')->state();
+    if($info)
+    {
+        $array += $info;
+    }
+    return $array;
+};
+
+return array
+(
+    # 表名
+    'name' => 'wechat',
+    # 显示给用户看的名称
+    'lang' => '公众号管理',
+    # 后台菜单排序
+    'order' => 10,
+    # 数据结构
+    'struct' => array
+    (
+        'id'        => array
+        (
+            'type'      => 'int-11',
+            'name'      => 'ID',
+            'default'   => '',
+            'desc'      => '',
+            'match'     => 'is_numeric',
+            'search'    => 'order',
+            //'list'      => true,
+            'order'     => 'desc',
+        ),
+        
+        'name'      => array
+        (
+            'type'      => 'varchar-100',
+            'name'      => '名称-直接输入公众号名称即可,系统会自动抓取该公众号的前10篇文章,每天抓取一次',
+            'default'   => '',
+            'desc'      => '请输入名称',
+            'match'     => 'is_string',
+            'update'    => 'text',
+            'search'    => 'fulltext',
+            'list'      => true,
+        ),
+
+        'wechat'      => array
+        (
+            'type'      => 'varchar-200',
+            'name'      => '微信号',
+            'default'   => '',
+            'desc'      => '微信号',
+            'match'     => 'is_string',
+            //'update'    => 'text',
+            'search'    => 'fulltext',
+            'list'      => true,
+        ),
+
+        'cate_id'       => array
+        (
+            'type'      => 'int-11',
+            'name'      => '所属栏目',
+            'default'   => '1',
+            'desc'      => '所属栏目',
+            'match'     => 'is_numeric',
+            'update'    => 'select',
+            'option'    => $cate,
+            'search'    => 'select',
+            'list'      => true,
+            'edit'      => true,
+        ),
+
+        'author_id'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '作者',
+            'default'   => '1',
+            'desc'      => '作者',
+            'match'     => 'is_numeric',
+            'update'    => 'select',
+            'option'    => $author,
+            'search'    => 'select',
+            'list'      => true,
+            'edit'      => true,
+        ),
+
+        'zdate'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '下次执行时间',
+            'match'     => 'is_numeric',
+            'desc'      => '请选择执行时间',
+            'default'   => '',
+            'update'    => 'date',
+            'callback'  => 'maketime',
+            'list'      => '{zdate} > 0 ? date("Y-m-d H:i:s", {zdate}) : "未知"',
+        ),
+
+        'pdate'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '抓取时间间隔-每个公众号发布的文章时间不同,请设置抓取时间间隔,单位为小时,尽量保证每次抓取都有新数据,也请不要设置间隔时间太短,抓取太频繁会导致抓取失败',
+            'match'     => 'is_numeric',
+            'default'   => '12',
+            'desc'      => '',
+            'update'    => 'text',
+            'list'      => true,
+        ),
+        
+        'reorder'       => array
+        (
+            'type'      => 'int-11',
+            'name'      => '排序(数值越大越靠前)',
+            'default'   => '1',
+            'desc'      => '请输入排序',
+            'match'     => 'option',
+            //'update'    => 'text',
+            'search'    => 'order',
+            //'list'      => true,
+            'order'     => 'desc',
+            'edit'      => true,
+        ),
+
+        'state'     => array
+        (
+            'type'      => 'tinyint-1',
+            'name'      => '状态',
+            'default'   => '1',
+            'desc'      => '请选择状态',
+            'match'     => 'is_numeric',
+        ),
+        
+        'cdate'     => array
+        (
+            'type'      => 'int-11',
+            'name'      => '录入时间',
+            'match'     => array('is_numeric', time()),
+            'desc'      => '',
+            # 只有insert时才生效
+            'insert'    => true,
+            //'list'      => 'date("Y-m-d H:i:s", {cdate})',
+        ),
+    ),
+
+    # 默认值
+    'default' => array
+    (
+
+    ),
+
+    'manage' => array
+    (
+        
+    ),
+
+    # request 请求接口定义
+    'request' => array
+    (
+        # get
+        'get' => array
+        (
+            'where' => array
+            (
+                'zdate' => array('yes', '<='),
+                'state' => 1,
+            ),
+            'type' => 'all',
+            'order' => array('id', 'desc'),
+            'col' => '*',
+        ),
+    ),
+);

+ 8 - 0
spider/index.php

@@ -0,0 +1,8 @@
+<?php
+
+define('DEVER_APP_NAME', 'spider');
+define('DEVER_APP_LANG', '公众号抓取');
+define('DEVER_APP_PATH', dirname(__FILE__) . DIRECTORY_SEPARATOR);
+define('DEVER_MANAGE_ORDER', 100);
+define('DEVER_MANAGE_ICON', 'glyphicon glyphicon-tower layui-icon-tree');
+include(DEVER_APP_PATH . '../boot.php');

+ 262 - 0
spider/lib/Get.php

@@ -0,0 +1,262 @@
+<?php
+
+namespace Spider\Lib;
+
+use Dever;
+
+class Get
+{
+
+    public function wechat_api()
+    {
+        $time = time();
+        $config = Dever::db('spider/wechat')->get(array('where_zdate' => $time));
+
+        if ($config) {
+            foreach ($config as $k => $v) {
+                //$this->get($v['id'], $v['name'], $v['wechat']);
+
+                Dever::daemon('lib/get.get?id=' . $v['id'], 'spider');
+
+                if (!$v['zdate'] || $v['zdate'] < 100000) {
+                    $v['zdate'] = time();
+                }
+                Dever::db('spider/wechat')->update(array('where_id' => $v['id'], 'zdate' => $v['zdate'] + ($v['pdate']*3600)));
+            }
+        }
+        return 'ok';
+    }
+
+    public function up($id, $name, $data)
+    {
+        $audit = Dever::param('audit', $data);
+        $info = Dever::db('spider/data')->one($id);
+        $wechat = Dever::db('spider/wechat')->one($info['wechat_id']);
+
+        # 查询文章表里有没有
+        $article = Dever::db('content/article')->one(array('wechat_data_id' => $id));
+        if ($audit == 2) {
+            $insert['cate_id'] = $wechat['cate_id'];
+            $insert['author_id'] = $wechat['author_id'];
+            $insert['name'] = $info['name'];
+            $insert['pdate'] = $info['pdate'];
+            $insert['pic_cover'] = $info['cover'];
+            $insert['content'] = $info['content'];
+            $insert['audit'] = 2;
+            $insert['wechat'] = 2;
+            $insert['wechat_data_id'] = $info['id'];
+            if (!$article) {
+                Dever::db('content/article')->insert($insert);
+            } else {
+                $insert['where_id'] = $article['id'];
+                Dever::db('content/article')->update($insert);
+            }
+        } else {
+            if ($article) {
+                $where['where_id'] = $article['id'];
+                $where['audit'] = 1;
+                Dever::db('content/article')->update($where);
+            }
+        }
+    }
+
+    public function get()
+    {
+        $id = Dever::input('id');
+        $info = Dever::db('spider/wechat')->one($id);
+        $name = $info['name'];
+        $wechat = $info['wechat'];
+
+        $url = 'https://weixin.sogou.com/weixin?type=1&query='.$name.'&ie=utf8&s_from=input&_sug_=n&_sug_type_=';
+
+        $content = $this->content($name, $url);
+
+        preg_match_all('/<a target="_blank" uigs="account_name_0" href="(.*?)">(.*?)<\/a>/i', $content, $matches);
+
+        if (isset($matches[1][0])) {
+            $url = $matches[1][0];
+
+            $url = str_replace('&amp;', '&', $url);
+
+            $content = $this->content($name, $url);
+
+            # 获取微信号 <p class="profile_account">微信号: tesexiaozhenzmg</p>
+            if (!$wechat) {
+                preg_match_all('/<p class="profile_account">微信号: (.*?)<\/p>/i', $content, $matches);
+                if (isset($matches[1][0]) && $matches[1][0]) {
+                    $wechat = $matches[1][0];
+                    Dever::db('spider/wechat')->update(array('where_id' => $id, 'wechat' => $wechat));
+                }
+            }
+            
+
+            # 获取微信前10条内容
+            preg_match_all('/var msgList = {(.*?)};/i', $content, $matches);
+
+            if (isset($matches[1][0]) && $matches[1][0]) 
+            {
+                $string = '{'.$matches[1][0].'}';
+                $array = Dever::json_decode($string);
+
+                if (isset($array['list'])) {
+                    foreach ($array['list'] as $k => $v) {
+                        $this->getArticle($id, $name, $v['app_msg_ext_info']);
+                    }
+                }
+            }
+        }
+    }
+
+    private function getArticle($id, $name, $data)
+    {
+        $host = 'https://mp.weixin.qq.com';
+
+        $url = $host . $data['content_url'];
+
+        $url = str_replace('&amp;', '&', $url);
+
+        $content = $this->content($name, $url);
+
+        preg_match_all('/<div class="rich_media_content " id="js_content">([\s\S]*?)<\/div>/i', $content, $matches);
+
+        $result['content'] = '';
+        if (isset($matches[1][0]) && $matches[1][0]) {
+            $result['content'] = $this->convertContent($matches[1][0]);
+        }
+        $result['title'] = $data['title'];
+        $result['cover'] = $this->pic($data['cover']);
+        $result['copyright'] = $data['copyright_stat'] == 11 ? 1 : 2;
+        $result['pdate'] = '';
+
+        # 获取时间
+        preg_match_all('/var ct = "(.*?)";/i', $content, $matches);
+        if ($matches[1][0] && $matches[1][0]) {
+            $result['pdate'] = $matches[1][0];
+        }
+
+
+        $where['wechat_id'] = $id;
+        $where['name'] = $result['title'];
+        $info = Dever::db('spider/data')->one($where);
+
+        $where['cover'] = $result['cover'];
+        $where['content'] = $result['content'];
+        $where['copyright'] = $result['copyright'];
+        $where['pdate'] = $result['pdate'];
+        if (!$info) {
+            Dever::db('spider/data')->insert($where);
+        } else {
+            $where['where_id'] = $info['id'];
+            Dever::db('spider/data')->update($where);
+        }
+    }
+
+    private function convertContent($content)
+    {
+        $content = trim($content);
+
+        preg_match_all('/data-src="(.*?)"/i', $content, $matches);
+
+        if (isset($matches[1]) && $matches[1]) {
+            foreach ($matches[1] as $k => $v) {
+                $pic = $this->pic($v);
+                $content = str_replace($v, $pic, $content);
+            }
+        }
+
+        $content = str_replace('data-src', 'src', $content);
+
+        return $content;
+    }
+
+    private function pic($pic)
+    {
+        $data = Dever::load('upload/save.copy?key=1&file=' . urlencode($pic));
+
+        if ($data && isset($data['url'])) {
+            return $data['url'];
+        } else {
+            return '';
+        }
+    }
+
+    private function header()
+    {
+        $ip = $this->ip();
+        $header = array();
+        $header['Accept'] = 'image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, */*';
+        $header['Connection'] = 'Keep-Alive';
+        $header['Pragma'] = 'no-cache';
+        $header['Accept-Language'] = 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3';
+        $header['User-Agent'] = $this->agent();
+        $header['CLIENT-IP'] = $ip;
+        $header['X-FORWARDED-FOR'] = $ip;
+
+        return $header;
+    }
+
+    private function agent()
+    {
+        $agent = [
+            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
+            'Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+            'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
+            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',
+            'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
+            'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
+            'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)',
+            'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.6',
+            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1',
+            'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0',
+            'Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5',
+            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
+            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
+            'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52'];
+        return $agent[rand(0,count($agent) - 1)];;
+    }
+
+    private function ip()
+    {
+        $ip_long = array(
+            array('607649792', '608174079'), //36.56.0.0-36.63.255.255
+            array('1038614528', '1039007743'), //61.232.0.0-61.237.255.255
+            array('1783627776', '1784676351'), //106.80.0.0-106.95.255.255
+            array('2035023872', '2035154943'), //121.76.0.0-121.77.255.255
+            array('2078801920', '2079064063'), //123.232.0.0-123.235.255.255
+            array('-1950089216', '-1948778497'), //139.196.0.0-139.215.255.255
+            array('-1425539072', '-1425014785'), //171.8.0.0-171.15.255.255
+            array('-1236271104', '-1235419137'), //182.80.0.0-182.92.255.255
+            array('-770113536', '-768606209'), //210.25.0.0-210.47.255.255
+            array('-569376768', '-564133889'), //222.16.0.0-222.95.255.255
+        );
+        $rand_key = mt_rand(0, 9);
+        $ip = long2ip(mt_rand($ip_long[$rand_key][0], $ip_long[$rand_key][1]));
+        return $ip;
+    }
+
+    private function content($name, $url)
+    {
+        $key = md5($name) . '_' . md5($url);
+        $time = time();
+        $date = explode('-', date('Y-m-d', $time));
+
+        $file = Dever::path(Dever::data() . 'tmp/', $date[0] . '/' . $date[1] . '/' . $date[2] . '/' . $key);
+        $num = 3600;
+        
+        if (!is_file($file) || (is_file($file) && filemtime($file) - $time > $num)) {
+            $header =  $this->header();
+            $content = Dever::curl($url, false, 'get', false, $header, $header['User-Agent'], 'http://weixin.sogou.com/');
+            file_put_contents($file, $content);
+        } else {
+            $content = file_get_contents($file);
+        }
+
+        if (!$content) {
+            Dever::alert('内容已失效');
+        }
+
+        return $content;
+    }
+}