爬虫10月6号

今天代码如下

import time
import datetime
import random
import json
import base64
import re
import requests
import urllib3
import logging.handlers
from flask import Flask, request
from flask import jsonify
from html.parser import HTMLParser
from peewee import * # pylint: disable=unused-import

# 禁用requests提交https请求时的报错
urllib3.disable_warnings()

# LOG相关 START#############################################################
rq = time.strftime('%Y-%m-%d', time.localtime(time.time()))
LOG_FILE = 'info.%s.log' % (rq)

handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024 * 10, backupCount=5, encoding='utf-8')  # 实例化handler
fmt = '[%(asctime)s] [%(funcName)s:%(filename)s:%(lineno)d] %(levelname)s - %(message)s'

formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('info')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
# LOG相关 END#############################################################

# DB相关 START#############################################################
db = MySQLDatabase("wechat_db", host="192.168.47.249", port=3306, user="root", passwd="root")
db.connect()

class BaseModel(Model):
    class Meta:
        database = db

class WechatAuthor(BaseModel):
    id = PrimaryKeyField()
    wx_id = CharField(null=False)
    biz = CharField(null=False, unique=True)
    nick_name = CharField(null=True)
    status = IntegerField(null=True, default=0)
    biz_id = BigIntegerField(null=True)
    create_time = DateTimeField(null=True)
    last_modified_date = DateTimeField(null=True)
    class Meta:
        db_table = 'wx_author'

class WechatContent(BaseModel):
    id = PrimaryKeyField()
    biz = CharField(null=False, index=True)
    author = CharField(null=True)
    title = CharField(null=False)
    digest = CharField(null=True)
    content_url = CharField(null=False)
    source_url = CharField(null=True)
    push_time = DateTimeField(null=True)
    read_num = IntegerField(null=True)
    like_num = IntegerField(null=True)
    reward_num = IntegerField(null=True)
    create_time = DateTimeField(null=True)
    last_modified_date= DateTimeField(null=True)
    class Meta:
        db_table = 'wx_content'

# DB相关 END#############################################################


# HTTP相关 START#############################################################
attrs_list = list()

class MyHTMLParser(HTMLParser):
    # 筛选<a>标签,属性key包含uigs、account_name,其中href就是公众号url
    # 一个页面可能包含多个公众号信息
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'uigs' and 'account_name' in attr[1]:
                    attrs_list.append(attrs)

# wechat_name可以是wx_id也可以为nick_name
def get_sogou_html(wechat_name):
    # 通过搜狗发送get请求
    header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
    sogou_url = 'http://weixin.sogou.com/weixin?query=' + wechat_name
    response = requests.get(sogou_url, headers=header, verify=False)
    logger.info('获取搜狗查询页面:%s-状态:%s' % (wechat_name, response.status_code))
    return response.text

# 公众号url有访问频率限制,访问太快会触发验证码
def get_wechat_html(wechat_url):
    header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
    response = requests.get(wechat_url, headers=header, verify=False)
    logger.info('获取微信公众号页面:%s-状态:%s' % (wechat_url, response.status_code))
    # todo 处理验证码页面
    return response.text

def get_wxurl_list(html):
    parser = MyHTMLParser()
    parser.feed(html)
    wxurl_list = [t[2][1] for t in attrs_list]
    return wxurl_list

def get_biz(wechat_name):
    wx_info_list = list()
    # 这里没有处理搜狗分页
    html = get_sogou_html(wechat_name)
    wxurl_list = get_wxurl_list(html)
    for wechat_url in wxurl_list:
        html = get_wechat_html(wechat_url)
        # 如果获取微信html,触发验证码,这里不应该正则匹配

        # 在js代码中,有biz,还有wx_id及nick_name
        # wx_id可能为''
        biz = re.findall(r'var biz = "(\S*)"', html)
        nick_name = re.findall(r'var name="(\S*)"', html)
        wx_info_list.append((biz[0], nick_name[0]))
        logging.info('biz:%s nick_name:%s' % (biz, nick_name))
        time.sleep(random.randint(5, 20))
    return wx_info_list

def headers_to_dict(headers):
    """
    将字符串
    '''
    Host: mp.weixin.qq.com
    Connection: keep-alive
    Cache-Control: max-age=
    '''
    转换成字典类型
    :param headers: str
    :return: dict
    """
    headers = headers.split("\n")
    d_headers = dict()
    for h in headers:
        h = h.strip()
        if h:
            k, v = h.split(":", 1)
            d_headers[k] = v.strip()
    return d_headers

def get_wx_content(biz):
    # url需要实时抓APP数据包获得,其中几个value经常改变
    url = 'https://mp.weixin.qq.com/mp/profile_ext?x5=0&is_ok=1&action=getmsg&scene=126&uin=777&key=777&f=json&count=10&' \
'__biz={}&offset={}&' \
'pass_ticket={}&' \
'appmsg_token={}'
    # Cookie需要实时抓APP数据包获得
    headers = {
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.7.0 NetType/WIFI Language/zh_CN',
        'Cookie': ''}

    next_offset = 0

    while True:
        content_url = url.format(biz, next_offset)
        response = requests.get(content_url, headers=headers, verify=False)
        logger.info('content_url:%s - 状态:%s' % (content_url, response.status_code))
        jsonstr = response.text
        rs = json.loads(jsonstr)

        # 标注是否有后续分页
        can_msg_continue = rs['can_msg_continue']
        next_offset = rs['next_offset']
        general_msg_list = rs['general_msg_list']
        rs = json.loads(general_msg_list)
        for obj in rs['list']:
            author = obj['app_msg_ext_info']['author']
            title = obj['app_msg_ext_info']['title']
            digest = obj['app_msg_ext_info']['digest']
            content_url = obj['app_msg_ext_info']['content_url']
            source_url = obj['app_msg_ext_info']['source_url']
            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            push_time = datetime.datetime.fromtimestamp(int(obj['comm_msg_info']['datetime'])).strftime('%Y-%m-%d %H:%M:%S')
            WechatContent.create(biz=biz, author=author, title=title, digest=digest, content_url=content_url, source_url=source_url, push_time=push_time, read_num=0, like_num=0, reward_num=0, create_time=now_time, last_modified_date=now_time)
            logger.info('%s-%s入库' % (title, author))

            if obj['app_msg_ext_info']['is_multi'] == 1:
                for sub_obj in obj['app_msg_ext_info']['multi_app_msg_item_list']:
                    author = sub_obj['author'],
                    title = sub_obj['title'],
                    digest = sub_obj['digest'],
                    content_url = sub_obj['content_url'],
                    source_url = sub_obj['source_url'],
                    WechatContent.create(biz=biz, author=author, title=title, digest=digest, content_url=content_url, source_url=source_url, push_time=push_time, read_num=0, like_num=0, reward_num=0, create_time=now_time, last_modified_date=now_time)
                    logger.info('%s-%s入库' % (title, author))

        if can_msg_continue == 0:
            logger.info('%s历史文章获取入库完毕' % (biz))
            break

        time.sleep(random.randint(5, 20))
# HTTP相关 END#############################################################


# WEB相关 START############################################################

# 定义返回码
R200_OK = {'code': 200, 'message': 'OK'}
R500_ERROR = {'code': 500, 'message': 'ERROR'}

def statusResponse(statu_dic):
    return jsonify({'status': statu_dic})

def fullResponse(statu_dic, data):
    return jsonify({'status': statu_dic, 'data': data})

app = Flask(__name__)

# 使用rest风格接收HTTP传递的wx_name
@app.route('/wechat/<string:wx_name>', methods=['PUT', 'GET'])
def wechat_content(wx_name):
    logger.info('获取wechat_name=%s相关信息开始' % wx_name)
    # 0.校验wx_name是否符合基本要求(参考搜狗)

    # 1.根据wx_name获取biz_list
    # 获取之前需要清空全局attrs_list
    global attrs_list
    attrs_list = list()
    wx_info_list = get_biz(wx_name)
    logger.info('待入库微信公众号信息-%s' % wx_info_list)
    for wx_info in wx_info_list:
        wx_id = wx_info[1].split('\"||\"')[0]
        biz = wx_info[0]
        nick_name = wx_info[1].split('\"||\"')[1]
        biz_id = base64.b64decode(biz)
        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if wx_id == '':
            wx_id = nick_name
            logger.info('wx_id匹配为null,使用nick_name:%s替代' % (nick_name))
        # 2.查询wx_name在表wx_author中是否存在
        try:
            WechatAuthor.get(biz=biz)
            logger.info('该biz:%s已存在' % biz)
            continue
        except Exception:
            try:
                # 3.根据返回的biz_list,将相关数据入库wx_author
                rs = WechatAuthor.create(wx_id=wx_id, biz=biz, nick_name=nick_name, create_time=now_time, last_modified_date=now_time, biz_id=biz_id)
                logger.info('%s插入自增id:%s' % (nick_name, rs))
                continue
            except Exception as err:
                logger.error('数据插入错误%s' % (err), exc_info=True)
                continue
    return statusResponse(R200_OK)


@app.route('/wechat/history/<string:nick_name>', methods=['PUT', 'GET'])
def wechat_history(nick_name):
    logger.info("尝试获取%s该公众号历史文章" % (nick_name))
    try:
        wechat_author = WechatAuthor.get(nick_name=nick_name)
    except Exception:
        logger.info("并未获取%s该公众号基本信息" % (nick_name))
        return statusResponse(R500_ERROR)

    try:
        if wechat_author.status == 0:
            get_wx_content(wechat_author.biz)
        else:
            logger.info("已获取%s该公众号历史文章" % (nick_name))
    except Exception as err:
        logger.info("%s" % (err))
        return statusResponse(R500_ERROR)
    return statusResponse(R200_OK)
# WEB相关 END############################################################

if __name__ == '__main__':
   app.run(host='0.0.0.0', port=8081, debug=True)

爬取微信公众号内容,亲测有效

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值