爬取微信公众号文章

# -*- coding:utf-8 -*-


import json
import random
import re
import time
from bs4 import BeautifulSoup
from datetime import datetime
from pyExcelerator import *  # 导入excel相关包
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# from utils import pgs, es


class WxMps(object):

    def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, wap_sid2, _offset=0):
        self.start_time = 1575129600 # 截止开始时间 2019/12/01 00:00:00
        self.offset = _offset
        self.biz = _biz  # 公众号标志
        self.msg_token = _app_msg_token  # 票据(非固定)
        self.pass_ticket = _pass_ticket  # 票据(非固定)
        self.wap_sid2 = wap_sid2
        self.headers = {
            'Cookie': _cookie,  # Cookie(非固定)
            'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 '
        }
        self.LIKE_URL = 'https://mp.weixin.qq.com/mp/getappmsgext?__biz=%s&appmsg_type=9&mid=%s&sn=%s&idx=%s&appmsg_token=%s&is_need_ad=0'

        # excel 第一行数据
        self.excel_data = [u'编号', u'时间', u'文章标题', u'关键词', u'阅读量', u'点赞数', u'评论', u'文章地址', u'文章内容']
        # 定义excel操作句柄
        self.excle_w = Workbook()
        excel_sheet_name = time.strftime('%Y-%m-%d')
        self.excel_content = self.excle_w.add_sheet(excel_sheet_name)

        self.container = []

        self.line = 1

        # wx_mps = 'wxmps'  # 这里数据库、用户、密码一致(需替换成实际的)
        # self.postgres = pgs.Pgs(host='localhost', port='12432', db_name=wx_mps, user=wx_mps, password=wx_mps)
        # self.elastic = es.Es(host='localhost', port=12900, index='mp', doc='article')

    def log(self, msg):
        print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)

    def start(self):
        """请求获取公众号的文章接口"""

        cols = 0
        for data in self.excel_data:
            self.excel_content.write(0, cols, data)
            cols += 1
        self.excle_w.save('test3.xls')

        count = 0
        offset = self.offset
        while True:
            api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
                  '&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
                  '={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)

            resp = requests.get(api, headers=self.headers, verify=False).json()
            ret, status = resp.get('ret'), resp.get('errmsg')  # 状态信息
            self.log(u'状态信息,ret:%s' % ret)

            if 0 == ret or 'ok' == status:
                # print('Crawl article: ' + api)

                general_msg_list = resp['general_msg_list']
                msg_list = json.loads(general_msg_list)['list']  # 获取文章列表
                self.log(u'文章列表个数:%s' % len(msg_list))

                for msg in msg_list:
                    self.log(u'文章(%d/%d)' % (count, len(msg_list)))

                    # if count >= 11:
                    #     return
                    # try:
                    #     if count % 10 == 0 and count != 0:
                    #         print '写入Excel'
                    #         first = count - 10
                    #         for line in range(first, count):
                    #             cols = 0
                    #             for data in self.excel_data:
                    #                 self.excel_content.write(line, cols, self.container[line][cols])
                    #                 cols += 1
                    #         self.excle_w.save('test3.xls')
                    # except Exception as e:
                    #     print(str(e))

                    comm_msg_info = msg['comm_msg_info']  # 该数据是本次推送多篇文章公共的
                    msg_id = comm_msg_info['id']  # 文章id
                    self.log(u'文章id:%s' % msg_id)
                    post_time = datetime.fromtimestamp(comm_msg_info['datetime'])  # 发布时间

                    s_time = datetime.fromtimestamp(self.start_time)  # 发布时间
                    if post_time.__lt__(s_time):
                        print "时间已到截止日期前,停止爬虫"
                        return

                    self.log(u'文章时间:%s' % post_time)
                    msg_type = comm_msg_info['type']  # 文章类型
                    self.log(u'文章类型:%s' % msg_type)
                    # msg_data = json.dumps(comm_msg_info, ensure_ascii=False)  # msg原数据
                    # self.log(u'文章原始数据:%s' % msg_data)


                    if 49 == msg_type:
                        # 图文消息
                        app_msg_ext_info = msg.get('app_msg_ext_info')  # article原数据
                        if app_msg_ext_info:
                            # 本次推送的首条文章
                            self._parse_articles(app_msg_ext_info, msg_id, post_time, msg_type, count)
                            count += 1

                            # 本次推送的其余文章
                            multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
                            if multi_app_msg_item_list:
                                for item in multi_app_msg_item_list:
                                    msg_id = item['fileid']  # 文章id
                                    if msg_id or not isinstance(msg_id, int):
                                        msg_id = int(time.time())  # 设置唯一id,解决部分文章id=0出现唯一索引冲突的情况
                                    self._parse_articles(item, msg_id, post_time, msg_type, count)
                                    count += 1
                    elif 1 == msg_type:
                        # 文字消息
                        content = comm_msg_info.get('content')
                        # if content:
                        #     self._save_text_and_image(msg_id, post_time, msg_type, digest=content)
                    elif 3 == msg_type:
                        # 图片消息
                        image_msg_ext_info = msg.get('image_msg_ext_info')
                        cdn_url = image_msg_ext_info.get('cdn_url')
                        # if cdn_url:
                        #     self._save_text_and_image(msg_id, post_time, msg_type, cover=cdn_url)


            # 0:结束;1:继续
            can_msg_continue = resp.get('can_msg_continue')
            if not can_msg_continue:
                print('Break , Current offset : %d' % offset)
                break
            offset = resp.get('next_offset')  # 下一次请求偏移量
            print('Next offset : %d' % offset)




    @staticmethod
    def crawl_article_content(content_url):
        """抓取文章内容
        :param content_url: 文章地址
        """

        try:
            html = requests.get(content_url, verify=False).text
        except:
            print(content_url)
            pass
        else:
            bs = BeautifulSoup(html, 'html.parser')
            js_content = bs.find(id='js_content')
            if js_content:
                p_list = js_content.find_all('p')
                content_list = list(map(lambda p: p.text, filter(lambda p: p.text != '', p_list)))
                content = ''.join(content_list)
                return content

    def _parse_articles(self, info, msg_id, post_time, msg_type, count):
        # """解析嵌套文章数据并保存入库"""

        row = []
        row.append(count)
        row.append(str(post_time))
        self.log(u'post_time:%s' % post_time)

        title = info.get('title')  # 标题
        row.append(title)
        self.log(u'标题:%s' % title)
        cover = info.get('cover')  # 封面图
        author = info.get('author')  # 作者
        self.log(u'作者:%s' % author)
        digest = info.get('digest')  # 关键字
        row.append(digest)
        self.log(u'关键字:%s' % digest)
        # source_url = info.get('source_url')  # 原文地址
        content_url = info.get('content_url')  # 微信地址
        # ext_data = json.dumps(info, ensure_ascii=False)  # 原始数据
        content_url = content_url.replace('amp;', '').replace('#wechat_redirect', '').replace('http', 'https')

        self._parse_article_detail(content_url, row)

        row.append(content_url)
        self.log(u'文章链接:%s' % content_url)

        content = self.crawl_article_content(content_url)
        self.log(u'文章内容:%s' % content)
        row.append(content)

        self.container.append(row)

        try:
            print '写入Excel, line:', self.line
            cols = 0
            for data in row:
                if data:
                    self.excel_content.write(self.line, cols, data)
                cols += 1
            self.line += 1
            self.excle_w.save('test3.xls')
        except Exception as e:
            print(str(e))

        time.sleep(random.randint(2, 3))

    def _parse_article_detail(self, content_url, row):
        # """从文章页提取相关参数用于获取评论,article_id是已保存的文章id"""

        try:
            resp = requests.get(content_url, headers=self.headers, verify=False)
        except Exception as e:
            print('获取评论失败')
        else:
            # group(0) is current line

            readNum, likeNum = self.getMoreInfo(content_url)
            row.append(readNum)
            row.append(likeNum)

            html = resp.text

            str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
            str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
            str_token = re.search(r'window.appmsg_token = "(.*)";', html)

            if str_comment and str_msg and str_token:
                comment_id = str_comment.group(1)  # 评论id(固定)
                app_msg_id = str_msg.group(1)  # 票据id(非固定)
                appmsg_token = str_token.group(1)  # 票据token(非固定)

                # 缺一不可

                if appmsg_token and app_msg_id and comment_id:
                    print('Crawl article comments')
                    row.append(self._crawl_comments(app_msg_id, comment_id, appmsg_token))

    def getMoreInfo(self, link):
        # 获得mid,_biz,idx,sn 这几个在link中的信息
        mid = link.split("&")[1].split("=")[1]
        idx = link.split("&")[2].split("=")[1]
        sn = link.split("&")[3].split("=")[1]
        _biz = link.split("&")[0].split("_biz=")[1]

        # fillder 中取得一些不变得信息
        req_id = "1121Emzkpus2YzixHv8OZPro"
        pass_ticket = self.pass_ticket
        appmsg_token = self.msg_token

        # 目标url
        url = "http://mp.weixin.qq.com/mp/getappmsgext"
        # 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识
        # phoneCookie = "devicetype=iOS11.3; lang=zh_CN; pass_ticket=AXCuXaypgHXqC3Grop/GDPNNyOIKC/Any1k/BMd35JUK805ZvVRvZ47cKc6S7Xcj; rewardsn=; version=2607033d; wap_sid2=CInatZ8IElwxY0hJQUZPWjFwdkZrYU9kLUw0UUdId0htQkY3VnNzeENXRncyTlkzNVhqcHhtWWJrNENBTGcyQThlRnNkbEhJYmxfb3ZXRG1Rb2RsTTFFWHJlWWM2OThEQUFBfjDclOHhBTgNQAE=; wxtokenkey=777; wxuin=2213375241; pgv_pvid=2213375241"
        phoneCookie = "rewardsn=; wxuin=2213375241; devicetype=android-24; lang:zh_CN; wxtokenkey=777; pgv_info=ssid=s2906320611; pgv_pvid=1873061148; version:2607033d; pass_ticket=%s; wap_sid2=%s" % (pass_ticket, self.wap_sid2)
        headers = {
            "Cookie": phoneCookie,
            # "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E216 MicroMessenger/6.7.2 NetType/WIFI Language/zh_CN"
            "User-Agent":"Mozilla/5.0 (Linux; Android 7.0; JMM-AL10 Build/HONORJMM-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/66.0.3359.126 MQQBrowser/6.2 TBS/044431 Mobile Safari/537.36 MMWEBID/3894 MicroMessenger/6.7.3.1360(0x2607033D) NetType/WIFI Language/zh_CN Process/toolsmp"
        }
        # 添加data,`req_id`、`pass_ticket`分别对应文章的信息,从fiddler复制即可。
        data = {
            "is_only_read": 1,
            "req_id": req_id,
            "pass_ticket": pass_ticket,
            "is_temp_url": 0,
            "appmsg_like_type": 2,
            "more_read_type": 0,
            "item_show_type": 0,
            "appmsg_type": 9
        }
        """
        添加请求参数
        __biz对应公众号的信息,唯一
        mid、sn、idx分别对应每篇文章的url的信息,需要从url中进行提取
        key、appmsg_token从fiddler上复制即可
        pass_ticket对应的文章的信息,也可以直接从fiddler复制
        """
        params = {
            "__biz": _biz,
            "mid": mid,
            "sn": sn,
            "idx": idx,
            "key": "777",
            "pass_ticket": pass_ticket,
            "appmsg_token": appmsg_token,
            "uin": "777",
            "wxtoken": "777"
        }

        # 使用post方法进行提交
        content = requests.post(url, headers=headers, data=data, params=params).json()
        # 提取其中的阅读数和点赞数
        print '提取其中的阅读数和点赞数:'
        print(content["appmsgstat"]["read_num"], content["appmsgstat"]["like_num"])
        readNum = content["appmsgstat"]["read_num"]
        likeNum = content["appmsgstat"]["like_num"]
        # 歇10s,防止被封
        # time.sleep(15)
        return readNum, likeNum

    def _crawl_comments(self, app_msg_id, comment_id, appmsg_token):
        """抓取文章的评论"""

        commentList = []

        api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
              '&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
              '&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
              '&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
                                                     self.pass_ticket, appmsg_token)
        try:
            resp = requests.get(api, headers=self.headers, verify=False).json()
        except:
            print('Article {} no Comment')
        else:
            ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
            if ret == 0 or status == 'ok':
                elected_comment = resp['elected_comment']
                for comment in elected_comment:
                    nick_name = comment.get('nick_name')  # 昵称
                    # self.log(u'昵称:%s' % nick_name)
                    logo_url = comment.get('logo_url')  # 头像
                    comment_time = datetime.fromtimestamp(comment.get('create_time'))  # 评论时间
                    content = comment.get('content')  # 评论内容
                    # self.log(u'评论内容:%s' % content)
                    content_id = comment.get('content_id')  # id
                    like_num = comment.get('like_num')  # 点赞数
                    # self.log(u'点赞数:%s' % like_num)

                    commentList.append("{昵称:%s, 时间:%s, 内容:%s, 点赞数:%s}" % (nick_name, comment_time, content, like_num))

                    reply_list = comment.get('reply')['reply_list']  # 回复数据
                    reply_content, reply_like_num, reply_create_time = None, None, None
                    if reply_list:
                        first_reply = reply_list[0]
                        reply_content = first_reply.get('content')
                        reply_like_num = first_reply.get('reply_like_num')
                        reply_create_time = datetime.fromtimestamp(first_reply.get('create_time'))
        return "; ".join(commentList)


if __name__ == '__main__':
    biz = 'MjM5MjAxNDM4MA=='
    app_msg_token = '1051_V0N4pX2JM165AQpdDhHFM3K4cphiT4GBURq37O2BI8nQbqkudDryhnItAI2SzgumjOImQzBdx5wIFaGS'
    # 上面3个定义与公众号相关【_id仅为个人数据库中的一个标志】,下面3个跟当前微信会话有关
    pass_ticket = '2uTeu/aMHkWuuQNFqUHXlRssyPyzYLOYAYRd9vNuaa72kVwfxe+/GQqdJlTFXVCI'
    wap_sid2 = 'CInatZ8IElxKYnFndm1keTc4cGFwenI1YnVSVzZmalM5Mjh6LTJGZlhrb1VDR1lJZ3kzam4wanhOTm1EYUlMQ0I5LS11ZmFTSFkwOV9qdHdidjFCZ1hLZV9nM2Y4eHNFQUFBfjCJlI3zBTgNQAE='
    cookie = 'wxuin=2213375241; version=2607033d; pass_ticket={}; wap_sid2={}'.format(pass_ticket, wap_sid2)
    wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie, wap_sid2)
    # 开始爬取文章及评论
    wxMps.start()


 

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
爬虫(Web Crawler)是一种自动化程序,用于从互联网上收集信息。其主要功能是访问网页、提取数据并存储,以便后续分析或展示。爬虫通常由搜索引擎、数据挖掘工具、监测系统等应用于网络数据抓取的场景。 爬虫的工作流程包括以下几个关键步骤: URL收集: 爬虫从一个或多个初始URL开始,递归或迭代地发现新的URL,构建一个URL队列。这些URL可以通过链接分析、站点地图、搜索引擎等方式获取。 请求网页: 爬虫使用HTTP或其他协议向目标URL发起请求,获取网页的HTML内容。这通常通过HTTP请求库实现,如Python中的Requests库。 解析内容: 爬虫对获取的HTML进行解析,提取有用的信息。常用的解析工具有正则表达式、XPath、Beautiful Soup等。这些工具帮助爬虫定位和提取目标数据,如文本、图片、链接等。 数据存储: 爬虫将提取的数据存储到数据库、文件或其他存储介质中,以备后续分析或展示。常用的存储形式包括关系型数据库、NoSQL数据库、JSON文件等。 遵守规则: 为避免对网站造成过大负担或触发反爬虫机制,爬虫需要遵守网站的robots.txt协议,限制访问频率和深度,并模拟人类访问行为,如设置User-Agent。 反爬虫应对: 由于爬虫的存在,一些网站采取了反爬虫措施,如验证码、IP封锁等。爬虫工程师需要设计相应的策略来应对这些挑战。 爬虫在各个领域都有广泛的应用,包括搜索引擎索引、数据挖掘、价格监测、新闻聚合等。然而,使用爬虫需要遵守法律和伦理规范,尊重网站的使用政策,并确保对被访问网站的服务器负责。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值