Python 使用Charles爬取APP信息以及公众号信息

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Tenderness4/article/details/80617196

一、Charles使用

​ 这个就不介绍了,自行网上查阅,官网下载然后破解一下,打开手机操作一波,都挺简单的。

​ 注意事项:都需要安装证书,手机和电脑都需要安装证书,443端口指的是https服务。

二、APP信息抓取

  1. 分析

    前期准备,需要知道url,cookies,response返回的数据,请求的方式

  2. 打开想要抓取的APP,这里是得到页面逻辑思维栏目,在手机上不断刷新,能在Charles的Structure中看到有黄色变化,点击去。如下图一。然后开始分析这个请求,得到自己想要的数据。首先在Overview选项卡中可以得到我们需要请求的url地址,发现其请求方式是POST。在Request选项卡中,在底部点击切换,我们需要headers数据,不设置的话就需要登录,这里要注意拷贝过来的headers,放入代码中仅仅是加个引号,逗号,内容中间不要有空格。在Response中,能发现这就是我们需要的数据,而且还是json格式的数据。然后接下来就是编写代码了

  3. 代码

    # coding=utf-8
    import requests
    import json
    from Utils import Utils
    import os
    import time
    
    
    class DeDao(object):
        def __init__(self):
            self.row_title = ['来源目录', '标题', '图片', '分享标题', 'mp3地址', '音频时长', '文件大小']
            sheet_name = '逻辑思维音频'
    
            return_execl = Utils.create_execl(sheet_name, self.row_title)
            self.execl_f = return_execl[0]
            self.sheet_table = return_execl[1]
            self.audio_info = []  # 存放每一条数据中的各元素
            self.count = 0
            self.base_url = 'https://entree.igetget.com/acropolis/v1/audio/listall'
            self.max_id = 0
            self.headers = {
                'Host':	'entree.igetget.com',
                'X-OS':	'iOS',
                'X-NET':	'wifi',
                'Accept':	'*/*',
                'X-Nonce':	'70291808a4530748',
                'Accept-Encoding':	'br, gzip, deflate',
                'X-TARGET':	'main',
                'User-Agent':	'%E5%BE%97%E5%88%B0/4.0.13 CFNetwork/894 Darwin/17.4.0',
                'X-CHIL':	'appstore',
                'Cookie':	'acw_tc=AQAAAPt0EXBorQgA3Tcgb+9WeJpgznSn; aliyungf_tc=AQAAADwDyS2DbAgA3TcgbxkoU3Bb9E7e',
                'X-UID':	'224804667',
                'X-AV':	'4.0.0',
                'X-SEID':	'',
                'X-SCR':	'1242*2208',
                'X-DT':	'phone',
                'X-S':	'1b3579ace486377b',
                'X-Sign':	'ZjQzMzZkNWI2YmJmOTMzNmUyOWJlNGY5NWRhZDYzNzY=',
                'Accept-Language':	'zh-cn',
                'X-D':	'e74fed5a22924a6ab5702a8a5fff9ef8',
                'X-THUMB':	'l',
                'X-T':	'json',
                'X-Timestamp':	'1528304815',
                'X-TS':	'1528304815',
                'X-U':	'224804667',
                'X-App-Key':	'ios-4.0.0',
                'X-OV':	'11.2.6',
                'Connection':	'keep-alive',
                'X-ADV':	'1',
                'Content-Type':	'application/x-www-form-urlencoded',
                'X-V':	'2',
                'X-IS_JAILBREAK':	'NO',
                'X-DV':	'iPhone9,2',
            }
    
        def request_data(self):
            try:
                data = {
                    'max_id': self.max_id,
                    'since_id': 0,
                    'column_id': 2,
                    'count': 20,
                    'order': 1,
                    'section': 0
                }
                response = requests.post(
                    self.base_url, headers=self.headers, data=data)
                print(response.status_code)
                if 200 == response.status_code:
                    self.parse_data(response)
            except Exception as e:
                print(e)
    
        def parse_data(self, response):
            dict_json = json.loads(response.text)
            datas = dict_json['c']['list']
            for data in datas:
                source_name = data['audio_detail']['source_name']
                title = data['audio_detail']['title']
                icon = data['audio_detail']['icon']
                share_title = data['audio_detail']['share_title']
                mp3_url = data['audio_detail']['mp3_play_url']
                duction = str(data['audio_detail']['duration'])+'秒'
                size = data['audio_detail']['size'] / (1000 * 1000)
                size = '%.2fM' % size
    
                self.download_mp3(mp3_url)
    
                self.audio_info.append(source_name)
                self.audio_info.append(title)
                self.audio_info.append(icon)
                self.audio_info.append(share_title)
                self.audio_info.append(mp3_url)
                self.audio_info.append(duction)
                self.audio_info.append(size)
    
                self.count += 1
                Utils.write_execl(self.execl_f, self.sheet_table,
                                  self.count, self.audio_info, '逻辑思维.xlsx')
                self.audio_info = []
                print('采集了{}条数据'.format(self.count))
    
            time.sleep(3)
            max_id = datas[-1]['publish_time_stamp']
            if self.max_id != max_id:
                self.max_id = max_id
                self.request_data()
            else:
                print("数据抓取完毕")
    
        def download_mp3(self, mp3_url):
            mp3_path = "D:/Photo/mp3/"
            if not os.path.exists(mp3_path):
                os.makedirs(mp3_path)
            with open(mp3_path+mp3_url.split('/')[-1], 'wb') as f:
                f.write(requests.get(mp3_url).content)
    
    
    if __name__ == '__main__':
        d = DeDao()
        d.request_data()
    

三、获取公众号信息,文章

​ 1. 分析

​ 和抓取APP信息一样,只不过这里不怎么好弄,最终结果是点击公众号右上角联系人按钮,进入历史消息爬取的,其它方式,例如精选文章什么的,请求传入的数据不固定,这种方式的好处是只需要设置offset,偏移量即可,并且在每次请求后可以得到下一个请求的偏移量。

  1. 由于公众号爬取涉及号主的隐私,这不大好,并且经常爬取容易被封号,这里就不截运行效果图了,自己运行一下就看出来了,注意第一个请求它的response不是json,可以从第二条开始分析,数据都会爬取到的。

  1. 这里请求头和数据解析和爬取APP信息类似,不同的地方是,baseUrl中的参数的信息是有时间限制的,一般半个小时会更新一次。爬取APP信息中将信息存入了Excel文件中,这里就不做演示了,做法类似。

    # coding:utf-8
    import requests
    import json
    import time
    
    
    class GZH():
        def __init__(self):
            self.base_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzA4NTQwNDcyMA==&f=json&offset={}&count=10&is_ok=1&scene=124&uin=MjI3NjA3NTMyNA%3D%3D&key=c98d6c02144b06270885a670c2a286663f9642c3ff72f373a00f06810301c8c7a7f3cc6229ddc696d9bccda804f946faf49bdb9c864015d943c50daa854219b3590115d9427bc059598cedb40e9d4613&pass_ticket=lYcbXQqfbHyz0ho29nS7V4jaOV82KM5wZk3wD53mBIPfs5kdYJSOVhwkuIWc18P9&wxtoken=&appmsg_token=960_DqPqoyT1gBzGPQoYtXXQ7vvGbGfE1hZOfXdaDw~~&x5=0&f=json"
            self.headers = {
                'Host':	'mp.weixin.qq.com',
                'Connection':	'keep-alive',
                'Accept':	'*/*',
                'User-Agent':	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400',
                'X-Requested-With':	'XMLHttpRequest',
                'Referer':	'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzA4NTQwNDcyMA==&scene=124&uin=MjI3NjA3NTMyNA%3D%3D&key=80b590d5e3a259312a4b1997f955cf49f1face919a96bf8f306fd5f9319a4cfe97dcce3de77d021ef4c31c24bb796ab3bdca5915daa97fd8450d32a29b328129fc54f66dfa544ea2e003f294d4fb0b32&devicetype=Windows+10&version=6206021b&lang=zh_CN&a8scene=7&pass_ticket=lYcbXQqfbHyz0ho29nS7V4jaOV82KM5wZk3wD53mBIPfs5kdYJSOVhwkuIWc18P9&winzoom=1',
                'Accept-Encoding':	'gzip, deflate',
                'Accept-Language':	'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
                'Cookie':	'rewardsn=; wxtokenkey=777; wxuin=2276075324; devicetype=Windows10; version=6206021b; lang=zh_CN; pass_ticket=lYcbXQqfbHyz0ho29nS7V4jaOV82KM5wZk3wD53mBIPfs5kdYJSOVhwkuIWc18P9; wap_sid2=CLzOqL0IElxLNWE4dV9EQURLc3JOdU9WLTBlR2Vaa0lHQ1phc1p5ekNfWlZwVmVkeVdpcFJrZkZjU2hNX3RPd3dDeDg4S2Joa3JTblFVWkZaYW9FX3RQRTZGN1Q0c0FEQUFBfjDrj+XYBTgNQJVO'
            }
            self.offset = 10
    
        def request_data(self):
            try:
                response = requests.get(self.base_url.format(
                    self.offset), headers=self.headers)
                if response.status_code == 200:
                    self.parse_data(response.text)
            except Exception as e:
                print(e)
    
    
        def parse_data(self, jsonText):
            datas = json.loads(jsonText)
            print(datas['ret'])
            if datas['ret'] == 0:
                self.offset = datas['next_offset']
                msg_list = datas['general_msg_list']
                result = json.loads(msg_list)['list']
                for data in result:
                    try:
                        title = data['app_msg_ext_info']['title']
                        digest = data['app_msg_ext_info']['digest']
                        content_url = data['app_msg_ext_info']['digest']
                        cover = data['app_msg_ext_info']['cover']
                        print('title:{} digest:{} content_url:{} cover:{}'.format(
                            title, digest, content_url, cover))
                    except Exception as e:
                        print(e)
                        continue
                print('***************************************************')
                time.sleep(2)
                self.request_data()
            else:
                print("数据抓取错误")
    
    
    if __name__ == '__main__':
        g = GZH()
        g.request_data()

四、只要Python基础扎实,有思路,就能做到。

        代码传送门

展开阅读全文

没有更多推荐了,返回首页