python爬取微信公众号文章

网上很多都是通过搜狗微信来爬取的,但是很多公众号搜索不到,用公众号平台的接口来获取更方便。
话不说多,直接上代码

import re
import time
import random

import xlrd
import requests
from bs4 import BeautifulSoup


# 返回一个随机的请求头 headers
def getheaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers

def open_excel(file):
    data = xlrd.open_workbook(file)
    table = data.sheet_by_index(0)
    nrows = table.nrows
    list = []
    for i in range(nrows):
        list.append(table.row_values(i))
    return list

def get_wenzhang(query):
    cursor, conn = opendb()
    cookie_str = ****************,# 微信公众平台的cookies
    url = 'https://mp.weixin.qq.com'
    req = requests.get(url, headers=getheaders(), cookies=cookie_str)
    token = re.findall(r'token=(\d+)', str(req.url))[0]
    # print(token)
    name_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    name_parmas = {
        'action': 'search_biz',
        'begin': '0',
        'count': '5',
        'query': query,
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1'
    }
    name_req = requests.get(name_url, cookies=cookie_str, headers=getheaders(), params=name_parmas)
    lists = name_req.json().get('list')[0]
    fakeid = lists.get('fakeid')
    begin = 0
    num = 1
    bool = True
    while bool:
        lj_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
        lj_parmas = {
            'action': 'list_ex',
            'begin': str(begin),
            'count': '5',
            'fakeid': fakeid,
            'type': '9',
            'query': '',
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1'
        }
        lj_req = requests.get(lj_url, params=lj_parmas, headers=getheaders(), cookies=cookie_str, )
        max_num = lj_req.json().get('app_msg_cnt')
        acticel_lists = lj_req.json().get('app_msg_list')
        if acticel_lists == [] or acticel_lists is None:
            break
        else:
            for acticel_list in acticel_lists:
                aid = acticel_list.get('aid')
                link = acticel_list.get('link')
                cover = acticel_list.get('cover')
                digest = acticel_list.get('digest')
                create_time = time.localtime(int(acticel_list.get('create_time')))
                otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", create_time)
                print(aid)
                print(otherStyleTime)
                res = requests.get(link, headers=getheaders())
                res.encoding = 'utf-8'
                pho = BeautifulSoup(res.text, 'html.parser')
                if pho.find(class_='rich_media_title') == [] or pho.find(class_='rich_media_title') is None:
                    title = ''
                else:
                    title = pho.find(class_='rich_media_title').text.replace('\n', '').replace(' ', '')
                neirong = str(pho.find(class_='rich_media_content')).replace('visibility: hidden;', '').replace('data-src', 'src').replace('href', 'javascript:void(0)')
                if neirong == [] or neirong is None:
                    continue
                else:
                    content = neirong
                if 'video' in content:
                    print('==============有视频, 不要================')
                    # print(link)
                else:
                    print('=============正在获取获取第%s篇=================' % num)
                    num += 1
        bool = False	# 获取第一页
        # begin每翻一次页 +5 
        # if begin > 5:
        #     break
        # begin = int(begin) + 5

if __name__ == '__main__':
    list = open_excel(**********************) # 想要爬取的公众号名称放在excel表里
    for i in list:
        print(i[0])
        get_wenzhang(i[0], i[1])
    # get_wenzhang('经济观察报')

新手刚开始学python
大家有什么问题,可以评论交流谈论

Python可以用于爬取微信公众号文章。有几种方式可以实现这一目标,其中一种方式是通过登录微信公众号平台,在里面搜索要爬取的公众号,然后进行抓包。另外一种方式是使用安卓模拟器进行抓包。还可以使用搜狗的微信模块来获取数据。 在使用第一种方式时,首先需要拥有一个微信公众号的账号。然后,通过工具如pycharm和fiddler等,登录微信公众号平台,进入创作管理,选择图文素材,然后创建或编辑文章。最后,通过添加引用和查找公众号文章等步骤,进行数据的爬取和收集。 在实现这些步骤之前,你需要了解微信公众号平台的主要功能和界面布局。主面板可以划分为七大块,每个块都有不同的功能。在后续的操作中,你可以使用Python代码来模拟微信请求,并实现数据的爬取和分析。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *3* [欢度国庆⭐️共享爬虫之美⭐️基于 Python 实现微信公众号爬虫(Python无所不能爬)](https://blog.csdn.net/xiejiachao/article/details/120573509)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *2* [【爬虫】python爬取微信公众号](https://blog.csdn.net/qq_36269293/article/details/109244944)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值