python爬取微信公众号文章

最新推荐文章于 2024-08-24 17:17:46 发布

qq_43650413

最新推荐文章于 2024-08-24 17:17:46 发布

阅读量376

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_43650413/article/details/104946614

版权

网上很多都是通过搜狗微信来爬取的，但是很多公众号搜索不到，用公众号平台的接口来获取更方便。
话不说多，直接上代码

import re
import time
import random

import xlrd
import requests
from bs4 import BeautifulSoup


# 返回一个随机的请求头 headers
def getheaders():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers

def open_excel(file):
    data = xlrd.open_workbook(file)
    table = data.sheet_by_index(0)
    nrows = table.nrows
    list = []
    for i in range(nrows):
        list.append(table.row_values(i))
    return list

def get_wenzhang(query):
    cursor, conn = opendb()
    cookie_str = ****************,# 微信公众平台的cookies
    url = 'https://mp.weixin.qq.com'
    req = requests.get(url, headers=getheaders(), cookies=cookie_str)
    token = re.findall(r'token=(\d+)', str(req.url))[0]
    # print(token)
    name_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
    name_parmas = {
        'action': 'search_biz',
        'begin': '0',
        'count': '5',
        'query': query,
        'token': token,
        'lang': 'zh_CN',
        'f': 'json',
        'ajax': '1'
    }
    name_req = requests.get(name_url, cookies=cookie_str, headers=getheaders(), params=name_parmas)
    lists = name_req.json().get('list')[0]
    fakeid = lists.get('fakeid')
    begin = 0
    num = 1
    bool = True
    while bool:
        lj_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
        lj_parmas = {
            'action': 'list_ex',
            'begin': str(begin),
            'count': '5',
            'fakeid': fakeid,
            'type': '9',
            'query': '',
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1'
        }
        lj_req = requests.get(lj_url, params=lj_parmas, headers=getheaders(), cookies=cookie_str, )
        max_num = lj_req.json().get('app_msg_cnt')
        acticel_lists = lj_req.json().get('app_msg_list')
        if acticel_lists == [] or acticel_lists is None:
            break
        else:
            for acticel_list in acticel_lists:
                aid = acticel_list.get('aid')
                link = acticel_list.get('link')
                cover = acticel_list.get('cover')
                digest = acticel_list.get('digest')
                create_time = time.localtime(int(acticel_list.get('create_time')))
                otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", create_time)
                print(aid)
                print(otherStyleTime)
                res = requests.get(link, headers=getheaders())
                res.encoding = 'utf-8'
                pho = BeautifulSoup(res.text, 'html.parser')
                if pho.find(class_='rich_media_title') == [] or pho.find(class_='rich_media_title') is None:
                    title = ''
                else:
                    title = pho.find(class_='rich_media_title').text.replace('\n', '').replace(' ', '')
                neirong = str(pho.find(class_='rich_media_content')).replace('visibility: hidden;', '').replace('data-src', 'src').replace('href', 'javascript:void(0)')
                if neirong == [] or neirong is None:
                    continue
                else:
                    content = neirong
                if 'video' in content:
                    print('==============有视频, 不要================')
                    # print(link)
                else:
                    print('=============正在获取获取第%s篇=================' % num)
                    num += 1
        bool = False	# 获取第一页
        # begin每翻一次页 +5 
        # if begin > 5:
        #     break
        # begin = int(begin) + 5

if __name__ == '__main__':
    list = open_excel(**********************) # 想要爬取的公众号名称放在excel表里
    for i in list:
        print(i[0])
        get_wenzhang(i[0], i[1])
    # get_wenzhang('经济观察报')