网上很多都是通过搜狗微信来爬取的,但是很多公众号搜索不到,用公众号平台的接口来获取更方便。
话不说多,直接上代码
import re
import time
import random
import xlrd
import requests
from bs4 import BeautifulSoup
# 返回一个随机的请求头 headers
def getheaders():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
def open_excel(file):
data = xlrd.open_workbook(file)
table = data.sheet_by_index(0)
nrows = table.nrows
list = []
for i in range(nrows):
list.append(table.row_values(i))
return list
def get_wenzhang(query):
cursor, conn = opendb()
cookie_str = ****************,# 微信公众平台的cookies
url = 'https://mp.weixin.qq.com'
req = requests.get(url, headers=getheaders(), cookies=cookie_str)
token = re.findall(r'token=(\d+)', str(req.url))[0]
# print(token)
name_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
name_parmas = {
'action': 'search_biz',
'begin': '0',
'count': '5',
'query': query,
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1'
}
name_req = requests.get(name_url, cookies=cookie_str, headers=getheaders(), params=name_parmas)
lists = name_req.json().get('list')[0]
fakeid = lists.get('fakeid')
begin = 0
num = 1
bool = True
while bool:
lj_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
lj_parmas = {
'action': 'list_ex',
'begin': str(begin),
'count': '5',
'fakeid': fakeid,
'type': '9',
'query': '',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1'
}
lj_req = requests.get(lj_url, params=lj_parmas, headers=getheaders(), cookies=cookie_str, )
max_num = lj_req.json().get('app_msg_cnt')
acticel_lists = lj_req.json().get('app_msg_list')
if acticel_lists == [] or acticel_lists is None:
break
else:
for acticel_list in acticel_lists:
aid = acticel_list.get('aid')
link = acticel_list.get('link')
cover = acticel_list.get('cover')
digest = acticel_list.get('digest')
create_time = time.localtime(int(acticel_list.get('create_time')))
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", create_time)
print(aid)
print(otherStyleTime)
res = requests.get(link, headers=getheaders())
res.encoding = 'utf-8'
pho = BeautifulSoup(res.text, 'html.parser')
if pho.find(class_='rich_media_title') == [] or pho.find(class_='rich_media_title') is None:
title = ''
else:
title = pho.find(class_='rich_media_title').text.replace('\n', '').replace(' ', '')
neirong = str(pho.find(class_='rich_media_content')).replace('visibility: hidden;', '').replace('data-src', 'src').replace('href', 'javascript:void(0)')
if neirong == [] or neirong is None:
continue
else:
content = neirong
if 'video' in content:
print('==============有视频, 不要================')
# print(link)
else:
print('=============正在获取获取第%s篇=================' % num)
num += 1
bool = False # 获取第一页
# begin每翻一次页 +5
# if begin > 5:
# break
# begin = int(begin) + 5
if __name__ == '__main__':
list = open_excel(**********************) # 想要爬取的公众号名称放在excel表里
for i in list:
print(i[0])
get_wenzhang(i[0], i[1])
# get_wenzhang('经济观察报')
新手刚开始学python
大家有什么问题,可以评论交流谈论