按某一关键字爬取相关的微信公众号
# -*- coding: utf-8 -*-
"""
日期:2019-6-29
作者:SuperMary
功能:获取某一关键词的公众号
"""
import requests
import random
import time
import warnings
# 模拟浏览器的请求参数
URL = "https://mp.weixin.qq.com/cgi-bin/searchbiz"
Cookie = "noticeLoginFlag=1; remember_acct=349257312%40qq.com; ua_id=1N57cCDXn9T5pQJOAAAAAKQDrCK1DVt_NYTV1_XNW0w=; pgv_pvi=9636206592; noticeLoginFlag=1; mm_lang=zh_CN; pgv_pvid=4703926742; pac_uid=0_0ee28f94e380a; openid2ticket_oW3PwswIGm2EcjSUjE5CjlRnK2LU=h7cRCcAHvtk9AKaK4NKFvo5xY5EFrCxFkcW1daAuuqs=; remember_acct=sujing0913%40163.com; ptui_loginuin=349257312@qq.com; RK=AQSVwvn4a5; ptcz=f3101f74771270cd300f377ed9f4c0a1471815466c27992782ad39e7af8b19d5; tvfe_boss_uuid=576abc8408ab5c22; pgv_si=s6497839104; uuid=b161d801457244ea83c67f8eb37983dc; data_bizuin=3018879835; bizuin=3257228887; data_ticket=N3HbSVHlS+2XBgzjC2F9Bg3I7NHEUZJHg0mR9PMxOAao39d9+82Q694hoomwXeyP; slave_sid=Q2J2Z204NUJmdXpmajUzQm1INEd2SVFwWkZGU3F5OXQ2Nkx1cTBtVUt2XzFHTTJmbWlkMVBJNjZjZFRMNHFJaEVQWkFoN3lHWXk5R1ZuVWtHQ2NlNlZuUV83ZDFRdHRzdEV1bEdIa3pvSzBmUFdaaXJ6QXZCVTNrR2NSbTZUVWx0eEUxUVJFRzB6dWF3RjN4; slave_user=gh_e9824d9bf4d9; xid=5abb4eb1762673d6b1b5a01c78982e28"
User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
Token = "1486999690"
class WxLinkList:
def __init__(self, query_str, num, filename):
self.num = num
self.query = query_str
self.filename = filename
warnings.filterwarnings('ignore')
# 根据某一查询词获取的公众号
def get_public_info(self, begin_num):
headers = {
"Cookie": Cookie,
"User-Agent": User_Agent,
}
params_public = {
"action": "search_biz",
"token": Token,
"lang": "zh_CN",
"charset": "GBK",
"f": "json",
"ajax": "1",
"begin": str(begin_num),
"count": "10",
"query": self.query,
}
content = requests.get(URL, headers=headers, params=params_public, verify=False).json()
wx_public_info = content["list"]
return wx_public_info
# 获取某一关键词的所有公众号
def get_all_public(self):
page = self.num // 10
file = open(self.filename, 'a')
for i in range(0, page):
print("\n第%s页:" % (i + 1))
wait_time = random.randint(10, 65)
print("等待%s秒。。。。。。" % wait_time)
time.sleep(wait_time)
begin_num = i * 10
try:
res_all = self.get_public_info(begin_num)
res_len = len(res_all)
for i in range(0, res_len):
temp_info = "公众号:%s, FakeID:%s" % (res_all[i]["nickname"], res_all[i]["fakeid"])
print(temp_info)
file.write(str(temp_info) + '\n')
except:
wait_time = random.randint(10, 65)
print("等待%s秒。。。。。。" % wait_time)
time.sleep(wait_time)
break
file.close()
if __name__ == "__main__":
# 初始执行等待时间,避免频繁重新启动程序,被公众号平台检测为恶意攻击而封号或者暂停查询功能
start_wait_time = 20
print("等待%i秒,避免频繁操作被封号" % start_wait_time)
time.sleep(20)
# 设置爬取关键词,爬取总数,存储文件路径
query_content = "Python"
total_num =10
filename = './Python_list.txt'
# 创建类实例,执行主函数
test = WxLinkList(query_content, total_num, filename)
test.get_all_public()