微信公众号、微信号、微信文章爬虫(搜狗搜索)
微信上面的东西真的不好爬,幸好我找到了一个神器!!!
这个能不能弄点东西出来呢?
这个链接还挺整齐的
嗯哼~~~~~~我似乎发现了什么
直接上代码啦
import requests
import re
import openpyxl
def oh_my_god(search="帅哥"):
url = "https://weixin.sogou.com/weixin?query="
headers = {'User-Agent': ''}
#这里需要自己填入了
with open("alldata.txt","w+") as file:
for page in range(1,20):
target_url = url+search+"&page="+str(page)
data = requests.get(target_url,headers=headers)
file.write(data.content.decode("utf-8"))
def find_all(sub,s):
index_list = []
index = s.find(sub)
while index != -1:
index_list.append(index)
index = s.find(sub,index+1)
if len(index_list) > 0:
return index_list
else:
return -1
txt_data = open("alldata.txt").read()
begin_place = find_all("<p class=\"tit\">",txt_data)
final_data = []
for i in range(len(begin_place)-1):
final_data.append(txt_data[begin_place[i]:begin_place[i+1]])
gzh = []#公众号
wx = []#微信
gnjs = []#功能介绍
ewm = []#二维码链接
for i in range(len(final_data)):
wx.append(final_data[i][final_data[i].find("微信号:<label name=")+31:final_data[i].find("</label>")])
gzh.append(''.join(re.findall(u'[\u4e00-\u9fa5]',final_data[i][:final_data[i].find("微信号")])))
gnjs.append(''.join(re.findall(u'[\u4e00-\u9fa5]',final_data[i][final_data[i].find("功能介绍")+5:final_data[i].find("最近文章")])))
ewm.append(final_data[i][final_data[i].find("微信扫一扫关注")+47:final_data[i].find("data-id")-2])
mywb = openpyxl.Workbook()
mysheet = mywb.get_sheet_by_name('Sheet')
mysheet['A1'] = '公众号'
mysheet['B1'] = '微信'
mysheet['C1'] = '功能介绍'
mysheet['D1'] = '二维码链接'
for i in range(len(gzh)):
mysheet['A'+str(i+2)] = gzh[i]
mysheet['B'+str(i+2)] = wx[i]
mysheet['C'+str(i+2)] = gnjs[i]
mysheet['D'+str(i+2)] = ewm[i]
mywb.save('data.xlsx')
if __name__ == '__main__':
s = input('输入关键字')
oh_my_god(search=s)