利用requests模块进行爬取
利用re模块进行匹配
利用xlwt模块进行存储
写法比较简单,我是还未入门的小白,自娱自乐写法,希望对他人有用。
import requests
import re
import xlwt
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
url1 = 'https://s.weibo.com/top/summary'
req = requests.get(url1,headers= headers)
req . encoding = 'utf-8'
#微博热搜用requests爬取
#<td class="td-01 ranktop">1</td>
#<a href="/weibo?q=%E9%82%93%E4%BC%A6%E7%B2%89%E4%B8%9D%E8%84%B1%E7%B2%89&Refer=top" target="_blank">邓伦粉丝脱粉</a>
#<span>4691966</span>
#https://s.weibo.com/weibo?q=%E9%82%93%E4%BC%A6%E7%B2%89%E4%B8%9D%E8%84%B1%E7%B2%89&Refer=top
num = re.findall('<td class="td-01 ranktop">(.*?)</td>',req.text,re.S)
title = re.findall('target="_blank">(.*?)</a>',req.text,re.S)
count = re.findall('<span>(.*?)</span>',req.text,re.S)
#获取网址
wangzhis = []
for i in range(len(title)-2):#-2的意思是最后有两个不需要的数据
wangzhi = (f'https://s.weibo.com/weibo?q=%23{title[i]}%23&Refer=top')
wangzhis.append(wangzhi)
# 验证是否抓取成功
# print(num)
# print(title)
# print(count)
# print(wangzhis)
excel1 = xlwt.Workbook()
worksheet = excel1.add_sheet('微博热搜')
worksheet.write(0,0,'目录')
worksheet.write(0,1,'标题')
worksheet.write(0,2,'热度值')
worksheet.write(0,3,'网址')
for i in range(1,len(title)-1):
worksheet.write(i, 0, num[i-1])
worksheet.write(i, 1, title[i-1])
worksheet.write(i, 2, count[i-1])
worksheet.write(i, 3, wangzhis[i-1])
i+=1
excel1.save('d:/123.xls')