yield函数
个人使用的主要目的是,让获取到的每一行数据都保存到文件中
yield的优点:节约内存,能够一行一行的录入数据
将获取到的数据,一行一行的返回
for i in items:#返回的数据设置
number = i.group('number')
title = i.group('title')
yield{
'mumber':number,
'title':title
}
a = 0#调用yield返回的数据
for i in message:
worksheet.write(a,0,i['mumber'])
worksheet.write(a,1,i['title'])
a = a + 1
xlsxwriter库
简单好上手,我的常用命令
# 创建名为微博的表格
workbook = xlsxwriter.Workbook('微博.xlsx')
# 在创建好的表格中指定一个工作表 - sheet
worksheet = workbook.add_worksheet('排行榜')
#写入数据:第a行 ,第0列 ,保存的数据
worksheet.write(a,0,i['mumber'])
workbook.close()#关闭文件
完整代码
#@Time:2021/7/22 22:04
#@Author: xsir161
import requests,re
import xlsxwriter,os
#微博热搜爬取
def main():
url = 'https://s.weibo.com/top/summary'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
response = requests.get(url,headers = headers).text
rule_1 = re.compile(r'icon-top.*?target="_blank">(?P<title>.*?)</a', re.S)
rule_2 = re.compile(r'class="".*?td-01 ranktop">(?P<number>.*?)<.*?target="_blank">(?P<title>.*?)</a',re.S)
top = rule_1.search(response).group('title')
items = rule_2.finditer(response)
for i in items:
number = i.group('number')
title = i.group('title')
yield{
'mumber':number,
'title':title
}
#微博信息爬取
def writer(message):
if os.path.exists('微博.xlsx'):
os.remove('微博.xlsx')
# 创建名为微博的表格
workbook = xlsxwriter.Workbook('微博.xlsx')
# 在创建好的表格中指定一个工作表 - sheet
worksheet = workbook.add_worksheet('排行榜')
#在工作表中写入数据
a = 0
for i in message:
worksheet.write(a,0,i['mumber'])
worksheet.write(a,1,i['title'])
a = a + 1
workbook.close()
if __name__ == '__main__':
g = main()
writer(g)