爬虫新手,边学边用,尝试着爬取百度热搜榜前50的数据,将数据以CSV文件格式保存下来,并以爬取时间作为文件名保存。
(一)代码
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv
import datetime
url="http://top.baidu.com/buzz?b=1&fr=topindex"
result=[]
def get_rank(driver):
driver.get(url)
html=driver.page_source
sp=BeautifulSoup(html,'lxml')
#挑选所有tr标签
list=sp.find_all('tr')
#挑选排名、标题、热点
for each in list:
rank=each.find('span',{'class':'num-top'})
if rank!=None:
key=each.find('a',{'class':'list-title'})
point=each.find('span',{'class':["icon-fall","icon-rise"]})
result.append([rank.string,key.string,point.string])
continue
rank=each.find('span',{'class':'num-normal'})
if rank!=None:
key=each.find('a',{'class':'list-title'})
point=each.find('span',{'class':["icon-fall","icon-rise"]})
result.append([rank.string,key.string,point.string])
continue
def save_rank(result):
headers=['排行','标题','热度']
now_time=datetime.datetime.now().strftime('%Y%m%d_%H%M')
#创建以时间命名的文件
filename=now_time+'.csv'
with open(filename,'a',newline="",encoding='utf-8-sig')as csvfile:
write=csv.writer(csvfile)
write.writerow(headers)
write.writerows(result)
def main():
driver=webdriver.Chrome()
wait=WebDriverWait(driver,10)
get_rank(driver)
save_rank(result)
driver.quit()
main()
(二)结果