非加速
非加速的实现的爬虫可以有序地获取数据信息,但速度相对比较慢。
完整程序
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import os
from time import time
def getinfo(html,news):
"""获取新闻信息"""
soup = BeautifulSoup(html,'html.parser')#解析Html文本
titleL = soup.find_all('a',attrs={'target':'_blank'})#选出所有target=_blank的标签
timeL = soup.find_all('span',attrs={'class':'c_time'})#选出所有span且class为c_time的标签
for i in range(len(titleL)-1):#去掉最后一项干扰
try:
title = titleL[i].get_text()#获取新闻标题
link = titleL[i]['href']#获取url
time = timeL[i].get_text()#获取时间
news.append([time,title,link])#记录新闻信息
except:
continue
def gethtml(url):
"""获取html页面源码"""
try:
options=webdriver.ChromeOptions()
#添加无界面参数
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
wait = WebDriverWait(browser,10)#至多等待10秒
wait.until(EC.presence_of_element_located((By.ID,'d_list'))) #等待页面渲染完成
text = browser.page_source#获取源码
browser.close()#关闭浏览器
return text
except:
return ''
if __name__ == '__main__':
start_time = time()
if not os.path.exists("爬取新闻测试"):
os.mkdir("爬取新闻测试")
print("准备就绪,开始搬砖....",end=" ")
url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page='
n = 8
news = [['time','title','link']]
for i in range(n):
url = url+str(i)
html = gethtml(url)
getinfo(html,news)
print("\r努力搬砖中....已完成:{:0.2f}%".format((i+1)*100/n),end=' ')
df = pd.DataFrame(news)
df.to_excel(r"爬取新闻测试\news.xlsx",header=False,index=False)
end_time = time()
print("\r新浪新闻爬取完成!!!共耗时{:0.2f}s".format(end_time-start_time))
多线程加速
加速的方法需要分开文件同时爬取,速度较快。
完整程序
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from multiprocessing import Pool
import pandas as pd
def runfunc(url,i,n):
news = [['time','title','link']]
filename = 'page'+str(i+1)+'news.xlsx'
url = url+str(i)
html = gethtml(url)
getinfo(html,news)
df = pd.DataFrame(news)
df.to_excel(filename,header=False,index=False)
def getinfo(html,news):
soup = BeautifulSoup(html,'html.parser')
titleL = soup.find_all('a',attrs={'target':'_blank'})
timeL = soup.find_all('span',attrs={'class':'c_time'})
num = 0
for i in range(len(titleL)-1):#去掉最后一项干扰
try:
title = titleL[i].get_text()
link = titleL[i]['href']
time = timeL[i].get_text()
news.append([time,title,link])
num+=1
except:
continue
def gethtml(url):
options=webdriver.ChromeOptions()
#添加无界面参数
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(url)
wait = WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'d_list')))
text = browser.page_source
#browser.close()
return text
if __name__ == '__main__':
pool = Pool(8)
url = 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page='
n = 8
for i in range(n):
pool.apply_async(runfunc,args=(url,i,n))
pool.close()
pool.join()#阻塞进程
print("新浪新闻爬取完成!")