使用多线程加速你的爬虫
核心部分
1.用thread = theading.Tread(target = ,args = (,))将代码块创建为一个线程
2.用thread.start()开启这个线程
代码块
运行几次后回触发百度的安全验证机制(测验得手机热点可以完美的绕过百度安全验证)
from bs4 import BeautifulSoup
import threading
import requests
import time
def getSoup(url):
resp = requests.get(url)
html = resp.content.decode("utf-8")
soup = BeautifulSoup(html,features="lxml")
return soup
#pn翻页的步长为50
def getUrls(begin,end):
return [
"https://tieba.baidu.com/f?kw=%E5%8D%97%E4%BA%AC%E4%BF%A1%E6%81%AF%E8%81%8C%E4%B8%9A%E6%8A%80%E6%9C%AF%E5%AD%A6%E9%99%A2&ie=utf-8&pn="+str(pn)
for pn in range(begin,end,50)
]
def printTiezi(url):
soup = getSoup(url)
infos = soup.find_all("a",{"target":"_blank","rel":"noreferrer","class":"j_th_tit"})
for info in infos:
if(info.string!=None and info.string!='0'):
print(info.string)
#开启多线程的核心代码
def multi_thead(urls):
threads = []
for url in urls:
threads.append(
threading.Thread(target = printTiezi,args = (url,))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
#d多线程
multi_start = time.time()
urls = getUrls(0,1200)
multi_thead(urls) #调用多线程
multi_end = time.time()
#单线程
start = time.time()
for url in urls:
printTiezi(url)
print("\n开启的线程数字 = %d" % len(urls))
print("\n多线程执行的时间为: %f s" % (multi_end - multi_start))
print("\n单线程执行的时间为: %f s", % (time.time()-start))
运行结果
开启的线程数字 = 24
多线程执行的时间为: 7.995203256607056 s
单线程执行的时间为: 28.473376512527466 s
结论:由于开启的线程数不多运行的任务量不大是所以差距不是十分的明显(当线程数开的多的时候就可以提现多线程的威力了)