用threading模块写一个简单的多线程爬虫和单线程爬虫对比爬取速度
import requests
import re
import threading
import time
# 单线程爬取
def spider(url,headers):
response = requests.get(url,headers).text
pattern = re.compile('<div class="pic">.*?<a href="(.*?)">',re.S)
linkList = pattern.findall(response)
for link in linkList:
html = requests.get(link,headers).text
p1 = re.compile('<span class="top250-no">(.*?)</span>',re.S)
p2 = re.compile('<span property="v:itemreviewed">(.*?)</span>',re.S)
num = re.findall(p1,html)
title = re.findall(p2,html)
print(num[0],':',title[0])
# 多线程爬取(三线程)
lock = threading.RLock() # 线程中的锁机制
#爬取每个电影的排名和电影名称
def infoSpider(link,headers):
html = requests.get(link, headers).text
p1 = re.compile('<span class="top250-no">(.*?)</span>', re.S)
p2 = re.compile('<span property="v:itemreviewed">(.*?)</span>', re.S)
num = re.findall(p1, html)
title = re.findall(p2, html)
print(num[0], ':', title[0])
def A(linkList,headers):
# lock.acquire()
for i in range(0, 25, 3):
url = linkList[i]
infoSpider(url, headers)
# lock.release()
def B(linkList,headers):
# lock.acquire()
for i in range(1,25, 3):
url = linkList[i]
infoSpider(url, headers)
# lock.release()
def C(linkList,headers):
# lock.acquire()
for i in range(2,25, 3):
url = linkList[i]
infoSpider(url, headers)
# lock.release()
def spider2(url,headers):
response = requests.get(url,headers).text
pattern = re.compile('<div class="pic">.*?<a href="(.*?)">',re.S)
linkList = pattern.findall(response)
t1 = threading.Thread(target=A, args=(linkList,headers))
t2 = threading.Thread(target=B, args=(linkList,headers))
t3 = threading.Thread(target=C, args=(linkList,headers))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
#单线程测试
start1 = time.time()
for i in range(3):
url = 'https://movie.douban.com/top250?start=%d'%(i*25)
spider(url,headers)
end1 = time.time()
#多线程测试
start2 = time.time()
for i in range(3):
url = 'https://movie.douban.com/top250?start=%d'%(i*25)
spider2(url,headers)
end2 = time.time()
print(end1-start1)#单线程运行时间
print(end2-start2)#多线程运行时间
if __name__ == '__main__':
main()
三线程爬取时间基本为单线程时间的三倍