前言
题目具体分析,可以查看上一篇文章Glidedsky系列—爬虫基础
直接贴源码
import re
import threading
import requests
import time
import queue
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
}
start = time.time()
# 为线程定义一个函数
class myThread(threading.Thread):
def __init__(self, name, q):
"""
:param name:线程名
:param q:url队列
"""
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print("Starting " + self.name)
sum = 0
while True:
try:
sum += detail_page(self.q.get_nowait())
except:
break
#把每个线程计算的结果存进列表,最后计算num_list的和,即为1000页所有数字累加的值
num_list.append(sum)
print("Exiting " + self.name)
def detail_page(url):
"""
:param threadNmae:
:param url:队列中取出的url
:return:当前页数字总和
"""
res = s.get(url=url, headers=headers).text
page_text = etree.HTML(res)
items = page_text.xpath('//div[@class="col-md-1"]/text()')
num = 0
for i in items:
num += int(i.replace('\n', '').strip())
return num
#获取登录所需token值
def get_token(url):
resp = s.get(url)
token = re.findall('<meta name="csrf-token" content="(.*?)">', resp.text)[0]
return token
def login(s):
login_url = "http://glidedsky.com/login"
data = {
'_token': get_token(login_url),
'email': '账号',
'password': "密码"
}
s.post(url=login_url, data=data)
s = requests.Session()
login(s)
# 填充队列
workQueue = queue.Queue(1000)
for i in range(1,1001):
workQueue.put('http://glidedsky.com/level/web/crawler-basic-2?page={}'.format(i))
threads = []
#保存每个线程计算出的sum
num_list = []
for i in range(1, 5):
# 创建4个新线程
thread = myThread("Thread-" + str(i), q=workQueue)
# 开启新线程
thread.start()
# 添加新线程到线程列表
threads.append(thread)
# 等待所有线程完成
for thread in threads:
thread.join()
print('1000页数字和为:{}'.format(sum(num_list)))
end = time.time()
print("多线程爬虫耗时:{} s".format(end - start))
总结
因为我是在图书馆,网络波动比较大,实际耗时应该会更短一点,不过相比于单线程,我们可以看到多线程速度也直接提高了一倍