1. Python多线程爬虫
在批量去爬取数据的时候,往往效率会很低,这个时候我们可以用到多线程的技术。
python是支持多线程的, 主要是通过thread和threading这两个模块来实现的。
单线程爬虫效率相对来说会低很多,例如:
import requests
from bs4 import BeautifulSoup
import time
start_time = time.time()
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
for i in range(1,6):
url = "https://so.csdn.net/so/search/s.do?p="+str(i)+"&q=python"
s = requests.session()
html = s.get(url,headers = headers)
html.encoding = "utf-8"
r = html.text
#print(r)
#print(r)
soup = BeautifulSoup(str(r),"html.parser")
limit = soup.find_all('div', class_='limit_width')
soup1 = BeautifulSoup(str(limit), "html.parser")
div = soup1.find_all('div', class_='limit_width')
soup2 = BeautifulSoup(str(div), "html.parser")
a = soup2.find_all('a')
for i in a:
text = i.get_text()
href = i["href"]
if "CSDN" not in text:
print(text)
print(href)
main()
end = time.time()
print(end-start_time)
#运行结果:
#......
#Time-Cost:2.061112642288208
然后我们尝试用多线程的方法,执行同样的爬取内容,如下所示:
# coding=utf-8
import threading, queue, time, urllib
from bs4 import BeautifulSoup
from urllib import request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
import requests
baseUrl = "https://so.csdn.net/so/search/s.do?p="
urlQueue = queue.Queue()
for i in range(1, 6):
url = baseUrl + str(i) + "&q=python"
urlQueue.put(url)
#print(url)
def fetchUrl(urlQueue):
while True:
try:
#不阻塞的读取队列数据
url = urlQueue.get_nowait()
i = urlQueue.qsize()
#print(url,threading.current_thread().name)
except Exception as e:
break
#print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
try:
s = requests.session()
html = s.get(url, headers=headers)
html.encoding = "utf-8"
r = html.text
# print(r)
# print(r)
soup = BeautifulSoup(str(r), "html.parser")
limit = soup.find_all('div', class_='limit_width')
soup1 = BeautifulSoup(str(limit), "html.parser")
div = soup1.find_all('div', class_='limit_width')
soup2 = BeautifulSoup(str(div), "html.parser")
a = soup2.find_all('a')
for i in a:
text = i.get_text()
href = i["href"]
if "CSDN" not in text:
print(text)
print(href)
print("已爬取完毕!")
except:
pass
#抓取内容的数据处理可以放到这里
#为了突出效果, 设置延时
#time.sleep(1)
#print(html)
if __name__ == '__main__':
startTime = time.time()
print("这是主线程:",threading.current_thread().name)
threads = []
# 可以调节线程数, 进而控制抓取速度
threadNum = 5
for i in range(0, threadNum):
#创建一个线程
t = threading.Thread(target=fetchUrl, args=(urlQueue,))
threads.append(t)
print(threads)
for t in threads:
t.start()
for t in threads:
#多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞
t.join()
endTime = time.time()
print("主线程结束:",threading.current_thread().name)
print ('Done, Time cost: %s ' % (endTime - startTime))
#运行结果:
#这是主线程: MainThread
#Python游戏开发入门
#https://edu.csdn.net/course/detail/5690
#Python, Python, Python
#https://blog.csdn.net/ww_great/article/details/3057071
#......
#已爬取完毕!
#主线程结束: MainThread
#Time cost: 0.7241780757904053
设置threadNum = 2的话,也就是将队列设置为2,那么速度会大大降低。
我们运行一下,发现 Time cost: 1.3654978275299072