Python多线程爬虫适用于IO密集型,涉及到网络、磁盘IO的任务都是IO密集型任务,多线程可以明显提高效率,例如多线程爬虫,多线程文件处理等等。CPU密集型任务不适合使用多线程处理。
思路:将所有的url放到队列里面,在io非阻塞的情况下,利用线程从队列里面取数据,当处理完所有的url,直接退出
#!/usr/bin/env python
#-*- coding:utf-8 -*-
#env:python3.X
import threading, queue, time, requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
res = requests.get('http://land.fang.com/market/210100________1_0_1.html')
soup = BeautifulSoup(res.text,'html.parser')
urlQueue = queue.Queue()
for message in soup.select('.list28_text'):
url = 'http://land.fang.com' + message.select('a')[0]['href']
urlQueue.put(url)
def fetchUrl(urlQueue):
while True:
try:
url = urlQueue.get_nowait() #不阻塞的读取队列数据
i = urlQueue.qsize() #队列长度,取出一个长度就减少一个
except Exception as e:
break #当取完的时候,退出循环
#print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
try:
response = urlopen(url)
responseCode = response.getcode() #获取返回的状态码
except Exception as e:
continue
if responseCode == 200:
#抓取内容的数据处理可以放到这里
detail = requests.get(url)
soup1 = BeautifulSoup(detail.text, 'html.parser')
messes = []
for mess in soup1.select('.banbox tr td'):
messes.append(mess.text)
print(messes[1:3])
#time.sleep(1)
if __name__ == '__main__':
start = time.time()
threads = []
threadNum = 10
for i in range(0, threadNum):
t = threading.Thread(target=fetchUrl, args=(urlQueue,))
threads.append(t)
t.start()
for t in threads:
t.join()
end = time.time()
print ('the total time is: %s ' % (end - start))