import requests
import threading
import time
import queue as Queue
# url列表,这里是虚构的,现实情况这个列表里有大量的url
link_list = ['http://www.baidu.com',
'http://www.qq.com',
'http://www.xxx.com',
'http://www.sogou.com',
'http://www.dsds.com',
'...']
start = time.time()
class myThread(threading.Thread):
def __init__(self,name,q):
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print("Starting " + self.name)
while True:
try:
crawler(self.name,self.q)
except:
break
print("Exiting " + self.name)
def crawler(threadName,q):
# 从队列里获取url
url = q.get(timeout=2)
try:
r = requests.get(url,timeout = 20)
# 打印:队列长度,线程名,响应吗,正在访问的url
print(q.qsize(),threadName,r.status_code,url)
except Exception as e:
print(q.qsize(),threadName,"Error: ",e)
# 创建5个线程名
threadList = ["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]
# 设置队列长度
workQueue = Queue.Queue(300)
# 线程池
threads = []
#创建新线程
for tName in threadList:
thread = myThread(tName,workQueue)
thread.start()
threads.append(thread)
#将url填充到队列
for url in link_list:
workQueue.put(url)
#等待所有线程完成
for t in threads:
t.join()
end = time.time()
print('Queue多线程爬虫总时间为:',end-start)
---------------------
作者:大蛇王
来源:CSDN
原文:https://blog.csdn.net/t8116189520/article/details/81914362
版权声明:本文为博主原创文章,转载请附上博文链接!