python 多线程访问url列表中地址

最新推荐文章于 2024-08-12 09:00:00 发布

chen072086

最新推荐文章于 2024-08-12 09:00:00 发布

阅读量1.6k

点赞数 1

文章标签： python多线程

import requests
import threading
import time
import queue as Queue

# url列表，这里是虚构的,现实情况这个列表里有大量的url
link_list = ['http://www.baidu.com',
'http://www.qq.com',
'http://www.xxx.com',
'http://www.sogou.com',
'http://www.dsds.com',
'...']

start = time.time()

class myThread(threading.Thread):
def __init__(self,name,q):
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print("Starting " + self.name)
while True:
try:
crawler(self.name,self.q)
except:
break
print("Exiting " + self.name)

def crawler(threadName,q):
# 从队列里获取url
url = q.get(timeout=2)
try:
r = requests.get(url,timeout = 20)
# 打印：队列长度，线程名，响应吗，正在访问的url
print(q.qsize(),threadName,r.status_code,url)
except Exception as e:
print(q.qsize(),threadName,"Error: ",e)

# 创建5个线程名
threadList = ["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]

# 设置队列长度
workQueue = Queue.Queue(300)

# 线程池
threads = []

#创建新线程
for tName in threadList:
thread = myThread(tName,workQueue)
thread.start()
threads.append(thread)

#将url填充到队列
for url in link_list:
workQueue.put(url)

#等待所有线程完成
for t in threads:
t.join()

end = time.time()
print('Queue多线程爬虫总时间为：',end-start)