"""
io密集,多线程请求网页,或则读写文件,计算时间,计算密集
join所完成的工作就是线程同步,即主线程任务结束之后,进入阻塞状态,一直等待其他的子线程执行结束之后,主线程在终止,例子见下面三
join有一个timeout参数:
当设置守护线程时,含义是主线程对于子线程等待timeout的时间将会杀死该子线程,最后退出程序。所以说,
如果有10个子线程,全部的等待时间就是每个timeout的累加和。简单的来说,就是给每个子线程一个timeout的时间,让他去执行,时间一到,不管任务有没有完成,直接杀死。
没有设置守护线程时,主线程将会等待timeout的累加和这样的一段时间,时间一到,主线程结束,但是并没有杀死子线程,子线程依然可以继续执行,直到子线程全部结束,程序退出
"""
from simple_spider_rule.settings import tasks,headers
import time
import requests
import logging
import threading
import multiprocessing
logging.basicConfig(filename="",level=logging.INFO)
qa = multiprocessing.Queue()
def get_some(clist):
for alist in clist:
qa.put(alist)
return qa
def defined_requetsts(url,headers):
res = requests.get(url=url, headers=headers, timeout=10)
response = res.text
return response
class MyRequests(object):
def __init__(self):
self.len_data = list()
self.q = get_some(tasks)
def fetcher(self):
pass
def run(self):
return_data = self.fetcher()
logging.info(f"一个线程七条数据所用时间为{return_data}")
class OrdinaryRequests(MyRequests):
def __init__(self):
super().__init__()
def fetcher(self):
start_time = time.time()
while self.q.qsize():
one_task = self.q.get()
try:
data = defined_requetsts(url=one_task, headers=headers)
except Exception as e:
logging.debug("请求出现异常")
continue
self.len_data.append(len(data))
end_time = time.time()
return end_time-start_time
class ThreadRequests(MyRequests):
def __init__(self):
super().__init__()
def fetcher(self):
while self.q.qsize():
one_task = self.q.get()
try:
data = defined_requetsts(url=one_task, headers=headers)
except Exception as e:
logging.debug("请求出现异常")
break
self.len_data.append(len(data))
def for_range(self):
threads = []
threadNum = 4
start_time = time.time()
for i in range(0,threadNum):
t = threading.Thread(target=self.fetcher) #如果需要传递参数时, t = threading.Thread(target=fetchUrl, args=(urlQueue,)) 参数以数组的形式进行传递
threads.append(t)
for oneth in threads:
# oneth.setDaemon(True) 这一句默认是false,当设置为True时,主线程结束,所有线程都结束
oneth.start() #每个线程开始运行
for oneth in threads:
oneth.join() #给每个线程设置守护线程
end_time = time.time()
return end_time-start_time
def run(self):
cdata = self.for_range()
logging.info(f"四个线程七条数据,所用时间为{cdata}")
class ProcessRequest(MyRequests):
def fetcher(self):
while self.q.qsize():
one_task = self.q.get()
try:
data = defined_requetsts(url=one_task, headers=headers)
except Exception as e:
logging.debug("请求出现异常")
continue
self.len_data.append(len(data))
def process_get(self):
processsing_list = []
startTime = time.time()
for i in range(multiprocessing.cpu_count()):
task = multiprocessing.Process(target=self.fetcher)
processsing_list.append(task)
for pro in processsing_list:
pro.start()
for pro in processsing_list:
if pro.is_alive():
pro.join()
endTime = time.time()
return endTime - startTime
def run(self):
atime = self.process_get()
logging.info(f"四个进程程七条数据,所用时间为{atime}")
if __name__ == '__main__':
mytr = ThreadRequests()
mytr.run()
myor = OrdinaryRequests()
myor.run()
mypr = ProcessRequest()
mypr.run()
执行结果
注
tasks = ["http://www.gdep.gov.cn/wgk/jc/201901/t20190104_247520.html",
"http://www.gdep.gov.cn/wgk/jc/201808/t20180830_242337.html",
"http://www.gdep.gov.cn/wgk/jc/201808/t20180830_242343.html",
"http://www.gdep.gov.cn/wgk/jc/201808/t20180809_241534.html",
"http://www.gdep.gov.cn/wgk/jc/201807/t20180724_240933.html",
"http://www.gdep.gov.cn/wgk/jc/201807/t20180730_241073.html",
"http://www.gdep.gov.cn/wgk/jc/201811/t20181113_245623.html", ]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/"
"537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}