1.多线程,多进程和线程池
1.1 GIL
gil:global interpreter lock (cpython)全局解释器;
python中一个线程对应于c语言中的一个线程;
gil使得同一个时刻只有一个线程在一个cpu上执行字节码, 无法将多个线程映射到多个cpu上执行;
gil会根据执行的字节码行数以及时间片释放gil,gil在遇到io的操作时候主动释放。
1.2 多线程
对于io操作来说,多线程和多进程性能差别不大
import time
import threading
def get_detail_html(url):
print("get detail html started")
time.sleep(2)
print("get detail html end")
def get_detail_url(url):
print("get detail url started")
time.sleep(4)
print("get detail url end")
class GetDetailHtml(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail html started")
time.sleep(2)
print("get detail html end")
class GetDetailUrl(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail url started")
time.sleep(4)
print("get detail url end")
if __name__ == "__main__":
thread1 = GetDetailHtml("get_detail_html")
thread2 = GetDetailUrl("get_detail_url")
start_time = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
#当主线程退出的时候, 子线程kill掉
print ("last time: {}".format(time.time()-start_time))
1.3线程间通信
最好通过queue的方式进行线程间同步
1.4线程同步
from threading import Lock, RLock, Condition #可重入的锁
#在同一个线程里面,可以连续调用多次acquire, 一定要注意acquire的次数要和release的次数相等
total = 0
lock = RLock()
def add():
#1. dosomething1
#2. io操作
# 1. dosomething3
global lock
global total
for i in range(1000000):
lock.acquire()
lock.acquire()
total += 1
lock.release()
lock.release()
def desc():
global total
global lock
for i in range(1000000):
lock.acquire()
total -= 1
lock.release()
import threading
thread1 = threading.Thread(target=add)
thread2 = threading.Thread(target=desc)
thread1.start()
thread2.start()
参考:
https://blog.csdn.net/ybdesire/article/details/80294638
条件变量condition:
https://www.cnblogs.com/yoyoketang/p/8337118.html
重点:wait(timeout): 线程挂起,直到收到一个notify通知或者超时(可选的,浮点数,单位是秒s)才会被唤醒继续运行。
Semaphore信号量:
用于控制进入数量的锁。文件, 读、写, 写一般只是用于一个线程写,读可以允许有多个。
import threading
import time
class HtmlSpider(threading.Thread):
def __init__(self, url, sem):
super().__init__()
self.url = url
self.sem = sem
def run(self):
time.sleep(2)
print("got html text success")
self.sem.release()
class UrlProducer(threading.Thread):
def __init__(self, sem):
super().__init__()
self.sem = sem
def run(self):
for i in range(20):
self.sem.acquire()
html_thread = HtmlSpider("https://baidu.com/{}".format(i), self.sem)
html_thread.start()
if __name__ == "__main__":
sem = threading.Semaphore(3)
url_producer = UrlProducer(sem)
url_producer.start()
1.5 线程池
线程池, 为什么要线程池
主线程中可以获取某一个线程的状态或者某一个任务的状态,以及返回值
当一个线程完成的时候我们主线程能立即知道
futures可以让多线程和多进程编码接口一致
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
from concurrent.futures import Future
from multiprocessing import Pool
#未来对象,task的返回容器
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2)
#通过submit函数提交执行的函数到线程池中, submit 是立即返回
# task1 = executor.submit(get_html, (3))
# task2 = executor.submit(get_html, (2))
#要获取已经成功的task的返回
urls = [3,2,4]
all_task = [executor.submit(get_html, (url)) for url in urls]
wait(all_task, return_when=FIRST_COMPLETED)
print("main")
1.6 多进程
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor
#多进程编程
#耗cpu的操作,用多进程编程, 对于io操作来说, 使用多线程编程,进程切换代价要高于线程
#1. 对于耗费cpu的操作,多进程由于多线程
# def fib(n):
# if n<=2:
# return 1
# return fib(n-1)+fib(n-2)
#
# if __name__ == "__main__":
# with ThreadPoolExecutor(3) as executor:
# all_task = [executor.submit(fib, (num)) for num in range(25,40)]
# start_time = time.time()
# for future in as_completed(all_task):
# data = future.result()
# print("exe result: {}".format(data))
#
# print("last time is: {}".format(time.time()-start_time))
#2. 对于io操作来说,多线程优于多进程
def random_sleep(n):
time.sleep(n)
return n
if __name__ == "__main__":
with ProcessPoolExecutor(3) as executor:
all_task = [executor.submit(random_sleep, (num)) for num in [2]*30]
start_time = time.time()
for future in as_completed(all_task):
data = future.result()
print("exe result: {}".format(data))
print("last time is: {}".format(time.time()-start_time))
import multiprocessing
#多进程编程
import time
def get_html(n):
time.sleep(n)
print("sub_progress success")
return n
if __name__ == "__main__":
#使用线程池
pool = multiprocessing.Pool(multiprocessing.cpu_count())
# result = pool.apply_async(get_html, args=(3,))
#
# #等待所有任务完成
# pool.close()
# pool.join()
#
# print(result.get())
#imap
# for result in pool.imap(get_html, [1,5,3]):
# print("{} sleep success".format(result))
for result in pool.imap_unordered(get_html, [1,5,3]):
print("{} sleep success".format(result))
1.6.1 进程间通信
import time
from multiprocessing import Process, Queue, Pool, Manager, Pipe
def producer(queue):
queue.put("a")
time.sleep(2)
def consumer(queue):
time.sleep(2)
data = queue.get()
print(data)
if __name__ == "__main__":
queue = Queue(10)
my_producer = Process(target=producer, args=(queue,))
my_consumer = Process(target=consumer, args=(queue,))
my_producer.start()
my_consumer.start()
my_producer.join()
my_consumer.join()
本章对应书的第四章,读过会有更清楚的理解。
书中线程:
import threading
from time import time, ctime
class MyThread(threading.Thread):
def __init__(self, func, args, name='', verb=False):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
self.verb = verb
def getResult(self):
return self.res
def run(self):
if self.verb:
print ('starting', self.name, 'at:', ctime())
self.res = self.func(*self.args)
if self.verb:
print (self.name, 'finished at:', ctime())