多进程多线程概念篇:
https://blog.csdn.net/cyt0906/article/details/107853738
https://blog.csdn.net/cyt0906/article/details/107853743
1、创建线程
import threading
# 这个函数名可随便定义
def run(n):
print("current task:", n)
if __name__ == "__main__":
t1 = threading.Thread(target=run, args=("thread 1",))
t2 = threading.Thread(target=run, args=("thread 2",))
t1.start()
t2.start()
采用继承threading重写run的方法来创建线程
import threading
class MyThread(threading.Thread):
def __init__(self, n):
super().__init__() # 重构run函数必须要写,继承父类的初始化内容
self.n = n
def run(self):
print("current task:", self.n)
if __name__ == "__main__":
t1 = MyThread("thread 1")
t2 = MyThread("thread 2")
t1.start()
t2.start()
# 将 t1 和 t2 加入到主线程中
t1.join()
t2.join()
判断线程是否存活
import threading
import time
def start():
time.sleep(5)
thread1 = threading.Thread(target=start)
print(thread1.is_alive()) #判断线程是否存活,存活为true,否则为false
thread1.start()
print(thread1.getName()) #查看线程的名称
print(thread1.is_alive())
thread1.join()
print(thread1.is_alive())
#输出
#False
#Thread-1
#True
#False
二、线程锁
lock
import threading
import time
num = 0
mutex = threading.Lock()
class MyThread(threading.Thread):
def run(self):
global num
time.sleep(1)
if mutex.acquire(1): # 加锁
num = num + 1
print(f'{self.name} : num value is {num}')
mutex.release() #解锁
if __name__ == '__main__':
for i in range(5):
t = MyThread()
t.start()
rlock
import threading
import time
# Lock普通锁不可嵌套,RLock普通锁可嵌套
mutex = threading.RLock()
class MyThread(threading.Thread):
def run(self):
if mutex.acquire(1):
print("thread " + self.name + "get mutex")
time.sleep(1)
mutex.acquire()
mutex.release()
mutex.release()
if __name__ == '__main__':
for i in range(5):
t = MyThread()
t.start()
lock和rlock都是普通锁,rlock支持嵌套,lock嵌套会产生死锁
条件锁
# 条件锁:该机制会使线程等待,只有满足某条件时,才释放n个线程
import threading
def condition():
ret = False
r = input(">>>")
if r == "yes":
ret = True
return ret
def func(conn,i):
# print(i)
conn.acquire()
conn.wait_for(condition) # 这个方法接收一个函数的返回值
print(i+100)
conn.release()
c = threading.Condition()
for i in range(10):
t = threading.Thread(target=func,args=(c,i,))
t.start()
# 条件锁的原理跟设计模式中的生产者/消费者(Producer/Consumer)模式类似
信号量
# 信号量:内部实现一个计数器,占用信号量的线程数超过指定值时阻塞
import time
import threading
def run(n):
semaphore.acquire()
print("run the thread: %s" % n)
time.sleep(1)
semaphore.release()
num = 0
semaphore = threading.BoundedSemaphore(5) # 最多允许5个线程同时运行
for i in range(20):
t = threading.Thread(target=run,args=(i,))
t.start()
事件
# 事件: 定义一个flag,set设置flag为True ,clear设置flag为False
import threading
def func(e,i):
print(i)
e.wait() # 检测当前event是什么状态,如果是红灯,则阻塞,如果是绿灯则继续往下执行。默认是红灯。
print(i+100)
event = threading.Event()
for i in range(10):
t = threading.Thread(target=func,args=(event,i))
t.start()
event.clear() # 主动将状态设置为红灯
inp = input(">>>")
if inp == "1":
event.set()# 主动将状态设置为绿灯
# 练习: 使用redis实现分布式锁
计时器
# 定时器: 指定n秒后执行
from threading import Timer
def hello():
print("hello, world")
t = Timer(1,hello) # 表示1秒后执行hello函数
t.start()
三、队列
多进程不支持变量共享,多线程支持变量共享(使用全局变量)
队列的生产者消费者:
import queue
q = queue.Queue(5)
q.put(111) # 存队列
q.put(222)
q.put(333)
print(q.get()) # 取队列
print(q.get())
q.task_done() # 每次从queue中get一个数据之后,当处理好相关问题,最后调用该方法,
# 以提示q.join()是否停止阻塞,让线程继续执行或者退出
print(q.qsize()) # 队列中元素的个数, 队列的大小
print(q.empty()) # 队列是否为空
print(q.full()) # 队列是否满了
###############
import queue
import threading
import random
import time
writelock = threading.Lock()
class Producer(threading.Thread):
def __init__(self, q, con, name):
super(Producer, self).__init__()
self.q = q
self.name = name
self.con =con
print(f'Producer {self.name} Started')
def run(self):
while 1:
global writelock
self.con.acquire() # 获得锁对象
if self.q.full(): # 队列满
with writelock:
print('Queue is full , producer wait')
self.con.wait() # 等待资源
else:
value = random.randint(0,10)
with writelock:
print(f'{self.name} put value {self.name} {str(value)} in queue')
self.q.put( (f'{self.name} : {str(value)}') ) # 放入队列
self.con.notify() # 通知消费者
time.sleep(1)
self.con.release()
class Consumer(threading.Thread):
def __init__(self, q, con, name):
super(Consumer, self).__init__()
self.q = q
self.name = name
self.con =con
print(f'Consumer {self.name} Started')
def run(self):
while 1:
global writelock
self.con.acquire()
if self.q.empty(): # 队列空
with writelock:
print('Queue is empty , consumer wait')
self.con.wait() # 等待资源
else:
value = self.q.get()
with writelock:
print(f'{self.name} get value {value} from queue')
self.con.notify() # 通知生产者
time.sleep(1)
self.con.release()
if __name__ == '__main__':
q = queue.Queue(10)
con = threading.Condition() # 条件变量锁
p1 = Producer(q, con, 'P1')
p1.start()
p2 = Producer(q, con, 'P2')
p2.start()
c1 = Consumer(q, con, 'C1')
c1.start()
# 练习使用列表实现队列
队列的其他用法
import queue
q = queue.PriorityQueue()
# 每个元素都是元组
# 数字越小优先级越高
# 同优先级先进先出
q.put((1,"work"))
q.put((-1,"life"))
q.put((1,"drink"))
q.put((-2,"sleep"))
print(q.get())
print(q.get())
print(q.get())
print(q.get())
# queue.LifoQueue 后进先出队列,类似堆栈
# q.deque 双向队列
线程和队列应用到爬虫中
import os
import queue
import threading
import requests
from fake_useragent import UserAgent
class DownloadThread(threading.Thread):
def __init__(self, q):
super().__init__()
self.q = q
def run(self):
while True:
url = self.q.get() # 从队列取出一个元素
print(f'{self.name} begin download {url}')
self.download_file(url) # 下载文件
self.q.task_done() # 下载完成发送信号
print(f'{self.name} download completed')
def download_file(self, url):
ua = UserAgent()
headers={"User-Agent":ua.random}
r = requests.get(url, stream=True, headers=headers)
fname = os.path.basename(url) + '.html'
with open(fname, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if not chunk: break
f.write(chunk)
if __name__ == '__main__':
urls = ['http://www.baidu.com',
'http://www.python.org',
'http://www.douban.com']
q = queue.Queue()
for i in range(5):
t = DownloadThread(q) # 启动5个线程
t.setDaemon(True)
t.start()
for url in urls:
q.put(url)
q.join()
十一、线程池
#一般的线程池
from multiprocessing.dummy import Pool as ThreadPoll
#并行任务的高级封装(python3.2之后支持)
from concurrent.futures import ThreadPoolExecutor
一般的线程池
import requests
from multiprocessing.dummy import Pool as ThreadPool
urls = [
'http://www.baidu.com',
'http://www.sina.com.cn',
'http://www.163.com',
'http://www.qq.com',
'http://www.taobao.com',
]
# 开启线程池
pool = ThreadPool(4)
# 获取urls的结果
results = pool.map(requests.get, urls)
# 关闭线程池等待任务完成退出
pool.close()
pool.join()
for i in results:
print(i.url)
并行任务的高级封装
# Python3.2 中引入了 concurrent.futures 库,利用这个库可以非常方便的使用多线程、多进程
from concurrent.futures import ThreadPoolExecutor
import time
def func(args):
print(f'call func {args}')
if __name__ == "__main__":
seed = ['a', 'b', 'c', 'd']
with ThreadPoolExecutor(3) as executor:
executor.submit(func, seed) # 把整个参数传递到func函数对象
time.sleep(1)
with ThreadPoolExecutor(3) as executor2:
executor2.map(func, seed) #把整个参数拆分(类似for循环取值)传递到func函数对象
time.sleep(1)
with ThreadPoolExecutor(max_workers=1) as executor: #最大工作线程为1
future = executor.submit(pow, 2, 3) #submit可以传递多个参数
print(future.result())
#输出
#call func ['a', 'b', 'c', 'd']
#call func a
#call func b
#call func c
#call func d
#8
避免线程的相互调用,会产生死锁
import time
from concurrent.futures import ThreadPoolExecutor
def wait_on_b():
time.sleep(5)
print(b.result()) # b will never complete because it is waiting on a.
return 5
def wait_on_a():
time.sleep(5)
print(a.result()) # a will never complete because it is waiting on b.
return 6
executor = ThreadPoolExecutor(max_workers=2)
a = executor.submit(wait_on_b)
b = executor.submit(wait_on_a)
# 当回调已关联了一个 Future 然后再等待另一个 Future 的结果时就会发产死锁情况
# https://docs.python.org/zh-cn/3.7/library/concurrent.futures.html#threadpoolexecutor
十二、GIL锁
GIL锁: Global Interpreter Lock,任何Python线程执行前,必须先获得GIL锁,然后,每执行100条字节码,解释器就自动释放GIL锁,让别的线程有机会执行。这个GIL全局锁实际上把所有线程的执行代码都给上了锁,所以,多线程在Python中只能交替执行,即使100个线程跑在100核CPU上,也只能用到1个核。
比较普通/多线程/多进程的运行时间
# process vs thread
import multiprocessing as mp
def job(q):
res = 0
for i in range(1000000):
res += i+i**2+i**3
q.put(res) # queue
# 多核
def multicore():
q = mp.Queue()
p1 = mp.Process(target=job, args=(q,))
p2 = mp.Process(target=job, args=(q,))
p1.start()
p2.start()
p1.join()
p2.join()
res1 = q.get()
res2 = q.get()
print('multicore:',res1 + res2)
# 创建多线程mutithread
# 接下来创建多线程程序,创建多线程和多进程有很多相似的地方。
# 首先import threading然后定义multithread()完成同样的任务
import threading as td
def multithread():
q = mp.Queue() # thread可放入process同样的queue中
t1 = td.Thread(target=job, args=(q,))
t2 = td.Thread(target=job, args=(q,))
t1.start()
t2.start()
t1.join()
t2.join()
res1 = q.get()
res2 = q.get()
print('multithread:', res1 + res2)
# 创建普通函数
def normal():
res = 0
for _ in range(2):
for i in range(1000000):
res += i + i**2 + i**3
print('normal:', res)
# 在上面例子中我们建立了两个进程或线程,均对job()进行了两次运算,
# 所以在normal()中我们也让它循环两次
# 运行时间
import time
if __name__ == '__main__':
st = time.time()
normal()
st1 = time.time()
print('normal time:', st1 - st)
multithread()
st2 = time.time()
print('multithread time:', st2 - st1)
multicore()
print('multicore time:', time.time() - st2)
# 普通/多线程/多进程的运行时间分别是1.41,1.47和0.75秒。
# 我们发现多核/多进程最快,说明在同时间运行了多个任务。
# 而多线程的运行时间居然比什么都不做的程序还要慢一点,
# 说明多线程还是有一定的短板的(GIL)。
线程是无法利用多核的,常用操作时线程加进程。