1.Python多线程编程
(1)通过实例化线程类
import time
import threading
def get_detail_html(url):
print("get detail html started")
time.sleep(2)
print("get detail html end")
def get_detail_url(url):
print("get detail url started")
time.sleep(4)
print("get detail url end")
if __name__ == "__main__":
thread1 = threading.Thread(target=get_detail_html, args=("", ))
thread2 = threading.Thread(target=get_detail_url, args=("", ))
#thread1.setDaemon(True)
thread2.setDaemon(True) #设置守护线程
start_time = time.time()
thread1.start() #开启线程
thread2.start()
thread1.join() #等待线程
thread2.join()
print("\nlast time:{}".format(time.time() - start_time))
输出结果
get detail html started
get detail url started
get detail html end
get detail url end
last time:4.006971120834351
代码中我们创建了两个线程,实际上整个程序有三个线程,还有一个是主线程。setDaemon是开启守护线程,jon()的作用是等待子线程执行完后在执行主线程接下来的代码。
(2)通过集成Thread来实现多线程
class GetDetailHtml(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail html started")
time.sleep(2)
print("get detail html end")
class GetDetailUrl(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail url started")
time.sleep(4)
print("get detail url end")
if __name__ == "__main__":
thread1 = GetDetailHtml("get_detail_html")
thread2 = GetDetailUrl("get_detail_url")
start_time = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
#当主线程退出的时候, 子线程kill掉
print ("last time: {}".format(time.time()-start_time))
输出结果
get detail html started
get detail url started
get detail html end
get detail url end
last time: 4.009343147277832
2.线程间通信
(1)通过共享变量
import time
import threading
detail_url_list = []
def get_detail_html():
# 爬取文章详情页
while True:
if len(detail_url_list):
if len(detail_url_list):
url = detail_url_list.pop()
print("get detail html started")
time.sleep(2)
print("get detail html end")
def get_detail_url():
# 爬取文章列表页
while True:
print("get detail url started")
time.sleep(4)
for i in range(20):
if len(detail_url_list) >= 10:
detail_url_list.append("http://projectsedu.com/{id}".format(id=i))
print("get detail url end")
if __name__ == "__main__":
thread_detail_url = threading.Thread(target=get_detail_url)
for i in range(10):
html_thread = threading.Thread(target=get_detail_html)
html_thread.start()
(2)通过Queue
from queue import Queue
import time
import threading
def get_detail_html(queue):
#爬取文章详情页
while True:
url = queue.get() #取url,队列为空时阻塞
# for url in detail_url_list:
print("get detail html started")
time.sleep(2)
print("get detail html end")
def get_detail_url(queue):
# 爬取文章列表页
while True:
print("get detail url started")
time.sleep(4)
for i in range(20):
queue.put("http://projectsedu.com/{id}".format(id=i))
print("get detail url end")
if __name__ == "__main__":
detail_url_queue = Queue(maxsize=1000)
thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_queue,))
for i in range(10):
html_thread = threading.Thread(target=get_detail_html, args=(detail_url_queue,))
html_thread.start()
start_time = time.time()
detail_url_queue.task_done()
detail_url_queue.join()
print ("last time: {}".format(time.time()-start_time))
3.线程同步
(1)Lock
先来看两个函数的运行过程
a = 0
def add1(a):
a += 1
def desc1(a):
a -= 1
import dis
print(dis.dis(add1))
print(dis.dis(desc1))
28 0 LOAD_FAST 0 (a)
2 LOAD_CONST 1 (1)
4 INPLACE_ADD
6 STORE_FAST 0 (a)
8 LOAD_CONST 0 (None)
10 RETURN_VALUE
None
31 0 LOAD_FAST 0 (a)
2 LOAD_CONST 1 (1)
4 INPLACE_SUBTRACT
6 STORE_FAST 0 (a)
8 LOAD_CONST 0 (None)
10 RETURN_VALUE
None
上面的过程如下:
- 加载a
- 加载1
- 进行运算
- 将运行结果赋值给a
add
"""
1.load a a = 0
2.load 1 1
3.+ 1
4.赋值给a a = 1
"""
desc
"""
1.load a a = 0
2.load 1 1
3.- -1
4.赋值给a a = -1
"""
两个线程有一个先执行,a等于1或者-1,但是不等于0
使用线程锁
from threading import Lock
total = 0
lock = Lock()
def add():
global total
global lock
for i in range(100000):
lock.acquire() #加锁
total += 1
lock.release() #释放
def desc():
global total
global lock
for i in range(100000):
lock.acquire()
total -= 1
lock.release()
import threading
thread1 = threading.Thread(target=add)
thread2 = threading.Thread(target=desc)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print(total)
输出结果
0
- 用锁会影响性能
- 锁会引起死锁
引起死锁的情况:
A、B都要请求a、b资源,A先拿到a,B先拿到b,A想要获取b必须等b释放,B想要获取a就必须等a释放,两者互相等待,谁都不释放,资源竞争,造成死锁。
A(a、b)
acquire (a)
acquire (b)
B(a、b)
acquire (b)
acquire (a)
还有一种情况会引起死锁,就是调用了两次锁。
from threading import Lock
total = 0
lock = Lock()
def add(lock):
global total
global lock
for i in range(10000):
lock.acquire() #加锁
dosomething(lock) #又加锁
total += 1
lock.release()
def dosomething(lock):
lock.acquire()
# do something
lock.release()
(2)RLock
为了解决这个问题,python引入了RLock(可重入的锁),在同一个线程里面,可以连续调用多次acquire, 但一定要注意acquire的次数要和release的次数相等。
from threading import RLock
total = 0
lock = RLock()
def add(lock):
global total
global lock
for i in range(10000):
lock.acquire()
lock.acquire()
total += 1
lock.release()
lock.release()
(3)Condition
通过线程实现两个机器人对话。
import threading
class XiaoAi(threading.Thread):
def __init__(self, lock):
super().__init__(name="小爱")
self.lock = lock
def run(self):
self.lock.acquire()
print("{} : 在 ".format(self.name))
self.lock.release()
self.lock.acquire()
print("{} : 好啊 ".format(self.name))
self.lock.release()
class TianMao(threading.Thread):
def __init__(self, lock):
super().__init__(name="天猫精灵")
self.lock = lock
def run(self):
self.lock.acquire()
print("{} : 小爱同学 ".format(self.name))
self.lock.release()
self.lock.acquire()
print("{} : 我们来对古诗吧 ".format(self.name))
self.lock.release()
if __name__ == "__main__":
lock = threading.Lock()
xiaoai = XiaoAi(lock)
tianmao = TianMao(lock)
tianmao.start()
xiaoai.start()
输出结果
天猫精灵 : 小爱同学
天猫精灵 : 我们来对古诗吧
小爱 : 在
小爱 : 好啊
这样写并不能实现对话,而是一个线程运行完再到另一个线程。为解决这个问题,我们需要使用Condition
import threading
class XiaoAi(threading.Thread):
def __init__(self, cond):
super().__init__(name="小爱")
self.cond = cond
def run(self):
with self.cond:
self.cond.wait()
print("{} : 在 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 好啊 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 君住长江尾 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 共饮长江水 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 此恨何时已 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 定不负相思意 ".format(self.name))
self.cond.notify()
class TianMao(threading.Thread):
def __init__(self, cond):
super().__init__(name="天猫精灵")
self.cond = cond
def run(self):
with self.cond:
print("{} : 小爱同学 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 我们来对古诗吧 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 我住长江头 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 日日思君不见君 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 此水几时休 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 只愿君心似我心 ".format(self.name))
self.cond.notify()
self.cond.wait()
if __name__ == "__main__":
cond = threading.Condition()
xiaoai = XiaoAi(cond)
tianmao = TianMao(cond)
xiaoai.start() # 小爱启动后,就进入等待状态
tianmao.start() # 天猫精灵启动后,就唤醒小爱,然后进入等待状态,等待小爱唤醒
输出结果
天猫精灵 : 小爱同学
小爱 : 在
天猫精灵 : 我们来对古诗吧
小爱 : 好啊
天猫精灵 : 我住长江头
小爱 : 君住长江尾
天猫精灵 : 日日思君不见君
小爱 : 共饮长江水
天猫精灵 : 此水几时休
小爱 : 此恨何时已
天猫精灵 : 只愿君心似我心
小爱 : 定不负相思意
- 启动顺序很重要
- 在调用with cond之后才能调用wait或者notify方法
- condition有两层锁, 一把底层锁会在线程调用了wait方法的时候释放,上面的锁会在每次调用wait的时候分配一把并放入到cond的等待队列中,等到notify方法的唤醒
(4)Semaphore
Semaphore 是用于控制进入数量的锁,对于文件读、写,写一般只是用于一个线程写,读可以允许有多个。
import threading
import time
class HtmlSpider(threading.Thread):
def __init__(self, url, sem):
super().__init__()
self.url = url
self.sem = sem
def run(self):
time.sleep(2)
print("got html text success")
self.sem.release()
class UrlProducer(threading.Thread):
def __init__(self, sem):
super().__init__()
self.sem = sem
def run(self):
for i in range(20):
self.sem.acquire()
html_thread = HtmlSpider("https://baidu.com/{}".format(i), self.sem)
html_thread.start()
if __name__ == "__main__":
sem = threading.Semaphore(3) #一次3个
url_producer = UrlProducer(sem)
url_producer.start()
4.concurrent线程池
为什么需要线程池?
- 主线程中可以获取某一个线程的状态或者某一个任务的状态,以及返回值
- 当一个线程完成的时候我们主线程能立即知道
from concurrent.futures import ThreadPoolExecutor
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2) #max_workers设定最大任务数
#通过submit函数提交执行的函数到线程池中, submit是立即返回
task1 = executor.submit(get_html, (3))
task2 = executor.submit(get_html, (2))
#done方法用于判定某个任务是否完成
print(task1.done())
#cancel方法用于取消某个任务
print(task2.cancel())
time.sleep(4)
print(task1.done())
#result方法可以获取task的执行结果
print(task1.result())
输出结果
False
False
get page 2 success
get page 3 success
True
3
cancel方法的执行结果是False,任务没有取消成功,这是因为任务一被提交就立即执行了,不能取消,将max_workers的值改为1,就可以取消了,如下:
executor = ThreadPoolExecutor(max_workers=1)
输出结果
False
True
get page 3 success
True
3
获取已经成功的task返回:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=1)
#获取已经成功的task返回
urls = [2, 3, 4]
all_task = [executor.submit(get_html, (url)) for url in urls]
for future in as_completed(all_task):
data = future.result()
print("get {} page success".format(data))
输出结果
get page 2 success
get 2 page success
get page 3 success
get 3 page success
get page 4 success
get 4 page success
通过executor的map获取已经完成的task的值
from concurrent.futures import ThreadPoolExecutor
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2)
urls = [2, 3, 4]
all_task = [executor.submit(get_html, (url)) for url in urls]
for data in executor.map(get_html, urls):
print("get {} page".format(data))
等待线程结束再执行
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2)
urls = [3,2,4]
all_task = [executor.submit(get_html, (url)) for url in urls]
wait(all_task, return_when=FIRST_COMPLETED)
print("main")
输出结果
get page 2 success
main
get page 3 success
get page 4 success
5. 进程池
多进程
import multiprocessing
import time
def get_html(n):
time.sleep(n)
print("sub_progress success")
return n
if __name__ == "__main__":
progress = multiprocessing.Process(target=get_html, args=(2,))
print(progress.pid)
progress.start()
print(progress.pid)
progress.join()
print("main progress end")
输出结果
None
15356
sub_progress success
main progress end
进程池
import multiprocessing
import time
def get_html(n):
time.sleep(n)
print("sub_progress success")
return n
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
result = pool.apply_async(get_html, args=(3,))
pool.close() #让pool不再接收新的任务
pool.join()
print(result.get())
输出结果
sub_progress success
3
imap方法
import multiprocessing
import time
def get_html(n):
time.sleep(n)
print("sub_progress success")
return n
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for result in pool.imap(get_html, [1,5,3]):
print("{} sleep success".format(result))
输出结果
sub_progress success
1 sleep success
sub_progress success
sub_progress success
5 sleep success
3 sleep success
可以看到按添加顺序打印
imap_unordered
import multiprocessing
import time
def get_html(n):
time.sleep(n)
print("sub_progress success")
return n
if __name__ == "__main__":
pool = multiprocessing.Pool(multiprocessing.cpu_count())
for result in pool.imap_unordered(get_html, [1,5,3]):
print("{} sleep success".format(result))
输出结果
sub_progress success
1 sleep success
sub_progress success
3 sleep success
sub_progress success
5 sleep success
可以看到按执行顺序打印
6.进程间通信
(1)使用Manager
import time
from multiprocessing import Pool, Manager
def producer(queue):
queue.put("a")
time.sleep(2)
def consumer(queue):
time.sleep(2)
data = queue.get()
print(data)
if __name__ == "__main__":
queue = Manager().Queue(10)
pool = Pool(2)
pool.apply_async(producer, args=(queue,))
pool.apply_async(consumer, args=(queue,))
pool.close()
pool.join()
输出结果
a
(2)使用Pipe
from multiprocessing import Process, Pipe
def producer(pipe):
pipe.send("bobby")
def consumer(pipe):
print(pipe.recv())
if __name__ == "__main__":
recevie_pipe, send_pipe = Pipe()
#pipe只能适用于两个进程
my_producer= Process(target=producer, args=(send_pipe, ))
my_consumer = Process(target=consumer, args=(recevie_pipe,))
my_producer.start()
my_consumer.start()
my_producer.join()
输出结果
bobby
(3)共享数据结构
from multiprocessing import Process, Manager
def add_data(p_dict, key, value):
p_dict[key] = value
if __name__ == "__main__":
progress_dict = Manager().dict()
first_progress = Process(target=add_data, args=(progress_dict, "bobby1", 22))
second_progress = Process(target=add_data, args=(progress_dict, "bobby2", 23))
first_progress.start()
second_progress.start()
first_progress.join()
second_progress.join()
print(progress_dict)
输出结果
{'bobby1': 22, 'bobby2': 23}