[Python笔记10] 多线程、多进程和线程池

最新推荐文章于 2024-09-17 23:15:58 发布

wong_faye

最新推荐文章于 2024-09-17 23:15:58 发布

阅读量185

点赞数

分类专栏： Python高级文章标签： python

本文链接：https://blog.csdn.net/wong_faye/article/details/105904230

版权

Python高级专栏收录该内容

12 篇文章 0 订阅

订阅专栏

1.Python多线程编程

（1）通过实例化线程类

import time
import threading

def get_detail_html(url):
    print("get detail html started")
    time.sleep(2)
    print("get detail html end")


def get_detail_url(url):
    print("get detail url started")
    time.sleep(4)
    print("get detail url end")
    
if __name__ == "__main__":
    thread1 = threading.Thread(target=get_detail_html, args=("", ))
    thread2 = threading.Thread(target=get_detail_url, args=("", ))
    #thread1.setDaemon(True)
    thread2.setDaemon(True) #设置守护线程
    start_time = time.time()
    thread1.start() #开启线程
    thread2.start()
    
    thread1.join()  #等待线程
    thread2.join()
    
    print("\nlast time:{}".format(time.time() - start_time))

输出结果

get detail html started
get detail url started
get detail html end
get detail url end

last time:4.006971120834351

代码中我们创建了两个线程，实际上整个程序有三个线程，还有一个是主线程。setDaemon是开启守护线程，jon()的作用是等待子线程执行完后在执行主线程接下来的代码。

（2）通过集成Thread来实现多线程

class GetDetailHtml(threading.Thread):
    def __init__(self, name):
        super().__init__(name=name)

    def run(self):
        print("get detail html started")
        time.sleep(2)
        print("get detail html end")


class GetDetailUrl(threading.Thread):
    def __init__(self, name):
        super().__init__(name=name)

    def run(self):
        print("get detail url started")
        time.sleep(4)
        print("get detail url end")

if  __name__ == "__main__":
    thread1 = GetDetailHtml("get_detail_html")
    thread2 = GetDetailUrl("get_detail_url")
    start_time = time.time()
    thread1.start()
    thread2.start()

    thread1.join()
    thread2.join()

    #当主线程退出的时候， 子线程kill掉
    print ("last time: {}".format(time.time()-start_time))

输出结果

get detail html started
get detail url started
get detail html end
get detail url end
last time: 4.009343147277832

2.线程间通信

（1）通过共享变量

import time
import threading

detail_url_list = []



def get_detail_html():
    # 爬取文章详情页
    while True:
        if len(detail_url_list):
            if len(detail_url_list):
                url = detail_url_list.pop()
                print("get detail html started")
                time.sleep(2)
                print("get detail html end")


def get_detail_url():
    # 爬取文章列表页
    while True:
        print("get detail url started")
        time.sleep(4)
        for i in range(20):
            if len(detail_url_list) >= 10:
                detail_url_list.append("http://projectsedu.com/{id}".format(id=i))  
        print("get detail url end")

if __name__ == "__main__":
    thread_detail_url = threading.Thread(target=get_detail_url)
    for i in range(10):
        html_thread = threading.Thread(target=get_detail_html)
        html_thread.start()

（2）通过Queue

from queue import Queue


import time
import threading


def get_detail_html(queue):
    #爬取文章详情页
    while True:
        url = queue.get()   #取url，队列为空时阻塞
        # for url in detail_url_list:
        print("get detail html started")
        time.sleep(2)
        print("get detail html end")


def get_detail_url(queue):
    # 爬取文章列表页
    while True:
        print("get detail url started")
        time.sleep(4)
        for i in range(20):
            queue.put("http://projectsedu.com/{id}".format(id=i))
        print("get detail url end")



if  __name__ == "__main__":
    detail_url_queue = Queue(maxsize=1000)


    thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_queue,))
    for i in range(10):
        html_thread = threading.Thread(target=get_detail_html, args=(detail_url_queue,))
        html_thread.start()
    start_time = time.time()
    detail_url_queue.task_done()
    detail_url_queue.join()

    print ("last time: {}".format(time.time()-start_time))

3.线程同步

（1）Lock
先来看两个函数的运行过程

a = 0

def add1(a):
    a += 1

def desc1(a):
    a -= 1

import dis

print(dis.dis(add1))
print(dis.dis(desc1))

 28           0 LOAD_FAST                0 (a)
              2 LOAD_CONST               1 (1)
              4 INPLACE_ADD
              6 STORE_FAST               0 (a)
              8 LOAD_CONST               0 (None)
             10 RETURN_VALUE
None
 31           0 LOAD_FAST                0 (a)
              2 LOAD_CONST               1 (1)
              4 INPLACE_SUBTRACT
              6 STORE_FAST               0 (a)
              8 LOAD_CONST               0 (None)
             10 RETURN_VALUE
None

上面的过程如下：

加载a
加载1
进行运算
将运行结果赋值给a

add
"""
1.load a	a = 0
2.load 1	1
3.+			1
4.赋值给a	a = 1
"""

desc
"""
1.load a	a = 0
2.load 1	1
3.-			-1
4.赋值给a	a = -1
"""

两个线程有一个先执行，a等于1或者-1，但是不等于0

使用线程锁

from threading import Lock

total = 0
lock = Lock()

def add():
    global total
    global lock
    for i in range(100000):
        lock.acquire()  #加锁
        total += 1
        lock.release()  #释放


def desc():
    global total
    global lock
    for i in range(100000):
        lock.acquire()
        total -= 1
        lock.release()

import threading
thread1 = threading.Thread(target=add)
thread2 = threading.Thread(target=desc)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print(total)

输出结果

用锁会影响性能
锁会引起死锁

引起死锁的情况：

A、B都要请求a、b资源，A先拿到a，B先拿到b，A想要获取b必须等b释放，B想要获取a就必须等a释放，两者互相等待，谁都不释放，资源竞争，造成死锁。

A(a、b)
acquire (a)
acquire (b)

B(a、b)
acquire (b)
acquire (a)

还有一种情况会引起死锁，就是调用了两次锁。

from threading import Lock

total = 0
lock = Lock()
def add(lock):
    global total
    global lock
    for i in range(10000):
        lock.acquire()  #加锁
        dosomething(lock)   #又加锁
        total += 1
        lock.release()

def dosomething(lock):
    lock.acquire()
    # do something
    lock.release()

（2）RLock
为了解决这个问题，python引入了RLock(可重入的锁)，在同一个线程里面，可以连续调用多次acquire，但一定要注意acquire的次数要和release的次数相等。

from threading import RLock

total = 0
lock = RLock()
def add(lock):
    global total
    global lock
    for i in range(10000):
        lock.acquire()
        lock.acquire()
        total += 1
        lock.release()
        lock.release()

（3）Condition
通过线程实现两个机器人对话。

import threading

class XiaoAi(threading.Thread):
    def __init__(self, lock):
        super().__init__(name="小爱")
        self.lock = lock

    def run(self):
        self.lock.acquire()
        print("{} : 在 ".format(self.name))
        self.lock.release()
    
        self.lock.acquire()
        print("{} : 好啊 ".format(self.name))
        self.lock.release()

class TianMao(threading.Thread):
    def __init__(self, lock):
        super().__init__(name="天猫精灵")
        self.lock = lock

    def run(self):
    
        self.lock.acquire()
        print("{} : 小爱同学 ".format(self.name))
        self.lock.release()
    
        self.lock.acquire()
        print("{} : 我们来对古诗吧 ".format(self.name))
        self.lock.release()


if __name__ == "__main__":
    lock = threading.Lock()
    xiaoai = XiaoAi(lock)
    tianmao = TianMao(lock)
    
    tianmao.start()
    xiaoai.start()

输出结果

天猫精灵 : 小爱同学 
天猫精灵 : 我们来对古诗吧 
小爱 : 在 
小爱 : 好啊

这样写并不能实现对话，而是一个线程运行完再到另一个线程。为解决这个问题，我们需要使用Condition

import threading

class XiaoAi(threading.Thread):
    def __init__(self, cond):
        super().__init__(name="小爱")
        self.cond = cond

    def run(self):
        with self.cond:
            self.cond.wait()
            print("{} : 在 ".format(self.name))
            self.cond.notify()

            self.cond.wait()
            print("{} : 好啊 ".format(self.name))
            self.cond.notify()

            self.cond.wait()
            print("{} : 君住长江尾 ".format(self.name))
            self.cond.notify()

            self.cond.wait()
            print("{} : 共饮长江水 ".format(self.name))
            self.cond.notify()

            self.cond.wait()
            print("{} : 此恨何时已 ".format(self.name))
            self.cond.notify()

            self.cond.wait()
            print("{} : 定不负相思意 ".format(self.name))
            self.cond.notify()

class TianMao(threading.Thread):
    def __init__(self, cond):
        super().__init__(name="天猫精灵")
        self.cond = cond

    def run(self):
        with self.cond:
            print("{} : 小爱同学 ".format(self.name))
            self.cond.notify()
            self.cond.wait()

            print("{} : 我们来对古诗吧 ".format(self.name))
            self.cond.notify()
            self.cond.wait()

            print("{} : 我住长江头 ".format(self.name))
            self.cond.notify()
            self.cond.wait()

            print("{} : 日日思君不见君 ".format(self.name))
            self.cond.notify()
            self.cond.wait()

            print("{} : 此水几时休 ".format(self.name))
            self.cond.notify()
            self.cond.wait()

            print("{} : 只愿君心似我心 ".format(self.name))
            self.cond.notify()
            self.cond.wait()


if __name__ == "__main__":
    cond = threading.Condition()
    xiaoai = XiaoAi(cond)
    tianmao = TianMao(cond)
    
    xiaoai.start()  # 小爱启动后，就进入等待状态
    tianmao.start() # 天猫精灵启动后，就唤醒小爱，然后进入等待状态，等待小爱唤醒

输出结果

天猫精灵 : 小爱同学 
小爱 : 在 
天猫精灵 : 我们来对古诗吧 
小爱 : 好啊 
天猫精灵 : 我住长江头 
小爱 : 君住长江尾 
天猫精灵 : 日日思君不见君 
小爱 : 共饮长江水 
天猫精灵 : 此水几时休 
小爱 : 此恨何时已 
天猫精灵 : 只愿君心似我心 
小爱 : 定不负相思意

启动顺序很重要
在调用with cond之后才能调用wait或者notify方法
condition有两层锁，一把底层锁会在线程调用了wait方法的时候释放，上面的锁会在每次调用wait的时候分配一把并放入到cond的等待队列中，等到notify方法的唤醒

（4）Semaphore
Semaphore 是用于控制进入数量的锁，对于文件读、写，写一般只是用于一个线程写，读可以允许有多个。

import threading
import time

class HtmlSpider(threading.Thread):
    def __init__(self, url, sem):
        super().__init__()
        self.url = url
        self.sem = sem
    
    def run(self):
        time.sleep(2)
        print("got html text success")
        self.sem.release()

class UrlProducer(threading.Thread):
    def __init__(self, sem):
        super().__init__()
        self.sem = sem
        
    def run(self):
        for i in range(20):
            self.sem.acquire()
            html_thread = HtmlSpider("https://baidu.com/{}".format(i), self.sem)
            html_thread.start()

if __name__ == "__main__":
    sem = threading.Semaphore(3)	#一次3个
    url_producer = UrlProducer(sem)
    url_producer.start()

4.concurrent线程池

为什么需要线程池？

主线程中可以获取某一个线程的状态或者某一个任务的状态，以及返回值
当一个线程完成的时候我们主线程能立即知道

from concurrent.futures import ThreadPoolExecutor
import time

def get_html(times):
    time.sleep(times)
    print("get page {} success".format(times))
    return times

executor = ThreadPoolExecutor(max_workers=2)	#max_workers设定最大任务数
#通过submit函数提交执行的函数到线程池中, submit是立即返回
task1 = executor.submit(get_html, (3))
task2 = executor.submit(get_html, (2))

#done方法用于判定某个任务是否完成
print(task1.done())

#cancel方法用于取消某个任务
print(task2.cancel())

time.sleep(4)

print(task1.done())

#result方法可以获取task的执行结果
print(task1.result())

输出结果

False
False
get page 2 success
get page 3 success
True
3

cancel方法的执行结果是False，任务没有取消成功，这是因为任务一被提交就立即执行了，不能取消，将max_workers的值改为1，就可以取消了，如下：

executor = ThreadPoolExecutor(max_workers=1)

输出结果

False
True
get page 3 success
True
3

获取已经成功的task返回：

from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def get_html(times):
    time.sleep(times)
    print("get page {} success".format(times))
    return times

executor = ThreadPoolExecutor(max_workers=1)

#获取已经成功的task返回
urls = [2, 3, 4]
all_task = [executor.submit(get_html, (url)) for url in urls]
for future in as_completed(all_task):
    data = future.result()
    print("get {} page success".format(data))

输出结果

get page 2 success
get 2 page success
get page 3 success
get 3 page success
get page 4 success
get 4 page success

通过executor的map获取已经完成的task的值

from concurrent.futures import ThreadPoolExecutor
import time

def get_html(times):
    time.sleep(times)
    print("get page {} success".format(times))
    return times

executor = ThreadPoolExecutor(max_workers=2)

urls = [2, 3, 4]
all_task = [executor.submit(get_html, (url)) for url in urls]
for data in executor.map(get_html, urls):
    print("get {} page".format(data))

等待线程结束再执行

from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
import time

def get_html(times):
    time.sleep(times)
    print("get page {} success".format(times))
    return times

executor = ThreadPoolExecutor(max_workers=2)


urls = [3,2,4]
all_task = [executor.submit(get_html, (url)) for url in urls]
wait(all_task, return_when=FIRST_COMPLETED)
print("main")

输出结果

get page 2 success
main
get page 3 success
get page 4 success

5. 进程池

多进程

import multiprocessing
import time

def get_html(n):
    time.sleep(n)
    print("sub_progress success")
    return n

if __name__ == "__main__":
    progress = multiprocessing.Process(target=get_html, args=(2,))
    print(progress.pid)
    progress.start()
    print(progress.pid)
    progress.join()
    print("main progress end")

输出结果

None
15356
sub_progress success
main progress end

进程池

import multiprocessing
import time

def get_html(n):
    time.sleep(n)
    print("sub_progress success")
    return n

if __name__ == "__main__":
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    result = pool.apply_async(get_html, args=(3,))
    pool.close()  #让pool不再接收新的任务
    pool.join()
    print(result.get())

输出结果

sub_progress success
3

imap方法

import multiprocessing
import time

def get_html(n):
    time.sleep(n)
    print("sub_progress success")
    return n

if __name__ == "__main__":
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    
    for result in pool.imap(get_html, [1,5,3]):
        print("{} sleep success".format(result))

输出结果

sub_progress success
1 sleep success
sub_progress success
sub_progress success
5 sleep success
3 sleep success

可以看到按添加顺序打印

imap_unordered

import multiprocessing
import time

def get_html(n):
    time.sleep(n)
    print("sub_progress success")
    return n

if __name__ == "__main__":
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    
    for result in pool.imap_unordered(get_html, [1,5,3]):
        print("{} sleep success".format(result))

输出结果

sub_progress success
1 sleep success
sub_progress success
3 sleep success
sub_progress success
5 sleep success

可以看到按执行顺序打印

6.进程间通信

（1）使用Manager

import time
from multiprocessing import Pool, Manager

def producer(queue):
    queue.put("a")
    time.sleep(2)

def consumer(queue):
    time.sleep(2)
    data = queue.get()
    print(data)

if __name__ == "__main__":
    queue = Manager().Queue(10)
    pool = Pool(2)

    pool.apply_async(producer, args=(queue,))
    pool.apply_async(consumer, args=(queue,))

    pool.close()
    pool.join()

输出结果

（2）使用Pipe

from multiprocessing import Process, Pipe

def producer(pipe):
    pipe.send("bobby")

def consumer(pipe):
    print(pipe.recv())

if __name__ == "__main__":
    recevie_pipe, send_pipe = Pipe()
    #pipe只能适用于两个进程
    my_producer= Process(target=producer, args=(send_pipe, ))
    my_consumer = Process(target=consumer, args=(recevie_pipe,))

    my_producer.start()
    my_consumer.start()
    my_producer.join()

输出结果

bobby

（3）共享数据结构

from multiprocessing import Process, Manager

def add_data(p_dict, key, value):
    p_dict[key] = value

if __name__ == "__main__":
    progress_dict = Manager().dict()
    

    first_progress = Process(target=add_data, args=(progress_dict, "bobby1", 22))
    second_progress = Process(target=add_data, args=(progress_dict, "bobby2", 23))

    first_progress.start()
    second_progress.start()
    first_progress.join()
    second_progress.join()

    print(progress_dict)