1、创建一个简单的线程
# -*- encoding:utf-8 -*-
# 如果中文运行不了加上上面那个注释
# 导入线程库
import threading
# 准备一个函数
def say_hello(arg1, arg2):
print(f'hello {arg1}!')
print(f'hello {arg2}!')
t = threading.Thread(target=say_hello,
args=('张三',),
kwargs={'arg2': '李四'})
# 等价于
# t = threading.Thread(target=say_hello,
# args=('张三', '李四'))
t.start() # 启动线程
t.join() # 等待结束
threading.Thread中:
- 形参args需传入一个元组,给函数提供实参
若args=(‘张三’, ‘李四’),则需要删除kwargs否则会报错 - kwargs需传入字典,键就是函数的形参,值就是实参
2、用网络爬虫体现线程加速
利用Python去爬取某个网站50个页面的数据
分别用单线程和多线程来完成,并记录从开始到结束的时间
# -*- encoding:utf-8 -*-
# 如果中文运行不了加上上面那个注释
import threading
import requests
import time
# 存储50个页面的链接
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 50 + 1)
]
# 爬取单个页面的数据
def craw(url):
r = requests.get(url)
# print(url, len(r.text))
# 用单线程处理爬取任务
def single_thread():
print('start single thread')
for url in urls:
craw(url)
print('end single thread')
# 用多线程处理爬取任务
# 一个线程爬取一页数据
def multi_thread():
threads = []
print('tart multi thread')
for url in urls:
threads.append(threading.Thread(target=craw, args=(url,)))
for t in threads:
t.start()
for t in threads:
t.join()
print('end multi thread')
# 大致记录一下时间
start = time.time()
single_thread()
end = time.time()
print('single:' + "%.2f" % (end - start)+'s\n')
start = time.time()
multi_thread()
end = time.time()
print('multi:' + "%.2f" % (end - start)+'s')
3、队列实现线程同步
3.1、阻塞的方式
用于多线程同步的queue,当队列为空时,取元素会被阻塞
import queue
# 创建队列
q = queue.Queue()
# 添加元素
q.put('123')
# 获取元素
# 如果列表为空会阻塞
item = q.get()
生产者负责往队列里放元素
消费者负责往队列里取元素
本程序先启动消费者,由于队列此时没有元素,会被阻塞,直到生产者往里面添加元素后才运行
# -*- encoding:utf-8 -*-
import random
import threading
import time
import queue
def produce(num, proc_queue: queue.Queue):
while True:
product = f'This is number {num} product'
print(f'{threading.currentThread().name} '
f'sizep:{proc_queue.qsize()}')
time.sleep(random.randint(3, 5))
proc_queue.put(product)
def consume(proc_queue: queue.Queue):
while True:
product = proc_queue.get()
print(f'{threading.currentThread().name} '
f'sizec:{proc_queue.qsize()}')
print(product)
time.sleep(random.randint(1, 2))
product_queue = queue.Queue()
for tid in range(4):
t = threading.Thread(target=consume,
args=(product_queue,),
name=f'consume:{tid}')
t.start()
time.sleep(2)
for tid in range(2):
t = threading.Thread(target=produce,
args=(tid, product_queue),
name=f'producer:{tid}')
t.start()
3.2、不阻塞的方式
使用get(block=False)并且忽略队列为空异常
# -*- encoding:utf-8 -*-
import threading
import time
import queue
def produce(proc_queue: queue.Queue):
i = 0
while i < 10:
i = i + 1
time.sleep(1)
proc_queue.put(i)
print('over')
def consume(proc_queue: queue.Queue):
while True:
time.sleep(0.5)
try:
i = proc_queue.get(block=False)
except queue.Empty:
pass
else:
print(i)
print('consume')
product_queue = queue.Queue()
t1 = threading.Thread(target=consume,
args=(product_queue,))
t1.start()
t2 = threading.Thread(target=produce,
args=(product_queue,))
t2.start()
t2.join()
4、加锁实现线程安全
先看个例子
模拟取钱,正常情况要取的金额小于余额才能取钱成功
# -*- encoding:utf-8 -*-
import threading
import time
class Account:
def __init__(self, balance):
self.balance = balance
# 模拟取钱
def take_money(acc: Account, num):
if acc.balance >= num:
time.sleep(1)
print("取钱成功")
acc.balance -= num
print(f"余额为:{acc.balance}")
else:
print("余额不足")
if __name__ == "__main__":
account = Account(1000)
t1 = threading.Thread(target=take_money,
args=(account, 800))
t2 = threading.Thread(target=take_money,
args=(account, 800))
t1.start()
t2.start()
必定会出现上面的结果,可通过加锁来避免
加锁有两种方式:
- try-finally
- with
import threading
# try-finally
lock = threading.Lock()
lock.acquire()
try:
# 任务
finally:
lock.release()
# with
lock = threading.Lock()
with lock:
# 任务
- 继续上面的例子
# -*- encoding:utf-8 -*-
import threading
import time
lock = threading.Lock()
class Account:
def __init__(self, balance):
self.balance = balance
def take_money(acc: Account, num):
with lock:
if acc.balance >= num:
time.sleep(1)
print("取钱成功")
acc.balance -= num
print(f"余额为:{acc.balance}")
else:
print("余额不足")
if __name__ == "__main__":
account = Account(1000)
t1 = threading.Thread(target=take_money,
args=(account, 800))
t2 = threading.Thread(target=take_money,
args=(account, 800))
t1.start()
t2.start()
这个例子不是很好,上面的程序加了锁以后就相当于变成单线程了
和直接调用两次take_money没有区别
5、线程池
使用线程池的好处:
- 提升性能,减去了大量新建/终止线程的开销
- 适合处理突大量请求,但实际的任务处理时间很短的场景
5.1、map函数
map输出的结果和传入的参数顺序对应
必须要给线程传入参数,即使该线程不需要传参
# -*- encoding:utf-8 -*-
import concurrent.futures as cf
import requests
# 存储50个页面的链接
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 50 + 1)
]
# 爬取单个页面的数据
def craw(url):
ret = requests.get(url)
return ret, url
# print(url, len(r.text))
# 放进线程池
# 传参是:要执行的函数+参数列表
# 就是需要提前准备好参数列表
with cf.ThreadPoolExecutor() as pool:
rets = pool.map(craw, urls)
# craw函数有两个返回值
for r, u in rets:
print(u, len(r.text))
5.2、submit函数
如果线程不需要传参,可以不传
这个函数没怎么搞懂
返回的ret和map不同???
# -*- encoding:utf-8 -*-
import concurrent.futures as cf
import requests
# 存储50个页面的链接
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 50 + 1)
]
# 爬取单个页面的数据
def craw(url):
ret = requests.get(url)
return ret, url
# print(url, len(ret.text))
# 放进线程池
# 传参是:要执行的函数+参数
with cf.ThreadPoolExecutor() as pool:
futures = [pool.submit(craw, url)
for url in urls]
# 这个方法返回值和传参的顺序不一定一致
# 它是看谁先执行完毕就返回谁
for future in cf.as_completed(futures):
print(future.result())
# 这个方法返回值和传参的顺序一致
# for future in futures:
# r, u = future.result()
# print(r, u)
参考:【2021最新版】Python 并发编程实战
原作者:蚂蚁学Python