线程
操作线程的模块:
- _thread
- threading
创建线程
一个进程里面必然有一个主线程
import threading
import time
def job():
print("这是一个需要执行的任务。。。。。")
print("当前线程的个数:", threading.active_count() )
print("当前线程的信息:", threading.current_thread())
time.sleep(100)
if __name__ == '__main__':
job()
线程的生命周期图
实现多线程
方法一:实例化对象方式
import threading
import time
def job():
print("这是一个需要执行的任务。。。。。")
print("当前线程的个数:", threading.active_count() )
time.sleep(1)
print("当前线程的信息:", threading.current_thread())
if __name__ == '__main__':
# 创建多线程时, 需要制定该线程执行的任务
t1 = threading.Thread(target=job)
t2 = threading.Thread(target=job)
t1.start()
t2.start()
print(threading.active_count())
print("程序执行结束.....")
出现的问题:主线程执行结束, 但是子线程还在运行。
解决问题:
import threading
import time
def job():
print("这是一个需要执行的任务。。。。。")
print("当前线程的个数:", threading.active_count() )
time.sleep(1)
print("当前线程的信息:", threading.current_thread())
if __name__ == '__main__':
# 创建多线程时, 需要制定该线程执行的任务
t1 = threading.Thread(target=job)
t2 = threading.Thread(target=job)
t1.start()
t2.start()
print(threading.active_count())
# 等待所有的子线程执行结束之后, 再执行主线程
t1.join()
t2.join()
print("程序执行结束.....")
未使用多线程案例
import time
import json
from urllib.request import urlopen
# 计算函数运行时间的装饰器
def timeit(f):
def wrapper(*args, **kwargs):
start_time = time.time()
res = f(*args, **kwargs)
end_time = time.time()
print("%s函数运行时间:%.2f" % (f.__name__, end_time - start_time))
return res
return wrapper
def get_addr(ip):
url = "http://ip-api.com/json/%s" % (ip)
urlObj = urlopen(url)
pageContent = urlObj.read().decode('utf-8')
dict_data = json.loads(pageContent)
print("""
%s
所在城市: %s
所在国家: %s
""" % (ip, dict_data['city'], dict_data['country']))
@timeit
def main():
ips = ['12.13.14.%s' % (i + 1) for i in range(10)]
for ip in ips:
get_addr(ip)
if __name__ == '__main__':
main()
未使用多线程时依次访问url,运行速度较慢:
使用多线程(实例化对象)案例
import json
import threading
from urllib.request import urlopen
import time
def timeit(f):
def wrapper(*args,**kwargs):
start_time = time.time()
res = f(*args,**kwargs)
end_time = time.time()
print('%s函数运行时间:%.2f' %(f.__name__,end_time - start_time))
return res
return wrapper
def get_addr(ip):
url = 'http://ip-api.com/json/%s' %(ip)
urlObj = urlopen(url)
pageContent = urlObj.read().decode('utf-8')
dict_pageContent = json.loads(pageContent)
print("""
%s
所在城市:%s
所在国家:%s
""" %(ip,dict_pageContent['city'],dict_pageContent['country']))
@timeit
def main():
ips = ['12.13.14.%s' %(i+1) for i in range(10)]
threads = []
for ip in ips:
t = threading.Thread(target=get_addr,args=(ip,))
threads.append(t)
t.start()
[thread.join() for thread in threads]
if __name__ == '__main__':
main()
使用多线程时访问url不需要逐个访问,运行速度更快:
方法二:继承方式
import threading
import time
# 定义一个 MyThread 类,继承于 threading.Thread
class MyThread(threading.Thread):
# run方法不能传参数,传参数需要写构造方法
def __init__(self, jobName):
super(MyThread, self).__init__()
self.jobName = jobName
# 重写run方法, 实现多线程, 因为start方法执行时, 调用的是run方法
# run方法里面编写的内容就是你要执行的任务
def run(self):
print("这是一个需要执行的任务%s。。。。。" %(self.jobName))
print("当前线程的个数:", threading.active_count() )
time.sleep(1)
print("当前线程的信息:", threading.current_thread())
if __name__ == '__main__':
t1 = MyThread("name1")
t2 = MyThread("name2")
t1.start()
t2.start()
t1.join()
t2.join()
print("程序执行结束.....")
使用多线程(继承)案例
import json
import threading
from urllib.request import urlopen
import time
def timeit(f):
def wrapper(*args,**kwargs):
start_time = time.time()
res = f(*args,**kwargs)
end_time = time.time()
print('%s函数运行时间:%.2f' %(f.__name__,end_time - start_time))
return res
return wrapper
class MyThread(threading.Thread):
def __init__(self,ip):
super(MyThread,self).__init__()
self.ip = ip
def run(self):
url = 'http://ip-api.com/json/%s' %(self.ip)
urlObj = urlopen(url)
pageContent = urlObj.read().decode('utf-8')
dict_pageContent = json.loads(pageContent)
print("""
%s
所在城市:%s
所在国家:%s
""" %(self.ip,dict_pageContent['city'],dict_pageContent['country']))
@timeit
def main():
ips = ['12.13.14.%s' %(i+1) for i in range(10)]
threads = []
for ip in ips:
t = MyThread(ip)
threads.append(t)
t.start()
[thread.join() for thread in threads]
if __name__ == '__main__':
main()
多线程案例
import threading
import time
def timeit(f):
def wrapper(*args, **kwargs):
start_time = time.time()
res = f(*args, **kwargs)
end_time = time.time()
print('%s函数运行时间:%.2f' % (f.__name__, end_time - start_time))
return res
return wrapper
def download_music(name):
time.sleep(2)
print('download music %s ......' % (name))
def download_movie(name):
time.sleep(2)
print('download movie %s ......' % (name))
@timeit
def main():
t1 = threading.Thread(target=download_music,args=('Someone like you',))
t2 = threading.Thread(target=download_movie,args=('无名之辈',))
t1.start()
t2.start()
t1.join()
t2.join()
print('download finish ......')
if __name__ == '__main__':
main()
线程与GIL
python使用多线程, 一定运行速度快么? 为什么?
多线程的应用场景: I/O密集型(input, output) ---- 爬虫
不建议使用多线程的场景:计算密集型(cpu一直占用)
因为传说中的GIL(全局解释锁)的存在,导致了同一时刻内,python的线程只有一条在CPU里面运行。
GIL锁:
- GIL(global interpreter lock);
- 作用:限制多线程同时执行,保证同一时间内只有一个线程在执行;
- 执行过程:
- 1). 设置一个GIL;
- 2). 切换线程去准备执行任务(Runnale就绪状态);
- 3). 运行;
- 4). 可能出现的状态:
- 线程任务执行结束
- time.sleep()
- 需要获取其他的信息才能继续执行(eg:读取文件,需要从网络下载html网页) - 5). 将线程设置为睡眠状态;
- 5). 解GIL的锁;
import threading
from mytimeit import timeit # 编写一个计算时间的装饰器,并导入
def job(li):
return sum(li)
# 使用多线程
@timeit
def use_thread():
li = [i for i in range(1, 10001)]
# 创建五个线程
threads = []
for i in range(5):
t = threading.Thread(target=job, args=(li,))
t.start()
threads.append(t)
[thread.join() for thread in threads]
# 不使用多线程
@timeit
def use_no_thread():
li = range(1, 10001)
for i in range(5):
job(li)
if __name__ == '__main__':
use_thread()
use_no_thread()
线程与队列
- 理论上多线程执行任务是不能获取返回结果的, 因此需要一个容器来存储产生的数据;
- 容器该如何选择?
-
list
队列:先进先出
栈:后进先出 -
tuple ---- 元组是不可变数据类型, 不可使用
-
set ---- 集合默认会去重, 所以不选择
-
dict ---- 返回结果不一定就是字典类型
- 选择队列类型存储(FIFO ==== first input first output)
import threading
from mytimeit import timeit
from queue import Queue
def job(li, queue):
# 将任务的执行结果存储到队列中
queue.put(sum(li))
@timeit
def use_thread():
# 实例化一个队列,用来存储每个线程执行的结果
q = Queue()
# q.get() -- 出队 ,q.put(value) -- 入队
lis = [range(5), range(2, 10), range(1000, 20000), range(3000, 10000)]
threads = []
for li in lis:
t = threading.Thread(target=job, args=(li, q))
t.start()
threads.append(t)
[thread.join() for thread in threads]
# 从队列中拿出所有线程执行的结果
results = [q.get() for li in lis]
print(results)
if __name__ == '__main__':
use_thread()
生产者-消费者模型
什么是生产者-消费者模型?
生产者:某个模块专门负责生产数据,可以认为是工厂;
消费者:另一个模块负责对生产的数据进行处理,可以认为是顾客;
缓冲区:在生产者和消费者之间加个缓冲区(队列queue实现),可以认为是商店;
生产者 ----> 缓冲区 -----> 消费者
优点:
1). 解耦:生产者和消费者的依赖关系减少;
2). 支持并发:是两个独立的个体, 可并发执行。
实现生产者-消费者模型:
需求: 给定200个ip地址,可能开放端口为80、443、7001、7002、8000、8080、9000(flask)、9001,以http://ip:port形式访问页面以判断是否正常访问
分析:
实现页面访问:
1). 构建所有的url地址;===存储到一个数据结构中
2). 依次判断url址是否可以成功访问
实现多线程(两种方法):
1). 实例化对象threading.Thread
2). 自定义类, 继承threading.Thread, 重写run方法(存储任务程序)
类的继承实现生产者-消费者模型:
import threading
from queue import Queue
from urllib.request import urlopen
import time
def create_data():
"""创建测试数据, 文件中生成200个IP"""
with open('doc/ips.txt', 'w') as f:
for i in range(200):
f.write('172.25.254.%s\n' % (i + 1))
print('测试数据创建完成!')
class Producer(threading.Thread):
def __init__(self, queue):
super(Producer, self).__init__()
self.q = queue
def run(self):
"""生产测试需要的url地址:http://ip:port"""
ports = [80, 443, 7001, 7002, 8000, 8080, 9000]
with open('doc/ips.txt') as f:
for line in f:
ip = line.strip()
for port in ports:
url = 'http://%s:%s' % (ip, port)
time.sleep(2)
self.q.put(url)
print('生产者生产url:%s' % (url))
class Consumer(threading.Thread):
def __init__(self, queue):
super(Consumer, self).__init__()
self.q = queue
def run(self):
url = self.q.get()
try:
urlObj = urlopen(url)
except Exception as e:
print('%s不可访问' %(url))
else:
pageContentSize = len(urlObj.read().decode('utf-8'))
print('%s可以访问,页面大小为%s' %(url,pageContentSize))
def main():
q = Queue()
p = Producer(q)
p.start()
for i in range(400):
c = Consumer(q)
c.start()
if __name__ == '__main__':
# 执行程序前创建测试数据
# create_data()
main()
线程同步之线程锁
为什么需要线程锁?
多个线程对同一个数据进行修改时, 可能会出现不可预料的情况.
示例:模拟银行存钱和取钱的过程
import threading
def add():
# 声明money为全局变量
global money
for i in range(100000):
money += 1
def reduce():
global money
for i in range(100000):
money -= 1
if __name__ == '__main__':
money = 0
t1 = threading.Thread(target=add)
t2 = threading.Thread(target=reduce)
t1.start()
t2.start()
t1.join()
t2.join()
print('当前金额:%s' % (money))
当对money加100000再减100000后,money应仍为初值0,但是结果却有些出乎意料:
如何实现线程锁?
- 实例化一个锁对象
lock = threading.Lock() - 操作变量之前进行加锁
lock.acquire() - 操作变量之后进行解锁
lock.release()
实现线程锁
实例化对象方式
import threading
def add():
# 声明money为全局变量
global money
for i in range(100000):
# 2. 操作变量之前进行加锁
lock.acquire()
money += 1
# 3. 操作变量之后进行解锁
lock.release()
def reduce():
global money
for i in range(100000):
lock.acquire()
money -= 1
lock.release()
if __name__ == '__main__':
money = 0
# 1. 实例化一个锁对象
lock = threading.Lock()
t1 = threading.Thread(target=add)
t2 = threading.Thread(target=reduce)
t1.start()
t2.start()
t1.join()
t2.join()
print('当前金额:%s' % (money))
类的继承方式
import threading
class AddThread(threading.Thread):
def __init__(self, lock):
super(AddThread, self).__init__()
self.lock = lock
def run(self):
global money
for i in range(100000):
self.lock.acquire()
money += 1
self.lock.release()
class ReduceThread(threading.Thread):
def __init__(self, lock):
super(ReduceThread, self).__init__()
self.lock = lock
def run(self):
global money
for i in range(100000):
self.lock.acquire()
money -= 1
self.lock.release()
if __name__ == '__main__':
money = 0
lock = threading.Lock()
t1 = AddThread(lock)
t2 = ReduceThread(lock)
t1.start()
t2.start()
t1.join()
t2.join()
print('当前金额:%s' % (money))
多线程下载器
下载器
from urllib.request import urlopen
DOWNLOAD_DIR = 'doc'
def download(url):
try:
imgContent = urlopen(url,timeout=3).read()
except Exception as e:
print('download %s error: ' %(url) ,e)
imgContent = None
else:
# 文件名取url的最后一个/后的内容
filename = url.split('/')[-1]
# 将下载的图片保存到doc目录下
# 'wb' ==== 写的是二进制文件(图片,视频,动图,.pdf)
with open('%s/%s' %(DOWNLOAD_DIR,filename),'wb') as f:
f.write(imgContent)
print('%s下载成功' %(filename))
url = 'http://pic34.photophoto.cn/20150127/0006019093196381_b.jpg'
download(url)
doc目录下找下载成功的图片:0006019093196381_b.jpg
当文件特别大时,可以分块下载:
from urllib.request import urlopen
DOWNLOAD_DIR = 'doc'
def download(url):
try:
urlObj = urlopen(url,timeout=3)
except Exception as e:
print('download %s error: ' %(url) ,e)
imgContent = None
else:
filename = url.split('/')[-1]
# 分块下载:'wb'会覆盖前面写入的部分,所以应该用'ab'
with open('%s/%s' %(DOWNLOAD_DIR,filename),'ab') as f:
# 每次只读取固定大小,防止占用内存过大
while True:
imgContentChunk = urlObj.read(1024 * 3)
if not imgContentChunk:
break
f.write(imgContentChunk)
print('%s下载成功' %(filename))
url = 'http://pic34.photophoto.cn/20150127/0006019093196381_b.jpg'
download(url)
实现多线程下载器
当你创建用户界面并想保持界面的可用性时,线程就特别有用,没有线程,用户界面将变得迟钝;
当你下载一个大文件或者执行一个庞大的数据库查询命令时用户界面会长时间无响应。
为了防止这样情况发生,你可以使用多线程来处理运行时间长的进程并且在完成后返回界面进行交互。
from urllib.request import urlopen
import threading
DOWNLOAD_DIR = 'doc'
class DownloadThread(threading.Thread):
def __init__(self, url):
super(DownloadThread, self).__init__()
self.url = url
def run(self):
try:
urlObj = urlopen(self.url, timeout=3)
except Exception as e:
print('download %s error\n' % (self.url),e)
imgContent = None
else:
filename = self.url.split('/')[-1]
with open('%s/%s' % (DOWNLOAD_DIR, filename), 'ab') as f:
while True:
imgContentChunk = urlObj.read(1024 * 3)
if not imgContentChunk:
break
f.write(imgContentChunk)
print('%s下载成功' % (filename))
url1 = 'ftp://172.25.254.250/pub/book/python/01_python.pdf'
url2 = 'ftp://172.25.254.250/pub/book/python/01_interpy-zh.pdf'
url3 = 'ftp://172.25.254.250/pub/book/python/01_python-doc-27-34.pdf'
urls = [url1, url2, url3]
for url in urls:
thread = DownloadThread(url)
thread.start()
线程池
线程池: 看作是一个池子, 只放指定个线程(10个线程)
为什么要使用线程池?
线程在创建和销毁过程中需要耗费资源和时间,使用线程池减少在创建和销毁线程上所花的时间以及系统资源的开销
submit方法
# python3.2版本之后才有的;
from concurrent.futures import ThreadPoolExecutor
def job(num):
# 需要执行的任务
print("这是一个%s任务" %(num))
return "执行结果:%s" %(num)
if __name__ == '__main__':
# 1. 实例化线程池对象,线程池里面包含5个线程执行任务
pool = ThreadPoolExecutor(max_workers=5)
futures = []
for i in range(10):
# 2. 往线程池里面扔需要执行的任务, 返回的是一个对象(_base.Future())
f1 = pool.submit(job, i)
futures.append(f1)
# 判断第一个任务是否执行结束
print(futures[0].done())
# 获取第一个任务的执行结果
print(futures[0].result())
任务执行结果:
submit应用
线程池里面的线程越多越好吗?
示例:多线程获取ip的地理位置
import json
import time
from concurrent.futures import ThreadPoolExecutor,wait
from urllib.request import urlopen
def timeit(f):
def wrapper(*args, **kwargs):
start_time = time.time()
res = f(*args, **kwargs)
end_time = time.time()
print('%s函数运行时间:%.8f' % (f.__name__, end_time - start_time))
return res
return wrapper
def get_area(ip):
url = 'http://ip-api.com/json/%s' % (ip)
urlObj = urlopen(url)
pageContent = urlObj.read().decode('utf-8')
dict_pageContent = json.loads(pageContent)
print("""
%s
所在城市:%s
所在国家:%s
""" %(ip,dict_pageContent['city'],dict_pageContent['country']))
@timeit
def use_ten_thread():
# 实例化线城池对象,线城池里面包含10个线程执行任务
pool = ThreadPoolExecutor(max_workers=10)
futures = []
for i in range(30):
ip = '12.13.14.%s' %(i+1)
f = pool.submit(get_area,ip)
futures.append(f)
# 等待futures里面所有的子线程执行结束, 再执行主线程(join())
wait(futures)
@timeit
def use_hundred_thread():
# 实例化线城池对象,线城池里面包含100个线程执行任务
pool = ThreadPoolExecutor(max_workers=100)
futures = []
for i in range(30):
ip = '12.13.14.%s' %(i+1)
f = pool.submit(get_area,ip)
futures.append(f)
wait(futures)
if __name__ == '__main__':
use_ten_thread()
use_hundred_thread()
10个线程:
100个线程:
map方法
from concurrent.futures import ThreadPoolExecutor
def job(num):
print('这是任务%s' %(num))
return '执行结果:%s' %(num)
if __name__ == '__main__':
pool = ThreadPoolExecutor(max_workers=5)
i = [item for item in range(10)]
pool.map(job,i)
map应用
import time
import json
from concurrent.futures import ThreadPoolExecutor
from urllib.request import urlopen
def timeit(f):
def wrapper(*args, **kwargs):
start_time = time.time()
res = f(*args, **kwargs)
end_time = time.time()
print("%s函数运行时间:%.2f" % (f.__name__, end_time - start_time))
return res
return wrapper
def get_area(ip):
url = "http://ip-api.com/json/%s" % (ip)
urlObj = urlopen(url)
pageContent = urlObj.read().decode('utf-8')
dict_data = json.loads(pageContent)
print("""
%s
所在城市: %s
所在国家: %s
""" % (ip, dict_data['city'], dict_data['country']))
@timeit
def use_ten_thread():
# 实例化线城池对象,线城池里面包含10个线程执行任务
pool = ThreadPoolExecutor(max_workers=10)
ips = ['12.13.14.%s' % (ip + 1) for ip in range(30)]
pool.map(get_area, ips)
@timeit
def use_hundred_thread():
# 实例化线城池对象,线城池里面包含100个线程执行任务
pool = ThreadPoolExecutor(max_workers=100)
ips = ['12.13.14.%s' % (ip + 1) for ip in range(30)]
pool.map(get_area, ips)
if __name__ == '__main__':
use_ten_thread()
use_hundred_thread()
线程池实现生产者-消费者模型
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
def create_data():
with open('doc/ips.txt', 'w') as f:
for i in range(200):
f.write('172.25.254.%s\n' % (i + 1))
print('测试数据创建完成!')
def producer(url):
"""生产测试需要的url地址:http://ip:port"""
print('生产者生产url:%s' % (url))
return url
def consumer(future):
# 获取consumer的返回值
url = future.result()
try:
urlObj = urlopen(url)
except Exception as e:
print('%s不可访问' % (url))
else:
pageContentSize = len(urlObj.read().decode('utf-8'))
print('%s可以访问,页面大小为%s' % (url, pageContentSize))
def main():
pool = ThreadPoolExecutor(max_workers=5)
ports = [80, 443, 7001, 7002, 8000, 8080, 9000]
with open('doc/ips.txt') as f:
for line in f:
ip = line.strip()
for port in ports:
url = 'http://%s:%s' % (ip, port)
# producer函数的返回值会回调给consumer函数
res = pool.submit(producer,url).add_done_callback(consumer)
if __name__ == '__main__':
# create_data()
main()