Python慢的主要原因
- 解释性语言, 需要边解释, 边执行
- GIL限制, 不能发挥多核cpu的性能
GIL
全局解释锁
- GIL是cpython解释器遗留的问题, 比如jpython解释器就没有GIL
- GIL使每个线程在执行的过程中都需要先获取GIL, 保证同一时刻只有一个线程可以执行代码.
- 线程释放GIL的情况: 在I/O操作等可能会引起阻塞的system call之前,可以暂时释放GIL, 阻塞完毕后, 必须重新获取GIL才能继续执行, Python3使用计时器(当执行时间达到阈值后, 当前线程释放GIL)
- Python中使用多进程可以利用多核CPU资源
例子: 内存回收机制的引用计数器和多线程冲突
针对python
多进程可以利用多核资源, 但是也占用了更多的资源, 进程之间的资源不能共享
多次线程, 协程均只用了cpu的一个核, 占用更少的资源, 可以缓解i/o等待(网络/读写(cpu和磁盘速度不匹配问题))问题
计算密集型 : 选择多进程
i/o密集型 : 选择线程, 协程
进程和线程:
进程包含线程,一个进程中包含多个线程.
线程是cpu调度和分配的基本单位,进程是操作系统进行资源分配(cpu,内存,硬盘io等)的最小单位.
单核cpu:
实现多进程依靠于操作系统的进程调度算法,比如时间片轮转算法,比如有3个正在运行的程序(即三个进程),操作系统会让单核cpu轮流来运行这些进程,然后一个进程只运行2ms,这样看起来就像多个进程同时在运行,从而实现多进程.
多线程其实是最大限度的利用cpu资源.一个拥有两个线程的进程的执行时间可能比一个线程的进程执行两遍的时间还长一点,因为线程的切换也需要时间.即采用多线程可能不会提高程序的运行速度,反而会降低速度,但是对于用户来说,可以减少用户的响应时间.
多核cpu:
什么是多核cpu?多核cpu是一枚处理器中集成多个完整的计算引擎(内核).
多核cpu和单核cpu对于进程来说都是并发,并不是并行.
但是多核cpu每一个核心都可以独立执行一个线程,所以多核cpu可以真正实现多线程的并行.比如四核可以把线程1234分配给核心1234,如果还有线程567就要等待cpu的调度.线程1234属于并行;如果一会核心1停止执行线程1改为执行线程5,那线程15属于并发.
线程
多线程, 多线程通信, 生产者消费者模型
import queue
import time
import random
import threading
import requests
from bs4 import BeautifulSoup
# 生成50个url
urls = [
f"https://www.cnblogs.com/sitehome/p/{page}"
for page in range(1, 10)
]
# 获取对应url的html的数据
def craw(url):
r = requests.get(url)
return r.text
# 解析html的数据
def parse(html):
# class="post-item-title"
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-item-title")
return [(link["href"], link.get_text()) for link in links]
# 相当于生产者
def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
while True:
url = url_queue.get()
html = craw(url)
html_queue.put(html)
print(threading.current_thread().name, f"craw {url}",
"url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1, 2))
# 相当于消费者
def do_parse(html_queue: queue.Queue, fout):
while True:
html = html_queue.get()
results = parse(html)
for result in results:
fout.write(str(result) + "\n")
print(threading.current_thread().name, f"results.size", len(results),
"html_queue.size=", html_queue.qsize())
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
# 用于线程通信
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in urls:
url_queue.put(url)
for idx in range(3):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue),
name=f"craw{idx}")
t.start()
fout = open("02.data.txt", "w")
for idx in range(2):
t = threading.Thread(target=do_parse, args=(html_queue, fout),
name=f"parse{idx}")
t.start()
使用锁保证线程安全
import threading
import time
lock = threading.Lock()
class Account:
def __init__(self, balance):
self.balance = balance
def draw(account, amount):
with lock:
if account.balance >= amount:
time.sleep(0.1)
print(threading.current_thread().name,
"取钱成功")
account.balance -= amount
print(threading.current_thread().name,
"余额", account.balance)
else:
print(threading.current_thread().name,
"取钱失败,余额不足")
线程池
import concurrent.futures
import requests
from bs4 import BeautifulSoup
# 生成50个url
urls = [
f"https://www.cnblogs.com/sitehome/p/{page}"
for page in range(1, 10)
]
# 获取对应url的html的数据
def craw(url):
r = requests.get(url)
return r.text
# 解析html的数据
def parse(html):
# class="post-item-title"
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-item-title")
return [(link["href"], link.get_text()) for link in links]
# craw
with concurrent.futures.ThreadPoolExecutor() as pool:
# 第一种方式
htmls = pool.map(craw, urls)
htmls = list(zip(urls, htmls))
for url, html in htmls:
print(url, len(html))
print("craw over")
# parse
with concurrent.futures.ThreadPoolExecutor() as pool:
futures = {}
for url, html in htmls:
# 第二种方式
future = pool.submit(parse, html)
futures[future] = url
# 按顺序
# for future, url in futures.items():
# print(url, future.result())
# 不按顺序, 哪个任务先执行完就先返回
for future in concurrent.futures.as_completed(futures):
url = futures[future]
print(url, future.result())
线程池加速web服务
import flask
import json
import time
from concurrent.futures import ThreadPoolExecutor
app = flask.Flask(__name__)
pool = ThreadPoolExecutor()
def read_file():
time.sleep(0.1)
return "file result"
def read_db():
time.sleep(0.2)
return "db result"
def read_api():
time.sleep(0.3)
return "api result"
@app.route("/")
def index():
# 加速后下面三个操作的时间应该耗时最长的那个时间
result_file = pool.submit(read_file)
result_db = pool.submit(read_db)
result_api = pool.submit(read_api)
return json.dumps({
"result_file": result_file.result(),
"result_db": result_db.result(),
"result_api": result_api.result(),
})
if __name__ == "__main__":
app.run()
进程
对于cpu密集型单线程, 多线程, 多进程对比
import math
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
PRIMES = [112272535095293] * 20
# cpu密集型任务
def is_prime(n):
if n < 2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
def single_thread():
for number in PRIMES:
is_prime(number)
def multi_thread():
with ThreadPoolExecutor() as pool:
pool.map(is_prime, PRIMES)
def multi_process():
with ProcessPoolExecutor() as pool:
pool.map(is_prime, PRIMES)
if __name__ == "__main__":
start = time.time()
single_thread()
end = time.time()
print("single_thread, cost:", end - start, "seconds")
start = time.time()
multi_thread()
end = time.time()
print("multi_thread, cost:", end - start, "seconds")
start = time.time()
multi_process()
end = time.time()
print("multi_process, cost:", end - start, "seconds")
# single_thread, cost: 11.919013500213623 seconds
# multi_thread, cost: 12.123728036880493 seconds
# multi_process, cost: 3.5287539958953857 seconds
进程池加速web服务
import flask
from concurrent.futures import ProcessPoolExecutor
import math
import json
app = flask.Flask(__name__)
def is_prime(n):
if n < 2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
# http://127.0.0.1:5000/is_prime/1111111111112111111111111111111111111,22222222222222222222222222
@app.route("/is_prime/<numbers>")
def api_is_prime(numbers):
number_list = [int(x) for x in numbers.split(",")]
results = process_pool.map(is_prime, number_list)
return json.dumps(dict(zip(number_list, results)))
if __name__ == "__main__":
# 下面这句必须在app.run()前执行
process_pool = ProcessPoolExecutor()
app.run()
协程
协程, 多线程, 单线程对比
import asyncio
import aiohttp
import time
import threading
import requests
from bs4 import BeautifulSoup
# 生成50个url
urls = [
f"https://www.cnblogs.com/sitehome/p/{page}"
for page in range(1, 50)
]
# async声明这是一个协程
async def async_craw(url):
print("craw url: ", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
# await表示这里可能阻塞, 可以切换到其他协程
result = await resp.text()
print(f"craw url: {url}, {len(result)}")
# 获取超级循环
loop = asyncio.get_event_loop()
tasks = [loop.create_task(async_craw(url)) for url in urls]
# 协程
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("asyncio cost:", end - start, "seconds")
# 获取对应url的html的数据
def craw(url):
r = requests.get(url)
return r.text
# 解析html的数据
def parse(html):
# class="post-item-title"
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-item-title")
return [(link["href"], link.get_text()) for link in links]
def single_thread():
print("single_thread begin")
for url in urls:
craw(url)
print("single_thread end")
def multi_thread():
print("multi_thread begin")
threads = []
for url in urls:
threads.append(
threading.Thread(target=craw, args=(url,))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
start = time.time()
single_thread()
end = time.time()
print("single thread cost:", end - start, "seconds")
start = time.time()
multi_thread()
end = time.time()
print("multi thread cost:", end - start, "seconds")
# 其中多线程和协程对比, 多线程之间切换有开销
使用Semaphore控制协程并发个数
import asyncio
import aiohttp
# 并发的协程数为10
semaphore = asyncio.Semaphore(10)
# 生成50个url
urls = [
f"https://www.cnblogs.com/sitehome/p/{page}"
for page in range(1, 50)
]
async def async_craw(url):
async with semaphore:
print("craw url: ", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
await asyncio.sleep(5)
print(f"craw url: {url}, {len(result)}")
loop = asyncio.get_event_loop()
tasks = [loop.create_task(async_craw(url)) for url in urls]
import time
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("use time seconds: ", end - start)
参考文献: https://www.bilibili.com/video/BV1bK411A7tV?p=7