python asyncio模块
1.事件循环
asyncio是python用于解决异步io编程的一整套解决方案
事件循环+回调(驱动生成器)+epoll(io多路复用)
基于asyncio有tornadoo、gevent、twisted(scrapy、django channels)
tornado是事件循环+协程,实现了web服务器
django+flask不提供web服务器,会加(uwsgi、gunicore+nignx)
tornado可以直接部署,但还会加nignx
# 使用asyncio
import asyncio
import time
async def get_html(url):
print("start get url")
await asyncio.sleep(2) # 在协程当中要等待 必须加await,不然有异常
print("end get url")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [get_html("http://www.imooc.com") for i in range(10)]
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start_time)
注意不能使用 time.sleep(),这种同步阻塞的接口不能用在协程里面,协程是单线程的,用同步阻塞就真的全阻塞了,和顺序执行差不多
wait如果ctrl+b进入后会发现是一个协程,满足协程wait驱动的协程后会执行loop.run_until_complete后的代码
协程有点像线程池,可以理解为协程池,协程也应该能获取任务状态
Future类型是对于具体的Runnable或者Callable任务的执行结果进行取消、查询是否完成、获取结果的接口。必要时可以通过get方法获取执行结果,该方法会阻塞直到任务返回结果。
import asyncio
import time
from functools import partial
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
return "bobby"
'''task加的回调会默认把当前的任务的future传入,用partial来传参的话参数必须写在future前面'''
def callback(url, future):
print(url)
print("send email to bobby")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
# 下一行的ensure_future尽管没有提到loop,但已经把任务注册到loop中。一个线程只能有一个loop
# get_future = asyncio.ensure_future(get_html("http://www.imooc.com"))
# 上一行和下一行等效,上一行返回一个future对象,下一行返回一个task对象,task是future的子类。其实ensure_future内部还是调用了create_task
task = loop.create_task(get_html("http://www.imooc.com"))
task.add_done_callback(partial(callback, "http://www.imooc.com")) # 加回调,百度partial看用法
loop.run_until_complete(task) # 这个很灵活,可以接task也可以接future
print(task.result()) # 获取协程任务返回值
start get url
http://www.imooc.com
send email to bobby
bobby
关于wait和gather
import asyncio
import time
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
print("end get url")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [get_html("http://www.imooc.com") for i in range(10)]
# loop.run_until_complete(asyncio.gather(*tasks))
# print(time.time()-start_time)
#gather和wait的区别
#gather更加high-level,gather可以进行任务分组,可以取消小组任务
group1 = [get_html("http://projectsedu.com") for i in range(2)]
group2 = [get_html("http://www.imooc.com") for i in range(2)]
group1 = asyncio.gather(*group1) # 记得加星号
group2 = asyncio.gather(*group2)
group2.cancel()
loop.run_until_complete(asyncio.gather(group1, group2))
print(time.time() - start_time)
2.协程取消与嵌套
1. run_until_complete
import asyncio
loop = asyncio.get_event_loop()
loop.run_forever() # loop不停止
loop.run_until_complete() # loop完成指定协程后停止
2. loop本身也会被放到任务的future中,目的是当执行完future任务之后启动一个回调函数,
回调函数停止future中的loop;这里有个环,loop里面有future,future里面有loop
3. 取消future(task)
async def get_html(sleep_times):
print("waiting")
await asyncio.sleep(sleep_times)
print("done after {}s".format(sleep_times))
if __name__ == "__main__":
task1 = get_html(2)
task2 = get_html(3)
task3 = get_html(3)
tasks = [task1, task2, task3]
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(asyncio.wait(tasks))
except KeyboardInterrupt as e:
all_tasks = asyncio.Task.all_tasks()
print(all_tasks)
for task in all_tasks:
print(task)
print("cancel task")
print(task.cancel())
loop.stop()
loop.run_forever() # 一定要加这句,不然抛异常
finally:
loop.close()
命令行执行,ctrl+c打断
<Task pending coro=<wait() running at C:\Program Files\Python37\lib\asyncio\tasks.py:389> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x000001482C779A08>()]>>
cancel task
True
<Task pending coro=<get_html() running at temp.py:6> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x000001482C779B28>()]> cb=[_wait.<locals>._on_completion() at C:\Program Files\Python37\lib\asyncio\tasks.py:466]>
cancel task
True
<Task pending coro=<get_html() running at temp.py:6> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x000001482C794138>()]> cb=[_wait.<locals>._on_completion() at C:\Program Files\Python37\lib\asyncio\tasks.py:466]>
cancel task
True
<Task pending coro=<get_html() running at temp.py:6> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x000001482C7C13A8>()]> cb=[_wait.<locals>._on_completion() at C:\Program Files\Python37\lib\asyncio\tasks.py:466]>
cancel task
True
关于嵌套协程,
await就像 yield from,为调用方和子生成器建立了通道,当子生成器实例一个awaitable的对象,可以由通道将执行权还给loop
3.asyncio中的其他函数
call_soon
import asyncio
def callback(sleep_times):
print("sleep {} success".format(sleep_times))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.call_soon(callback, 2) # 加入当前任务队列尾
loop.run_forever() # 注意这里不是协程
call_soon会在下一个循环立即执行,不是马上执行
执行完任务后,loop不会停止。停止需要
import asyncio
def callback(sleep_times):
print("sleep {} success".format(sleep_times))
def stoploop(loop):
loop.stop()
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.call_soon(callback, 2)
loop.call_soon(stoploop, loop)
loop.run_forever() # 注意这里不是协程
call_later
指定时间后运行
call_soon可以在call_later前运行
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.call_later(1, callback, 1) # 1sec
loop.call_later(3, callback, 3) # 3secs
loop.call_soon(callback, 2)
# loop.call_soon(stoploop, loop)
loop.run_forever() # 注意这里不是协程
sleep 2 success
sleep 1 success
sleep 3 success
call_at
系统时间
def callback(sleep_times, loop):
print("loop time {} success".format(loop.time()))
def stoploop(loop):
loop.stop()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
now = loop.time()
loop.call_at(now+2, callback, 2, loop)
loop.call_at(now+1, callback, 1, loop)
loop.call_at(now+3, callback, 3, loop)
# loop.call_soon(stoploop, loop)
loop.call_soon(callback, 4, loop)
loop.run_forever()
loop time 7305.859 success
loop time 7306.859 success
loop time 7307.859 success
loop time 7308.859 success
call_soon_threadsafe
线程安全
在选择的线程的loop的任务列表尾部插入任务
用于变量共享
以上方法比较底层
4.threadpoolexecutor+asyncio
线程+协程,牛逼大了
异步io包括多线程、协程、进程,async提供了完整解决方案
使用多线程
之前socket的一篇有过一段是用socket实现请求url的,这里会用到
#使用多线程:在携程中集成阻塞io
import asyncio
from concurrent.futures import ThreadPoolExecutor
import socket
from urllib.parse import urlparse
def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
#建立socket连接
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# client.setblocking(False)
client.connect((host, 80)) #阻塞不会消耗cpu
#不停的询问连接是否建立好, 需要while循环不停的去检查状态
#做计算任务或者再次发起其他的连接请求
client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
data = b""
while True:
d = client.recv(1024)
if d:
data += d
else:
break
data = data.decode("utf8")
html_data = data.split("\r\n\r\n")[1]
print(html_data)
client.close()
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
executor = ThreadPoolExecutor(3)
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
task = loop.run_in_executor(executor, get_url, url) # 避免了再开loop #
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
print("last time:{}".format(time.time()-start_time))
确实是多线程了
看看源码
def run_in_executor(self, executor, func, *args):
self._check_closed()
if self._debug:
self._check_callback(func, 'run_in_executor')
if executor is None:
executor = self._default_executor
if executor is None:
executor = concurrent.futures.ThreadPoolExecutor()
self._default_executor = executor # 如果没有executor就先创建一个
return futures.wrap_future(
executor.submit(func, *args), loop=self) # 把executor包装成loop的未来对象
协程套线程
把一个网页的获取完全包装在一个线程里
5.asyncio模拟http请求
如果纯手打,可能要 事件循环+回调(处理send和recv)+协程(多个url)+线程(内部建立连接)
如果是 I/O 密集型,且 I/O 请求比较耗时的话,使用协程。
如果是 I/O 密集型,且 I/O 请求比较快的话,使用多线程。
如果是 计算 密集型,考虑可以使用多核 CPU,使用多进程。
asyncio没有提供http级别的接口
aiohttp可以
下面模拟aiohttp
注意上上一段代码单个url的获取是在一条完整线程内的,包括请求网页和处理网页,其中请求网页有io操作,这里又可以外包给协程
所以层次是:协程套线程套协程
好消息是asyncio提供了一个建立连接的方法await asyncio.open_connection()
open_connection中的重要逻辑
if loop is None:
loop = events.get_event_loop()
reader = StreamReader(limit=limit, loop=loop)
protocol = StreamReaderProtocol(reader, loop=loop)
transport, _ = await loop.create_connection(
lambda: protocol, host, port, **kwds)
writer = StreamWriter(transport, protocol, reader, loop)
return reader, writer
封装了一个reader类和writer类并返回,主要还是await loop.create_connection,看看其主要逻辑
其中有
infos = await self._ensure_resolved(
(host, port), family=family,
type=socket.SOCK_STREAM, proto=proto, flags=flags, loop=self)
关于ensure_resolved又有
async def _ensure_resolved(self, address, *,
family=0, type=socket.SOCK_STREAM,
proto=0, flags=0, loop):
host, port = address[:2]
info = _ipaddr_info(host, port, family, type, proto, *address[2:])
if info is not None:
# "host" is already a resolved IP.
return [info]
else:
return await loop.getaddrinfo(host, port, family=family, type=type,
proto=proto, flags=flags)
其中的loop.getaddrinfo可以看到
async def getaddrinfo(self, host, port, *,
family=0, type=0, proto=0, flags=0):
if self._debug:
getaddr_func = self._getaddrinfo_debug
else:
getaddr_func = socket.getaddrinfo
return await self.run_in_executor(
None, getaddr_func, host, port, family, type, proto, flags)
loop.getaddrinfo解析url
最后的run_in_executor上面已经有了,会把线程池包装成未来对象放入loop日程
这里io会比较快,选线程比较好
这个未来对象作为ensure_resolved的返回值给了info
info所在函数接着运行,最终还是有
await self.sock_connect(sock, address)
给这个躲不开的建立连接的时间放了协程
使用await asyncio.open_connection()
:
#使用多线程:在携程中集成阻塞io
import asyncio
from concurrent.futures import ThreadPoolExecutor
import socket
from urllib.parse import urlparse
async def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
#建立socket连接
reader, writer = await asyncio.open_connection(host, 80)
writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
all_lines = []
'''新的async语法,异步化for循环遍历的过程,内部是anext迭代协议,包装成了协程'''
async for raw_line in reader:
data = raw_line.decode("utf8")
all_lines.append(data)
html = "\n".join(all_lines)
return html
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
tasks.append(get_url(url))
loop.run_until_complete(asyncio.wait(tasks))
print("last time:{}".format(time.time()-start_time))
last time:0.5226032733917236
快了大概几倍
tasks里边存的是协程对象
如果要在外边输出,需要将协程对象包装成未来对象,未来对象可以获取结果
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
tasks.append(asyncio.ensure_future(get_url(url)))
loop.run_until_complete(asyncio.wait(tasks))
for task in tasks:
print(task.result())
print("last time:{}".format(time.time()-start_time))
......
last time:0.43235325813293457
快的一笔
如果想获取一个html就打印一个html呢?
async def main():
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
tasks.append(asyncio.ensure_future(get_url(url)))
for task in asyncio.as_completed(tasks): # 和线程池的写法一样,这里直接返回未来对象结果了
result = await task
print(result)
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print("last time:{}".format(time.time()-start_time))
6.关于future和task
asyncio的future和线程的future差不多一样
future是一个结果容器,task是future的子类,是协程和future之间桥梁,解决了协程的激活问题,处理了stopiteration的值
保持了接口的一致
7.asyncio的同步和通信
asyncio不需要锁
total = 0
async def add():
global total
for i in range(1000000):
total += 1
async def desc():
global total
for i in range(1000000):
total -= 1
if __name__ == '__main__':
import asyncio
tasks = [add(), desc()]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print(total)
0
由于是单线程,不涉及await操作/io操作,都会执行完上一段代码再执行下一段代码
但有时候我们需要lock来进行同步,比如避免一个网址同一时间被访问两次的可能
import asyncio
from asyncio import Lock
import aiohttp
cache = {}
lock = Lock()
async def get_stuff(url):
# with await lock: 或者 async with lock:都得,因为lock实现了上下文管理器协议
await lock.acquire()
if url in cache:
return cache[url]
stuff = await aiohttp.request('GET', url)
cache[url] = stuff
lock.release()
return stuff
async def parse_stuff(stuff):
url_stuff = await get_stuff(stuff)
async def use_stuff(stuff):
url_stuff = await get_stuff(stuff)
lock的源码:
内部申请了一个未来对象,并加入waiters队列,然后用await fut阻塞控制权,控制权会交由上一级,即lock锁住的代码,直到release被调用waiters中的fut才被置为为true,aquire的代码继续运行到完成
async def acquire(self):
"""Acquire a lock.
This method blocks until the lock is unlocked, then sets it to
locked and returns True.
"""
if not self._locked and all(w.cancelled() for w in self._waiters):
self._locked = True
return True
fut = self._loop.create_future()
self._waiters.append(fut)
# Finally block should be called before the CancelledError
# handling as we don't want CancelledError to call
# _wake_up_first() and attempt to wake up itself.
try:
try:
await fut
finally:
self._waiters.remove(fut)
except futures.CancelledError:
if not self._locked:
self._wake_up_first()
raise
self._locked = True
return True
就是加了一层await使得控制权不会被交直接给loop,从而保证顺序执行lock锁住的代码
asyncio有自己的Queue
由于是单线程,如果不是为了限流,声明一个全局的queue就好
8.aiohttp爬虫
由于jobbole已经挂了,以下代码仅供参考
import aiohttp
import aiomysql
from pyquery import PyQuery
import asyncio
import re
stopping = False
start_url = "http://www.jobbole.com"
waitting_urls = []
seen_urls = set() # 用来去重
sem = asyncio.Semaphore(3)
async def fetch(url, session):
async with sem:
try:
async with session.get(url) as resp:
print('url status: ', resp.status)
if resp.status in [200, 201]:
data = await resp.text()
return data
except Exception as e:
print(e)
def extract_urls(html):
urls = []
pq = PyQuery(html)
for link in pq.items("a"):
url = link.attr("href")
if url and url.startswith("http") and url not in seen_urls:
urls.append(url)
waitting_urls.append(url)
return urls
async def init_urls(url, session):
html = await fetch(url, session)
seen_urls.add(url)
extract_urls(html)
# 真正逻辑
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stopping:
if len(waitting_urls) == 0:
# 处理列表为空的逻辑
await asyncio.sleep(0.5)
continue
url = waitting_urls.pop()
print("start get url: {} ".format(url))
if re.match('http://.*?jobbole.com/\d+/', url): # 进一步筛选
if url not in seen_urls:
asyncio.ensure_future(article_handler(url, session, pool))
# await asyncio.sleep(30) # 调试使用
else:
if url not in seen_urls:
asyncio.ensure_future(init_urls(url, session))
async def article_handler(url, session, pool):
# 获取文章详情并解析数据
html = await fetch(url, session)
seen_urls.add(url)
extract_urls(html)
pq = PyQuery(html)
title = pq("title").text()
async with pool.acquire() as conn:
async with conn.cursor() as cur:
await cur.execute("SELECT 42;")
insert_sql = "insert to article_test(title) values('{}')".format(title)
await cur.execute(insert_sql)
async def main(loop):
pool = await aiomysql.create_pool(host='127.0.0.1', port=3306,
user='root', password='',
db='aiomysql_test', loop=loop,
charset='utf8', autocommit=True,
) # 注意最后两个坑
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_urls.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
到这里,所有菜鸟进阶已经完成;
下一步,你可以:
1.自行查看学习python官方文档https://docs.python.org/zh-tw/3/index.html
2.开始python爬虫和各种框架的学习
3.继续深入学习计算机知识,学习源码,开发自己的库
😙❤
2020.8.3