asyncio和 aiohttp
asyncio即Asynchronous I/O是python一个用来处理并发(concurrent)事件的包,是很多python异步架构的基础,多用于处理高并发网络请求方面的问题。
为了简化并更好地标识异步IO,从Python 3.5开始引入了新的语法async和await,可以让coroutine的代码更简洁易读。
asyncio 被用作多个提供高性能 Python 异步框架的基础,包括网络和网站服务,数据库连接库,分布式任务队列等等。
asyncio 往往是构建 IO 密集型和高层级 结构化 网络代码的最佳选择。
import asyncio
async def task(i):
print(f"task {i} start")
await asyncio.sleep(1)
print(f"task {i} end")
# 创建事件循环对象
loop = asyncio.get_event_loop()
# 直接将协程对象加入时间循环中
tasks = [task(1), task(2)]
# asyncio.wait:将协程任务进行收集,功能类似后面的asyncio.gather
# run_until_complete阻塞调用,直到协程全部运行结束才返回
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
task
: 任务,对协程对象的进一步封装,包含任务的各个状态;asyncio.Task是Future的一个子类,用于实现协作式多任务的库,且Task对象不能用户手动实例化,通过下面2个函数loop.create_task() 或 asyncio.ensure_future()
创建。
import asyncio, time
async def work(i, n): # 使用async关键字定义异步函数
print('任务{}等待: {}秒'.format(i, n))
await asyncio.sleep(n) # 休眠一段时间
print('任务{}在{}秒后返回结束运行'.format(i, n))
return i + n
start_time = time.time() # 开始时间
tasks = [asyncio.ensure_future(work(1, 1)),
asyncio.ensure_future(work(2, 2)),
asyncio.ensure_future(work(3, 3))]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
print('运行时间: ', time.time() - start_time)
for task in tasks:
print('任务执行结果: ', task.result())
3.8版本+ 特性
async.run()
运行协程
async.create_task()
创建task
async.gather()
获取返回值
import asyncio, time
async def work(i, n): # 使用async关键字定义异步函数
print('任务{}等待: {}秒'.format(i, n))
await asyncio.sleep(n) # 休眠一段时间
print('任务{}在{}秒后返回结束运行'.format(i, n))
return i + n
tasks = []
async def main():
global tasks
tasks = [asyncio.create_task(work(1, 1)),
asyncio.create_task(work(2, 2)),
asyncio.create_task(work(3, 3))]
await asyncio.wait(tasks) # 阻塞
start_time = time.time() # 开始时间
asyncio.run(main())
print('运行时间: ', time.time() - start_time)
for task in tasks:
print('任务执行结果: ', task.result())
asyncio.create_task() 函数在 Python 3.7 中被加入。
asyncio.gather
方法
# 用gather()收集返回值
import asyncio, time
async def work(i, n): # 使用async关键字定义异步函数
print('任务{}等待: {}秒'.format(i, n))
await asyncio.sleep(n) # 休眠一段时间
print('任务{}在{}秒后返回结束运行'.format(i, n))
return i + n
async def main():
tasks = [asyncio.create_task(work(1, 1)),
asyncio.create_task(work(2, 2)),
asyncio.create_task(work(3, 3))]
# 将task作为参数传入gather,等异步任务都结束后返回结果列表
response = await asyncio.gather(tasks[0], tasks[1], tasks[2])
print("异步任务结果:", response)
start_time = time.time() # 开始时间
asyncio.run(main())
print('运行时间: ', time.time() - start_time)
aiohttp
爬虫最重要的模块requests,但它是阻塞式的发起请求,每次请求发起后需阻塞等待其返回响应,不能做其他的事情。本文要介绍的aiohttp可以理解成是和requests对应Python异步网络请求库,它是基于 asyncio 的异步模块,可用于实现异步爬虫,有点就是更快于 requests 的同步爬虫。安装方式,pip install aiohttp。
aiohttp是一个为Python提供异步HTTP 客户端/服务端编程,基于asyncio
的异步库。asyncio可以实现单线程并发IO操作,其实现了TCP、UDP、SSL等协议,aiohttp就是基于asyncio实现的http框架, 使用方式如下。
import aiohttp
import asyncio
async def main():
async with aiohttp.ClientSession() as session:
async with session.get("http://httpbin.org/headers") as response:
print(await response.text())
asyncio.run(main())
案例
import asyncio
import os
import aiohttp
import time
from utils.aiorequests import aiorequest
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
url = "https://www.pkdoutu.com/photo/list/"
base_url = "https://www.xr02.vip/"
async def get_home_url():
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, ssl=False) as resp:
res = await resp.content.read()
selector = etree.HTML(res)
urls = selector.xpath('//ul/li[@class="i_list list_n2"]/a/@href')
return map(lambda i: base_url+i, urls)
async def get_page_url(urls):
async with aiohttp.ClientSession() as session:
async with session.get(urls, headers=headers, ssl=False) as resp:
res = await resp.content.read()
selector = etree.HTML(res)
page_urls = selector.xpath('//div[@class="page"]/a/@href')
return map(lambda i: base_url+i, set(page_urls))
async def get_img_url(urls):
async with aiohttp.ClientSession() as session:
async with session.get(urls, headers=headers, ssl=False) as resp:
res = await resp.content.read()
selector = etree.HTML(res)
name = selector.xpath("//h1/text()")[0].replace("[XiuRen秀人网]",'')
img_urls = selector.xpath('//p/img/@src')
return name, map(lambda i: base_url+i, img_urls)
async def download_img(urls, base_name):
name = os.path.basename(urls)
name = base_name + '_' + name
try:
async with aiohttp.ClientSession() as session:
async with session.get(urls, headers=headers, ssl=False) as resp:
res = await resp.content.read()
with open(f"./imgs/{name}","wb") as f:
f.write(res)
print(f"url: {urls} 下载成功,存储文件为{name}")
except:
print(f"url: {urls} 下载失败")
return "success"
async def main():
tasks_1 = [asyncio.create_task(get_page_url(i)) for i in await get_home_url()]
result_1 = await asyncio.gather(*tasks_1)
result_list = []
for i in result_1: result_list.extend(list(i))
tasks_2 = [asyncio.create_task(get_img_url(i)) for i in result_list]
result_2 = await asyncio.gather(*tasks_2)
tasks_3 = []
for name, img_url in result_2:
tasks_3.extend(asyncio.create_task(download_img(url, name)) for url in img_url)
await asyncio.gather(*tasks_3)
if __name__ == '__main__':
if not os.path.isdir("./imgs"):
os.mkdir("./imgs")
start = time.time()
asyncio.run(main())
print(time.time()-start)
通过这个案例,可以看到一个问题,那就是 aiohttp的使用,每次都需要写一堆重复代码,并且整个代码结构看起来复杂,作为一个高级开发,必须要会做的就是减少代码重复编写,要将其模块化,封装起来
优化方案
aiorequest.py
class AioRequest:
async def request(self, method: str, url: str, data: Union[Dict, bytes, None] = None, **kwargs: Any) -> Any:
async with aiohttp.ClientSession() as session:
async with session.request(method, url, ssl=False, data=data, **kwargs) as response:
if response.status != 200:
raise Exception(f"{method.upper()} request failed with status {response.status}")
# return await handler(await response.content.read()
# return 这里必须带上await,但不支持 await ClientResponse 对象直接返回 必须要处理响应数据
# 根据内容类型处理响应体
content_type = response.headers.get('Content-Type')
if content_type and ('image' in content_type or 'video' in content_type):
return await response.read() # 返回图片或视频的二进制数据
elif 'application/json' in content_type:
return await response.json() # 假设响应是JSON格式
else:
return await response.text() # 读取文本内容
async def get(self, url: str, **kwargs: Any):
return await self.request("GET", url, **kwargs)
async def post(self, url: str, data: Union[Dict, bytes], **kwargs: Any):
return await self.request("POST", url, data=data, **kwargs)
# 处理大文件
async def save_binary_content(self, url: str, file_path: str, headers: Dict[str, str] = None, **kwargs: Any):
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(url, ssl=False, **kwargs) as response:
if response.status != 200:
raise Exception(f"GET request failed with status {response.status}")
with open(file_path, 'wb') as f:
while True:
chunk = await response.content.read(1024) # 每次读取1024字节
if not chunk:
break
f.write(chunk)
aiorequest = AioRequest() # 减少对象的重复创建消耗内存
使用aiorequest 后,代码就简洁明了多了,
import asyncio
import os
import time
from utils.aiorequests import aiorequest
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
base_url = "https://www.xr02.vip/"
img_urls_dict = dict()
async def get_home_url():
res = await aiorequest.get(base_url)
selector = etree.HTML(res)
urls = selector.xpath('//ul/li[@class="i_list list_n2"]/a/@href')
return map(lambda i: base_url+i, urls)
async def get_page_url(urls):
res = await aiorequest.get(urls)
selector = etree.HTML(res)
await get_img_url(res)
page_urls = selector.xpath('//div[@class="page"]/a/@href')
page_urls = list(map(lambda i: base_url + i, set(page_urls)))
page_urls.remove(urls)
return page_urls
async def get_img_url(res):
selector = etree.HTML(res)
name = selector.xpath("//h1/text()")[0].replace("[XiuRen秀人网]",'')
img_list = selector.xpath('//p/img/@src')
img_list = map(lambda i: base_url+i, img_list)
if name not in img_urls_dict:
img_urls_dict.setdefault(name, list(img_list))
else:
img_urls_dict[name].extend(list(img_list))
async def get_imgs_url(urls):
res = await aiorequest.get(urls)
await get_img_url(res)
async def download_img(urls, base_name):
name = os.path.basename(urls)
name = base_name + '_' + name
try:
res = await aiorequest.get(urls)
with open(f"./imgs_2/{name}","wb") as f:
f.write(res)
print(f"url: {urls} 下载成功,存储文件为{name}")
except:
print(f"url: {urls} 下载失败")
return "success"
async def main():
tasks_1 = [asyncio.create_task(get_page_url(i)) for i in await get_home_url()]
result_1 = await asyncio.gather(*tasks_1)
result_list = []
for i in result_1: result_list.extend(i)
tasks_2 = [asyncio.create_task(get_imgs_url(i)) for i in result_list]
await asyncio.wait(tasks_2)
tasks_3 = []
for name, img_url in img_urls_dict.items():
tasks_3.extend(asyncio.create_task(download_img(url, name)) for url in img_url)
await asyncio.wait(tasks_3)
if __name__ == '__main__':
if not os.path.isdir("./imgs_2"):
os.mkdir("./imgs_2")
start = time.time()
asyncio.run(main())
print(time.time()-start)