协程和多任务异步协程
一般情况下,当程序处于IO操作时(包括time.sleep()、requests.get()、input),线程处于阻塞状态。
在单线程条件下,协程指:当程序遇到IO操作时,可以选择切换执行别的任务。
微观上是任务间的切换(切换条件一般为IO操作);宏观上是多任务一起执行,即多任务异步操作。
#编写协程
import asyncio
import time
async def func1(): #async:声明异步函数
print("async func1")
# time.sleep(2) #当程序出现同步操作时,异步中断
await asyncio.sleep(2) #异步操作,await表示挂起切换到别的任务
print("async func1..")
async def func2():
print("async func2")
# time.sleep(4)
await asyncio.sleep(4)
print("async func2..")
async def func3():
print("async func3")
# time.sleep(3)
await asyncio.sleep(3)
print("async func3..")
if __name__ == '__main__':
#此写法不常用
f1 = func1() #此时func()是异步函数,返回协程对象
f2 = func2()
f3 = func3()
print(f1)
tasks = [f1,f2,f3]
t1 = time.time()
asyncio.run(asyncio.wait(tasks)) #借助asyncio模块来运行协程对象,一次性启动多个任务
t2 = time.time()
print(t2-t1)
常用以下写法
import asyncio
import time
async def func1(): #async:声明异步函数
print("async func1")
await asyncio.sleep(2) #异步操作,await表示挂起切换到别的任务
print("async func1..")
async def func2():
print("async func2")
await asyncio.sleep(4)
print("async func2..")
async def func3():
print("async func3")
await asyncio.sleep(3)
print("async func3..")
async def main():
#推荐使用该方法
tasks = [
asyncio.create_task(func1()),
asyncio.create_task(func2()),
asyncio.create_task(func3())
]
await asyncio.wait(tasks)
if __name__ == '__main__':
t1 = time.time()
#一次性启动多个协程任务
asyncio.run(main())
t2 = time.time()
print(t2-t1)
在爬虫领域的应用,即模板
#在爬虫的应用(模板)
urls=[
"https://www.baidu.com",
"https://www.163.com",
"https://www.bilibili.com",
]
async def download(url):
print("准备下载")
await asyncio.sleep(2) #网络请求 用await的requests.get()
print("下载完成")
async def main():
tasks = []
for url in urls:
d = download(url)
d = asyncio.create_task(d)
tasks.append(d)
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
异步http请求aiohttp模块
requests.get()同步的代码->在异步操作aiohttp中:pip install aiohttp
因为包在国外,网络连接慢,长时间连接不成功或下载不成功,就报错。
所以增加连接等待的时长:pip --default-timeout=100 install aiohttp
套用上述模板。
例子:异步请求下载图片
import asyncio
import aiohttp
import aiofiles
urls = [
"https://img2.woyaogexing.com/2023/02/24/422577f830a9e7f88d90e9734891c647.jpg",
"https://img2.woyaogexing.com/2023/02/23/3ccbcf1b3d39a674ba91226973434e16.jpeg",
"https://img2.woyaogexing.com/2023/02/22/79c87c0c0674085124011591b3e050a4.jpeg"
]
async def aioDownloadPic(url):
#发生请求->得到图片->保存文件
name = url.rsplit("/",1)[1] #从右边工具/切一次,取[1]的位置的内容
async with aiohttp.ClientSession() as session: #等价于requests
async with session.get(url) as resp: #等价于 resp = requests.get()
async with aiofiles.open('attachment/'+name, mode='wb') as f:
# 异步写文件,等价 with open('attachment/'+name, mode='wb') as f:
await f.write(
await resp.content.read() #读取内容是异步的,用await挂起,等价于 resp.text()
)
print(name," is OK!")
async def main():
tasks = []
for url in urls:
d = aioDownloadPic(url)
d = asyncio.create_task(d)
tasks.append(d)
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
例子:爬小说
#爬百度小说
import json
import requests
import asyncio
import aiohttp
import aiofiles
#所有章节目录的url
# url1 = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}' #得到章节的cid和名称
#章节里的正文的url,根据上述cid得到下述cid:book_id|上述cid,所以根据url1拼接url2下载内容
# url2 = "http://dushu.baidu.com/api/pc/getChapterContent?data={'book_id':'4306063500','cid':'4306063500|1569782244','need_bookinfo':1}"
async def getCatalog(book_id):
url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + book_id + '"}'
resp = requests.get(url)
tasks = []
for item in resp.json()['data']['novel']['items']:
title = item['title']
cid = item['cid']
# print(title,cid)
tasks.append(asyncio.create_task(aioDownloadContent(book_id,cid,title)))
await asyncio.wait(tasks)
resp.close()
async def aioDownloadContent(book_id,cid,title):
data = {
"book_id": book_id,
"cid": f"{book_id}|{cid}",
"need_bookinfo": 1
}
chapterUrl = f"http://dushu.baidu.com/api/pc/getChapterContent?data={json.dumps(data)}"
async with aiohttp.ClientSession() as session:
async with session.get(chapterUrl) as resp:
dic = await resp.json()
async with aiofiles.open('attachment/西游记/'+title+'.txt', mode='wb') as f:
await f.write(dic['data']['novel']['content'].encode()) #当前操作的字符串是bytes类型的字符串对象,用encode编码为字节
# print(dic['data']['novel']['content'])
print(title," ok!")
if __name__ == '__main__':
book_id = "4306063500"
asyncio.run(getCatalog(book_id))