python协程爬虫
准备工作:
安装:
pip install asyncio
pip install aiphttp
引入:
import re
import time
import asyncio
import aiohttp
需要安装两个异步的两个模块
第一步:
#链接生成器,我这里是生成了50条链接
arr = []
if __name__ == '__main__':
for i in range(1, 50):
url=f'https://bbs.tiepi.top/forum-9-{i}.html'#一个论坛网站
arr.extend({url})
asyncio.run(main(arr))
print(arr)
第二步:
async def main(urls):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
#类似线程线程启动器,他这个是创建了很多异步协程
tasks = [fetch_html(url,headers) for url in urls]
#然后通过await asyncio.gather去启动
html_content = await asyncio.gather(*tasks)
for content in html_content:
content = re.findall(r'<title>(.*?)</title>', content)
print(content) # 输出网页内容
第三步:
async def fetch_html(url,headers):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,headers=headers) as response:
return await response.text()
#一个异步的网络请求
每一个异步执行需要用到的代码前面需要加上async
异步等待响应的开头需要await response.text()
展示一下效果:
正常网络请求:
单线正常网络请求50页拿到数据花了61秒!
协程的网络请求:
单线程协程网络请求50页拿到数据花了4秒!
同样单线程,协程快了将近20倍!
协程完整代码:
import re
import time
import asyncio
import aiohttp
starts = time.time()#开始记时
async def fetch_html(url,headers):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,headers=headers) as response:
return await response.text()
async def main(urls):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
tasks = [fetch_html(url,headers) for url in urls]
html_content = await asyncio.gather(*tasks)
for content in html_content:
content = re.findall(r'<title>(.*?)</title>', content)
print(content) # 输出网页内容
arr = []
if __name__ == '__main__':
for i in range(1, 50):
url=f'https://bbs.tiepi.top/forum-9-{i}.html'
arr.extend({url})
asyncio.run(main(arr))
print(arr)
end = time.time()#结束计时
print("执行:",end - starts,"秒")