aiohttp异步请求提高爬虫效率
- aiohttp可以理解为一个异步的requests,使用aiohttp发送请求可大大提高爬虫效率。
- 基本语法可参考 https://www.jianshu.com/p/63d9741b0bee
async定义一个协程
improt async
# 定义该函数为协程
async def test():
pass
requests发送请求
import requests
html_str = requests.get(url).text
print(html_str)
aiohttp发送请求
import asyncio
import aiohttp
# 对比requests语法
async with aiohttp.ClientSession() as session:
async with session.get(url=url) as response:
html_str = await response.read()
print(html)
使用requests单线程爬取糗事百科12页数据
import time
import requests
import lxml.html
def make_url_list():
base_url = 'https://www.qiushibaike.com/8hr/page/{}/'
for i in range(1, 14):
yield base_url.format(i)
def getHTMLText(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
html_str = requests.get(url=url,headers=headers).text
return html_str
def printTitle(html_str):
html = lxml.html.fromstring(str(html_str))
title = html.xpath('//a[@class="recmd-content"]/text()')
return title
def run():
url_list = make_url_list()
for url in url_list:
html_str = getHTMLText(url)
title = printTitle(html_str)
print(title)
t1 = time.time()
run()
t2 = time.time()
print('耗时:',t2-t1)
打印结果
使用aiohttp异步爬取糗事百科12页数据
import asyncio
import time
import aiohttp
import lxml.html
from queue import Queue
class QiuBai:
def __init__(self):
self.base_url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
self.html_queue = Queue()
def make_url_list(self):
for i in range(1, 14):
url = self.base_url.format(i)
yield url
async def getHTMLText(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url, headers=self.headers) as response:
html_str = await response.read()
html = lxml.html.fromstring(html_str.decode(u'utf-8', errors='ignore'))
title = html.xpath('//a[@class="recmd-content"]/text()')
print(title)
def run(self):
loop = asyncio.get_event_loop()
tasks = [self.getHTMLText(url) for url in self.make_url_list()]
loop.run_until_complete(asyncio.wait(tasks))
if __name__ == '__main__':
qiu = QiuBai()
t1 = time.time()
qiu.run()
t2 = time.time()
print('耗时:', t2 - t1)
打印结果
可以看出使用aiohttp异步爬取速度有明显的提升。