一.进程池实现异步爬虫
1.代码
#原则:线程池处理的是阻塞且较为耗时的操作
#对下述url发起请求解析出视频详情页的url和视频的名称
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls = [] #存储所有视频的链接and名字
for li in li_list:
detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
#对详情页的url发起请求
detail_page_text = requests.get(url=detail_url,headers=headers).text
#从详情页中解析出视频的地址(url)
ex = 'srcUrl="(.*?)",vdoUrl'
video_url = re.findall(ex,detail_page_text)[0]
dic = {
'name':name,
'url':video_url
}
urls.append(dic)
#对视频链接发起请求获取视频的二进制数据,然后将视频数据进行返回
def get_video_data(dic):
url = dic['url']
print(dic['name'],'正在下载......')
data = requests.get(url=url,headers=headers).content
#持久化存储操作
with open(dic['name'],'wb') as fp:
fp.write(data)
print(dic['name'],'下载成功!')
#使用线程池对视频数据进行请求(较为耗时的阻塞操作)
pool = Pool(4)
pool.map(get_video_data,urls)
pool.close()
pool.join()
2.弊端
根据CPU核心数决定异步效率且开销过大
二.协程实现异步爬虫
1.python 内置协程模块asyncio
看最后asyncio介绍
2.代码
异步爬取三个网站并持久化存储
import asyncio
import aiohttp
urls = [
"http://www.smilenow.top",
"http://www.baidu.com",
"http://www.163.com"
]
async def get_cont(url):
print("准备下载:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
name = url.split(".")[1] + ".html"
with open(name, "w") as fp:
text = await response.text()
fp.write(text)
print(url, "已经下载完毕")
tasks = map(lambda url: get_cont(url), urls)
asyncio.run(asyncio.wait(tasks)) # done:返回结果,pending:返回未完成的协程函数