爬图片太慢?
话不多说,直接看代码,亲测有效(小心把劣质网站爬崩)
第一种方法:aiohttp异步请求:缺点是加代理IP不太方便,但是可以自己限制并发下载数量,以免爬崩网站或者被封禁 IP。优点就是:快
import random
import time
import aiohttp
import asyncio
import os
#限制最大并发下载数量
semaphore = asyncio.Semaphore(3)
#创建文件夹以及同名文件夹处理
def judge_path(path,d=1):
folder = os.path.exists(path)
if not folder:
mkdir(path)
else:
print("已有同名文件夹{}".format(path))
if d==1:
path=path+'({})'.format(d)
else:
path=path.replace('({})'.format(d-1),'({})'.format(d))
d=d+1
path=judge_path(path,d)
return path
def makedir(path):
os.makedirs(path)
print("新建文件夹{}".format(path))
async def img_download(session, img_url,path,semaphore):
#图片下载
async with semaphore:
name = img_url.split('/')[-1] # 根据img_url获取图片名字
response = await session.get(img_url)
content = await response.read()
# 获得bytes流数据
with open(path+'\{}'.format(str(name)), 'wb') as f:
#写入文件
f.write(content)
print(f'{name} 下载成功!')
return str(img_url)
async def main(img_urls,path,semaphore):
# 创建session
async with aiohttp.ClientSession() as session:
# 创建任务
tasks = [asyncio.create_task(img_download(session, img_url,path,semaphore)) for img_url in img_urls]
if __name__="__main__":
img_urls=["","",]#若干个图片url
path=r"D\"#定义文件夹路径,注意是文件夹不是文件
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(img_urls,path,semaphore))
end = time.time()
print("下载结束,耗时{}秒".format((round(end-start))))
第二种方法:ThreadPoolExecutord:缺点是没那么快,但其实已经够用了。优点是设置代理IP和请求头很方便。
from concurrent.futures import ThreadPoolExecutor,as_completed
import requests
from fake_useragent import UserAgent
import os
#创建文件夹以及同名文件夹处理
def judge_path(path,d=1):
folder = os.path.exists(path)
if not folder:
mkdir(path)
else:
print("已有同名文件夹{}".format(path))
if d==1:
path=path+'({})'.format(d)
else:
path=path.replace('({})'.format(d-1),'({})'.format(d))
d=d+1
path=judge_path(path,d)
return path
def makedir(path):
os.makedirs(path)
print("新建文件夹{}".format(path))
def img_downloader(path,img_url):
requests.packages.urllib3.disable_warnings()#可以屏蔽掉一个很烦人的警告
proxy = {"http": "60.188.5.196:80"}
headers = {
'User-Agent': str(UserAgent().random)
}
try:
r = requests.get(img_url, headers=headers, proxies=proxy, verify=False)
name = img_url.split('-')[-1]
f = open(path + '\{}'.format(name), 'wb')
f.write(r.content)
f.close()
print("{}下载成功!\n".format(name))
except:
print("下载失败")
if __name__="__main__":
img_urls=[]
path=r"D\"#定义文件夹路径,注意是文件夹不是文件
with ThreadPoolExecutor() as pool:
futures = []
for img_url in img_urls:
future = pool.submit(lambda cxp:img_downloader(*cxp),(path ,img_url))
#由于要传递两个参数,所以做了处理
futures.append(future)
for future in as_completed(futures):
print('')