需要下载的图片列表为,是妹子
图中的一个页面(http://www.meizitu.com/a/5593.html)的所有图片,列表是用python爬的。
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
要对这些图片做下载,可以由如下方法
- 多线程
import requests
from concurrent import futures
import time
headers = {
"Referer": "http://www.meizitu.com/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko'
}
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
def download_one(url):
resp = requests.get(url, headers=headers)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as file:
file.write(resp.content)
start = time.time()
with futures.ThreadPoolExecutor(len(urlList)) as executor:
executor.map(download_one, urlList)
print(time.time() - start)
# output: 20.165
用时二十秒左右
- 多进程
import requests
from concurrent import futures
import time
headers = {
"Referer": "http://www.meizitu.com/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko'
}
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
def download_one(url):
resp = requests.get(url, headers=headers)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as file:
file.write(resp.content)
start = time.time()
with futures.ProcessPoolExecutor(4) as executor:
executor.map(download_one, urlList)
print(time.time() - start)
# output: 36.158297061920166
由于,系统只有四个核心,所以设置的四个进程。这个是在linux环境下运行的结果,在win下出问题。
结果在二十五到四十之中出现,我这个双系统,再加上电脑很老,跑的有点卡。
- 协程
import requests
import asyncio
import aiohttp
# from asyncio import async, await
from concurrent import futures
import random
import time
ua_list = [
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko',
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
]
headers = {
"Referer": "http://www.meizitu.com/",
'User-Agent': random.choice(ua_list)
}
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
params = {'headers': headers, 'proxies':{'http':requests.get('http://127.0.0.1:5010/get/')}}
async def download_one(urlList):
async with aiohttp.ClientSession(headers=headers) as session:
for url in urlList:
for i in range(15):
try:
print("对{}开始的第{}次请求".format(url, i + 1))
proxy = 'http://'+requests.get("http://127.0.0.1:5010/get/").text
resp = await session.get(url, proxy=proxy, timeout=5)
if resp.status != 200:
print("返回的状态码错误,为{}".format(resp.status))
continue
content = await resp.read()
file_name = url.split("/")[-1]
with open(file_name, 'wb') as file:
file.write(content)
print("{}图片下载成功".format(url))
break
except asyncio.TimeoutError:
print("{}第{}次请求出现timeout异常".format(url, i))
except Exception as e:
print("{}第{}次请求出现timeout异常".format(url, i))
print("the exception that is not in mind is " + e.args[0])
def get_proxy():
resp = requests.get('http://127.0.0.1:5010/get/')
return resp.text
# loop = asyncio.get_event_loop()
# # to_do = asyncio.wait([download_one(url) for url in urlList])
# loop.run_until_complete(download_one(urlList))
# loop.close()
# # proxy = 'http://' + get_proxy()
# async def func(url):
# async with aiohttp.ClientSession(headers=headers) as session:
# resp = None
# for i in range(15):
# try:
# print("网站---{}---的第---{}---次请求".format(url, i))
# resp = await session.get(url, proxy=('http://' + get_proxy()), timeout=3)
# # async with session.get(url, proxy=('http://' + get_proxy())) as resp:
# # async with session.get('http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg') as resp:
# if resp.status != 200:
# continue
# except asyncio.TimeoutError:
# print("网站---{}---的第---{}---次请求失败".format(url, i))
# continue
# except aiohttp.client_exceptions.ClientProxyConnectionError:
# print("访问网站---{}---代理错误".format(url))
# else:
# print("网站---{}---的第---{}---次请求成功".format(url, i))
# break
# if resp is None:
# print("{}请求次数过多,都失败了".format(url))
# return
# if resp.status != 200:
# print("{}请求返回的结果都是错误的".format(url))
# return
# print('resp is ', resp)
# print('resp.text ', resp.text())
# print("开始下载文件{}".format(url))
# content = await resp.read()
# file_name = url.split('/')[-1]
# with open(file_name, 'wb') as file:
# file.write(content)
# print("文件{}下载成功".format(url))
# # for i in range(5):
# # try:
# # print("开始下载文件{}".format(url))
# # content = await resp.read()
# # file_name = url.split('/')[-1]
# # with open(file_name, 'wb') as file:
# # file.write(content)
# # print("文件{}下载成功".format(url))
# # except asyncio.TimeoutError:
# # continue
# # else: break
# loop = asyncio.get_event_loop()
# to_do = [func(url) for url in urlList]
# loop.run_until_complete(asyncio.wait(to_do))
# loop.close()
# start = time.time()
# # with futures.ThreadPoolExecutor(2) as executor:
# # executor.map(download_one, urlList)
# taskList = [download_one(url) for url in urlList]
# loop = asyncio.get_event_loop()
# wait_coro = asyncio.wait(taskList)
# # wait_coro运行结束后,会返回一个元组,第一个是一系列结束的期物,第二个是一系列未结束的期物。
# res, oo = loop.run_until_complete(wait_coro)
# loop.close()
# print(time.time() - start)
import sys
urlList = 'cn in us id br pk ng bd ru jp mx ph vn et eg de ir tr cd fr'.split(" ")
base_url = 'http://flupy.org/data/flags/{}/{}'
async def get_flag(url):
async with aiohttp.ClientSession(headers=headers) as session:
resp = await session.get(url)
content = await resp.read()
return content
def show(text):
print(text, end = ' ')
sys.stdout.flush()
def save_flag(img, filename):
with open(filename, 'wb') as file:
file.write(img)
async def download_one(cc):
img = await get_flag(cc)
save_flag(img, cc + '.gif')
return cc
loop = asyncio.get_event_loop()
to_do = [download_one(base_url.format(cc, cc + '.gif')) for cc in urlList]
res, _ = loop.run_until_complete(asyncio.wait(to_do))
loop.close()