闲话不多说,直接上代码:
import requests
from bs4 import BeautifulSoup
import re
import urllib
def cbk(a,b,c):
'''''回调函数
@a:已经下载的数据块
@b:数据块的大小
@c:远程文件的大小
'''
per=100.0*a*b/c
if per>100:
per=100
print ('%.2f%%' % per)
print(" ")
url = 'http://www.ivsky.com/tupian/meishishijie/'
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400','Referer':'http://www.ivsky.com/tupian/qita/index_11.html'}
html = requests.get(url,headers = headers)
soup = BeautifulSoup(html.text,'html.parser')
for i in range(0,12):
link = url +'/index_'+str(i)+'.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400',
'Referer':'http://www.ivsky.com/tupian/qita/index_11.html'}
html = requests.get(link, headers=headers)
mess = BeautifulSoup(html.text, 'html.parser')
for page in mess.find_all('ul',class_='ali'):
for img in page.find_all('img'):
imgre = re.compile(r'src="(.*?\.jpg)" alt')
imglist = re.findall(imgre,html.text)
#imgurl = img.get('src')
#print imgurl
x = 0
for imgurl in imglist:
work_path = "E:/img/" + str(x) + ".jpg"
urllib.urlretrieve(imgurl,work_path,cbk)
x += 1