话不多说,上代码(本代码仅供学习参考)
亲测有效!
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 3 15:44:17 2020
@author: lenovo
"""
import sys
import os
import time
import requests
from bs4 import BeautifulSoup
class Url(object):
def __init__(self):
self.url = '' # 起始网址
self.select_dic = {'动漫':'http://www.netbian.com/dongman/',
'风景':'http://www.netbian.com/fengjing/',
'美女':'http://www.netbian.com/meinv/',
'游戏':'http://www.netbian.com/youxi/'}
# 待选网址
self.url_list = [] # 下载链接列表
self.set_url()
self.set_url_list()
def set_url(self):
# 设置起始网址
self.url = ''
if self.select_dic:
print('选择下载网址')
j = 0
for k,v in self.select_dic.items():
print(str(j) + '.'+ k + ': ' + v)
j = j + 1
try:
select = int(input('你的选择:'))
if select not in range(len(list(self.select_dic.values()))):
raise ValueError
except ValueError:
print('输入非选项,退出~')
sys.exit()
self.url = list(self.select_dic.values())[select]
print('起始网址为:' + self.url)
else:
print('没有备选网址')
sys.exit()
def set_url_list(self):
# 获取下载网址列表
self.url_list = []
if self.url:
try:
start = int(input('起始页数(>=1):'))
end = int(input('终止页数:'))
except ValueError:
print('输入错误,退出~')
sys.exit()
for i in range(start, end+1):
if i == 1:
self.url_list.append(self.url + 'index.htm')
else:
self.url_list.append(self.url + 'index_' + str(i) + '.htm')
def mkdir(dir_name):
# 在当前路径下创建目录
path = os.getcwd() + '\\' + dir_name
try:
os.mkdir(path)
print('目录创建成功' + path)
except FileExistsError:
print('目录已经存在,创建失败')
return path
def down_pic(url, header, path):
global count_of_pic
# 下载彼岸网某一页面全部图片
resp = requests.get(url, headers=header)
resp.encoding = resp.apparent_encoding # 支持中文
if resp.status_code != 200:
print('爬取错误')
sys.exit()
soup = BeautifulSoup(resp.text, 'html.parser')
# 找到图片所在标签
a_list = soup.find('div',class_='list').find_all('a',target='_blank')
for a in a_list:
# 部分图片的网址是http://pic.netbian.com,应该略过
if a.attrs['href'][0:4] == 'http':
continue
# 图片网址
url_a = 'http://www.netbian.com' + a.attrs['href']
# 访问图片网页
res = requests.get(url_a, headers=header)
res.encoding = res.apparent_encoding
if res.status_code != 200:
print('爬取错误')
continue
s = BeautifulSoup(res.text, 'html.parser')
s = s.find('div',class_='pic').find('img')
img = s['src']
file_name = s['title']
# 访问图片
r = requests.get(img, headers = header)
if r.status_code != 200:
print('爬取错误')
continue
#图片不是文本文件,以二进制格式写入,所以是html.content
f = open(path + '\\' + file_name + '.jpg','wb')
f.write(r.content)
f.close()
count_of_pic = count_of_pic + 1
print(str(count_of_pic) + '.文件:' + file_name + ' 下载成功')
time.sleep(0.2)
count_of_pic = 0
if __name__ == '__main__':
dir_name = '彼岸壁纸1' # 文件夹名称
header = {'User-Agent':'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36'}
print('开始爬取图片')
print('时间:' + time.strftime('%Y/%m/%d %H:%M:%S'))
time_start = time.time()
path = mkdir(dir_name)
url = Url() # 得到网址列表
for u in url.url_list:
down_pic(u, header,path)
time_end = time.time()
print('下载完成')
print('用时:{:.2f}'.format(time_end-time_start))