最近看小甲鱼的课程,动手实战了一下,成功了还是很开心的。。
废话就不说了,直接上源码,(本人比较懒所以没有注释)
import urllib.request as ureq
import os
import time
def url_open(url):
req = ureq.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0')
req.add_header('Referer', url)
response = ureq.urlopen(req)
html = response.read()
return html
def get_pages(url):
html = url_open(url).decode('utf-8')
pages_nums = []
a = html.find('data-original=')
while a != -1:
b = html.find("' width", a)
if b != -1:
pages_nums.append(html[a + 15:b])
else:
b = a + 15
a = html.find('data-original=', b)
return pages_nums
def find_imgs(url):
html = url_open(url).decode('utf-8')
a = html.find('src=')
b = html.find('.jpg', a, a + 255)
while b == -1:
b = a + 5
a = html.find('src=', b)
b = html.find('.jpg', a, a + 255)
img_addrs = html[a + 5:b + 4]
return img_addrs
def save_img(img_addrs):
filename = img_addrs.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open(img_addrs)
f.write(img)
def maxnum(url, maxPageInd):
html = url_open(url).decode('utf-8')
a = html.find(maxPageInd)
b = html.find('<span>', a - 10)
num = html[b + 6:a]
return int(num)
def meizi():
folder = input("请输入保存图包的文件夹名称:")
try:
os.mkdir(folder)
except OSError:
pass
os.chdir(folder)
count = 0
url = 'https://www.mzitu.com/'
page_num = input("请输入你想要下载的页数:")
print('开始获取页面信息...\n')
page_nums = get_pages(url)
img_addrs = []
print('检测完毕,当前页面共有%d个图包,即将分别爬取...\n' % len(page_nums))
for each in page_nums:
count += 1
temp = each.split("'")
packnum = temp[0].split('/')[-1]
name = packnum.split('_')[0]
newfolder = temp[-1]
page_url = url + name
page_url2 = page_url + '/2'
maxPageInd = "</span></a><a href='" + page_url2 + "'><span>"
every_maxnum = maxnum(page_url, maxPageInd)
print('第%d个图包的总图片数为%d张...\n' % (count, every_maxnum))
os.mkdir(newfolder)
os.chdir(newfolder)
for i in range(every_maxnum):
page_url3 = page_url + '/' + str(i + 1)
print("正在获取第%d张图片地址..." % int(i+1))
img_addrs = find_imgs(page_url3)
save_img(img_addrs)
os.chdir('..')
print("等待5秒后,开始爬取下一个图包。")
time.sleep(5)
if __name__ == '__main__':
meizi()