主要通过BeatifulSoup来实现爬取。定位各级目录,然后做路径拼接。
# -*- coding: utf-8 -*-
import requests
import time
from bs4 import BeautifulSoup
import os
parent_url = r'https://www.umei.cc/'
resp = requests.get(parent_url)
resp.encoding = 'utf-8'
soup_ = BeautifulSoup(resp.text, 'html.parser')
column_list = soup_.find_all('a', class_="nav-li-item")
sub_column_list = soup_.find_all('div', class_="sonnav")
for i in range(len(column_list)):
column_name = column_list[i]
url = column_name.get('href')
name = column_name.string
name_list = []
for i in range(len(sub_column_list)):
sub_list = []
sub_column_names = sub_column_list[i]
sub_column_li = sub_column_names.find_all('a')
for j in range(len(sub_column_li)):
sub_column_name = sub_column_li[j]
name = sub_column_name.string
href = sub_column_name.get('href')[1:]
dic = {}
dic[name] = href
sub_list.append(dic)
name_list.append(sub_list)
msg_li = name_list[4]
for i in range(len(msg_li)):
name = list(msg_li[i].keys())[0]
if not os.path.exists('./img/' + name):
os.mkdir('./img/' + name)
sub_url = parent_url + msg_li[i][name]
print(sub_url)
sub_resp = requests.get(sub_url)
sub_resp.encoding = 'utf-8'
sub_soup_ = BeautifulSoup(sub_resp.text, 'html.parser')
div = sub_soup_.find_all('div', class_='item masonry_brick')
for ele in div:
a = ele.find('a')
href = a.get('href')[1:]
sub2_url = parent_url + href
sub2_resp = requests.get(sub2_url)
sub2_resp.encoding = 'utg-8'
sub2_soup_ = BeautifulSoup(sub2_resp.text, 'html.parser')
sub2_div = sub2_soup_.find_all('div', class_='pages')[0]
sub2_a = sub2_div.find_all('a', text='尾页')[0]
final_href = sub2_a.get('href')
pages = final_href.split(".")[0].split('_')[-1]
for j in range(2, int(pages)+1):
img_web_url = sub2_url[:-4] + '_' + str(j) + '.htm'
img_resp = requests.get(img_web_url)
img_resp.encoding = 'utf-8'
img_soup = BeautifulSoup(img_resp.text, 'html.parser')
img_div = img_soup.find('div', class_='big-pic')
img = img_div.find('img')
img_url = img.get('src')
img_name = img_url.split('.')[-2].split('/')[-1]
final_img = requests.get(img_url)
with open('./img/' + name + '/'+ img_name + '.jpg', 'wb') as f:
f.write(final_img.content)
print('图片%s下载完成' % img_name)