80行代码爬尽各种风格美女图片

资料加载中

已于 2023-11-20 20:31:56 修改

阅读量723

点赞数

分类专栏： python 文章标签： python html

于 2023-04-02 08:48:43 首次发布

本文链接：https://blog.csdn.net/qq_44091004/article/details/129906628

版权

python 专栏收录该内容

34 篇文章

订阅专栏

主要通过BeatifulSoup来实现爬取。定位各级目录，然后做路径拼接。

# -*- coding: utf-8 -*-

import requests
import time
from bs4 import BeautifulSoup
import os

parent_url = r'https://www.umei.cc/'

resp = requests.get(parent_url)
resp.encoding = 'utf-8'

soup_ = BeautifulSoup(resp.text, 'html.parser')

column_list = soup_.find_all('a', class_="nav-li-item")
sub_column_list = soup_.find_all('div', class_="sonnav")


for i in range(len(column_list)):
    column_name = column_list[i]
    url = column_name.get('href')
    name = column_name.string

name_list = []
for i in range(len(sub_column_list)):
    sub_list = []
    sub_column_names = sub_column_list[i]
    sub_column_li = sub_column_names.find_all('a')
    for j in range(len(sub_column_li)):
        sub_column_name = sub_column_li[j]
        name = sub_column_name.string
        href = sub_column_name.get('href')[1:]
        dic = {}
        dic[name] = href
        sub_list.append(dic)

    name_list.append(sub_list)
msg_li = name_list[4]
for i in range(len(msg_li)):
    name = list(msg_li[i].keys())[0]
    if not os.path.exists('./img/' + name):
        os.mkdir('./img/' + name)

    sub_url = parent_url + msg_li[i][name]
    print(sub_url)

    sub_resp = requests.get(sub_url)
    sub_resp.encoding = 'utf-8'
    sub_soup_ = BeautifulSoup(sub_resp.text, 'html.parser')
    div = sub_soup_.find_all('div', class_='item masonry_brick')
    for ele in div:
        a = ele.find('a')
        href = a.get('href')[1:]
        sub2_url = parent_url + href
        sub2_resp = requests.get(sub2_url)
        sub2_resp.encoding = 'utg-8'
        sub2_soup_ = BeautifulSoup(sub2_resp.text, 'html.parser')
        sub2_div = sub2_soup_.find_all('div', class_='pages')[0]
        sub2_a = sub2_div.find_all('a', text='尾页')[0]
        final_href = sub2_a.get('href')
        pages = final_href.split(".")[0].split('_')[-1]
        for j in range(2, int(pages)+1):
            img_web_url = sub2_url[:-4] + '_' + str(j) + '.htm'
            img_resp = requests.get(img_web_url)
            img_resp.encoding = 'utf-8'
            img_soup = BeautifulSoup(img_resp.text, 'html.parser')
            img_div = img_soup.find('div', class_='big-pic')
            img = img_div.find('img')
            img_url = img.get('src')
            img_name = img_url.split('.')[-2].split('/')[-1]
            final_img = requests.get(img_url)

            with open('./img/' + name + '/'+ img_name + '.jpg', 'wb') as f:
                f.write(final_img.content)
            print('图片%s下载完成' % img_name)