爬取 ZOL 壁纸

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# 开发时间  :  2021/6/2 15:02
# 文件名称  :  zolWallpaper.py
# 开发工具  :  PyCharm

import os
import requests
import urllib.error
import urllib.request
from bs4 import BeautifulSoup


def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/91.0.4472.77 Safari/537.36"
    }
    request = urllib.request.Request(url, headers=headers)
    try:
        response = urllib.request.urlopen(request)
        content = response.read().decode('gb18030')
        return content
    except urllib.error.URLError as e:
        print("URLError", e)
    except urllib.error.HTTPError as e:
        print("HTTPError", e)
    except UnicodeEncodeError as e:
        print("解码错误", e)


def mkdir(path):
    path = path.strip()
    path = path.rstrip("\\")
    if not os.path.exists(path):
        os.makedirs(path)


class ZolWallpaper:
    def __init__(self):
        self.base_url = "https://desk.zol.com.cn"

    def get_category_path(self):
        category = {}
        content = get_html(self.base_url)
        soup = BeautifulSoup(content, 'lxml')

        dds = soup.find('dd', class_='brand-sel-box clearfix')
        links = dds.find_all('a')[1:]
        for link in links:
            category[link.string] = link['href'].replace('/', '')

        return category

    def get_size_path(self):
        size = []
        content = get_html(self.base_url)
        soup = BeautifulSoup(content, 'lxml')

        dds = soup.find_all('dd', class_='brand-sel-box clearfix')[1]
        links = dds.find_all('a')[1:]
        for link in links:
            size.append(link['href'].replace('/', ''))

        return size


class PhotoList(ZolWallpaper):
    def __init__(self, path):
        super(PhotoList, self).__init__()
        # 获取壁纸尺寸
        self.all_size = self.get_size_path()
        # 获取壁纸分类
        self.all_category = self.get_category_path()

        # 当前壁纸尺寸
        self.now_size = ''
        # 保存根路径
        self.base_path = path
        # 当前保存路径
        self.save_path = ''

        if not os.path.exists(self.base_path):
            os.mkdir(self.base_path)

    def get_album_url(self, url):
        """
        获取所有专辑地址
        :param url:
        :return:
        """
        photo_url = []
        content = get_html(url)
        soup = BeautifulSoup(content, 'lxml')

        links = soup.find_all('a', class_='pic')
        for link in links:
            photo_url.append(self.base_url + link['href'])

        return photo_url

    def get_next_page_url(self, url):
        """
        获取下一页的地址
        :param url:
        :return:
        """
        content = get_html(url)
        soup = BeautifulSoup(content, 'lxml')
        try:
            link = soup.find('a', id='pageNext')
            return self.base_url + link['href']
        except TypeError:
            return ''

    def get_album_photo_url(self, url):
        """
        获取专辑中所有图片地址
        :param url: 专辑地址
        :return:
        """
        urls = []
        content = get_html(url)
        soup = BeautifulSoup(content, 'lxml')
        ul = soup.find('ul', id='showImg')
        links = ul.find_all('a')
        for link in links:
            urls.append(self.base_url + link['href'])

        return urls

    def get_show_image_url(self, url):
        """
        获取图片显示地址
        :param url:
        :return:
        """
        content = get_html(url)
        soup = BeautifulSoup(content, 'lxml')
        try:
            link = soup.find('a', id=self.now_size)
            return self.base_url + link['href']
        except TypeError as e:
            print("当前图片没有 {} 分辨率的尺寸".format(self.now_size))
        return ''

    def get_image_url(self, url):
        """
        从图片显示地址中获取图片真实地址
        :param url: 专辑地址
        :return:
        """
        content = get_html(url)
        soup = BeautifulSoup(content, 'lxml')
        img_url = soup.find('img')['src']
        return img_url

    def save_picture(self, url):
        """
        保存图片
        :param url: 图片真实地址
        :return:
        """
        filename = os.path.basename(url)
        file_path = self.save_path + filename
        if not os.path.exists(file_path):
            try:
                response = requests.get(url, timeout=5)
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print("保存图片 {} 到 {} ".format(filename, self.save_path))
            except requests.exceptions as e:
                print(e)
        else:
            print(file_path, " 已经存在")

    def spiderPhoto(self, category, size):
        # 记录当前壁纸的尺寸
        self.now_size = size
        # 构造保存路径
        self.save_path = os.path.join(self.base_path, category, size) + "\\"
        print(self.save_path)
        # 创建文件夹
        if not os.path.exists(self.save_path):
            mkdir(self.save_path)
        # 获取所有专辑地址
        tem_list = [self.base_url, self.all_category[category], size]
        root_url = "/".join(tem_list) + '/'
        while root_url != '':
            print('将要爬取的页面为 ', root_url)
            album_urls = self.get_album_url(root_url)
            for album_url in album_urls:
                print("当前爬取的专辑为 {}".format(album_url))
                # 获取专辑中所有图片的地址
                photo_urls = self.get_album_photo_url(album_url)
                for photo_url in photo_urls:
                    # 获取图片显示地址
                    show_url = self.get_show_image_url(photo_url)
                    # 获取图片的真实地址
                    if show_url != '':
                        url = self.get_image_url(show_url)
                        self.save_picture(url)
                print("专辑 {} 爬取完成".format(album_url))
            print('页面 {} 爬取完成'.format(root_url))
            root_url = self.get_next_page_url(root_url)


if __name__ == '__main__':
    photo = PhotoList(r"E:\\")
    print("壁纸分类:")
    for key in photo.all_category.keys():
        print(key, end=" ")
    print("")
    print("壁纸尺寸:")
    for size in photo.all_size:
        print(size, end=" ")
    print("")
    size = input("请输入您需要的尺寸(默认:4096x2160)").strip()
    category = input("请输入您需要的的类别(默认:模特)").strip()

    if size == "":
        size = "4096x2160"
    if category == "":
        category = "模特"

    if size not in photo.all_size:
        print("您选择的尺寸 {} 不在支持范围之内".format(size))
        exit()
    if category not in photo.all_category.values():
        print("您选择的类别 {} 不在支持范围之内".format(category))
        exit()

    print("您的选择为 {} {}, 即将为您爬取相关内容".format(category, size))
    photo.spiderPhoto(category, size)

代码写于 2021/6/2,此时代码工作正常

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值