超清壁纸爬虫


概述

本文介绍如何使用python爬虫实现超清壁纸的下载。
思路:通过requests模块对网页发起url请求,使用xpath解析提取图片链接,将二进制数据保存至电脑桌面并创建对应的文件夹!


提示:以下是实现源码,仅供参考。

一、超清壁纸搜索并下载

import os

try:
    from lxml import etree
    import requests
except:
    print("正在安装必需的数据库, 请稍等...")
    os.popen(cmd='pip install lxml -i https://pypi.doubanio.com/simple/').read()
    os.popen(cmd='pip install requests -i https://pypi.doubanio.com/simple/').read()
else:
    pass


import requests
from lxml import etree


def meitu_search_engine(keyword='美女', page_num = 1):
    """ 搜索美图, 并自动下载到当前目录 """
    global page_text_4, response_2, page_text_3
    url_1 = f"https://www.bizhizu.cn/search/{keyword}/{page_num}.html"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
    }

    response = requests.get(url=url_1, headers=headers, timeout=10)  # 发起URL请求
    response.encoding = 'utf-8'
    # print("网页响应状态码:", response.status_code)
    if response.status_code == 200:
        print("\t网页访问成功!")
    # print("url:", response.url)
    page_text_1 = response.text
    # print(page_text_1)

    tree_1 = etree.HTML(page_text_1)    # 实例化一个etree对象
    list_a = tree_1.xpath('//div[@class="imgcont"]/ul/li/a/text()')
    list_a_href = tree_1.xpath('//div[@class="imgcont"]/ul/li/a[2]/@href')
    # print(len(list_a), list_a)
    # print(len(list_a_href),list_a_href)

    list_meitu_source_code = []
    for link in list_a_href:
        # print(link)
        while True:
            try:
                response_2 = requests.get(url=link, headers=headers, timeout=3)
            except:
                continue
            else:
                break

        response_2.encoding = 'utf-8'
        page_text_2 = response_2.text
        tree_2 = etree.HTML(page_text_2)
        list_link_2 = tree_2.xpath('//p[@class="text_con" and @id="photoDesc"]/a[1]/@href')
        # print("list_link_2 = ", list_link_2)
        print("\t爬取图片地址链接:", list_link_2)

        while True:
            try:
                page_text_3 = requests.get(url=f"https://www.bizhizu.cn{list_link_2[0]}", headers=headers,
                                           timeout=3).text
            except:
                continue
            else:
                break
        # print(page_text_3)
        tree_3 = etree.HTML(page_text_3)
        list_link_3 = tree_3.xpath('//div[@class="show-pages-imgshow"]/img/@src')
        # print("list_link_3 = ", list_link_3)

        while True:
            try:
                page_text_4 = requests.get(url=list_link_3[0], headers=headers, timeout=3).content  # 访问图片源码, 以二进制返回
            except:
                continue
            else:
                break

        # print(page_text_4)
        list_meitu_source_code.append(page_text_4)

    list_name = list_a
    # print(len(list_name))
    # print(len(list_meitu_source_code))
    print(f"\t爬取的照片数量 = {len(list_name)}")
    print(f"\t爬取的照片链接数量 = {len(list_meitu_source_code)}")
    for name, link in zip(list_name, list_meitu_source_code):
        with open(f"./{keyword}/{name}.jpg", mode='wb') as obj:
            obj.write(link)

    return


def check_floder(file_name):
    """ 检查文件夹是否存在, 否则创建它 """
    try:
        os.listdir(file_name)
    except:
        os.mkdir(file_name)
    else:
        pass

    return


if __name__ == '__main__':
    while True:
        while True:
            try:
                keyword = str(input("请输入搜索关键字:")).strip()
                if len(keyword) > 0:
                    pass
                else:
                    8/0
            except:
                print("\t您输入的不合法!")
                continue
            else:
                break

        while True:
            try:
                page_num = int(input("你打算取多少页图片啊?(>0):"))
                if page_num > 0:
                    pass
                else:
                    8/0
            except:
                print("\t您输入的不合法!")
                continue
            else:
                break
        check_floder(file_name=keyword)
        print('\t-------------------------------')
        for i in range(1, page_num+1, 1):
            print(f"\t当前爬取的是第{i}页")
            try:
                meitu_search_engine(keyword=keyword, page_num=i)
            except:
                print("\t没有搜到当前页的数据~")
            else:
                pass
            print('\t-------------------------------')

        select = input("\n\t您打算继续搜索更多的吗?(y/n):")
        if select in ['n', 'N', 'NO', 'No']:
            break
        else:
            print()
            continue
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值