超清壁纸爬虫

最新推荐文章于 2023-03-25 10:25:17 发布

Shusheng Yuan

最新推荐文章于 2023-03-25 10:25:17 发布

阅读量454

点赞数 1

分类专栏： python网络爬虫文章标签： python

本文链接：https://blog.csdn.net/weixin_44932346/article/details/116524175

版权

python网络爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

超清壁纸爬虫

概述
一、超清壁纸搜索并下载

概述

本文介绍如何使用python爬虫实现超清壁纸的下载。
思路：通过requests模块对网页发起url请求，使用xpath解析提取图片链接，将二进制数据保存至电脑桌面并创建对应的文件夹！

提示：以下是实现源码，仅供参考。

一、超清壁纸搜索并下载

import os

try:
    from lxml import etree
    import requests
except:
    print("正在安装必需的数据库, 请稍等...")
    os.popen(cmd='pip install lxml -i https://pypi.doubanio.com/simple/').read()
    os.popen(cmd='pip install requests -i https://pypi.doubanio.com/simple/').read()
else:
    pass


import requests
from lxml import etree


def meitu_search_engine(keyword='美女', page_num = 1):
    """ 搜索美图, 并自动下载到当前目录 """
    global page_text_4, response_2, page_text_3
    url_1 = f"https://www.bizhizu.cn/search/{keyword}/{page_num}.html"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
    }

    response = requests.get(url=url_1, headers=headers, timeout=10)  # 发起URL请求
    response.encoding = 'utf-8'
    # print("网页响应状态码:", response.status_code)
    if response.status_code == 200:
        print("\t网页访问成功!")
    # print("url:", response.url)
    page_text_1 = response.text
    # print(page_text_1)

    tree_1 = etree.HTML(page_text_1)    # 实例化一个etree对象
    list_a = tree_1.xpath('//div[@class="imgcont"]/ul/li/a/text()')
    list_a_href = tree_1.xpath('//div[@class="imgcont"]/ul/li/a[2]/@href')
    # print(len(list_a), list_a)
    # print(len(list_a_href),list_a_href)

    list_meitu_source_code = []
    for link in list_a_href:
        # print(link)
        while True:
            try:
                response_2 = requests.get(url=link, headers=headers, timeout=3)
            except:
                continue
            else:
                break

        response_2.encoding = 'utf-8'
        page_text_2 = response_2.text
        tree_2 = etree.HTML(page_text_2)
        list_link_2 = tree_2.xpath('//p[@class="text_con" and @id="photoDesc"]/a[1]/@href')
        # print("list_link_2 = ", list_link_2)
        print("\t爬取图片地址链接:", list_link_2)

        while True:
            try:
                page_text_3 = requests.get(url=f"https://www.bizhizu.cn{list_link_2[0]}", headers=headers,
                                           timeout=3).text
            except:
                continue
            else:
                break
        # print(page_text_3)
        tree_3 = etree.HTML(page_text_3)
        list_link_3 = tree_3.xpath('//div[@class="show-pages-imgshow"]/img/@src')
        # print("list_link_3 = ", list_link_3)

        while True:
            try:
                page_text_4 = requests.get(url=list_link_3[0], headers=headers, timeout=3).content  # 访问图片源码, 以二进制返回
            except:
                continue
            else:
                break

        # print(page_text_4)
        list_meitu_source_code.append(page_text_4)

    list_name = list_a
    # print(len(list_name))
    # print(len(list_meitu_source_code))
    print(f"\t爬取的照片数量 = {len(list_name)}")
    print(f"\t爬取的照片链接数量 = {len(list_meitu_source_code)}")
    for name, link in zip(list_name, list_meitu_source_code):
        with open(f"./{keyword}/{name}.jpg", mode='wb') as obj:
            obj.write(link)

    return


def check_floder(file_name):
    """ 检查文件夹是否存在, 否则创建它 """
    try:
        os.listdir(file_name)
    except:
        os.mkdir(file_name)
    else:
        pass

    return


if __name__ == '__main__':
    while True:
        while True:
            try:
                keyword = str(input("请输入搜索关键字:")).strip()
                if len(keyword) > 0:
                    pass
                else:
                    8/0
            except:
                print("\t您输入的不合法!")
                continue
            else:
                break

        while True:
            try:
                page_num = int(input("你打算取多少页图片啊?(>0):"))
                if page_num > 0:
                    pass
                else:
                    8/0
            except:
                print("\t您输入的不合法!")
                continue
            else:
                break
        check_floder(file_name=keyword)
        print('\t-------------------------------')
        for i in range(1, page_num+1, 1):
            print(f"\t当前爬取的是第{i}页")
            try:
                meitu_search_engine(keyword=keyword, page_num=i)
            except:
                print("\t没有搜到当前页的数据~")
            else:
                pass
            print('\t-------------------------------')

        select = input("\n\t您打算继续搜索更多的吗?(y/n):")
        if select in ['n', 'N', 'NO', 'No']:
            break
        else:
            print()
            continue

Shusheng Yuan

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
超清壁纸爬虫

超清壁纸爬虫概述一、超清壁纸搜索并下载二、视频效果展示概述本文介绍如何使用python爬虫实现超清壁纸的下载。思路：通过requests模块对网页发起url请求，使用xpath解析提取图片链接，将二进制数据保存至电脑桌面并创建对应的文件夹！提示：以下是实现源码，仅供参考。一、超清壁纸搜索并下载import ostry: from lxml import etree import requestsexcept: print("正在安装必需的数据库, 请稍等..."
复制链接

扫一扫