【python爬虫】下载wallhaven的高清大图

最新推荐文章于 2024-09-13 18:11:35 发布

桃幺

最新推荐文章于 2024-09-13 18:11:35 发布

阅读量141

点赞数

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/Cherp/article/details/131340976

版权

【python爬虫】下载wallhaven的高清大图

所需的包
现在还存在的bug
具体代码

所需的包

BeautifulSoup4
requests

现在还存在的bug

有时候下载不了，可能是网络的原因。
页面分析方法还未通用所有的页面。经测试，爬取主页的图片没有问题，但是爬取关键词的图片就有点bug。
页面分析方法时间所需时间很长，原因暂未查明。

具体代码

import os
from bs4 import BeautifulSoup
import requests as re

# 网站url
website_url_base = "https://wallhaven.cc/"
# website_url_base = "https://wallhaven.cc/search?q=Genshin+Impact"
# header 改成你自己的header
fake_header ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64"} 

# 搜索关键词，默认为空，直接访问的是主页
key_word = ""
max_page = ""
# False关闭自己输入，True开启自己设置
key_word_switch = True
# 转换网站的搜索格式
def get_search_format(key_word):
    if key_word != "":
        return key_word.replace(' ','%20')
    else:
        return None

# 搜索url拼接
def splice_url(key_word,page=None):
    url = website_url_base + "search?q=" + get_search_format(key_word)
    if page != None:
        if isinstance(page,str):
            url = url + "&page=" + page
        else:
            url = url + "&page=" + str(page)
    print("拼接结果：",url)
    return url

# Get方法
def get_website_by_get(destination_path):
    resq = re.request(url=destination_path,method="get",headers=fake_header)
    resq.encoding = "utf-8"
    return resq.text
# Post方法，没用到
def get_website_by_post(destination_path):
    resq = re.request(url=destination_path,method="post",headers=fake_header)
    resq.encoding = "utf-8"
    return resq.text

# 分析获取大图的url，有指定的keyword
def analyze_website(destination_path):
    print("开始分析大图url")
    resq_html = get_website_by_get(destination_path)
    # 第二个参数写"html.parser"，不然会报一个警告
    soup = BeautifulSoup(resq_html,"html.parser")
    raw_pic_urls_list = []
    global key_word
    if key_word != "":
        for item in soup.select(".preview"):
            print(item)
            raw_pic_urls_list.append(item.attrs["href"])
    else:
        for item in soup.select(".lg-thumb"):
            raw_pic_urls_list.append(item.a.attrs["href"])
        for item in soup.select(".sm-thumb"):
            raw_pic_urls_list.append(item.a.attrs["href"])
    
    print("开始分析大图url完成")
    return raw_pic_urls_list


# 根据上一步拿到的url，进一步获取图片所在的url
def get_pic_url(raw_pic_urls_list):
    print("获取图片url")
    if len(raw_pic_urls_list) == 0:
        print("警告：没解析出raw_pic_url")
        return None
    pic_urls_list = []
    for raw_pic_url in raw_pic_urls_list:
        resq_html = get_website_by_get(raw_pic_url)
        soup = BeautifulSoup(resq_html,"html.parser")
        print(soup.select("#wallpaper")[0].attrs["src"])
        pic_urls_list.append(soup.select("#wallpaper")[0].attrs["src"])
    print("获取图片url完成")
    return pic_urls_list
        

# 下载器
def download_pic(pic_url):
    # 获取pic二进制内容
    print("开始下载图片")
    def get_pic_content():
        resq = re.request(url=pic_url,method="get",headers=fake_header)
        return resq.content
    save_path = "图片"
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    else:
        pass
    file_path = "图片/"+ pic_url[-20::]
    with open(file=file_path,mode="wb") as file:
        file.write(get_pic_content())
        print("保存成功，路径位置在同级文件夹",save_path,"中")
    print("下载完成")

# main函数
def main():
    print("启动")
    search_urls = []
    if not key_word_switch:
        print("关闭搜索关键词开关")
        search_urls.append(website_url_base)
    else:
        print("开启搜索关键词开关")
        global key_word,max_page
        key_word = input("输入你的关键词：")
        max_page = int(input("输入查找的最大页面："))
        for page in range(1,max_page+1):
            search_urls.append(splice_url(key_word,page))
            
    for search_url in search_urls:
        raw_pic_urls_list = analyze_website(search_url)
        pic_urls_list = get_pic_url(raw_pic_urls_list)
        for pic_url in pic_urls_list:
            download_pic(pic_url)      
    
   

if __name__ == "__main__":
    main()
    # print(get_pic_url(["https://wallhaven.cc/w/1pd1o9"]))