所需的包
BeautifulSoup4
requests
现在还存在的bug
- 有时候下载不了,可能是网络的原因。
- 页面分析方法还未通用所有的页面。经测试,爬取主页的图片没有问题,但是爬取关键词的图片就有点bug。
- 页面分析方法时间所需时间很长,原因暂未查明。
具体代码
import os
from bs4 import BeautifulSoup
import requests as re
# 网站url
website_url_base = "https://wallhaven.cc/"
# website_url_base = "https://wallhaven.cc/search?q=Genshin+Impact"
# header 改成你自己的header
fake_header ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.64"}
# 搜索关键词,默认为空,直接访问的是主页
key_word = ""
max_page = ""
# False关闭自己输入,True开启自己设置
key_word_switch = True
# 转换网站的搜索格式
def get_search_format(key_word):
if key_word != "":
return key_word.replace(' ','%20')
else:
return None
# 搜索url拼接
def splice_url(key_word,page=None):
url = website_url_base + "search?q=" + get_search_format(key_word)
if page != None:
if isinstance(page,str):
url = url + "&page=" + page
else:
url = url + "&page=" + str(page)
print("拼接结果:",url)
return url
# Get方法
def get_website_by_get(destination_path):
resq = re.request(url=destination_path,method="get",headers=fake_header)
resq.encoding = "utf-8"
return resq.text
# Post方法,没用到
def get_website_by_post(destination_path):
resq = re.request(url=destination_path,method="post",headers=fake_header)
resq.encoding = "utf-8"
return resq.text
# 分析获取大图的url,有指定的keyword
def analyze_website(destination_path):
print("开始分析大图url")
resq_html = get_website_by_get(destination_path)
# 第二个参数写"html.parser",不然会报一个警告
soup = BeautifulSoup(resq_html,"html.parser")
raw_pic_urls_list = []
global key_word
if key_word != "":
for item in soup.select(".preview"):
print(item)
raw_pic_urls_list.append(item.attrs["href"])
else:
for item in soup.select(".lg-thumb"):
raw_pic_urls_list.append(item.a.attrs["href"])
for item in soup.select(".sm-thumb"):
raw_pic_urls_list.append(item.a.attrs["href"])
print("开始分析大图url完成")
return raw_pic_urls_list
# 根据上一步拿到的url,进一步获取图片所在的url
def get_pic_url(raw_pic_urls_list):
print("获取图片url")
if len(raw_pic_urls_list) == 0:
print("警告:没解析出raw_pic_url")
return None
pic_urls_list = []
for raw_pic_url in raw_pic_urls_list:
resq_html = get_website_by_get(raw_pic_url)
soup = BeautifulSoup(resq_html,"html.parser")
print(soup.select("#wallpaper")[0].attrs["src"])
pic_urls_list.append(soup.select("#wallpaper")[0].attrs["src"])
print("获取图片url完成")
return pic_urls_list
# 下载器
def download_pic(pic_url):
# 获取pic二进制内容
print("开始下载图片")
def get_pic_content():
resq = re.request(url=pic_url,method="get",headers=fake_header)
return resq.content
save_path = "图片"
if not os.path.exists(save_path):
os.mkdir(save_path)
else:
pass
file_path = "图片/"+ pic_url[-20::]
with open(file=file_path,mode="wb") as file:
file.write(get_pic_content())
print("保存成功,路径位置在同级文件夹",save_path,"中")
print("下载完成")
# main函数
def main():
print("启动")
search_urls = []
if not key_word_switch:
print("关闭搜索关键词开关")
search_urls.append(website_url_base)
else:
print("开启搜索关键词开关")
global key_word,max_page
key_word = input("输入你的关键词:")
max_page = int(input("输入查找的最大页面:"))
for page in range(1,max_page+1):
search_urls.append(splice_url(key_word,page))
for search_url in search_urls:
raw_pic_urls_list = analyze_website(search_url)
pic_urls_list = get_pic_url(raw_pic_urls_list)
for pic_url in pic_urls_list:
download_pic(pic_url)
if __name__ == "__main__":
main()
# print(get_pic_url(["https://wallhaven.cc/w/1pd1o9"]))