背景:通过python脚本多线程从《https://wallhaven.cc》壁纸网站批量下载图片,如果下载其它站点请调整html结构。
系统环境:windows10
python版本:3.8
完整脚本如下:
# -*- coding: UTF-8 -*-
import time
from queue import Queue
from requests_html import HTMLSession
import requests
import threading
now = time.time()
get_links_list = []
session = HTMLSession()
# url_1 用于获取当前图片页码,urls2为当前页码
url_1 = 'https://wallhaven.cc/search?q=id%3A24972&sorting=random&ref=fp&seed=WbEycL&page=2'
urls2 = 'https://wallhaven.cc/search?q=id%3A24972&sorting=random&ref=fp&seed=WbEycL&page='
# 需要爬取的页面,30为爬取的内容至30页,如果小于30页则获取所有页码图片
page_number = 30
# 保存图片至指定页面
def save_image(url, title):
img_response = requests.get(url)
with open('.\Wallpapers\\'+title+'.png', 'wb') as file:
file.write(img_response.content)
# 获取页码数
def get_page(url_1):
r = session.get(url_1)
news = r.html.find('div > section > header > h2')
for new in news:
page1 = new.text
if 'Page' in page1:
page2 = str(page1).split(' ')[-1]
return page2
# 获取图片下载连接
def get_picturs_url(page, urls2):
page = int(page)
if page <= page_number:
print(f'当前页码数是:{page} ')
for page1 in range(page):
url = f'{urls2}{page1}'
r = session.get(url)
news = r.html.find('div > section > ul > li > figure > a')
for new in news:
get_links_list.append(str(new.absolute_links).replace('\'', '').replace('{', '').replace('}', ''))
else:
for page2 in range(page_number):
url = f'{urls2}{page2}'
r = session.get(url)
news = r.html.find('div > section > ul > li > figure > a')
for new in news:
get_links_list.append(str(new.absolute_links).replace('\'', '').replace('{', '').replace('}', ''))
return get_links_list
# 下载图片
def get_picture(links, output_q):
r = session.get(links)
items_img = r.html.find('body > main > section > div > img')
for imgs in items_img:
url = imgs.attrs['src']
title = imgs.attrs['data-wallpaper-id']
print(url + title)
save_image(url, title)
# 通过多线程调用下载函数
if __name__ == '__main__':
page = get_page(url_1)
get_picturs_url(page, urls2)
for links in get_links_list:
time.sleep(2)
t = threading.Thread(target=get_picture, args=(links, Queue()))
t.start()
print(time.time() - now)
模块安装:
pip install requests-html
pip install requests
参考文档:
https://docs.python-requests.org/projects/requests-html/en/latest/
https://docs.python-requests.org/en/latest/