爬取彼岸图网壁纸主页
我做的是爬取每种不同类型的壁纸及爬取多少页的爬虫程序(类型,页数均为用户输入):
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import trange
class Gain_pictures:
def __init__(self, type):
self.type = type
os.makedirs(f'../彼岸花网壁纸/{self.type}', exist_ok=True) # 创建上级文件夹,并忽略文件夹存在的错误
self.page = int(input('请输入爬取页数:'))
options = webdriver.ChromeOptions()
user_agent = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 '
'Safari/537.36')
options.add_argument('user-agent=%s' % user_agent) # 请求头设置
options.add_experimental_option('useAutomationExtension', False) # 禁用浏览器的自动化扩展
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 隐藏"Chrome正在受到自动软件的控制"
self.browser = webdriver.Chrome(options=options)
self.browser.implicitly_wait(10)
self.browser.get('https://pic.netbian.com/new/') # 访问主页网址
self.browser.find_element(By.CSS_SELECTOR, f'div[id="main"] a[title="{type}"]').click() # 点击图片类型link链接
self.url = self.browser.current_url # 获取当前页面的url
def gain_url(self, page):
if page > 1:
url = self.url + f'index_{page}.html' # 获取新页面的url
self.browser.get(url) # 访问新页面
pictures = self.browser.find_elements(By.XPATH, '//div[@class="slist"]//img')
return pictures
# 写入图片
def write_picture(self):
print('客官,请稍等,正在爬取!')
for page in trange(1, self.page + 1, desc="Training"): # 页面进度条
time.sleep(0.1)
pictures = self.gain_url(page)
for picture in pictures:
picture_url = picture.get_attribute('src') # 获取图片url链接
response = requests.get(picture_url) # 请求图片相应数据
with open(f'../彼岸花网壁纸/{self.type}/{picture.get_attribute("alt")}.jpg', 'wb') as f:
f.write(response.content) # 二进制写入图片
time.sleep(2)
self.browser.quit() # 关闭浏览器
if __name__ == '__main__':
type_ = input(
'请输入爬取图片类型:\
[4K新年图片,4K游戏图片,4K动漫图片,4K美女图片,4K风景图片,4K影视图片,4K汽车图片,4K动物图片,4K背景图片,平板壁纸图片,4K独家图片,4K手机壁纸图片]\n'
)
Gain_pictures(type_).write_picture()
print('彼岸花网壁纸爬取完毕!')
Tips:不可作为商业用途,仅做测验
如需交流,请关注后私信,说明来意