pixabay 网站图片下载
准备
目标网站:https://pixabay.com/
-
下载这个网站的图片需要先登录,可以用临时邮箱去注册一个账号。
-
使用浏览器插件Cookie-Editor导出json格式的cookies。
-
删除cookies 数据的
"sameSite": null
这个网站api请求会返回 Cloudflare 验证页面,它提示启用 JavaScript 和 cookies 以继续浏览,这是 Cloudflare 的安全机制,用于保护网站免受恶意访问。
没有办法跳过,于是使用pyppeteer模拟浏览器操作。
编码
import asyncio
import json
from pyppeteer import launch
from spider import Spider
# https://cdn.pixabay.com/photo/2023/08/19/05/50/wolf-8199785_1280.png
# https://pixabay.com/images/download/wolf-8199785.png
async def parse_one_page(url):
options = {
"executablePath": r"C:\Program Files\Google\Chrome\Application\chrome.exe",
'userDataDir': './temp',
"headless": False,
"defaultViewport": {"width": 1920, "height": 1080},
"autoClose": True,
"dumpio": True,
"args": [
'--disable-infobars',
'--no-sandbox',
'--start-maximized',
],
}
browser = await launch(**options)
page = await browser.newPage()
# cookie导入前去掉 "sameSite": null
with open('cookies.json', 'r', encoding='utf8') as f:
cookies_data = json.load(f)
for cookie in cookies_data:
await page.setCookie(cookie)
try:
await page.goto(url, options={'timeout': 30000})
except:
pass
###############################################
temp = None
while True:
y = await page.evaluate('document.documentElement.scrollTop')
if y == temp:
break
else:
print('滚动条刷新')
temp = y
await page.evaluate('window.scrollBy(0, document.documentElement.clientHeight)')
await asyncio.sleep(2)
##############################################
src_list = await page.xpath('//a[@class="link--WHWzm"]/img/@src')
download_url_list = []
for src in src_list:
img_url = await (await src.getProperty('textContent')).jsonValue()
print(img_url)
pid = img_url.split('/')[-1].split('_')[0]
download_url = f"https://pixabay.com/images/download/{pid}.jpg"
download_url_list.append(download_url)
print(download_url_list)
return download_url_list
async def parse_download_url(url, save_path):
options = {
# "executablePath": r"C:\Program Files\Google\Chrome\Application\chrome.exe",
'userDataDir': './temp',
"headless": False,
"defaultViewport": {"width": 1920, "height": 1080},
"autoClose": True,
"dumpio": True,
"args": [
'--disable-infobars',
'--no-sandbox',
'--start-maximized',
],
}
browser = await launch(**options)
page = await browser.newPage()
with open('cookies.json', 'r', encoding='utf8') as f:
cookies_data = json.load(f)
for cookie in cookies_data:
await page.setCookie(cookie)
response = await page.goto(url)
print(response.url)
await browser.close()
# 下载
pic_name = url.split('/')[-1]
# os.makedirs(save_path, exist_ok=True)
save_name = f'{save_path}/{pic_name}'
return response.url, save_name
if __name__ == '__main__':
keyword = 'face'
for page in range(1, 406):
print(f'下载第{page}页数据')
page_url = f'https://pixabay.com/photos/search/{keyword}/?pagi={page}'
loop1 = asyncio.get_event_loop()
download_url_list = loop1.run_until_complete(parse_one_page(page_url))
task_list = []
for download_url in download_url_list:
loop2 = asyncio.get_event_loop()
one_task = loop2.run_until_complete(parse_download_url(download_url, f'./{keyword}'))
task_list.append(one_task)
spider = Spider(
task_list=task_list,
thread_num=3
)
spider.run()