Pexels网站图片爬虫
目标网站
https://www.pexels.com/zh-cn/search/
下载目标
根据关键词搜索自动下载该关键词下的所有图片
步骤
输入关键词搜索,在结果页面滚动鼠标加载图片,在浏览器控制台找到请求的接口:https://www.pexels.com/zh-cn/api/v3/search/photos,
经过分析需要携带必要的请求头User-Agent和Secret-Key
下面直接上代码
import os.path
import time
from urllib.parse import parse_qs, urlparse
import requests
from spider import Spider
class PexelsSpider:
def __init__(self):
self.url = 'https://www.pexels.com/zh-cn/api/v3/search/photos'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'Secret-Key': 'H2jk9uKnhRmL6WPwh89zBezWvr',
}
def get_total_page(self, query):
# 获取总页数
params = {
'page': 1,
'per_page': 24,
'query': query,
'orientation': 'all',
'size': 'all',
'color': 'all',
'seo_tags': 'true'
}
resp = requests.get(url=self.url, params=params, headers=self.headers).json()
total_page = resp['pagination']['total_pages']
print(f'total_page: {total_page}')
return total_page
def parse_one_page(self, page, query):
# 解析单个页面
result = []
params = {
'page': page,
'per_page': 24,
'query': query,
'orientation': 'all',
'size': 'all',
'color': 'all',
'seo_tags': 'true'
}
resp = requests.get(url=self.url, params=params, headers=self.headers).json()
data = resp['data']
for item in data:
url = item['attributes']['image']['download_link']
filename = parse_qs(urlparse(url).query)['dl'][0]
save_path = os.path.join('pexels', query, filename)
result.append((url, save_path))
return result
def run(self, query):
total_page = self.get_total_page(query)
for i in range(1, total_page + 1):
print(f'##################### {i} ####################')
task_list = self.parse_one_page(i, query)
spider = Spider(
task_list=task_list,
thread_num=3
)
spider.run()
time.sleep(5)
if __name__ == '__main__':
key = input('请输入关键词:')
os.makedirs(f'./pexels/{key}', exist_ok=True)
ps = PexelsSpider()
ps.run(key)