python 爬取Photoshop素材代码,详细思路见注释~~
import requests
import re
import os
import random
import time
from lxml import etree
# 获取response信息
def get_text(url):
global headers
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', }
try:
response = requests.get(url, headers=headers, timeout=30)
response.encoding = 'utf-8'
return response.text
except requests.RequestException as err:
print(err)
return ''
# 获取页数
def get_length(url):
text = get_text(url)
ehtml = etree.HTML(text)
length = ehtml.xpath('//*[@id="zan-page"]/ul/li/a/text()')[-2]
return int(length)
# 处理url,直接得到下载页面链接
def url_modified(string):
return string.replace(
'brushes8.com', 'brushes8.com/xiazaiyemian6').replace('.html', '')
# 获得列表页的相关信息
def get_detail_page(url):
text = get_text(url)
ehtml = etree.HTML(text)
titles = ehtml.xpath('//div[@id="containere"]//a/@title')
urls = re.findall(
r'<a href="(https://brushes8\.com/\d+.html)" title=',
text)
urls = list(map(url_modified, urls))
img_urls = ehtml.xpath('//div[@id="containere"]//a//img/@src')
return titles, urls, img_urls
# 下载程序
def download_file(title, img_url, file_url, file_path):
global count
count += 1
directory = 'Photoshop Download\\{}'.format(keyword.capitalize())
path = os.path.join(file_path, directory)
if not os.path.exists(path):
os.makedirs(path)
os.chdir(path)
# 下载缩略图
try:
if os.path.exists(title + '.jpg'):
print(f'{title} 缩略图已经存在啦')
else:
with open(title + '.jpg', "wb") as img:
img.write(requests.get(img_url, headers=headers).content)
print(f'正在下载第{count}个缩略图: {title}')
except (requests.RequestException, PermissionError, IOError):
pass
# 下载素材包
try:
resp = requests.get(file_url, headers=headers, stream=True) #关键字 stream
file_size = float(resp.headers['content-length'])
file_name = os.path.join(path, title + '.7z')
if os.path.exists(file_name):
if os.path.getsize(file_name) == file_size:
print(f'{title} 素材包已经存在啦')
else:
with open(file_name, 'wb') as file:
size = 0
print(f'正在下载第{count}个素材包: {title}')
# 大文件下载时,需要采用流式下载
for chunk in resp.iter_content(chunk_size=512 * 1024):
if chunk:
file.write(chunk)
size += len(chunk)
print('\r当前下载进度为{:.1%}'.
format(size / file_size), end='')
print(f'\n第{count}个素材包: {title}------下载完成')
time.sleep(2 * random.random() + 1)
print('')
except (requests.RequestException, PermissionError, IOError) as err:
print(f'{title}------下载失败', err)
pass
def run(key_word):
start_url = f'https://brushes8.com/category/photoshop-brushes/' \
f'{key_word}-brushes'
length = get_length(start_url)
print(f'当前素材一共有{length}页\n')
url_template = 'https://brushes8.com/category/photoshop-brushes/' \
'{}-brushes/page/{}'
urls = [url_template.format(key_word, i) for i in range(1, length + 1)]
file_path = os.getcwd()
for url in urls:
titles, urls, imgs = get_detail_page(url)
for title, url, img in zip(titles, urls, imgs):
title = re.sub('、|,|。|;|(|)|下载', '', title)
text = get_text(url)
if text == '':
continue
ehtml = etree.HTML(text)
try:
file_url = ehtml.xpath(
'//ul[@class="xzyemul"]/li[1]/a/@href')[0]
download_file(title, img, file_url, file_path)
except IndexError:
continue
if __name__ == '__main__':
keyword = 'light'
# 可进一步将keyword改成列表,从而使用多进程爬取。
count = 0
run(keyword)