Python爬取ps笔刷素材--大文件下载

33 篇文章 0 订阅
16 篇文章 0 订阅

python 爬取Photoshop素材代码,详细思路见注释~~

import requests
import re
import os
import random
import time
from lxml import etree


# 获取response信息
def get_text(url):
    global headers
    headers = {'User-Agent':
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', }
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.encoding = 'utf-8'
        return response.text
    except requests.RequestException as err:
        print(err)
        return ''


# 获取页数
def get_length(url):
    text = get_text(url)
    ehtml = etree.HTML(text)
    length = ehtml.xpath('//*[@id="zan-page"]/ul/li/a/text()')[-2]

    return int(length)


# 处理url,直接得到下载页面链接
def url_modified(string):
    return string.replace(
        'brushes8.com', 'brushes8.com/xiazaiyemian6').replace('.html', '')


# 获得列表页的相关信息
def get_detail_page(url):
    text = get_text(url)
    ehtml = etree.HTML(text)
    titles = ehtml.xpath('//div[@id="containere"]//a/@title')
    urls = re.findall(
        r'<a href="(https://brushes8\.com/\d+.html)" title=',
        text)
    urls = list(map(url_modified, urls))
    img_urls = ehtml.xpath('//div[@id="containere"]//a//img/@src')
    return titles, urls, img_urls


# 下载程序
def download_file(title, img_url, file_url, file_path):
    global count
    count += 1
    directory = 'Photoshop Download\\{}'.format(keyword.capitalize())
    path = os.path.join(file_path, directory)
    if not os.path.exists(path):
        os.makedirs(path)
    os.chdir(path)

	# 下载缩略图
    try:
        if os.path.exists(title + '.jpg'):
            print(f'{title} 缩略图已经存在啦')
        else:
            with open(title + '.jpg', "wb") as img:
                img.write(requests.get(img_url, headers=headers).content)
                print(f'正在下载第{count}个缩略图: {title}')
    except (requests.RequestException, PermissionError, IOError):
        pass

	# 下载素材包
    try:
        resp = requests.get(file_url, headers=headers, stream=True) #关键字 stream
        file_size = float(resp.headers['content-length'])
        file_name = os.path.join(path, title + '.7z')
        if os.path.exists(file_name):
            if os.path.getsize(file_name) == file_size:
                print(f'{title} 素材包已经存在啦')
        else:
            with open(file_name, 'wb') as file:
                size = 0
                print(f'正在下载第{count}个素材包: {title}')
                # 大文件下载时,需要采用流式下载
                for chunk in resp.iter_content(chunk_size=512 * 1024):
                    if chunk:
                        file.write(chunk)
                        size += len(chunk)
                        print('\r当前下载进度为{:.1%}'.
                              format(size / file_size), end='')
                print(f'\n第{count}个素材包: {title}------下载完成')
                time.sleep(2 * random.random() + 1)
        print('')
    except (requests.RequestException, PermissionError, IOError) as err:
        print(f'{title}------下载失败', err)
        pass


def run(key_word):
    start_url = f'https://brushes8.com/category/photoshop-brushes/' \
                f'{key_word}-brushes'
    length = get_length(start_url)
    print(f'当前素材一共有{length}页\n')
    
    url_template = 'https://brushes8.com/category/photoshop-brushes/' \
                   '{}-brushes/page/{}'
    urls = [url_template.format(key_word, i) for i in range(1, length + 1)]
    file_path = os.getcwd()
    
    for url in urls:
        titles, urls, imgs = get_detail_page(url)
        for title, url, img in zip(titles, urls, imgs):
            title = re.sub('、|,|。|;|(|)|下载', '', title)
            text = get_text(url)
            if text == '':
                continue
            ehtml = etree.HTML(text)
            try:
                file_url = ehtml.xpath(
                    '//ul[@class="xzyemul"]/li[1]/a/@href')[0]
                download_file(title, img, file_url, file_path)
            except IndexError:
                continue


if __name__ == '__main__':
    keyword = 'light'
    # 可进一步将keyword改成列表,从而使用多进程爬取。
    count = 0
    run(keyword)


  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值