Python爬取ps笔刷素材--大文件下载

最新推荐文章于 2020-12-05 14:31:20 发布

Sound_of_ Silence

最新推荐文章于 2020-12-05 14:31:20 发布

阅读量285

点赞数 1

分类专栏： Python 爬虫 request

本文链接：https://blog.csdn.net/weixin_44521703/article/details/102215991

版权

Python 同时被 3 个专栏收录

92 篇文章 5 订阅

订阅专栏

爬虫

33 篇文章 0 订阅

订阅专栏

request

16 篇文章 0 订阅

订阅专栏

python 爬取Photoshop素材代码，详细思路见注释~~

import requests
import re
import os
import random
import time
from lxml import etree


# 获取response信息
def get_text(url):
    global headers
    headers = {'User-Agent':
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', }
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.encoding = 'utf-8'
        return response.text
    except requests.RequestException as err:
        print(err)
        return ''


# 获取页数
def get_length(url):
    text = get_text(url)
    ehtml = etree.HTML(text)
    length = ehtml.xpath('//*[@id="zan-page"]/ul/li/a/text()')[-2]

    return int(length)


# 处理url，直接得到下载页面链接
def url_modified(string):
    return string.replace(
        'brushes8.com', 'brushes8.com/xiazaiyemian6').replace('.html', '')


# 获得列表页的相关信息
def get_detail_page(url):
    text = get_text(url)
    ehtml = etree.HTML(text)
    titles = ehtml.xpath('//div[@id="containere"]//a/@title')
    urls = re.findall(
        r'<a href="(https://brushes8\.com/\d+.html)" title=',
        text)
    urls = list(map(url_modified, urls))
    img_urls = ehtml.xpath('//div[@id="containere"]//a//img/@src')
    return titles, urls, img_urls


# 下载程序
def download_file(title, img_url, file_url, file_path):
    global count
    count += 1
    directory = 'Photoshop Download\\{}'.format(keyword.capitalize())
    path = os.path.join(file_path, directory)
    if not os.path.exists(path):
        os.makedirs(path)
    os.chdir(path)

	# 下载缩略图
    try:
        if os.path.exists(title + '.jpg'):
            print(f'{title} 缩略图已经存在啦')
        else:
            with open(title + '.jpg', "wb") as img:
                img.write(requests.get(img_url, headers=headers).content)
                print(f'正在下载第{count}个缩略图: {title}')
    except (requests.RequestException, PermissionError, IOError):
        pass

	# 下载素材包
    try:
        resp = requests.get(file_url, headers=headers, stream=True) #关键字 stream
        file_size = float(resp.headers['content-length'])
        file_name = os.path.join(path, title + '.7z')
        if os.path.exists(file_name):
            if os.path.getsize(file_name) == file_size:
                print(f'{title} 素材包已经存在啦')
        else:
            with open(file_name, 'wb') as file:
                size = 0
                print(f'正在下载第{count}个素材包: {title}')
                # 大文件下载时，需要采用流式下载
                for chunk in resp.iter_content(chunk_size=512 * 1024):
                    if chunk:
                        file.write(chunk)
                        size += len(chunk)
                        print('\r当前下载进度为{:.1%}'.
                              format(size / file_size), end='')
                print(f'\n第{count}个素材包： {title}------下载完成')
                time.sleep(2 * random.random() + 1)
        print('')
    except (requests.RequestException, PermissionError, IOError) as err:
        print(f'{title}------下载失败', err)
        pass


def run(key_word):
    start_url = f'https://brushes8.com/category/photoshop-brushes/' \
                f'{key_word}-brushes'
    length = get_length(start_url)
    print(f'当前素材一共有{length}页\n')
    
    url_template = 'https://brushes8.com/category/photoshop-brushes/' \
                   '{}-brushes/page/{}'
    urls = [url_template.format(key_word, i) for i in range(1, length + 1)]
    file_path = os.getcwd()
    
    for url in urls:
        titles, urls, imgs = get_detail_page(url)
        for title, url, img in zip(titles, urls, imgs):
            title = re.sub('、|，|。|；|（|）|下载', '', title)
            text = get_text(url)
            if text == '':
                continue
            ehtml = etree.HTML(text)
            try:
                file_url = ehtml.xpath(
                    '//ul[@class="xzyemul"]/li[1]/a/@href')[0]
                download_file(title, img, file_url, file_path)
            except IndexError:
                continue


if __name__ == '__main__':
    keyword = 'light'
    # 可进一步将keyword改成列表，从而使用多进程爬取。
    count = 0
    run(keyword)

Sound_of_ Silence

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
Python爬取ps笔刷素材--大文件下载

python 爬取Photoshop素材代码，url还是加密防水了~~import requestsimport reimport osimport randomimport timefrom lxml import etreedef get_text(url): global headers headers = {'User-Agent': ...
复制链接

扫一扫