python 爬取网络图片保存到本地

最新推荐文章于 2023-08-21 08:00:00 发布

非鸽传书

最新推荐文章于 2023-08-21 08:00:00 发布

阅读量298

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_15351029/article/details/112305687

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

这里把爬取的网站地址处理掉了。直接展示不好。
经常找壁纸的童鞋，应该会知道我爬的是哪个网站🙄（找不到狗头）

from typing import List, Any, Union
# import requests
import random
import string
import urllib.request
from bs4 import BeautifulSoup
import os,stat
from PIL import Image
from io import BytesIO

from util.accessWebContent import accessWebContent

class _4kpicSpider:



    # 下载
    def download(self):
        pass

    # 进入网站 - 爬动漫图片
    def linkWebSit(self, page):
        result: List[Union[str, Any]] = []
        _base_url = '.....'
        file_path='D:/book/img'
        if not os.path.exists(file_path):
            #创建路径
            os.makedirs(file_path)
        url = ''
        if (page is None) or (page == 1):
            # 首页
            url = '.....'
        else:
            url = '.....'+ str(page) +'.html'

        content = accessWebContent().accessContent(url)
        soup = BeautifulSoup(content, 'html.parser')
        pics = soup.find('ul', class_='clearfix').find_all('img')
        num = pics.__len__()
        if num > 0:
            for index in range(0, num):
                next_url = _base_url + pics[index].attrs['src']
                result.append(next_url)

                # 下载图片
                # pic = requests.get(next_url, timeout=10)
                ran_str = ''.join(random.sample(string.ascii_letters + string.digits, 10))
                filename = 'x' + ran_str + '.jpg'
                print(filename)
                # urllib.request.urlretrieve(next_url,filename=filename)
                with urllib.request.urlopen(next_url, timeout=30) as response, open("D:/book/img/"+filename
                    , 'wb') as f_save:
                    f_save.write(response.read())
                    f_save.flush()
                    f_save.close()

        ''' 下面是访问链接后在进行爬取，想爬取相对高清一点的但是失败了，request无法爬取使用js渲染的，因此想要搞的需要使用 selenium
        linkList = soup.find('ul', class_='clearfix').find_all('a')
        num = linkList.__len__()
        if num > 0:
            for index in range(0, num):
                next_url = _base_url + linkList[index].attrs['href']
                result.append(next_url)
                next_content = accessWebContent().accessContent(next_url)
                next_html = BeautifulSoup(content, 'html.parser')
                imgEle = next_html.find('#img')
                print(imgEle)
        '''
        # print(result)
        return result


if __name__ == "__main__":
    spider = _4kpicSpider()

    for i in range(1,147):
        res = spider.linkWebSit(i)

import requests
import logging

class accessWebContent:

    # 无需请求头
    def accessContent(self, url):

        req = requests.get(url)
        if req.encoding == 'ISO-8859-1':
            encodings = requests.utils.get_encodings_from_content(req.text)
            if encodings:
                encoding = encodings[0]
            else:
                encoding = req.apparent_encoding

            # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
            global encode_content
            #如果设置为replace，则会用?取代非法字符；
            encode_content = req.content.decode(encoding, 'replace')

        # 默认日志级别为 warning
        logging.debug(encode_content)

        return encode_content

爬取效果展示：

在这里插入图片描述

非鸽传书

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬取网络图片保存到本地

这里把爬取的网站地址处理掉了。直接展示不好。经常找壁纸的童鞋，应该会知道我爬的是哪个网站????（找不到狗头）from typing import List, Any, Union# import requestsimport randomimport stringimport urllib.requestfrom bs4 import BeautifulSoupimport os,statfrom PIL import Imagefrom io import BytesIOfrom
复制链接

扫一扫