分别用类、函数与普通的方法爬取天堂网图片，并将其分类保存到文件夹中

最新推荐文章于 2020-05-29 21:52:20 发布

dayun555

最新推荐文章于 2020-05-29 21:52:20 发布

阅读量949

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/dayun555/article/details/79460814

版权

python 专栏收录该内容

74 篇文章 2 订阅

订阅专栏

1.普通方法

引入需要的包

# 当创建文件夹时，用到此包
import os
# 网络请求包
from urllib import request, parse
# 正则
import re
from fake_useragent import UserAgent
useragent = UserAgent()

准备url地址

url = 'http://www.ivsky.com/tupian/ziranfengguang/'

构造请求对象

req = request.Request(url=url,
                      headers={'User-Agent': useragent.random})

发起请求接收响应

response = request.urlopen(req)

将返回的字节转换为str类型

html = response.read().decode('utf-8')

准备正则，进行匹配

pattern = re.compile('<div class="il_img".*?<a href="(.*?)" title="(.*?)"')
# findall()函数 查找所有符合正则的数据
# findall() 返回存放分组信息的列表
res = re.findall(pattern, html)

for循环遍历列表，取出每一个图片分类的链接及标题

for info in res:
    link = info[0]
    title = info[1]
    # 如果文件夹不存在 创建文件夹
    if not os.path.exists(title):
        # 创建文件夹
        os.makedirs(title)
    # 拼接完整的详情链接
    detail_url = 'http://www.ivsky.com' + link
    # 发起请求
    response = request.urlopen(detail_url)
    # 将数据转换为字符串
    detail_html = response.read().decode('utf-8')
    # 准备正则
    detail_pat = re.compile('<div class="il_img.*?<img src="(.*?)"')
    # findall()
    detail_res = re.findall(detail_pat,detail_html)
    # print(detail_res)
    for src in detail_res:
        name = src.split('/')[-1]
        print('图片名称：{}  图片链接：{}'.format(name, src))
        # 直接根据链接，文件名称 下载图片
        request.urlretrieve(src, title+'/'+name)

2.用函数的方法爬取风景网图片

引入包

import os
# 网络请求包
from urllib import request, parse
# 正则
import re
from fake_useragent import UserAgent
useragent = UserAgent()

定义一个发送请求，获取html源码的函数

def get_html(url):
    """
    根据url地址发送请求，接收响应数据，返回响应数据
    :param url: 请求地址
    :return: str类型的html源代码
    """
    # 构建request对象
    req = request.Request(
        url=url,
        headers={'User-Agent': useragent.random}
    )
    # 发起请求 接收响应
    response = request.urlopen(req)
    # 转换数据
    html = response.read().decode('utf-8')
    print(html)
    # 返回HTML源代码
    return html

用正则在传过来的html数据中解析详情url和分类标题

def get_detail(html):
    """
    根据正则提取详情url和分类标题
    :param html: 网页源代码
    :return:
    """
    # 1.准备正则
    pattern = re.compile('<div class="il_img".*?<a href="(.*?)" title="(.*?)"')
    # 2.提取数据
    res = re.findall(pattern, html)
    # 3.for循环遍历
    for info in res:
        link = info[0]
        title = info[1]
        path = 'images/'+title
        # print(link, title)
        if not os.path.exists(path):
            os.makedirs(path)
        # 拼接详情url地址
        detail_url = 'http://www.ivsky.com' + link
        # 执行获取图片src\下载图片的函数
        get_img_src(detail_url, path)

根据正则获取图片地址，下载图片

def get_img_src(url, path):
    # 获取详情页面的html
    html = get_html(url)
    pattern = re.compile('<div class="il_img.*?<img src="(.*?)"')
    res = re.findall(pattern, html)
    for src in res:
        print(src)
        name = src.split('/')[-1]
        # 下载图片
        request.urlretrieve(src, path+'/'+name)

爬虫的主函数准备起始url

def main():
    url = 'http://www.ivsky.com/tupian/ziranfengguang/'
    html = get_html(url)
    get_detail(html)

# 是否在当前文件直接运行
if __name__ == '__main__':
    main()

3.用类的方法爬取风景网图片

# -*- coding:utf-8 -*-

import os
# 网络请求包
from urllib import request, parse
# 正则
import re

from fake_useragent import UserAgent
useragent = UserAgent()


# 风景图片爬虫类
class IvskySpider(object):
    def __init__(self):
        # 爬虫的初始地址
        self.url = 'http://www.ivsky.com/tupian/ziranfengguang/'
        self.headers = {'User-Agent': useragent.random}
        self.html = ''

    def get_html(self):
        # 构建请求对象
        req = request.Request(self.url, headers=self.headers)
        # 发起请求
        response = request.urlopen(req)
        # 转换html
        html = response.read().decode('utf-8')
        # 给对象属性赋值
        self.html = html

    # 解析详情链接和title
    def parse_detail(self):
        # 准备正则
        pattern = re.compile('<div class="il_img".*?<a href="(.*?)" title="(.*?)"')

        # findall()函数 查找所有符合正则的数据
        # findall() 返回存放分组信息的列表
        res = re.findall(pattern, self.html)
        # print(res)
        # for循环遍历列表，取出每一个图片分类的链接及标题
        for info in res:
            link = info[0]
            title = info[1]
            path = 'images/' + title
            if not os.path.exists(path):
                os.makedirs(path)
            # 拼接完整的详情链接
            detail_url = 'http://www.ivsky.com' + link
            # 赋值
            self.url = detail_url
            self.path = path
            # 调用解析
            self.parse_src_domnload()

    # 解析详情页面每张图片的链接并下载存储
    def parse_src_domnload(self):
        # 获取详情页面的html源代码
        self.get_html()
        pattern = re.compile('<div class="il_img.*?<img src="(.*?)"')
        res = re.findall(pattern, self.html)
        for src in res:
            print(src)
            name = src.split('/')[-1]
            # 下载图片
            request.urlretrieve(src, self.path + '/' + name)

    # 定义启动爬虫的函数
    def start(self):
        self.get_html()
        self.parse_detail()


# 是否在当前文件直接运行
if __name__ == '__main__':
    ivsky = IvskySpider()
    ivsky.start()