教你用Python爬取表情包网站下的全部表情图片

最新推荐文章于 2022-01-24 19:42:12 发布

pyinstaller -F py.py

最新推荐文章于 2022-01-24 19:42:12 发布

阅读量652

点赞数 4

分类专栏：爬虫 Python 文章标签： python

本文链接：https://blog.csdn.net/xiaokeaishiwoya/article/details/106028917

版权

爬虫同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

Python

1 篇文章 0 订阅

订阅专栏

教你用Python爬取表情包网站下的全部表情图片

又是我啦~~~
最近上网的时候老看到有人用Python爬取表情包，心痒痒自己也整了一个。

使用到的扩展库：BeautifulSoup, requests

pip install beautifulsoup4 requests

分析：

先进入到网站：https://www.doutula.com/photo/list/
看下总页数：
在这里插入图片描述
哦豁，3千多页，不慌，点到下一页看看url组成：

好像有点规律，其实每一页的url就是由https://www.doutula.com/photo/list/?page=再加上页数构成的，于是写出如下代码：

# coding: utf-8

from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_

class BiaoQingBao(object):
	self.first_page_link = 'https://www.doutula.com/photo/list/?page='
    self.limit_page = 3425
    self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

( 这里我偷了一下懒，那个页数需要手动改代码更新，没有一开始就先获取总页数

好了，获得url的规律后，最激动人心的一步就来了，找到图片的地址
右键一张图片，检查：
在这里插入图片描述
这个网页的结构很简单。可以看到，有三个属性都有储存图片的url，那究竟是哪一个呢？

事实上，楼主试过，那个data-backup的属性才是图片的真正url（具体自行尝试，其他那两个爬下来网址就变了，有大神解答下吗

根据上面发现，增加下面代码：

def request(self, url):
        return get(url, headers=self.headers)

def next_page(self):
        if self.page_num < self.limit_page:
            self.page_num += 1
        else:
            print u'下载完毕，正在退出。。。'
            exit()

def get_picture(self, limit=3420):
    print u'正在创建目录。。。'
    try:
        mkdir('d:\\biaoqingbao')
    except WindowsError:
        pass

    for i in xrange(limit):
        print u'正在创建第%d页的目录。。。' % self.page_num
        try:
            mkdir('d:\\biaoqingbao\\' + str(self.page_num))
        except WindowsError:
            pass
        print u'正在获取响应。。。'
        req = self.request(self.first_page_link + str(self.page_num))
        sleep(1)
        soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
            picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
        picture_links = [i['data-backup'] for i in picture_tags]
        picture_names = [i['alt'] for i in picture_tags]
        picture_dict = dict(zip(picture_names, picture_links))
        for j in picture_dict:
            if j != self.page_title and self.page_title is not None:
                continue
            chdir(u'd:\\表情包下载器\\')
            self.write_last_download(self.page_num, j)
            chdir('d:\\biaoqingbao\\' + str(self.page_num))
            print u'正在下载: %s' % j
            f = open(j + '.png', 'wb')
            req_ = self.request(picture_dict[j])
            sleep(0.5)
            f.write(req_.content)
            f.close()
        print u'翻页中。。。'
        self.next_page()

因为上面我们发现了url构成的规律，所以我们可以手动将页数加1，在接到之前的那个url前缀上，就可以实现翻页功能。

表情那么多，那用户退出后难道就要重新下载吗？
所以，在写两个函数进行配置文件的读写：

@classmethod
def write_last_download(cls, download_page, download_title):
    f = open_('settings.ini', 'w', 'utf-8')
    f.write(u'last download page: ' + unicode(download_page) + u'\n')
    f.write(u'last download title: ' + unicode(download_title) + u'\n')
    f.close()

@classmethod
def load_last_download(cls):
    try:
        f = open_('settings.ini', 'r', 'utf-8')
        buf = f.read()
        download_record = []
        for i in buf:
            if i.isdigit():
                download_record.append(int(i))
        if len(download_record) == 2:
            f.close()
            return download_record[0], download_record[1]
    except IOError:
        return 1, None

（请自行在d盘目录下创建“表情包下载器”这个目录（也可以直接修改源码

汇总代码如下：

# coding: utf-8

from bs4 import BeautifulSoup
from requests import get
from os import mkdir, chdir
from sys import exit
from time import sleep
from codecs import open as open_


class BiaoQingBao(object):

    def __init__(self):
        self.page_num = self.load_last_download()[0]
        self.page_title = self.load_last_download()[1]
        self.first_page_link = 'https://www.doutula.com/photo/list/?page='
        self.limit_page = 3425
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/80.0.3987.149 Safari/537.36 '
        }

    @classmethod
    def write_last_download(cls, download_page, download_title):
        f = open_('settings.ini', 'w', 'utf-8')
        f.write(u'last download page: ' + unicode(download_page) + u'\n')
        f.write(u'last download title: ' + unicode(download_title) + u'\n')
        f.close()

    @classmethod
    def load_last_download(cls):
        try:
            f = open_('settings.ini', 'r', 'utf-8')
            buf = f.read()
            download_record = []
            for i in buf:
                if i.isdigit():
                    download_record.append(int(i))
            if len(download_record) == 2:
                f.close()
                return download_record[0], download_record[1]
        except IOError:
            return 1, None

    def request(self, url):
        return get(url, headers=self.headers)

    def next_page(self):
        if self.page_num < self.limit_page:
            self.page_num += 1
        else:
            print u'下载完毕，正在退出。。。'
            exit()

    def get_picture(self, limit=3420):
        print u'正在创建目录。。。'
        try:
            mkdir('d:\\biaoqingbao')
        except WindowsError:
            pass

        for i in xrange(limit):
            print u'正在创建第%d页的目录。。。' % self.page_num
            try:
                mkdir('d:\\biaoqingbao\\' + str(self.page_num))
            except WindowsError:
                pass
            print u'正在获取响应。。。'
            req = self.request(self.first_page_link + str(self.page_num))
            sleep(1)
            soup = BeautifulSoup(req.content.decode('utf-8'), 'lxml')
            picture_tags = soup.find_all('img', referrerpolicy="no-referrer")
            picture_links = [i['data-backup'] for i in picture_tags]
            picture_names = [i['alt'] for i in picture_tags]
            picture_dict = dict(zip(picture_names, picture_links))
            for j in picture_dict:
                if j != self.page_title and self.page_title is not None:
                    continue
                chdir(u'd:\\表情包下载器\\')
                self.write_last_download(self.page_num, j)
                chdir('d:\\biaoqingbao\\' + str(self.page_num))
                print u'正在下载: %s' % j
                f = open(j + '.png', 'wb')
                req_ = self.request(picture_dict[j])
                sleep(0.5)
                f.write(req_.content)
                f.close()
            print u'翻页中。。。'
            self.next_page()


if __name__ == '__main__':
    biaoqingbao = BiaoQingBao()
    while True:
        print u'你想获取几页的表情包：'
        answer = raw_input('>>> ')
        if answer:
            try:
                if 0 <= int(answer) <= 3425:
                    break
                else:
                    print u'请规范输入！ 只能输入大于0小于3420的整数！'
            except ValueError:
                print u'请输入整数！'
        else:
            print u'输入不能为空！'
    biaoqingbao.get_picture(int(answer))
    print u'感谢使用！可以在D盘biaoqingbao目录下找到结果~~~'

效果图：
在这里插入图片描述

喜欢的话点个赞再走呗~~~

pyinstaller -F py.py

关注

4
点赞
踩
7

收藏

觉得还不错? 一键收藏
1
评论
教你用Python爬取表情包网站下的全部表情图片

教你用Python爬取表情包网站下的全部表情图片又是我啦~~~最近上网的时候老看到有人用Python爬取表情包，心痒痒自己也整了一个。使用到的扩展库：BeautifulSoup, requestspip install beautifulsoup4 requests分析：先进入到网站：https://www.doutula.com/photo/list/看下总页数：哦豁，3千多页，不慌，点到下一页看看url组成：好像有点规律，其实每一页的url就是由https://www.doutu
复制链接

扫一扫