简单python爬表情包

简单python爬表情包

使用前配置好环境

# coding=utf-8
'''
[小试牛刀]爬取斗图网的图片
'''
import requests
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import time
def get_page(url):
    '''获取页面soup'''
    request_heads = {
        'authority': 'www.doutula.com',
        'method': 'GET',
        'path': '/',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': '__cfduid=d87014a95c888749a0e7e3558aa029e391542875442; UM_distinctid=1673a8af4ec947-0fe5c7933d2931-4313362-144000-1673a8af4ed561; _ga=GA1.2.997708735.1542875444; _gid=GA1.2.289917732.1542875445; yjs_id=c5141fe916285e06748acc082e344c44; ctrl_time=1; CNZZDATA1256911977=719530834-1542871248-null%7C1542882055; XSRF-TOKEN=eyJpdiI6Ilc3N1wvUGdhXC8rcWZVdXY4R1U1UUxIQT09IiwidmFsdWUiOiI3SGNZbk1Ic3ZYWVNHY045ZFNYakxiM21KMnU2bHdmN2ZkQVl0MFwvejJyalpkSnI1dG9XNzlldVN1aHFtTlA4dSIsIm1hYyI6IjQzMmY0NGI2Y2QxNjFmYWRkYWNkYjM3ODc4ZmYyNzJlNTcyY2U4MzgzYWMxMzUwNTExMDA4MzVmYTQxMDE4MzAifQ%3D%3D; doutula_session=eyJpdiI6IklJWldMQ2d3Q2s5cm9hdXlOcWJhZWc9PSIsInZhbHVlIjoiS1B3OHNNXC9XQmRCeVpJcDZqc1puU3pjbXZyaEVhUFBsOVdlN0o2aGo5MEk5Q3ZUNlRLVUltQk54OE9Malg5aU8iLCJtYWMiOiJmNDE4NDVlNjVkNzkwMmE0NGIzM2E0NTU1NmExYTdhMTY0YjJmNTFiNDM3Mzg5ZTc3ZmExMTczMDBlNmI5MzQ2In0%3D',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    }
    response = requests.get(url, headers=request_heads)  # 伪造请求头信息
    print(response.status_code)
    html = BeautifulSoup(response.text, 'html.parser')
    return html
# print html.prettify().encode('GBK','ignore')
'''爬去网站表情包'''
def get_pic_url(html):
    '''获取页面中图片地址'''
    # 第一栏部分div标签下
    pic_divs = html.select('div[class="col-xs-6 col-sm-3"]')
    picUrl = []
    for pic_div in pic_divs:
        pic_url = pic_div.select('img[class="lazy image_dtb img-responsive"]')[0]["data-original"]
        if pic_url[-1] == "g":
            picUrl.append((pic_url, 'jpg'))
        elif pic_url[-1] == "f":
            picUrl.append((pic_url, 'gif'))
    # 第二栏部分a标签下
    pic_as = html.select('a[class="col-xs-6 col-sm-3"]')
    for pic_a in pic_as:
        pic_url = pic_a.select('img[class="img-responsive lazy image_dta"]')[0]["data-original"]
        if pic_url[-1] == "g":
            picUrl.append((pic_url, 'jpg'))
        elif pic_url[-1] == "f":
            picUrl.append((pic_url, 'gif'))
    return picUrl
def get_pic(picUrl, i):
    '''下载图片'''
    j = 0
    for picUrlx in picUrl:
        if picUrlx[1] == 'jpg':
            urlretrieve(picUrlx[0], filename="/home/run/文档/pa1/%s%s.jpg" % (str(i), str(j)))
            # 使用urllib的方法来保存图片
            print("downloding...%s" % picUrlx[0])
        elif picUrlx[1] == 'gif':
            urlretrieve(picUrlx[0], filename="/home/run/文档/pa1/%s%s.gif" % (str(i), str(j)))
            print("downloding...%s" % picUrlx[0])
        j += 1
        time.sleep(5)
    return 0
def turn_page(url, i):
    '''翻页爬取'''
    url2 = ''.join([url, "/article/list/?page=%s" % str(i)])
    return url2
def main():
    '''main就是main'''
    i = 1 # 页数
    url2 = 'http://www.doutula.com/'
    while i <= 3:
        html = get_page(url2)
        url2 = turn_page(url2, i)
        picUrl = get_pic_url(html)
        get_pic(picUrl, i)
        i += 1
    main()


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值