python爬虫爬取斗图网上图片

#-*- coding:utf-8 -*-

"""
最基础的python爬取图片的程序,目前在学习阶段,程序思路:
①首先得到网页上的图片的分类
②根据图片的分类url,进入到相应的网页,得到这个分类的所有图片Url
③下载图片
"""
import re, requests
import os

def getHtml(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '\
                            '(KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36'}
    return requests.get(url, headers=headers).content

#得到图片分组的url链接集合
def getImgGroupUrls(url):
    reg = r'<a class="list-group-item" href="(.*?)">'
    reg = re.compile(reg)
    imgGroupUrls = reg.findall(getHtml(url))
    return imgGroupUrls



#下载一组图片
def downloadGroupImgs(url):
    reg = r'<img src="(.*?)" alt=".*?" οnerrοr="this.src=\'.*?\'">'
    reg = re.compile(reg)
    imgUrls = reg.findall(getHtml(url))
    for imgUrl in imgUrls:
        saveImage('http:' + imgUrl)


#保持图片
imgIndex = 0
def saveImage(url):
    dirName = 'image'
    global imgIndex
    if not os.path.exists(dirName):
        os.makedirs(dirName)
    file = dirName + ('/%s.jpg' % imgIndex)
    print u'正在下载第 %d 张图片' % imgIndex
    with open(file, 'wb') as fileWrite:
        fileWrite.write(getHtml(url))
    imgIndex += 1

def downloadPage(url):
    print url
    imgGroupUrls = getImgGroupUrls(url=url)
    for item in imgGroupUrls:
        downloadGroupImgs(item)

if __name__ == '__main__':
    # url = https://www.doutula.com/article/list/?page=2
    url = 'https://www.doutula.com/article/list/?page={}'
    for i in range(1, 2):
        downloadPage(url.format(i))


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值