#-*- coding:utf-8 -*-
"""
最基础的python爬取图片的程序,目前在学习阶段,程序思路:
①首先得到网页上的图片的分类
②根据图片的分类url,进入到相应的网页,得到这个分类的所有图片Url
③下载图片
"""
import re, requests
import os
def getHtml(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '\
'(KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36'}
return requests.get(url, headers=headers).content
#得到图片分组的url链接集合
def getImgGroupUrls(url):
reg = r'<a class="list-group-item" href="(.*?)">'
reg = re.compile(reg)
imgGroupUrls = reg.findall(getHtml(url))
return imgGroupUrls
#下载一组图片
def downloadGroupImgs(url):
reg = r'<img src="(.*?)" alt=".*?" οnerrοr="this.src=\'.*?\'">'
reg = re.compile(reg)
imgUrls = reg.findall(getHtml(url))
for imgUrl in imgUrls:
saveImage('http:' + imgUrl)
#保持图片
imgIndex = 0
def saveImage(url):
dirName = 'image'
global imgIndex
if not os.path.exists(dirName):
os.makedirs(dirName)
file = dirName + ('/%s.jpg' % imgIndex)
print u'正在下载第 %d 张图片' % imgIndex
with open(file, 'wb') as fileWrite:
fileWrite.write(getHtml(url))
imgIndex += 1
def downloadPage(url):
print url
imgGroupUrls = getImgGroupUrls(url=url)
for item in imgGroupUrls:
downloadGroupImgs(item)
if __name__ == '__main__':
# url = https://www.doutula.com/article/list/?page=2
url = 'https://www.doutula.com/article/list/?page={}'
for i in range(1, 2):
downloadPage(url.format(i))
python爬虫爬取斗图网上图片
最新推荐文章于 2024-04-05 14:08:36 发布