from urllib import request
import re, os
class IvskySpider(object):
def __init__(self):
self.url = 'http://www.ivsky.com/tupian/ziranfengguang/'
self.html = ''
self.title = 'images'
self.count = 0
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
# 创建文件夹目录
self.create_directry()
def create_directry(self):
# 1.获取html源代码
self.get_html(self.url)
# 2.准备正则表达式,提取网页标题
pattern = re.compile(r'<title.*?>(.*?)</title>', re.S)
# 3.提取标题,保存为属性值
res = re.search(pattern, self.html)
if res:
self.title = res.group(1)
# 4.判断文件夹是否存在,不存在则创建
if not os.path.exists(self.title):
# 创建文件夹
os.mkdir(self.title)
def get_html(self, url):
"""根据url地址,获取网页源代码"""
req = request.Request(url=url, headers=self.headers)
# 发起请求,接收请求结果
response = request.urlopen(req)
# 读取响应数据,转换为字符串
self.html = response.read().decode('utf-8', 'ignore')
def parse_html(self):
# 1.准备正则
pattern = re.compile(r'<div class="il_img.*?<img src="(.*?)"', re.S)
# 2.根据正则提取网页数据
results = re.findall(pattern, self.html)
# 3.遍历下载每一张图片
for link in results:
self.count += 1
print('正在下载第%s张图片,请稍后....' % self.count)
img_name = link.split('/')[-1]
# 4.拼接图片完整的存放路径
# 自然风光图片 - 自然风景图片 (天堂图片网)/meilidehaitan-004.jpg
path = self.title + '/' + '%s.jpg' % self.count
# 5.下载图片
request.urlretrieve(link, path)
def run(self):
for x in range(1, 11):
url = self.url + 'index_{}.html'.format(x)
self.get_html(url)
self.parse_html()
天堂图片下载
最新推荐文章于 2021-07-10 09:15:52 发布