作为一个Python新手,简单的编写了一个爬虫,获取网页图片数据,并下载到本地的 一小段程序 ,有更加优化方法可以共享哦!本人小白一名。
#!/usr/python2.7
# -*- coding:utf-8 -*-
import re
import urllib
import os
url = 'http://www.地址.com/'#网页地址
path = 'C:/Users/Administrator/Desktop/image'#你的保存路径
def get_content(url):
'''
获取网页数据
'''
info = urllib.urlopen(url)
content = info.read()
info.close()
return content
def get_image_info(info,path):
''' 我的图片 html 前端代码 根据此代码写正则表达式哦!
获取图片路径 并下载
<div class="imgppt relative"><img src="/ppt/spic/2019/08/18/hkzohxu4g5k.png" alt=""><i class="down-icon icon-vip"></i></div>
'''
isExists = os.path.exists(path)
# 目录不存在 创建目录
if not isExists:
os.makedirs(path)
#你的正则表达式 不一定是我这种
regx = r'\<img src="([http].+?\.[p,j][n,p]g)" alt=".+?"\>'
pat = re.compile(regx)
img_info = re.findall(pat, info)
length = len(img_info)
i = 0
for img_url in img_info:
img_len = len(img_url)
suffix = img_url[-3:]
suffix_list = [
'jpg','jpeg','png','gif'
]
if suffix in suffix_list:
path_address = path + '/' + str(i) + '.' + suffix
else:
path_address = path + '/' + str(i) + '.jpg'
urllib.urlretrieve(img_url,path_address)
i += 1
per = 100.0 * i / length
if per > 100:
per = 100
print('%.2f%%' % per)
return img_info
#页面信息
content = get_content(url)
#图片路径
img_info = get_image_info(content,path)
print img_info
二、改进版 (按照当前日期 进行目录保存 )
#!/usr/python2.7
# -*- coding:utf-8 -*-
import re
import urllib
import os
import time
import random
url = ''#网页地址
path = 'C:/Users/Administrator/Desktop/image'#你的保存路径
def get_content(url):
'''
获取网页数据
'''
info = urllib.urlopen(url)
content = info.read()
info.close()
return content
def get_image_info(info,path):
'''
获取图片路径 并下载
<div class="imgppt relative"><img src="/2019/08/18/hkzohxu4g5k.png" alt=""><i class="down-icon icon-vip"></i></div>
'''
time_path = time.strftime('%Y%m%d', time.localtime())
path = path + '/' + str(time_path)
isExists = os.path.exists(path)
# 目录不存在 创建目录
if not isExists:
os.makedirs(path)
#你的正则表达式 不一定是我这种
regx = r'\<img src="([http].+?\.[p,j][n,p]g)" alt=".+?"\>'
pat = re.compile(regx)
img_info = re.findall(pat, info)
length = len(img_info)
i = 0
for img_url in img_info:
img_len = len(img_url)
suffix = img_url[-3:]
suffix_list = [
'jpg','jpeg','png','gif'
]
time_now = time.strftime('%Y%m%d%H%m%S',time.localtime()) + str(random.randint(0000,9999))
if suffix in suffix_list:
path_address = path + '/'+ str(time_now) + '.' + suffix
else:
path_address = path + '/' + str(time_now)+ '.jpg'
urllib.urlretrieve(img_url,path_address)
i += 1
per = 100.0 * i / length
if per > 100:
per = 100
print('%.2f%%' % per)
return img_info
#页面信息
content = get_content(url)
#图片路径
img_info = get_image_info(content,path)
print img_info