#coding=utf-8
import re
import time
import os
import _osx_support
import urllib.request
baseFiledirs='D:\img\images/'
# baseFiledir='D:\img/'
# 获取网页
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = html.decode('UTF-8')
return html
# 获取图片
def getImg(html):
# r'^http(s)?://.+\.(jp(e)?g|png)$'
# reg = r'img class="BDE_Image" src="(.+?\.jpg)"'
# reg = r'img src="(.+?\.jpg)"'
# reg = r'src="(.+?\.jpg|.+?\.png)"'
reg = r'img .+?\ src="(.+?\.jpg|.+?\.png)"* '
# reg = r'^http(s)?://.+\.(jp(e)?g|png)$'
imgre = re.compile(reg)
# print(type(imgre))
# print(imgre)
imglist = re.findall(imgre,html)
# print(type(imglist))
print(imglist)
num = 0
for imgurl in imglist:
# urllib.request.urlretrieve(imgurl,'D:\img\hardaway%s.jpg' %num)
# urllib.request.urlretrieve(imgurl,'D:\img' +time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+"\%s.jpg")#不可用
# urllib.request.urlretrieve(imgurl,'D:\img\%s' %num+".jpg")
# imgurl.replace("https","")
# imgurl.replace("http","")
# paths=imgurl.split('/')
# del paths[len(paths)-1]
# del paths[0]
# del paths[0]
# path=baseFiledir
# for shortPath in paths:
# path+=shortPath+"/"
#
# # print(num,path)
#
# if os.path.exists(path)==False:
# os.makedirs(path)
#
# print(num, path+"/"+imgurl.split('/')[-1])
# urllib.request.urlretrieve(imgurl,path+imgurl.split('/')[-1])#将所有文件保存到相应路径
print(imgurl)
try:
if imgurl.startswith("//"):
imgurl="httP:"+imgurl
if os.path.exists(baseFiledirs) == False:
os.makedirs(baseFiledirs)
urllib.request.urlretrieve(imgurl,baseFiledirs+imgurl.split('/')[-1])#将所有文件保存到指定目录
num += 1
except BaseException:
print("errorurl:",imgurl)
return "success"
# html = getHtml("http://tieba.baidu.com/p/1569069059")
# html = getHtml("http://www.cankaoxiaoxi.com/roll10/bd/20170425/1926008.shtml")
# html = getHtml("http://www.bilibili.com")
# html = getHtml("http://www.bilibili.com/blackboard/activity-B1bzUVG0l.html")
html = getHtml("http://news.baidu.com/")
# print("html内容:",html)
print(getImg(html))
Python爬取网页图片02
最新推荐文章于 2020-08-24 22:52:50 发布