通过python3.x 来实现一个简单的爬虫功能,把网页上想要的图片能通过代码爬取到本地。
#coding=utf-8
import re,codecs,sys,os,time
import urllib.request
def createFile():
if (os.path.exists('photos')== False):
os.mkdir('photos')
def extract(text, sub1, sub2):
"""
extract a substring from text between first
occurances of substrings sub1 and sub2
"""
return text.split(sub1, 1)[-1].split(sub2, 1)[0]
def getHtml(url):
createFile()
fp = urllib.request.urlopen(url)
url = fp.read()
encoding = extract(str(url).lower(), 'charset=', '"')
if encoding:
html = url.decode(encoding)
else:
print("Encoding type not found!")
fp.close()
return html
def getImg(html):
#print (html)
#下面正则是匹配图片地址,可以根据页面实际代码进行修改
reg = r'.+src="(.+?\.jpg.+)" data_org_bimg'
imgre = re.compile(reg,re.I)
imglist = imgre.findall(html)
x = 0
for imgurl in imglist:
print (imgurl)
now = int(time.time())
#print (now)
savepath = 'photos/'+ str(now)+'.jpg'
print (savepath)
urllib.request.urlretrieve(imgurl,savepath)
x+=1
if (x == 0):
return "loading failed";
else:
return "loading successed";
html = getHtml("http://photo.poco.cn/lastphoto-htx-id-4877933-p-0.xhtml")
print (getImg(html))