该爬虫只是简单的爬取了一个小网站上的图片:
# encoding:utf-8
import urllib
import re
import os
#该函数传入URL获得该url所对应的html源代码
def gethtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#该函数传入html源码,返回源码中的图片链接list
def getimagesurl(html):
reg = 'src="(http://.*?\.jpg)"'
imageurllist = re.findall(reg,html)
return imageurllist
#根据文件名创建文件
def createFileWithFileName(localPathParam,fileName):
totalPath=localPathParam+'\\'+fileName
if not os.path.exists(totalPath):
file=open(totalPath,'a+')
file.close()
return totalPath
#传入image连接的list将图片存到本地文件
def saveimages(imageurllist):
name = 0
for imageurl in imageurllist:
print imageurl
#urllib.urlretrieve('E:\\tupian','{}.jpg'.format(name))
localPath = 'E:\\tupian'
urllib.urlretrieve(imageurl,createFileWithFileName(localPath,'{}.jpg'.format(name)))
name+=1
#这里是代码的入口
html = gethtml("http://www.ivsky.com/tupian/ziranfengguang")
imageurllist = getimagesurl(html)
saveimages(imageurllist)
打印结果:
http://img.ivsky.com/img/tupian/li/201604/07/haishang_riluo_fengjing-001.jpg
http://img.ivsky.com/img/tupian/li/201603/31/weimei_de_chengshi_yese-003.jpg
http://img.ivsky.com/img/tupian/li/201603/25/tianye-002.jpg
http://img.ivsky.com/img/tupian/li/201603/25/shulin-001.jpg
http://img.ivsky.com/img/tupian/li/201603/06/hailang-002.jpg
http://img.ivsky.com/img/tupian/li/201603/05/heliu.jpg
http://img.ivsky.com/img/tupian/li/201603/05/hailang-007.jpg
http://img.ivsky.com/img/tupian/li/201602/27/richu-003.jpg
http://img.ivsky.com/img/tupian/li/201602/27/caihong-002.jpg
http://img.ivsky.com/img/tupian/li/201602/17/haibian_jiaoshi-001.jpg
http://img.ivsky.com/img/tupian/li/201602/17/jinse_de_maitian.jpg
http://img.ivsky.com/img/tupian/li/201602/19/chuchun-002.jpg
http://img.ivsky.com/img/tupian/li/201602/16/yuzhou-006.jpg
http://img.ivsky.com/img/tupian/li/201602/16/titian.jpg
http://img.ivsky.com/img/tupian/li/201602/13/yunhai-004.jpg
http://img.ivsky.com/img/tupian/li/201602/14/bingchuan-002.jpg
http://img.ivsky.com/img/tupian/li/201602/01/aurora_borealis.jpg
http://img.ivsky.com/img/tupian/li/201601/30/sea-007.jpg
这里我们同时将每个连接所对应的图片写到了本地文件