下载一个网页的图片:
#-*- coding= utf-8 -*-
import urllib
import re
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
#格式的匹配
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imgList = re.findall(imgre, html)
x= 0
for imgurl in imgList:
#下载的主要语句
img = urllib.urlretrieve(imgurl, r"D://picture/%s.jpg" %x)
x = x+1
print img
#下载页面的地址
html = getHtml("http://tieba.baidu.com/p/2460150866")
print getImg(html)
#-*- coding=utf-8 -*-
import urllib2
import urllib
import re
import HTMLParser
import time,os
host = "http://desk.zol.com.cn"
startImageUrl =''
localSavePath = 'D:\\picture\\'
ISOTIMEFORMAT='%Y%m%d%H%M%S'
def downloadImage(url):
imgRe = '[0-9]*\.jpg'
match = re.search(imgRe, url)
if match:
print "Downloading image begin" ,url
filename = localSavePath + str(time.strftime(ISOTIMEFORMAT))+ r'.jpg'
img = urllib.urlretrieve(url, filename)
else:
print "NO match"
def getImageUrlByHtmlUrl(htmlUrl):
parser = MyHtmlParser(False)
request = urllib2.Request(htmlUrl)
try:
response = urllib2.urlopen(request)
content = response.read()
parser.feed(content)
except urllib2.URLError, e:
print e.reason
class MyHtmlParser(HTMLParser.HTMLParser):
def __init__(self,isIndex):
self.isIndex = isIndex
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if(self.isIndex):
if(tag == 'a'):
if(len(attrs) == 4):
if(attrs[0] == ('class','pic')):
newUrl = host + attrs[1][1]
print "Find a image site: ",newUrl
#Question 这句话去掉就会只有一个网页的图片 global 定义的用法
startImageUrl = newUrl
getImageUrlByHtmlUrl(newUrl)
else:
if(tag == 'img'):
if(attrs[0] == ('id','bigImg')):
imgUrl = attrs[1][1]
print " one image : " ,imgUrl
downloadImage(imgUrl)
if(tag == 'a'):
if(len(attrs) == 4):
if(attrs[1] == ('class','next')):
nextUrl = host + attrs[2][1]
print "Find a next image Link" ,nextUrl
global startImageUrl
if( nextUrl != startImageUrl ):
getImageUrlByHtmlUrl(nextUrl)
if __name__ == "__main__":
indexUrl = "http://desk.zol.com.cn/meinv/"
page = urllib2.urlopen(indexUrl).read()
parseIndex = MyHtmlParser(True)
parseIndex.feed(page)
API: http://blog.csdn.net/tianxicool/article/details/5942523