urlretrieve(url, filename=None, reporthook=None, data=None)
urlretrieve方法直接将远程数据下载到本地。参数filename指定了保存到本地的路径(如果未指定该参数,urllib会生成一个临时文件来保存数据);参数reporthook是一个回调函数,当连接上服务器.以及相应的数据块传输完毕的时候会触发该回调.
python2.0
这里写个爬虫小程序,可以把百度贴吧http://tieba.baidu.com/p/2236567282网页上的jpg图片依次下载下来.
import re
import urllib
def getHtml(url):
html = urllib.urlopen(url).read()
return html
def getJpg(html):
reg = r'src="(http://.*?\.jpg)"'
imgre = re.compile(reg)
imgList = re.findall(imgre,html)
x = 0
for imgurl in imgList:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x += 1
html = getHtml("http://tieba.baidu.com/p/2236567282")
getJpg(html)
python3.0
import urllib.request
import re
def getHtml(url):
html =urllib.request.urlopen(url).read().decode('utf8')
print(html)
return html
def getJpg(html):
reg = r'src="(http://.*?\.jpg)"'
imgre = re.compile(reg)
imgList = re.findall(imgre,html)
x = 0
for imgurl in imgList:
urllib.request.urlretrieve(imgurl,'e://wangjiansheng/%s.jpg' % x)
x += 1
html = getHtml("http://tieba.baidu.com/p/2236567282")
getJpg(html)
.decode('UTF-8')