场景:
1.有时候需要下载某个网站上提供的所有pdf文件,貌似没发现哟下载工具提供。
#! encoding=utf-8
import urllib2
import re
import os
def Download(url,output):
print "downloading..."+url
response = urllib2.urlopen(url)
resourceFile = open(output,"wb")
resourceFile.write(response.read())
resourceFile.close()
print "downloaded"
def Action(url,ext = "pdf",output = "."):
#1.domain
index = url.rfind("/");
domain = url[0:index+1];
print domain
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#2.content
content = response.read()
# print content
#3.resource
mode = '\"([^\"]+'+ext+')\"'
pattern = re.compile(mode)
strMatch = pattern.findall(content)
size = len(strMatch)
print "file num: "+str(size)
for i in range(0,size,1):
# print strMatch[i]
one = strMatch[i]
partIndex = one.rfind('/')
if not one.startswith('http://'):
if -1!=partIndex:
directDir = one[0:partIndex+1]
else:
directDir = ""
# print directDir
try:
os.makedirs(output+"/"+directDir)
except Exception,e:
pass
fileUrl = domain+one
fileOutput = output+"/"+one
print fileUrl
print fileOutput
Download(fileUrl,fileOutput)
else:
print one
print "........."
print one[partIndex:]
fileOutput = output+"/"+one[partIndex:]
print fileOutput
Download(one,fileOutput)
#5.download
if __name__=='__main__':
print "download"
url = "http://compgeom.cs.uiuc.edu/~jeffe/teaching/algorithms/";
Action("http://tech.qq.com/","jpg");