#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import os
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
downloadDirectory="downloaded"
baseUrl="http://pyhtonscraping.com"
def getAbsoluteURL(baseUrl,source):
if source.startswith("http://www."):
url=source[11:]
elif source.startswith("http://"):
url=source
elif source.startswith("www."):
url="http://"+source[4:]
else:
url =baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl,absoluteUrl,downloadDirectory):
path=absoluteUrl.replace("www.","")
path=path.replace(baseUrl,"")
path=downloadDirectory+path
directory= os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html=urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html,"html.parser")
downloadLists=bsObj.findAll(src=True)
for download in downloadLists:
fileUrl=getAbsoluteURL(baseUrl,download["src"])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl,getDownloadPath(baseUrl,fileUrl,downloadDirectory))