import urllib.request
import os
import urllib
import bs4
import time
def get_html(url):
content = urllib.request.urlopen(url).read()
soup = bs4.BeautifulSoup(content, "html.parser")
return soup
def get_img(soup, url):
local_dir = 'E:\\pythonDownload\\image' # 设置图片的保存地址
if not os.path.isdir(local_dir):
os.makedirs(local_dir) # 判断没有此路径则创建
img_tag = 'a' # 网页中包含图片信息的标签
attr_src = 'data-original' # 图片路径所在的attr
attr_name = 'title' # 图片名称所在的attr
attr_href = 'href' # 图片href所在的attr
doms = soup.findAll(img_tag)
list_src = []
list_name = []
list_href = []
for dom in doms:
if dom.has_attr(attr_src):
src = urllib.parse.urljoin(url, dom[attr_src]) # 相对路径转换为绝对路径
list_src.append(src)
if dom.has_attr(attr_name):
list_name.append(dom[attr_name])
else:
list_name.append(time.strftime("%Y%m%d%H%M%S", time.localtime()))
if dom.has_attr(attr_href):
list_href.append(dom[attr_href])
num = 0
for i in range(list_src.__len__()):
imgurl = list_src[i]
filesuffix = imgurl.split(".")[-1] # 获取文件后缀名
filename = local_dir + os.sep + list_name[i] + '_' + str(num) + '.' + filesuffix
print(filename + " " + imgurl) # 打印下载信息
urllib.request.urlretrieve(imgurl, filename) # 下载图片
num += 1
print('下载图片:' + str(num) + '张')
path = "www.baidu.com" # 要下载图片的网页
html = get_html(path) # 获取该网页的详细信息
get_img(html, path) # 从网页源代码中分析下载保存图片
print('done')