直接上代码:
# coding=utf-8 from urllib import request from bs4 import BeautifulSoup import requests # import urllib import os import time import chardet import collections # 目标地址 url="https://echarts.baidu.com/" # 像目标URL发送get请求 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} chromHeaders={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER'} user_agents = [ {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'}, {'User-Agent':'Opera/9.25 (Windows NT 5.1; U; en)'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'}, {'User-Agent':'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)'}, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12'}, {'User-Agent': 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'}, {'User-Agent':"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"}, {'User-Agent':"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}, ] req = request.Request(url=url, headers=headers) #防止浏览器拦截 #req = request.Request(url) # 获取URL请求内容 response = request.urlopen(req) # 读取返回的HTML内容 html = response.read() chardit1 = chardet.detect(html) #获取文本编码 """ if("gb2312"==chardit1['encoding'].lower()): print(html) else: print(html.decode(chardit1['encoding'])) """ fout =open("test2.html","wb") fout.write(html) fout.close() # 解析HTML soup = BeautifulSoup(html, 'html.parser') # 使用find_all方法查找HTML中所有<img>标签 all_img = soup.find_all('img') print("all_img:") print(all_img) # 遍历所有爬取到的<img> for img in all_img: # 定义存储位置 root = "D://download//d//" if("data-src" in str(img) or "src" in str(img)): if("data-src" in str(img)): scr = img['data-src'] else: scr = img['src'] else: continue # 获取遍历<img>得到的图片src地址 img_url = scr print("img_url:") print(img_url) print("img:") print(img) # 拼接图片下载完整地址 img_full_url = url + img_url print("img_full_url:") print(img_full_url) if ("alt" in str(img)): alt = img['alt'] else: alt=str(time.time()) print("alt:" + alt) """ # 遍历获取图片名称 for alt in all_img: print('alt = ', alt) img = collections.defaultdict(img) img.setdefault('alt','0'); alt = img['alt'] if(alt=='0'): alt=str(time.time()); """ # 定义图片存储名称和格式 # path = root + img_full_url.split('/')[-1] + ".jpg" # 使用完整下载地址最后分割部分作为图片名称 suffix =img_url.split(".")[len(img_url.split("."))-1] list= ['jpg','jpeg','png','gif'] if(suffix not in (list)): suffix="png" path = root + alt + "."+suffix #定义后缀 try: # 判断路径是否存在 if not os.path.exists(root): print('没有路径,请新建') os.mkdir(root) # 判断图片是否存在 if not os.path.exists(path): print('文件夹没有该图片') if("http://" in img_url or "https://" in img_url ): reqUrl=img_url else: reqUrl=img_full_url print("reqUrl:") print(reqUrl) r = requests.get(url=reqUrl, headers=user_agents[7]) # 检查请求是否成功 r.raise_for_status() # wb表示写入二进制文件,使用r.content方式响应二进制内容 with open(path, "wb") as f: f.write(r.content) print('爬取完成\n\n') else: print('图片已经存在') except Exception as e: print('爬取失败:' + str(e)) continue
代码完成 关于引入的第三方库
在
在python安装路径的scripts路径下敲这个命令 pip install beautifulsoup4
成功安装的话如下
其他的第三方库类似
代码参考来自https://blog.csdn.net/yihongyuantufei/article/details/81745565
在其基础上进行修改,主要增加了1.浏览器拦截2.自动获取后缀问题3.获取网站的编码格式4.图片请求路径的优化等