首先需要引入threading模块来实现多线程,没有此模块需要先安装一下。
爬取图片的代码如下(有注释):
from bs4 import BeautifulSoup from bs4 import UnicodeDammit import urllib.request import threading headers = { "User-Agent":"Mozilla/5.0(Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre" } count = 0 #记录图片张数 threads = [] # 保存所有调用的线程 def imageSpider(start_url): global threads global count try: urls = [] # 保存已下载过的图片的路径 req = urllib.request.Request(start_url, headers=headers) data = urllib.request.urlopen(req) data = data.read() dammit = UnicodeDammit(data, ["utf-8", "gbk"]) data = dammit.unicode_markup soup = BeautifulSoup(data, features="html.parser") # 查找网页中所有标签是img的 images = soup.select("img") for image in images: try: # 取出图片的路径 src = image["src"] # 将访问的网址与图片路径结合在一起 url = urllib.request.urljoin(start_url, src) # 判断图片是否已经下载过了 if url not in urls: print(url) count = count + 1 # 线程调用,执行download函数,并传参 T = threading.Thread(target=download, args=(url, count)) T.setDaemon(False) # 设置线程为后台线程 T.start() threads.append(T) # 将调用的线程添加到threads中 except Exception as err: print(err) except Exception as err: print(err) def download(url, count): try: if url[len(url)-4] == ".": # 对应xxx.jpg、xxx.png等图片的处理 ext = url[len(url)-4:] elif url[len(url)-5] == ".": # 对应xxx.jpeg图片的处理 ext = url[len(url) - 5:] else: ext = "" req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req, timeout=100) data = data.read() # 创建图像文件 fobj = open("images\\"+str(count)+ext, "wb") # 写入数据 fobj.write(data) fobj.close() print("downloaded"+str(count)+ext) except Exception as err: print(err) # 要访问的网站 start_url = "http://travel.163.com/" # 请求头,模拟浏览器 headers = { "User-Agent":"Mozilla/5.0(Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre" } count = 0 threads = [] # 保存所有调用的线程 imageSpider(start_url) for t in threads: t.join() # 线程阻塞,主线程要等待所有线程执行完才继续向下执行 print("The end")
此外需要在程序的同级目录下建立一个images文件夹,如图所示:
运行picture.py,查看images文件夹得到如图所示结果:
可以修改start_url的地址常识爬取不同网站的图片(先将images文件夹清空)。