要爬取的url:
https://fabiaoqing.com/biaoqing/lists/page/1.html
该网站总共有200页,每页45张表情包,理论上一共包含9000张表情包。
这里我们分析静态网页,利用BeautifulSoup即可轻松筛选出表情包的url链接。
-
requests 高级爬虫库
-
bs4 网页选择器
-
threading 多线程标准库
import os
import requests
from bs4 import BeautifulSoup
from queue import Queue
from threading import Thread
# 创建一个多线程类
class Download_Image(Thread):
# 重写构造函数
def __init__(self,queue,path):
Thread.__init__(self)
# 创建类属性
self.queue = queue
self.path = path
if not os.path.exists(path):
os.mkdir(path)
#重写run方法
def run(self):
while True:
url = self.queue.get()
try:
download_img(url,self.path)
except Exception as e:
print(e)
finally:
self.queue.task_done()
def download_img(url,path):
# 模拟浏览器
headers = {'User-Agent': 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}
response = requests.get(url,headers=headers).text
#筛选数据
"""
BeautifulSoup需要两个参数:html网页、解析库(它可以将html代码转化为python对象)
解析库有:lxml、html.parse(自带的)
"""
soup = BeautifulSoup(response,'lxml')
#静态页面分析
img_list = soup.find_all('img',class_='ui image lazy')
for img in img_list:
image_name = img.get('title')
image_url = img.get('data-original')
image_type = os.path.splitext(image_url)[-1]
image_data = requests.get(image_url,headers=headers).content
# 保存图片
try:
with open(path+image_name+image_type,'wb') as file:
file.write(image_data)
print('%s下载成功' %image_name)
except Exception as e:
print(e)
# 单线程下载(不调用它)
# def download_all():
# path = './pictures/'
# if not os.path.exists(path):
# os.mkdir(path)
# # 45*200 一共约9000张图片
# for i in range(1,201):
# url = 'https://fabiaoqing.com/biaoqing/lists/page/%s.html' %i
# download_img(url,path)
if __name__=='__main__':
_url = 'https://fabiaoqing.com/biaoqing/lists/page/{pageNum}.html'
# urls中存放了200条链接
urls = [_url.format(pageNum=num) for num in range(1,201)]
queue = Queue()
path = './threading_images/'
# 创建10个线程
for i in range(10):
worker = Download_Image(queue,path)
# 创建一个守护线程,防止子线程阻塞影响主线程,造成卡死
worker.daemon = True
worker.start()
# 将url放入队列中
for url in urls:
queue.put(url)
# 等队列为空,则退出所有线程
queue.join()
print('下载完毕...')