百度飞桨PaddlePaddle-21天零基础实践深度学习-图像分类
摘要
介绍了几种经典的图像分类模型,分别是LeNet, AlexNet, VGG, GoogLeNet和ResNet,并将它们应用到眼疾数据集上。除了LeNet不适合大尺寸的图像分类问题之外,其它几个模型在此数据集上损失函数都能显著下降,在验证集上的预测精度在94%左右。
数据集介绍
眼疾分类数据集iChallenge-PM
iChallenge-PM是百度大脑和中山大学中山眼科中心联合举办的iChallenge比赛中,提供的关于病理性近视(Pathologic Myopia,PM)的医疗类数据集,包含1200个受试者的眼底视网膜图片,训练、验证和测试数据集各400张。
下载地址:
https://aistudio.baidu.com/aistudio/datasetdetail/19065?_=1597918741595
数据准备
training.zip:包含训练中的图片和标签
validation.zip:包含验证集的图片
valid_gt.zip:包含验证集的标签
#解压代码
!unzip -o -q -d /.../training.zip
!unzip -o -q PALM-Training400.zip
!unzip -o -q -d /.../validation.zip
!unzip -o -q -d /.../valid_gt.zip
非病理性患者(Normal,High myopia)的图片作为负样本,标签为0。
病理性患者(PM)的图片作为正样本,标签为1;
数据读取器定义
使用OpenCV从磁盘读入图片,每张图缩放到224×224大小,并且将像素值调整到[−1,1]之间.
import cv2
import random
import numpy as np
import os
# 对读入的图像数据进行预处理
def transform_img(img):
# 将图片尺寸缩放道 224x224
img = cv2.resize(img, (224, 224))
# 读入的图像数据格式是[H, W, C]
# 使用转置操作将其变成[C, H, W]
img = np.transpose(img, (2,0,1))
img = img.astype('float32')
# 将数据范围调整到[-1.0, 1.0]之间
img = img / 255.
img = img * 2.0 - 1.0
return img
# 定义训练集数据读取器
def data_loader(datadir, batch_size=10, mode = 'train'):
# 将datadir目录下的文件列出来,每条文件都要读入
filenames = os.listdir(datadir)
def reader():
if mode == 'train':
# 训练时随机打乱数据顺序
random.shuffle(filenames)
batch_imgs = []
batch_labels = []
for name in filenames:
filepath = os.path.join(datadir, name)
img = cv2.imread(filepath)
img = transform_img(img)
if name[0] == 'H' or name[0] == 'N':
# H开头的文件名表示高度近似,N开头的文件名表示正常视力
# 高度近视和正常视力的样本,都不是病理性的,属于负样本,标签为0
label = 0
elif name[0] == 'P':
# P开头的是病理性近视,属于正样本,标签为1
label = 1
else:
raise('Not excepted file name')
# 每读取一个样本的数据,就将其放入数据列表中
batch_imgs.append(img)
batch_labels.append(label)
if len(batch_imgs) == batch_size:
# 当数据列表的长度等于batch_size的时候,
# 把这些数据当作一个mini-batch,并作为数据生成器的一个输出
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
batch_imgs = []
batch_labels = []
if len(batch_imgs) > 0:
# 剩余样本数目不足一个batch_size的数据,一起打包成一个mini-batch
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
return reader
# 定义验证集数据读取器
def valid_data_loader(datadir, csvfile, batch_size=10, mode='valid'):
# 训练集读取时通过文件名来确定样本标签,验证集则通过csvfile来读取每个图片对应的标签
# 请查看解压后的验证集标签数据,观察csvfile文件里面所包含的内容
# csvfile文件所包含的内容格式如下,每一行代表一个样本,
# 其中第一列是图片id,第二列是文件名,第三列是图片标签,
# 第四列和第五列是Fovea的坐标,与分类任务无关
# ID,imgName,Label,Fovea_X,Fovea_Y
# 1,V0001.jpg,0,1157.74,1019.87
# 2,V0002.jpg,1,1285.82,1080.47
# 打开包含验证集标签的csvfile,并读入其中的内容
filelists = open(csvfile).readlines()
def reader():
batch_imgs = []
batch_labels = []
for line in filelists[1:]:
line = line.strip().split(',')
name = line[1]
label = int(line[2])
# 根据图片文件名加载图片,并对图像数据作预处理
filepath = os.path.join(datadir, name)
img = cv2.imread(filepath)
img = transform_img(img)
# 每读取一个样本的数据,就将其放入数据列表中
batch_imgs.append(img)
batch_labels.append(label)
if len(batch_imgs) == batch_size:
# 当数据列表的长度等于batch_size的时候,
# 把这些数据当作一个mini-batch,并作为数据生成器的一个输出
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
batch_imgs = []
batch_labels = []
if len(batch_imgs) > 0:
# 剩余样本数目不足一个batch_size的数据,一起打包成一个mini-batch
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
return reader
数据输出形状
查看数据形状代码
# 查看数据形状
DATADIR = '...'
train_loader = data_loader(DATADIR,
batch_size=10, mode='train')
data_reader = train_loader()
data = next(data_reader)
data[0].shape, data[1].shape
eval_loader = data_loader(DATADIR,
batch_size=10, mode='eval')
data_reader = eval_loader()
data = next(data_reader)
data[0].shape, data[1].shape
数据形状
((10, 3, 224, 224), (10, 1))
卷积神经网络
LeNet
LeNet是最早的卷积神经网络之一。1998年,第一次将LeNet卷积神经网络应用到图像分类上,通过连续使用卷积和池化层的组合提取图像特征,,在手写数字识别任务中取得了巨大成功。
LeNet-5网络 (3conv+2fc)
#LeNet网络代码
# 导入需要的包
import paddle
import paddle.fluid as fluid
import numpy as np
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
# 定义 LeNet 网络结构
class LeNet(fluid.dygraph.Layer):
def __init__(self, num_classes=1):
super(LeNet, self).__init__()
# 创建卷积和池化层块,每个卷积层使用Sigmoid激活函数,后面跟着一个2x2的池化
self.conv1 = Conv2D(num_channels=1, num_filters=6, filter_size=5, act='sigmoid')
self.pool1 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
self.conv2 = Conv2D(num_channels=6, num_filters=16, filter_size=5, act='sigmoid')
self.pool2 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
# 创建第3个卷积层
self.conv3 = Conv2D(num_channels=16, num_filters=120, filter_size=4, act='sigmoid')
# 创建全连接层,第一个全连接层的输出神经元个数为64, 第二个全连接层输出神经元个数为分类标签的类别数
self.fc1 = Linear(input_dim=120, output_dim=64, act='sigmoid')
self.fc2 = Linear(input_dim=64, output_dim=num_classes)
# 网络的前向计算过程
def forward(self, x):
x = self.conv1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.conv3(x)
x = fluid.layers.reshape(x, [x.shape[0], -1])
x = self.fc1(x)
x = self.fc2(x)
return x
LeNet识别手写数字
# -*- coding: utf-8 -*-
# LeNet 识别手写数字
import os
import random
import paddle
import paddle.fluid as fluid
import numpy as np
# 定义训练过程
def train(model):
print('start training ... ')
model.train()
epoch_num = 5
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9, parameter_list=model.parameters())
# 使用Paddle自带的数据读取器
train_loader = paddle.batch(paddle.dataset.mnist.train(), batch_size=10)
valid_loader = paddle.batch(paddle.dataset.mnist.test(), batch_size=10)
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_loader()):
# 调整输入数据形状和类型
x_data = np.array([item[0] for item in data], dtype='float32').reshape(-1, 1, 28, 28)
y_data = np.array([item[1] for item in data], dtype='int64').reshape(-1, 1)
# 将numpy.ndarray转化成Tensor
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
# 计算模型输出
logits = model(img)
# 计算损失函数
loss = fluid.layers.softmax_with_cross_entropy(logits, label)
avg_loss = fluid.layers.mean(loss)
if batch_id % 1000 == 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, avg_loss.numpy()))
avg_loss.backward()
opt.minimize(avg_loss)
model.clear_gradients()
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(valid_loader()):
# 调整输入数据形状和类型
x_data = np.array([item[0] for item in data], dtype='float32').reshape(-1, 1, 28, 28)
y_data = np.array([item[1] for item in data], dtype='int64').reshape(-1, 1)
# 将numpy.ndarray转化成Tensor
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
# 计算模型输出
logits = model(img)
pred = fluid.layers.softmax(logits)
# 计算损失函数
loss = fluid.layers.softmax_with_cross_entropy(logits, label)
acc = fluid.layers.accuracy(pred, label)
accuracies.append(acc.numpy())
losses.append(loss.numpy())
print("[validation] accuracy/loss: {}/{}".format(np.mean(accuracies), np.mean(losses)))
model.train()
# 保存模型参数
fluid.save_dygraph(model.state_dict(), 'mnist')
if __name__ == '__main__':
# 创建模型
with fluid.dygraph.guard():
model = LeNet(num_classes=10)
#启动训练过程
train(model)
结果
...........
epoch: 4, batch_id: 0, loss is: [0.23572874]
epoch: 4, batch_id: 1000, loss is: [0.23122503]
epoch: 4, batch_id: 2000, loss is: [0.22874115]
epoch: 4, batch_id: 3000, loss is: [0.06003483]
epoch: 4, batch_id: 4000, loss is: [0.1360332]
epoch: 4, batch_id: 5000, loss is: [0.15419601]
[validation] accuracy/loss: 0.9325999617576599/0.2355729043483734
LeNet 识别眼疾图片
#仅展示关键代码,其余参照手写数字识别代码
#数据目录
DATADIR = '/.../palm/PALM-Training400/PALM-Training400'
DATADIR2 = '/.../palm/PALM-Validation400'
CSVFILE = '/.../labels.csv'
#定义训练过程
#不同于数字识别部分--数据处理----两数据集结构不一样
train_loader = data_loader(DATADIR, batch_size=10, mode='train')
valid_loader = valid_data_loader(DATADIR2, CSVFILE)
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_loader()):
x_data, y_data = data
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
#定义验证数据读取器部分
# 二分类,sigmoid计算后的结果以0.5为阈值分两个类别
# 计算sigmoid后的预测概率,进行loss计算
pred = fluid.layers.sigmoid(logits)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(logits, label)
# 计算预测概率小于0.5的类别
pred2 = pred * (-1.0) + 1.0
# 得到两个类别的预测概率,并沿第一个维度级联
pred = fluid.layers.concat([pred2, pred], axis=1)
acc = fluid.layers.accuracy(pred, fluid.layers.cast(label, dtype='int64'))
accuracies.append(acc.numpy())
losses.append(loss.numpy())
定义 LeNet 网络结构—不同于数字识别
输入input:[10 3 224 224]
shape:
conv1 [10 6 220 220]→pool1 [10 6 110 110]→
conv2 [10 16 106 106]→ pool2 [10 16 53 53]→
conv3 [10 120 50 50]→ fc1 [10 64]→fc2 [10 1]
fc_1的输入input_dims=300,000是conv3.shape中 120x50x50=300,000
结果:LeNet的loss很难下降,模型没有收敛。
..........
epoch: 4, batch_id: 0, loss is: [0.7160613]
epoch: 4, batch_id: 10, loss is: [0.6806122]
epoch: 4, batch_id: 20, loss is: [0.6809079]
epoch: 4, batch_id: 30, loss is: [0.6780921]
[validation] accuracy/loss: 0.5275000333786011/0.691761314868927
说明在图片尺寸比较大时,LeNet在图像分类任务上存在局限性。
AlexNet
硬件性能 → GPU并行计算提高,复杂神经网络的计算容易
大数据的涌现
导致大量研究人员开始专门针对神经网络做算法和模型的优化
AlexNet获得了2012年ImageNet比赛冠军
AlexNet与LeNet相比,具有更深的网络结构,包含5层conv和3层fc,同时使用了如下三种方法改进模型的训练过程:
1.数据增广:通过对训练随机加一些变化,比如平移、缩放、裁剪、旋转、翻转或者增减亮度等,扩大训练数据集。通过这种方式,避免过度依赖于某些属性,能从一定程度上抑制过拟合。
2.使用Dropout抑制过拟合
3.使用ReLU代替Sigmoid激活函数减少梯度消失现象
AlexNet-8网络(5conv+3fc)
input [10 3 224 224] → conv1 [10 96 56 56] → pool1 [10 96 28 28] →conv2 [10 256 28 28] → pool2 [10 256 14 14] → conv3 [10 384 14 14] → conv4 [10 384 14 14] → conv5 [10 256 14 14] → pool5 [10 256 7 7] → fc1 [10 4096] → fc2 [10 4096] → fc3 [10 1]
fc_1的input_dims=12544 是 pool5 [10 256 7 7] 中250x7x7=12544。
# -*- coding:utf-8 -*-
# 导入需要的包
import paddle
import paddle.fluid as fluid
import numpy as np
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
# 定义 AlexNet 网络结构
class AlexNet(fluid.dygraph.Layer):
def __init__(self, num_classes=1):
super(AlexNet, self).__init__()
# AlexNet与LeNet一样也会同时使用卷积和池化层提取图像特征
# 与LeNet不同的是激活函数换成了‘relu’
self.conv1 = Conv2D(num_channels=3, num_filters=96, filter_size=11, stride=4, padding=5, act='relu')
self.pool1 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
self.conv2 = Conv2D(num_channels=96, num_filters=256, filter_size=5, stride=1, padding=2, act='relu')
self.pool2 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
self.conv3 = Conv2D(num_channels=256, num_filters=384, filter_size=3, stride=1, padding=1, act='relu')
self.conv4 = Conv2D(num_channels=384, num_filters=384, filter_size=3, stride=1, padding=1, act='relu')
self.conv5 = Conv2D(num_channels=384, num_filters=256, filter_size=3, stride=1, padding=1, act='relu')
self.pool5 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
self.fc1 = Linear(input_dim=12544, output_dim=4096, act='relu')
self.drop_ratio1 = 0.5
self.fc2 = Linear(input_dim=4096, output_dim=4096, act='relu')
self.drop_ratio2 = 0.5
self.fc3 = Linear(input_dim=4096, output_dim=num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.pool5(x)
x = fluid.layers.reshape(x, [x.shape[0], -1])
x = self.fc1(x)
# 在全连接之后使用dropout抑制过拟合
x= fluid.layers.dropout(x, self.drop_ratio1)
x = self.fc2(x)
# 在全连接之后使用dropout抑制过拟合
x = fluid.layers.dropout(x, self.drop_ratio2)
x = self.fc3(x)
return x
AlexNet在眼疾数据集上应用
训练
with fluid.dygraph.guard():
model = AlexNet()
train(model)
结果
.........
epoch: 4, batch_id: 0, loss is: [0.30062953]
epoch: 4, batch_id: 10, loss is: [0.44626054]
epoch: 4, batch_id: 20, loss is: [0.07915]
epoch: 4, batch_id: 30, loss is: [0.28443167]
[validation] accuracy/loss: 0.9274999499320984/0.1915486752986908
使用AlexNet,loss能有效下降,在验证集上的准确率可以达到93%左右。
VGG
2014年提出了VGG网络结构,结构简单、应用性极强,是当前最流行的卷积神经网络之一。VGG通过使用一系列大小为3x3的小尺寸卷积核和pooling层构造深度卷积神经网络,并取得了较好的效果。
VGG-16网络(13conv+3fc)
VGG网络的设计严格使用3×3的卷积层和池化层来提取特征,并在网络的最后面使用三层全连接层,将最后一层全连接层的输出作为分类的预测。 在VGG中每层卷积将使用ReLU作为激活函数,在全连接层之后添加dropout来抑制过拟合。
使用小的卷积核能够有效地减少参数的个数,使得训练和测试变得更加有效。由于卷积核比较小,可以堆叠更多的卷积层,加深网络的深度,这对于图像分类任务来说是有利的。VGG模型的成功证明了增加网络的深度,可以更好的学习图像中的特征模式。
首先定义vgg_block块,包含多层3x3的卷积和1层2x2最大池化层
VGG一共有5个vgg_block,每个block里面的卷积层数目和输出通道数由conv_arch指定
# -*- coding:utf-8 -*-
# VGG模型代码
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
# 定义vgg块,包含多层卷积和1层2x2的最大池化层
class vgg_block(fluid.dygraph.Layer):
def __init__(self, num_convs, in_channels, out_channels):
"""
num_convs, 卷积层的数目
num_channels, 卷积层的输出通道数,在同一个Incepition块内,卷积层输出通道数是一样的
"""
super(vgg_block, self).__init__()
self.conv_list = []
for i in range(num_convs):
conv_layer = self.add_sublayer('conv_' + str(i), Conv2D(num_channels=in_channels,
num_filters=out_channels, filter_size=3, padding=1, act='relu'))
self.conv_list.append(conv_layer)
in_channels = out_channels
self.pool = Pool2D(pool_stride=2, pool_size = 2, pool_type='max')
def forward(self, x):
for item in self.conv_list:
x = item(x)
return self.pool(x)
class VGG(fluid.dygraph.Layer):
def __init__(self, conv_arch=((2, 64),
(2, 128), (3, 256), (3, 512), (3, 512))):
super(VGG, self).__init__()
self.vgg_blocks=[]
iter_id = 0
# 添加vgg_block
# 这里一共5个vgg_block,每个block里面的卷积层数目和输出通道数由conv_arch指定
in_channels = [3, 64, 128, 256, 512, 512]
for (num_convs, num_channels) in conv_arch:
block = self.add_sublayer('block_' + str(iter_id),
vgg_block(num_convs, in_channels=in_channels[iter_id],
out_channels=num_channels))
self.vgg_blocks.append(block)
iter_id += 1
self.fc1 = Linear(input_dim=512*7*7, output_dim=4096,
act='relu')
self.drop1_ratio = 0.5
self.fc2= Linear(input_dim=4096, output_dim=4096,
act='relu')
self.drop2_ratio = 0.5
self.fc3 = Linear(input_dim=4096, output_dim=1)
def forward(self, x):
for item in self.vgg_blocks:
x = item(x)
x = fluid.layers.reshape(x, [x.shape[0], -1])
x = fluid.layers.dropout(self.fc1(x), self.drop1_ratio)
x = fluid.layers.dropout(self.fc2(x), self.drop2_ratio)
x = self.fc3(x)
return x
AlexNet在眼疾数据集上应用
with fluid.dygraph.guard():
model = VGG()
train(model)
结果
.........
epoch: 1, batch_id: 0, loss is: [0.6305684]
epoch: 1, batch_id: 10, loss is: [0.29560965]
epoch: 1, batch_id: 20, loss is: [0.23282667]
epoch: 1, batch_id: 30, loss is: [0.3495158]
[validation] accuracy/loss: 0.9350000619888306/0.23027324676513672
使用VGG,loss能有效的下降,在验证集上的准确率可以达到94%左右。
GoogLeNet
GoogLeNet在2014年ImageNet比赛冠军
主要特点是网络不仅有深度,还在横向上具有“宽度”。
由于图像信息在空间尺寸上的巨大差异,如何选择合适的卷积核大小来提取特征很重要。空间分布范围更广的图像信息适合用较大的卷积核来提取其特征,而空间分布范围较小的图像信息用较小的卷积核来提取其特征。
GoogLeNet提出了Inception模块。
Inception模块的设计思想:使用3个不同大小的卷积核对输入图片进行卷积操作,并附加最大池化,将这4个操作的输出沿着通道这一维度进行拼接,构成的输出特征图将会包含经过不同大小的卷积核提取出来的特征。Inception模块采用**多通路(multi-path)**的设计形式。conv1x1 → conv 3x3 → conv 5x5 → maxpool 3x3
为了减小参数量,Inception模块在每个3x3和5x5的卷积层之前,增加1x1的卷积层来控制输出通道数;在最大池化层后面增加1x1卷积层减小输出通道数。
conv1x1 → conv1x1 → conv 3x3→ conv1x1 → conv 5x5 → maxpool 3x3 → conv1x1
Inception模块
# -*- coding:utf-8 -*-
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
# 定义Inception块
class Inception(fluid.dygraph.Layer):
def __init__(self, c0,c1, c2, c3, c4, **kwargs):
'''
Inception模块的实现代码,
c1, 图(b)中第一条支路1x1卷积的输出通道数,数据类型是整数
c2,图(b)中第二条支路卷积的输出通道数,数据类型是tuple或list,
其中c2[0]是1x1卷积的输出通道数,c2[1]是3x3
c3,图(b)中第三条支路卷积的输出通道数,数据类型是tuple或list,
其中c3[0]是1x1卷积的输出通道数,c3[1]是3x3
c4, 图(b)中第一条支路1x1卷积的输出通道数,数据类型是整数
'''
super(Inception, self).__init__()
# 依次创建Inception块每条支路上使用到的操作
self.p1_1 = Conv2D(num_channels=c0, num_filters=c1,
filter_size=1, act='relu')
self.p2_1 = Conv2D(num_channels=c0, num_filters=c2[0],
filter_size=1, act='relu')
self.p2_2 = Conv2D(num_channels=c2[0], num_filters=c2[1],
filter_size=3, padding=1, act='relu')
self.p3_1 = Conv2D(num_channels=c0, num_filters=c3[0],
filter_size=1, act='relu')
self.p3_2 = Conv2D(num_channels=c3[0], num_filters=c3[1],
filter_size=5, padding=2, act='relu')
self.p4_1 = Pool2D(pool_size=3,
pool_stride=1, pool_padding=1,
pool_type='max')
self.p4_2 = Conv2D(num_channels=c0, num_filters=c4,
filter_size=1, act='relu')
def forward(self, x):
# 支路1只包含一个1x1卷积
p1 = self.p1_1(x)
# 支路2包含 1x1卷积 + 3x3卷积
p2 = self.p2_2(self.p2_1(x))
# 支路3包含 1x1卷积 + 5x5卷积
p3 = self.p3_2(self.p3_1(x))
# 支路4包含 最大池化和1x1卷积
p4 = self.p4_2(self.p4_1(x))
# 将每个支路的输出特征图拼接在一起作为最终的输出结果
return fluid.layers.concat([p1, p2, p3, p4], axis=1)
GoogLeNet-22网络
主体卷积部分中使用5个block,每个block间使用 maxpool 3x3。
<1>使用一个conv 7x7(64)
<2>使用2个卷积层:conv 1 × 1(64) + conv 3 × 3(192)
<3>串联2个Inception块
<4>串联5个Inception块
<5>串联2 个Inception块
紧跟输出层,使用全局平均池化层,最后接上一个输出个数为标签类别数的全连接层。
class GoogLeNet(fluid.dygraph.Layer):
def __init__(self):
super(GoogLeNet, self).__init__()
# GoogLeNet包含五个模块,每个模块后面紧跟一个池化层
# 第一个模块包含1个卷积层
self.conv1 = Conv2D(num_channels=3, num_filters=64, filter_size=7,
padding=3, act='relu')
# 3x3最大池化
self.pool1 = Pool2D(pool_size=3, pool_stride=2,
pool_padding=1, pool_type='max')
# 第二个模块包含2个卷积层
self.conv2_1 = Conv2D(num_channels=64, num_filters=64,
filter_size=1, act='relu')
self.conv2_2 = Conv2D(num_channels=64, num_filters=192,
filter_size=3, padding=1, act='relu')
# 3x3最大池化
self.pool2 = Pool2D(pool_size=3, pool_stride=2,
pool_padding=1, pool_type='max')
# 第三个模块包含2个Inception块
self.block3_1 = Inception(192, 64, (96, 128), (16, 32), 32)
self.block3_2 = Inception(256, 128, (128, 192), (32, 96), 64)
# 3x3最大池化
self.pool3 = Pool2D(pool_size=3, pool_stride=2,
pool_padding=1, pool_type='max')
# 第四个模块包含5个Inception块
self.block4_1 = Inception(480, 192, (96, 208), (16, 48), 64)
self.block4_2 = Inception(512, 160, (112, 224), (24, 64), 64)
self.block4_3 = Inception(512, 128, (128, 256), (24, 64), 64)
self.block4_4 = Inception(512, 112, (144, 288), (32, 64), 64)
self.block4_5 = Inception(528, 256, (160, 320), (32, 128), 128)
# 3x3最大池化
self.pool4 = Pool2D(pool_size=3, pool_stride=2,
pool_padding=1, pool_type='max')
# 第五个模块包含2个Inception块
self.block5_1 = Inception(832, 256, (160, 320), (32, 128), 128)
self.block5_2 = Inception(832, 384, (192, 384), (48, 128), 128)
# 全局池化,尺寸用的是global_pooling,pool_stride不起作用
self.pool5 = Pool2D(pool_stride=1,
global_pooling=True, pool_type='avg')
self.fc = Linear(input_dim=1024, output_dim=1, act=None)
def forward(self, x):
x = self.pool1(self.conv1(x))
x = self.pool2(self.conv2_2(self.conv2_1(x)))
x = self.pool3(self.block3_2(self.block3_1(x)))
x = self.block4_3(self.block4_2(self.block4_1(x)))
x = self.pool4(self.block4_5(self.block4_4(x)))
x = self.pool5(self.block5_2(self.block5_1(x)))
x = fluid.layers.reshape(x, [x.shape[0], -1])
x = self.fc(x)
return x
GoogLeNet在眼疾数据集上应用
with fluid.dygraph.guard():
model = GoogLeNet()
train(model)
使用GoogLeNet在眼疾数据集上,loss能有效的下降,在验证集上的准确率可以达到95%左右。
ResNet
2015年提出了ResNet,通过引入残差模块加深网络层数,在ImagNet数据集上的错误率降低到3.6%,超越了人眼识别水平。
残差设计的思想:输入x通过跨层连接,能更快的向前传播数据,或者向后传播梯度。
残差块结构示意图(瓶颈结构)
ResNet-50网络(49conv+1fc)
首先定义卷积批归一化块
再定义残差块,每个残差块会对输入图片做三次卷积,然后跟输入图片进行短接,如果残差块中第三次卷积输出特征图的形状与输入不一致,则对输入图片做1x1卷积,将其输出形状调整成一致输出通道*4
最后定义ResNet模型,ResNet-50包含多个模块,其中第2到第5个模块包含的残差块 depth = [3 4 6 3]
# 定义卷积批归一化块
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
"""
num_channels, 卷积层的输入通道数
num_filters, 卷积层的输出通道数
stride, 卷积层的步幅
groups, 分组卷积的组数,默认groups=1不使用分组卷积
act, 激活函数类型,默认act=None不使用激活函数
"""
super(ConvBNLayer, self).__init__()
# 创建卷积层
self._conv = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
# 创建BatchNorm层
self._batch_norm = BatchNorm(num_filters, act=act)
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
return y
定义残差块
# 定义残差块
# 每个残差块会对输入图片做三次卷积,然后跟输入图片进行短接
# 如果残差块中第三次卷积输出特征图的形状与输入不一致,则对输入图片做1x1卷积,将其输出形状调整成一致
class BottleneckBlock(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters,
stride,
shortcut=True):
super(BottleneckBlock, self).__init__()
# 创建第一个卷积层 1x1
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='relu')
# 创建第二个卷积层 3x3
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
# 创建第三个卷积 1x1,但输出通道数乘以4
self.conv2 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 4,
filter_size=1,
act=None)
# 如果conv2的输出跟此残差块的输入数据形状一致,则shortcut=True
# 否则shortcut = False,添加1个1x1的卷积作用在输入数据上,使其形状变成跟conv2一致
if not shortcut:
self.short = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters * 4,
filter_size=1,
stride=stride)
self.shortcut = shortcut
self._num_channels_out = num_filters * 4
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
conv2 = self.conv2(conv1)
# 如果shortcut=True,直接将inputs跟conv2的输出相加
# 否则需要对inputs进行一次卷积,将形状调整成跟conv2输出一致
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
y = fluid.layers.elementwise_add(x=short, y=conv2)
layer_helper = LayerHelper(self.full_name(), act='relu')
return layer_helper.append_activation(y)
定义ResNet模型
# 定义ResNet模型
class ResNet(fluid.dygraph.Layer):
def __init__(self, layers=50, class_dim=1):
"""
layers, 网络层数,可以是50, 101或者152
class_dim,分类标签的类别数
"""
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
#ResNet50包含多个模块,其中第2到第5个模块分别包含3、4、6、3个残差块
depth = [3, 4, 6, 3]
elif layers == 101:
#ResNet101包含多个模块,其中第2到第5个模块分别包含3、4、23、3个残差块
depth = [3, 4, 23, 3]
elif layers == 152:
#ResNet50包含多个模块,其中第2到第5个模块分别包含3、8、36、3个残差块
depth = [3, 8, 36, 3]
# 残差块中使用到的卷积的输出通道数
num_filters = [64, 128, 256, 512]
# ResNet的第一个模块,包含1个7x7卷积,后面跟着1个最大池化层
self.conv = ConvBNLayer(
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
self.pool2d_max = Pool2D(
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
# ResNet的第二到第五个模块c2、c3、c4、c5
self.bottleneck_block_list = []
num_channels = 64
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
bottleneck_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
BottleneckBlock(
num_channels=num_channels,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1, # c3、c4、c5将会在第一个残差块使用stride=2;其余所有残差块stride=1
shortcut=shortcut))
num_channels = bottleneck_block._num_channels_out
self.bottleneck_block_list.append(bottleneck_block)
shortcut = True
# 在c5的输出特征图上使用全局池化
self.pool2d_avg = Pool2D(pool_size=7, pool_type='avg', global_pooling=True)
# stdv用来作为全连接层随机初始化参数的方差
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
# 创建全连接层,输出大小为类别数目
self.out = Linear(input_dim=2048, output_dim=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
y = self.conv(inputs)
y = self.pool2d_max(y)
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, [y.shape[0], -1])
y = self.out(y)
return y
ResNet在眼疾数据集上应用
with fluid.dygraph.guard():
model = ResNet()
train(model)
结果
使用ResNet在眼疾数据集上,loss能有效的下降,在验证集上的准确率可以达到95%左右。