图像分类
图像分类是根据图像的语义信息对不同类别图像进行区分,是计算机视觉中重要的基础问题,是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础。
本节使用LeNet和AlexNet解决图像分类问题。
LeNet:Yan LeCun等人于1998年第一次将卷积神经网络应用到图像分类任务上,在手写数字识别任务中取得了巨大成功。LeNet通过连续使用卷积层和池化层的组合提取图像特征,其构架如图所示,这里展示的是作者论文中的LeNet-5模型。
图:LeNet模型网络结构示意图
- 第一轮卷积和池化:卷积提取图像中包含的特征模式(激活函数使用sigmoid),图像尺寸从32减小到28.经过池化层可以降低输出特征图对空间位置的敏感性,图像尺寸减到14.
- 第二轮卷积和池化:卷积操作使得图像尺寸减小到10,经过池化后变成5.
- 第三轮卷积:将经过第3次卷积提取到的特征图输入到全连接层。第一杆全连接层的输出神经元的个数是64,第二个全连接层的输出神经元个数是分类标签的类别数,对于手写数字识别其大小是10.然后使用softmax激活函数即可计算出每个类别的预测概率。
卷积层的输出特征图的输出数据格式是[N,C,H,W],当输入全连接层时,会自动将数据拉平,也就是对每个样本,自动将其转化为长度为K的向量,其中K=C*H*W.一个mini-batch的数据维度变成了N*K的二维向量。
LeNet在手写数字识别上的应用
lenet网络的实现代码如下:
#导入需要的包
import paddle
import paddle.fluid as fluid
import numpy as np
from paddle.fluid.dygraph.nn import Conv2D , Pool2D,Linear
#定义LeNet网络结构
class LeNet(fluid.dygrapy.Layer):
def __init__(self,name_scope,num_classes=1):
super(LeNet, self).__init__(name_scope)
#创建卷积和池化层块,每个卷积层使用sigmoid激活函数,后面跟着一个2*2的池化层
self.conv1 = Conv2D(num_channels=1,num_filters=6, filter_size=5,act='sigmoid')
self.pool1 = Pool2D(pool_size=2,pool_stride=2, pool_type='max')
self.conv2 = Conv2D(num_channels=6,num_filters=16, filter_size=5,act='sigmoid')
self.pool2 = Pool2D(pool_size=2,pool_stride=2, pool_type='max')
#创建第三个卷积层
self.conv3 = Conv2D(num_channels=16,num_filters=120,filter_size=4, act='sigmoid')
#创建全连接层,第一个全连接层的输出神经元个数为64,第二个全连接层输出神经元个数为标签的类别数
self.fc1 = Linear(input_dim=120, output_dim=64, act='sigmoid')
self.fc2 = Linear(input_dim=64,output_dim=num_classes)
#网络前向计算过程
def forward(self ,x):
x = self.conv1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.conv3(x)
x = fluid.layers.reshape(x,[x.shape[0], -1])
x = self.fc1(x)
x = self.fc2(x)
return x
下面的程序使用随机数作为输入,查看经过LeNet-5的每一层作用之后,输出数据的形状。
#输入数据形状是[N,1,H,W]
#这里用np.random创建一个随机数组作为输入数据
x = np.random.randn(*[3,1,28,28])
x = x.astype('float32')
with fluid.dygraph.guard():
#创建LeNet类的实例,指定模型名称和分类的类别数目
m = LeNet('LeNet' ,num_classes=10)
#通过调用LeNet继承的sublayers()函数,查看LeNet中所包含的子层
print(m.sublayers())
x = fluid.dygraph.to_varible(x)
for item in m.sublayers():
#item是LeNet类中的一个子层,查看经过子层之后的输出数据形状
try:
x = item(x)
except:
x = fluid.layers.reshape(x,[x.shape[0], -1])
x = item(x)
if len(item.parameters())==2:
#查看卷积和全连接层的数据和参数的形状,其中item.parameters()[0]是权重参数w,item.parameters()[1]是偏置参数b
print(item.full_name(),x.shape,item.parameters()[0].shape, item.parameters()[1].shape)
else:
#池化层没有参数
print(item.full_name(), x.shape)
代码结果如下所示:
接下来就使用LeNet进行手写数字识别
#LeNet 识别手写数字
import os
import random
import paddle
import paddle.fluid as fluid
import numpy as np
#定义训练过程
def train(model):
print('start training ...')
model.train()
epoch_num = 5
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9, parameter_list=model.parameters())
#使用paddle自带数据读取器
train_loader = paddle.batch(paddle.dataset.mnist.train(),batch_size=10)
valid_loader = paddle.batch(paddle.dataset.mnist.test(),batch_size=10)
for epoch in range(epoch_num):
for batch_id ,data in enumerate(train_loader()):
# 调整输入数据形状和类型
x_data = np.array([item[0] for item in data], dtype='float32').reshape(-1, 1, 28, 28)
y_data = np.array([item[1] for item in data], dtype='int64').reshape(-1, 1)
# 将numpy.ndarray转化成Tensor
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
# 计算模型输出
logits = model(img)
# 计算损失函数
loss = fluid.layers.softmax_with_cross_entropy(logits, label)
avg_loss = fluid.layers.mean(loss)
if batch_id % 1000 == 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, avg_loss.numpy()))
avg_loss.backward()
opt.minimize(avg_loss)
model.clear_gradients()
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(valid_loader()):
# 调整输入数据形状和类型
x_data = np.array([item[0] for item in data], dtype='float32').reshape(-1, 1, 28, 28)
y_data = np.array([item[1] for item in data], dtype='int64').reshape(-1, 1)
# 将numpy.ndarray转化成Tensor
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
# 计算模型输出
logits = model(img)
pred = fluid.layers.softmax(logits)
# 计算损失函数
loss = fluid.layers.softmax_with_cross_entropy(logits, label)
acc = fluid.layers.accuracy(pred, label)
accuracies.append(acc.numpy())
losses.append(loss.numpy())
print("[validation] accuracy/loss: {}/{}".format(np.mean(accuracies), np.mean(losses)))
model.train()
# 保存模型参数
fluid.save_dygraph(model.state_dict(), 'mnist')
if __name__ == '__main__':
# 创建模型
with fluid.dygraph.guard():
model = LeNet("LeNet", num_classes=10)
#启动训练过程
train(model)
结果如下所示:
start training ...
Cache file /home/aistudio/.cache/paddle/dataset/mnist/train-images-idx3-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/train-images-idx3-ubyte.gz
Begin to download
Download finished
Cache file /home/aistudio/.cache/paddle/dataset/mnist/train-labels-idx1-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/train-labels-idx1-ubyte.gz
Begin to download
........
Download finished
Cache file /home/aistudio/.cache/paddle/dataset/mnist/t10k-images-idx3-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/t10k-images-idx3-ubyte.gz
Begin to download
Download finished
Cache file /home/aistudio/.cache/paddle/dataset/mnist/t10k-labels-idx1-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/t10k-labels-idx1-ubyte.gz
Begin to download
..
Download finished
epoch: 0, batch_id: 0, loss is: [2.8019154]
epoch: 0, batch_id: 1000, loss is: [2.2887921]
epoch: 0, batch_id: 2000, loss is: [2.3337946]
epoch: 0, batch_id: 3000, loss is: [2.2778637]
epoch: 0, batch_id: 4000, loss is: [2.2619343]
epoch: 0, batch_id: 5000, loss is: [2.3197517]
[validation] accuracy/loss: 0.3272000253200531/2.2573952674865723
epoch: 1, batch_id: 0, loss is: [2.2457855]
epoch: 1, batch_id: 1000, loss is: [2.1932871]
epoch: 1, batch_id: 2000, loss is: [2.232579]
epoch: 1, batch_id: 3000, loss is: [1.9601866]
epoch: 1, batch_id: 4000, loss is: [1.4367704]
epoch: 1, batch_id: 5000, loss is: [1.6037204]
[validation] accuracy/loss: 0.7105000019073486/1.0988653898239136
epoch: 2, batch_id: 0, loss is: [0.9293951]
epoch: 2, batch_id: 1000, loss is: [0.7314283]
epoch: 2, batch_id: 2000, loss is: [0.7332459]
epoch: 2, batch_id: 3000, loss is: [0.48880863]
epoch: 2, batch_id: 4000, loss is: [0.38987648]
epoch: 2, batch_id: 5000, loss is: [0.6705998]
[validation] accuracy/loss: 0.8681000471115112/0.49554571509361267
epoch: 3, batch_id: 0, loss is: [0.438795]
epoch: 3, batch_id: 1000, loss is: [0.33228877]
epoch: 3, batch_id: 2000, loss is: [0.29587844]
epoch: 3, batch_id: 3000, loss is: [0.15542315]
epoch: 3, batch_id: 4000, loss is: [0.18254852]
epoch: 3, batch_id: 5000, loss is: [0.3203889]
[validation] accuracy/loss: 0.9107999801635742/0.3208836317062378
epoch: 4, batch_id: 0, loss is: [0.29746038]
epoch: 4, batch_id: 1000, loss is: [0.24129651]
epoch: 4, batch_id: 2000, loss is: [0.20082739]
epoch: 4, batch_id: 3000, loss is: [0.0723183]
epoch: 4, batch_id: 4000, loss is: [0.11514989]
epoch: 4, batch_id: 5000, loss is: [0.1840646]
[validation] accuracy/loss: 0.929099977016449/0.24536828696727753
通过运行结果可以看出,LeNet在手写数字识别任务中的准确率高达92%以上。下面我们通过眼疾识别数据集iChallenge-PM验证一下。LeNet在手写数字识别方面效果好是否在其他数据集上效果也优。
AlexNet:Alex Krizhevsky等人在2012年提出了AlexNet,并应用在大尺度数据集ImageNet上,并获得比赛冠军。
LeNet在眼疾识别数据集iChallenge-PM上的应用:
iChallenge-PM数据集包含1200个受试者的眼底视网膜图片,训练、验证和测试数据集各400张。将病理性患者的图片作为正样本,标签为1;非病理患者的图片作为负样本,标签为0.。
- training.zip:包含训练中的图片和标签
- validation.zip:包含验证集的图片
- valid_gz.zip:包含验证集的标签
#解压数据集
!unzip -o -q -d /home/aistudio/work/palm /home/aistudio/data/data19065/training.zip
%cd /home/aistudio/work/palm/PALM-Training400/
!unzip -o -q PALM-Training400.zip
!unzip -o -q -d /home/aistudio/work/palm /home/aistudio/data/data19065/validation.zip
!unzip -o -q -d /home/aistudio/work/palm /home/aistudio/data/data19065/valid_gt.zip
从数据集中选取两张图片,通过LeNet提取特征,构建分类器,对正负样本进行分类,并将图片显示出来
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
DATADIR = '/home/aistudio/work/palm/PALM-Training400/PALM-Training400'
# 文件名以N开头的是正常眼底图片,以P开头的是病变眼底图片
file1 = 'N0012.jpg'
file2 = 'P0095.jpg'
#读取图片
img1 = Image.open(os.path.join(DATADIR, file1))
img1 = np.array(img1)
img2 = Image.open(os.path.join(DATADIR,file2))
img2 = np.array(img2)
#画出读取图片
plt.figure(figsize=(16, 8))
f = plt.subplot(121)
f.set_title('Normal', fontsize=20)
plt.imshow(img1)
f = plt.subplot(122)
f.set_title('PM', fontsize=20)
plt.imshow(img2)
plt.show()
#查看图片形状
img1.shape, img2.shape
代码结果:
((2056, 2124, 3), (2056, 2124, 3))
定义数据读取器:
将读取的图片每张缩放到224*224大小,并且将像素值调整到【-1,1】之间,代码如下所示:
import cv2
import random
import numpy as np
#对读入的图像数据进行预处理
def transform_img(img):
#将图片尺寸缩放到224*224
img = cv2.resize(img,(224, 224))
#读入图像数据格式是[H,W,C],使用转置操作将其变成[C,H,W]
img = np.transpose(img,(2,0,1))
img = img.astype('float32')
#将数据范围调整到[-1.0,1.0]之间
img = img / 225.
img = img * 2.0 -1.0
return img
#定义训练集数据读取器
def data_loader(datadir, batch_size=10,mode = 'train')
# 将datadir目录下的文件列出来,每条文件都要读入
filenames = os.listdir(datadir)
def reader():
if mode =='train':
#训练时随机打乱数据顺序
random.shuffle(filenames)
batch_imgs = []
batch_labels = []
for name in filenames:
filepath = os.path.join(fatadir, name)
img = cv2.imread(filepath)
img = transform_img(img)
if name[0] == 'H' or name[0] == 'N':
# H开头的文件名表示高度近似,N开头的文件名表示正常视力
# 高度近视和正常视力的样本,都不是病理性的,属于负样本,标签为0
label = 0
elif name[0] == 'P':
# P开头的是病理性近视,属于正样本,标签为1
label = 1
else:
raise('Not excepted file name')
# 每读取一个样本的数据,就将其放入数据列表中
batch_imgs.append(img)
batch_labels.append(label)
if len(batch_imgs) == batch_size:
# 当数据列表的长度等于batch_size的时候,
# 把这些数据当作一个mini-batch,并作为数据生成器的一个输出
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
batch_imgs = []
batch_labels = []
if len(batch_imgs) > 0:
# 剩余样本数目不足一个batch_size的数据,一起打包成一个mini-batch
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
return reader
# 定义验证集数据读取器
def valid_data_loader(datadir ,csvfile,batch_size=10, mode='valid'):
#训练集读取时通过文件名来确定样本标签,验证集则通过csvfile来读取每个图片对应的标签。
#打开包含验证集标签的csvfile,并读取其中的内容
filelists = open(csvfile).readlines()
def reader():
batch_imgs = []
batch_labels = []
for line in filelists[1:]:
line = line.strip().split(',')
name = line[1]
label = int(line[2])
#根据图片文件名加载图片,并对图片数据作预处理
filepath = os.path.join(datadir, name)
img = cv2.imread(filepath)
img = transform_img(img)
#每读取一个样本的数据,就将其放入数据列表中
batch_imgs.append(img)
batch_labels.append(label)
if len(batch_imgs) == batch_size:
# 当数据列表的长度等于batch_size的时候,
# 把这些数据当作一个mini-batch,并作为数据生成器的一个输出
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
batch_imgs = []
batch_labels = []
if len(batch_imgs) > 0:
# 剩余样本数目不足一个batch_size的数据,一起打包成一个mini-batch
imgs_array = np.array(batch_imgs).astype('float32')
labels_array = np.array(batch_labels).astype('float32').reshape(-1, 1)
yield imgs_array, labels_array
return reader
查看数据形状
#查看数据形状
DATADIR = '/home/aistudio/work/palm/PALM-Training400/PALM-Training400'
train_loader = data_loader(DATADIR,batch_size=10,mode='train')
data_reader = train_loader()
data = next(data_reader)
data[0].shape,data[1].shape
代码结果如图所示‘
启动训练:
#LeNet 识别眼疾图片
import os
import random
import paddle
import paddle.fluid as fluid
import numpy as np
#DATADIR是训练数据集的地址,DATADIR2是验证数据集的地址,CSVFILE是验证集的标签
DATADIR = '/home/aistudio/work/palm/PALM-Training400/PALM-Training400'
DATADIR2 = '/home/aistudio/work/palm/PALM-Validation400'
CSVFILE = '/home/aistudio/work/palm/PALM-Validation-GT/labels.csv'
#定义训练过程
def train(model):
with fluid.dygraph.guard():
print('start training ... ')
model,train()
epoch_num= 5
#定义优化器
opt = fluid.optimizer.Momentum(learning_rate=0.001,momentum=0.9,parameter_list=mode.parameters())
#定义数据读取器,训练数据读取器和验证数据读取器
train_loader = data_loader(DATADIR,batch_size=10,mode='train')
valid_loader = valid_data_loader(DATADIR2, CSVFILE)
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_loader()):
x_data,y_data = data
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
#运行模型前向计算,得到预测值
logits = model(img)
#进行loss计算
loss = fluid.layers.sigmoid_cross_entropy_with_logits(logits ,label)
avg_loss = fluid.layers.mean(loss)
if batch_id % 10 == 0:
print("epoch: {},batch_id:{},loss is:{}".format(epoch,batch_id,avg_loss.numpy()))
#反向传播,更新权重,清除梯度
avg_loss.backward()
opt.minimize(avg_loss)
mode.clear_gradients()
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(valid_loader()):
x_data,y_data = data
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
#运行模型前向计算,得到预测值
logits = model(img)
#二分类,sogmoid计算后的结果以0.5为阈值分两个类别
#计算sigmoid后的预测概率,进行loss计算
pred = fluid.layers.sigmoid(logits)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(logits, label)
# 计算预测概率小于0.5的类别
pred2 = pred * (-1.0) + 1.0
# 得到两个类别的预测概率,并沿第一个维度级联
pred = fluid.layers.concat([pred2, pred], axis=1)
acc = fluid.layers.accuracy(pred, fluid.layers.cast(label, dtype='int64'))
accuracies.append(acc.numpy())
losses.append(loss.numpy())
print("[validation] accuracy/loss: {}/{}".format(np.mean(accuracies), np.mean(losses)))
model.train()
#save params of model
fluid.save_dygraph(model.state_dict(),'mnist')
#save optimizer state
fluid.save_dygraph(opt.state_dict(),'mnist')
#定义评估过程
def evaluation(model, params_file_path):
with fluid.dygraph.guard():
print('start evaluation ......')
#加载模型参数
model_state_dict, __= fluid.load_dygraph(params_file_path)
model.load_dict(model_state_dict)
model.eval()
eval_loader = load_data('eval')
acc_set = []
avg_loss_set = []
for batch_id, data in enumerate(eval_loader()):
x_data, y_data = data
img = fluid.dygraph.to_variable(x_data)
label = fluid.dygraph.to_variable(y_data)
# 计算预测和精度
prediction, acc = model(img, label)
# 计算损失函数值
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
acc_set.append(float(acc.numpy()))
avg_loss_set.append(float(avg_loss.numpy()))
# 求平均精度
acc_val_mean = np.array(acc_set).mean()
avg_loss_val_mean = np.array(avg_loss_set).mean()
print('loss={}, acc={}'.format(avg_loss_val_mean, acc_val_mean))
#导入需要的包
import paddle
import paddle.fluid as fluid
import numpy as np
from paddle.fluid.dygraph.nn import Conv2D,Pool2D,Linear
# 定义 LeNet 网络结构
class LeNet(fluid.dygraph.Layer):
def __init__(self, name_scope, num_classes=1):
super(LeNet, self).__init__(name_scope)
# 创建卷积和池化层块,每个卷积层使用Sigmoid激活函数,后面跟着一个2x2的池化
self.conv1 = Conv2D(num_channels=3, num_filters=6, filter_size=5, act='sigmoid')
self.pool1 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
self.conv2 = Conv2D(num_channels=6, num_filters=16, filter_size=5, act='sigmoid')
self.pool2 = Pool2D(pool_size=2, pool_stride=2, pool_type='max')
# 创建第3个卷积层
self.conv3 = Conv2D(num_channels=16, num_filters=120, filter_size=4, act='sigmoid')
# 创建全连接层,第一个全连接层的输出神经元个数为64, 第二个全连接层输出神经元个数为分裂标签的类别数
self.fc1 = Linear(input_dim=300000, output_dim=64, act='sigmoid')
self.fc2 = Linear(input_dim=64, output_dim=num_classes)
# 网络的前向计算过程
def forward(self, x):
x = self.conv1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.conv3(x)
x = fluid.layers.reshape(x, [x.shape[0], -1])
x = self.fc1(x)
x = self.fc2(x)
return x
if __name__ == '__main__':
# 创建模型
with fluid.dygraph.guard():
model = LeNet("LeNet_", num_classes=1)
train(model)
代码结果如下所示:
start training ...
epoch: 0, batch_id: 0, loss is: [0.7495805]
epoch: 0, batch_id: 10, loss is: [1.636447]
epoch: 0, batch_id: 20, loss is: [0.723161]
epoch: 0, batch_id: 30, loss is: [0.71356595]
[validation] accuracy/loss: 0.5275000333786011/0.6923364400863647
epoch: 1, batch_id: 0, loss is: [0.6789056]
epoch: 1, batch_id: 10, loss is: [0.66355455]
epoch: 1, batch_id: 20, loss is: [0.6578954]
epoch: 1, batch_id: 30, loss is: [0.65324485]
[validation] accuracy/loss: 0.5275000333786011/0.6920405626296997
epoch: 2, batch_id: 0, loss is: [0.68819726]
epoch: 2, batch_id: 10, loss is: [0.6819553]
epoch: 2, batch_id: 20, loss is: [0.7818572]
epoch: 2, batch_id: 30, loss is: [0.70672405]
[validation] accuracy/loss: 0.5275000333786011/0.6923947334289551
epoch: 3, batch_id: 0, loss is: [0.69647765]
epoch: 3, batch_id: 10, loss is: [0.6738097]
epoch: 3, batch_id: 20, loss is: [0.6926111]
epoch: 3, batch_id: 30, loss is: [0.714018]
[validation] accuracy/loss: 0.5275000333786011/0.6918395161628723
epoch: 4, batch_id: 0, loss is: [0.67984945]
epoch: 4, batch_id: 10, loss is: [0.6825234]
epoch: 4, batch_id: 20, loss is: [0.69415]
epoch: 4, batch_id: 30, loss is: [0.6940319]
[validation] accuracy/loss: 0.5275000333786011/0.698182225227356
通过运行结果可以看出,在眼疾筛查数据集上,LeNet的loss很难下降,模型没有收敛。这是因为MNIST数据集的图片尺寸较小(28*28),但在眼疾数据集上尺寸较大(原始尺寸约为2000*2000,经过缩放之后变成224*224),LeNet模型很难进行有效分类。这说明在图片尺寸较大时,LeNet在图像分类任务上存在局限性。