SIGNS数据集介绍
SIGNS数据集包含了手势从0~5的照片,其中:
- 训练集:1080张图片(64 x 64像素)的手势表示从0到5的数字(每个数字180张图片)。
- 测试集:120张图片(64 x 64像素)的手势表示从0到5的数字(每个数字20张图片)。
数据集处理
本文使用的SIGNS数据集格式为h5,我们可以先将数据集转换为numpy格式,可先用一下代码进行转换:
#下载h5格式的数据集
def load_dataset():
train_dataset = h5py.File('datasets/train_signs.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('datasets/test_signs.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
train_data, train_label, test_data, test_label, classes=load_dataset()
由于在利用pytorch训练时,输入网络的数据维度应为(个数,通道数,像素,像素),但输出train_data.shape会得到(1080,64,64,3),说明通道数被放在了最后,我们需要进行维度转换,从而将其放在第二位,可利用np.transpose进行numpy格式的维度转换:
# print(train_data.shape) #(1080, 64, 64, 3)
# print(train_label.shape) #(1, 1080)
train_data=np.transpose(train_data,(0,3,1,2)) #因为pytorch训练时输入网络的数据图像的通道数应在像素值之前,因此需进行转换
test_data=np.transpose(test_data,(0,3,1,2))
# print(train_data.shape) #(1080, 3, 64, 64)
# print(test_data.shape) #(120, 3, 64, 64)
最后将numpy格式转换成tensor格式,然后对图像、标签进行打包合并:
#加载数据集
train_xy=TensorDataset(train_data,train_label)
test_xy=TensorDataset(test_data,test_label)
train=DataLoader(train_xy,batch_size=64,shuffle=True)
test=DataLoader(test_xy,batch_size=64)
搭建网络
因为最后需要判断0~5共6种手势,所以网络的最后的输出维度需要是6×1×1。输入维度为3×64×64,笔者搭建的网络先通过4个卷积层将像素变为4、通道数为256后,再进入全连接层。网络模型如下:
class signs_cnn(nn.Module):
def __init__(self):
super(signs_cnn, self).__init__()
self.model_conv=nn.Sequential( # n=64
nn.Conv2d(in_channels=3,out_channels=32,kernel_size=5,stride=1,padding=2), #64
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2,stride=2), #32
nn.Conv2d(in_channels=32,out_channels=64,kernel_size=5,stride=1,padding=2), #32
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #16
nn.Conv2d(in_channels=64,out_channels=128,kernel_size=5,stride=1,padding=2), #16
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #8
nn.Dropout(0.5),
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5, stride=1, padding=2), #8
nn.BatchNorm2d(256),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2) #4
)
self.model_linear=nn.Sequential(
nn.Linear(4*4*256,256),
nn.ReLU(),
nn.Linear(256,16),
nn.ReLU(),
nn.Linear(16,6)
)
def forward(self,x):
x=self.model_conv(x)
x=x.view(x.shape[0],-1) #一行对应一张照片
x=self.model_linear(x)
return x
训练模型
设置相关参数,对模型进行训练,并计算loss、accuracy等值。这里需要注意,输入网络前需要将图像(inputs)强转为float形式;计算损失值loss前需要将标签(labels)强转为long形式。
model=signs_cnn()
model.to(device)
loss_fn=nn.CrossEntropyLoss()
loss_fn.to(device)
learning_rate=0.0001
optimizer=torch.optim.SGD(params=model.parameters(),lr=learning_rate,momentum=0.9)
train_acc_list = []
train_loss_list = []
test_acc_list = []
test_loss_list=[]
epochs=150
for epoch in range(epochs):
print("-----第{}轮训练开始------".format(epoch + 1))
train_loss = 0.0
test_loss = 0.0
train_sum, train_cor, test_sum, test_cor = 0, 0, 0, 0
#训练步骤开始
model.train()
for batch_idx,data in enumerate(train):
inputs,labels=data
inputs, labels = inputs.to(device), labels.to(device)
labels = torch.tensor(labels, dtype=torch.long)
optimizer.zero_grad()
outputs=model(inputs.float())
loss=loss_fn(outputs,labels)
loss.backward()
optimizer.step()
# 计算每轮训练集的Loss
train_loss+=loss.item()
# 计算每轮训练集的准确度
_, predicted = torch.max(outputs.data, 1) # 选择最大的(概率)值所在的列数就是他所对应的类别数,
train_cor += (predicted == labels).sum().item() # 正确分类个数
train_sum += labels.size(0) # train_sum+=predicted.shape[0]
#测试步骤开始
model.eval()
for batch_idx1,data in enumerate(test):
inputs,labels=data
inputs,labels=inputs.to(device),labels.to(device)
labels = torch.tensor(labels, dtype=torch.long)
outputs=model(inputs.float())
loss=loss_fn(outputs,labels)
test_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
test_cor += (predicted == labels).sum().item()
test_sum += labels.size(0)
print("Train loss:{} Train accuracy:{}% Test loss:{} Test accuracy:{}%".format(train_loss / batch_idx,
100 * train_cor / train_sum,
test_loss / batch_idx1,
100 * test_cor / test_sum))
train_loss_list.append(train_loss / batch_idx)
train_acc_list.append(100 * train_cor / train_sum)
test_acc_list.append(100 * test_cor / test_sum)
test_loss_list.append(test_loss / batch_idx1)
#保存网络
torch.save(model,"signs_epoch{}.pth".format(epochs))
画图
fig=plt.figure()
plt.plot(range(len(train_loss_list)),train_loss_list,'blue')
plt.plot(range(len(test_loss_list)),test_loss_list,'red')
plt.legend(['Train Loss','Test Loss'],fontsize=14,loc='best')
plt.xlabel('Epoch',fontsize=14)
plt.ylabel('Loss',fontsize=14)
plt.grid()
plt.savefig('figLOSS_SIGNS_epoch{}'.format(epochs))
plt.show()
fig=plt.figure()
plt.plot(range(len(train_acc_list)),train_acc_list,'blue')
plt.plot(range(len(test_acc_list)),test_acc_list,'red')
plt.legend(['Train Accuracy','Test Accuracy'],fontsize=14,loc='best')
plt.xlabel('Epoch',fontsize=14)
plt.ylabel('Accuracy(%)',fontsize=14)
plt.grid()
plt.savefig('figAccuracy_SIGNS_epoch{}'.format(epochs))
plt.show()