一、问题分析
图像分类问题,训练集2160张图片,共12类,每类180张;测试集240张,不确定有多少种类别。
图像为彩色图像,大部分是24位色彩,个别是8位和32位,这个问题导致后面在训练过程中一致出bug,会导致Tensor的shape不一致。
所有训练集在一个文件夹下,label信息在一个单独的txt中。
二、问题解决流程
1、框架选择
选择的是百度Paddle框架,动态图模式(也能够支持静态图,这部分的区别理解还不是很深刻)。数据结构为tensor,所以所有的训练集、测试集都应该转为tensor格式后再进行训练。
2、数据、标签读取;
(1)数据解压
本地一般没什么问题,在线平台上训练需要通过代码形式解压
!unzip -qn /home/aistudio/data/data8136/cat_data_sets_models.zip
(2)label信息读取
label存在单个的txt中,需要利用Python进行读取
#读取label标签
f = open("data_sets/cat_12/train_list.txt", encoding='gbk')
line = f.readline() #按行读取
data_list = []
while line:
#map函数:简单理解就是将第一个参数代表的函数作用到第二个参数中,本处即转为string;
#line.split():label形式为“url” “class”,即拆分为两个元素,最终存储到list中。
label = list(map(str,line.split()))
data_list.append(label)
line = f.readline()
f.close()
(3)图像样本读取
样本集在一个文件夹中,对其读取时需要根据样本名把对应的label也写进去。形成“特征”——“标签”形式的数据存储,为下一步训练做准备。
#批量读取图像
import os
from PIL import Image
#获取样本集文件夹下所有文件的名称
fileName = os.listdir('data_sets/cat_12/cat_12_train/')
train_image = []
for img in fileName:
#利用PIL库中Image读取图像文件
old_pic = Image.open('data_sets/cat_12/cat_12_train/' + img)
#将图像缩放到统一标准
new_pic = old_pic.resize((32,32))
#将图像统一转化为RGB 3通道格式
new_pic = new_pic.convert("RGB")
temp = [] #用于存储数据标签对
temp.append(VF.to_tensor(new_pic)) #VF是paddle中version.function
#根据图像名称确定标签,data_list为标签信息文件
for i in data_list:
if i[0] == ('cat_12_train/'+img): #i[0]是标签信息对中的name
temp.append(paddle.to_tensor(int(i[1]))) #i[1]是class
break
train_image.append(temp)
3、自定义数据集类
paddle框架训练过程中会用到DataLoader函数,其要求数据集符合下述要求
class MyDataset(Dataset):
"""
步骤一:继承paddle.io.Dataset类
"""
def __init__(self, num_samples, data):
"""
步骤二:实现构造函数,定义数据集大小
"""
super(MyDataset, self).__init__()
self.num_samples = num_samples # 样本的数量
self.data = data #样本数据
def __getitem__(self, index):
"""
步骤三:实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
"""
data = self.data[index][0] #index是数据的标号,指第index个数据。[0]为特征
label = self.data[index][1] #[1]为标签
return data, label
def __len__(self):
"""
步骤四:实现__len__方法,返回数据集总数目
"""
return self.num_samples
4、模型创建
class MyNet(paddle.nn.Layer):
def __init__(self, num_classes):
super(MyNet, self).__init__()
self.conv1 = paddle.nn.Conv2D(in_channels=3, out_channels=32,
kernel_size=(3, 3))
self.pool1 = paddle.nn.MaxPool2D(kernel_size=2, stride=2)
self.conv2 = paddle.nn.Conv2D(in_channels=32, out_channels=64,
kernel_size=(3,3))
self.pool2 = paddle.nn.MaxPool2D(kernel_size=2, stride=2)
self.conv3 = paddle.nn.Conv2D(in_channels=64, out_channels=64,
kernel_size=(3,3))
self.flatten = paddle.nn.Flatten()
self.linear1 = paddle.nn.Linear(in_features=1024, out_features=64)
self.linear2 = paddle.nn.Linear(in_features=64, out_features=num_classes)
#@paddle.jit.to_static(input_spec=[InputSpec(shape=[None, 3,32,32],
dtype='float32')])
def forward(self, x):
x = self.conv1(x)
x = NF.relu(x)
x = self.pool1(x)
x = self.conv2(x)
x = NF.relu(x)
x = self.pool2(x)
x = self.conv3(x)
x = NF.relu(x)
x = self.flatten(x)
x = self.linear1(x)
x = NF.relu(x)
x = self.linear2(x)
return x
5、模型训练
def train(model):
print('start training ... ')
# turn into training mode
model.train()
opt = paddle.optimizer.Adam(learning_rate=learning_rate,
parameters=model.parameters())
train_loader = paddle.io.DataLoader(mydata,
shuffle=True,
batch_size=batch_size,num_workers=0)
#valid_loader = paddle.io.DataLoader(cifar10_test, batch_size=batch_size)
for epoch in range(epoch_num):
print("进入循环")
for batch_id, data in enumerate(train_loader()):
x_data = data[0]
y_data = data[1]
#y_data = paddle.unsqueeze(y_data, 1)
logits = model(x_data)
#loss = NF.square_error_cost(logits, y_data)
loss = NF.cross_entropy(logits, y_data)
if batch_id % 10 == 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy()))
loss.backward()
opt.step()
opt.clear_grad()
# evaluate model after one epoch
model.eval()
accuracies = []
losses = []
for batch_id, data in enumerate(train_loader()):
x_data = data[0]
y_data = paddle.to_tensor(data[1])
#y_data = paddle.unsqueeze(y_data, 1)
logits = model(x_data)
loss = NF.cross_entropy(logits, y_data)
acc = paddle.metric.accuracy(logits, y_data)
accuracies.append(acc.numpy())
losses.append(loss.numpy())
avg_acc, avg_loss = np.mean(accuracies), np.mean(losses)
print("[validation] accuracy/loss: {}/{}".format(avg_acc, avg_loss))
val_acc_history.append(avg_acc)
val_loss_history.append(avg_loss)
6、模型保存
paddle.jit.save(
layer=model,
path="./work/example.dy_model/linear",
input_spec=[InputSpec(shape=[None, 3,32,32], dtype='float32')])
7、模型推理
# load 模型读取
path = "./work/example.dy_model/linear"
loaded_layer = paddle.jit.load(path)
loaded_layer.eval()
#模型推理
mydata = MyDataset(len(test_image),test_image) #测试集
test_loader = paddle.io.DataLoader(mydata,
shuffle=False,
batch_size=1,num_workers=0)
reslis = []
for batch_id, data in enumerate(test_loader()):
pred = loaded_layer(data[0]) #通过模型得到class
#topk函数:判断最大的值在哪个位置
reslis.append(int(paddle.topk(pred,1)[1][0].numpy()))
'''
namelis是测试集样本名称列表,reslis是推理得到的标签列表,因为没有进行shuffle,
所以是一致的,将其转为dict字典,便于输出
'''
output = dict(zip(namelis,reslis))
#推理结果输出
import csv
fileName="./work/result002.csv"
##保存文件
with open(fileName,"w") as csv_file:#不要写wb,那是以二进制格式写入
writer=csv.writer(csv_file)
for key,value in output.items():
writer.writerow([key, value])