目录
1 项目构建
构建项目文件夹Banknote,以及子文件夹与py文件。
名称 | 作用 |
---|---|
data目录 | 存放原始数据与预处理后切分为训练集、验证机、测试集的数据 |
log目录 | 训练过程中使用tensorboardX保存的指标数值,如损失、精确度等 |
model_save目录 | 存放不同训练阶段的模型,最后找出个最优的用于测试集 |
config.py | 保存超参数 |
dataset_banknote.py | Banknote数据类,用于训练时获取数据 |
inference.py | 挑选模型在测试集上运行 |
model.py | 算法模型 |
preprocess.py | 对原始数据进行预处理,划分为训练集、验证集、测试集 |
trainer.py | 模型训练代码 |
2 超参数设置 config.py
config.py文件如下:
# banknote clasification config
class Hyperparameter():
# ###################################################################
# Data
# ###################################################################
device='cuda'
data_dir='./data/'
data_path='./data/data_banknote_authentication.txt'
trainset_path='./data/train.txt'
devset_path='./data/dev.txt'
testset_path='./data/test.txt'
in_feature=4 # input feature dim
out_dim=2 # output feature dim (classes number)
seed=1234 # random seed
# ###################################################################
# Model Structure
# ###################################################################
layer_list=[in_feature, 64, 128, 64, out_dim]
# ###################################################################
# Experiment
# ###################################################################
batch_size=64
init_lr=1e-3 # 初始学习率
epoch=100
verbose_step = 32 # 日志打印间隔
save_step = 200 # 模型保存间隔
HP=Hyperparameter()
在其他py文件中导入cofig.HP即可获取到超参数。
from config import HP
3 数据集预处理 preprocess.py
Banknote数据集约有1300条数据,前四列为四个特征值,最后一个列为标签。
使用process.py文件将原始数据data_banknote_authentication.txt切分为训练集、测试集、训练集(train.txt、dev.txt、test.txt)。
# preprocess original data
# shuffle and split into trainset,devset,testset
import numpy as np
from config import HP
import os
trainset_ratio=0.7
devset_ratio=0.2
testset_ratio=0.1
np.random.seed(1234)
dataset=np.loadtxt(HP.data_path, delimiter=',')
np.random.shuffle(dataset)
n_items=len(dataset)
trainset_num=int(n_items*trainset_ratio)
devset_num=int(n_items*devset_ratio)
#testset_num=int(data_len*trainset_ratio)
trainset= dataset[:trainset_num]
devset= dataset[trainset_num:trainset_num + devset_num]
testset=dataset[trainset_num + devset_num:]
np.savetxt(fname=os.path.join(HP.data_dir,'train.txt'),X=trainset,delimiter=',')
np.savetxt(fname=os.path.join(HP.data_dir,'dev.txt'),X=devset,delimiter=',')
np.savetxt(fname=os.path.join(HP.data_dir,'test.txt'),X=testset,delimiter=',')
4 数据集类构建 dataset_banknote.py
创建BanknoteDataset类,继承torch.utils.data中的Dataset类,重写__getitem__与__len__方法。
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from config import HP
class BanknoteDataset(Dataset):
def __init__(self,data_path):
""" data path : trainset,devset,testset """
self.dataset=np.loadtxt(data_path,delimiter=',')
def __getitem__(self, idx):
"""传入索引idx返回对应特征x与标签y"""
item=self.dataset[idx]
x,y=item[:HP.in_feature],item[HP.in_feature:]
return torch.Tensor(x).float().to(HP.device),torch.Tensor(y).squeeze().long().to(HP.device)
def __len__(self):
"""返回数据集样本数量"""
return self.dataset.shape[0]
if __name__=='__main__':
dataset=BanknoteDataset(HP.testset_path)
x, y = dataset[0]
dataloader=DataLoader(dataset,batch_size=HP.batch_size,shuffle=True,drop_last=True)
for x,y in dataloader:
print('x:', x.shape)
print('y:', y.shape)
5 模型构建 model.py
定义模型架构,使用多层感知机,结构为layer_list=[4(in_feature), 64, 128, 64, 2(out_dim)]。
import torch
import torch.nn as nn
from config import HP
class BanknoteClasificationModel(nn.Module):
def __init__(self):
super(BanknoteClasificationModel, self).__init__()
# 定义模型架构
self.linear_layer=nn.ModuleList(
[nn.Linear(in_dim,out_dim) for in_dim,out_dim in zip(HP.layer_list[:-1], HP.layer_list[1:])]
)
def forward(self,intput_x):
"""前向计算"""
for layer in self.linear_layer:
intput_x=layer(intput_x)
intput_x=torch.relu(intput_x)
return intput_x
if __name__=='__main__':
x=torch.randn(size=(32,4)).float().to(HP.device)
model=BanknoteClasificationModel().to(HP.device)
y_pred=model(x)
print(y_pred.shape)
6 训练trainer.py
在训练过程中,每个批次batch的训练,算为一个step。在超参数verbose_step和save_step中设定了计算验证集损失和保存模型的step间隔。
import os.path
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from tensorboardX import SummaryWriter
from dataset_banknote import BanknoteDataset
from model import BanknoteClasificationModel
from config import HP
from argparse import ArgumentParser
from inference import accuracy
# init sedd (ensure reproducible result)
random.seed(HP.seed)
torch.manual_seed(HP.seed)
torch.cuda.manual_seed(HP.seed)
np.random.seed(HP.seed)
logger=SummaryWriter(logdir='./log')
def evaluate(model,dev_dataloader,crit):
model.eval()# set evaluate flag
sum_loss=0.
with torch.no_grad():
for batch in dev_dataloader:
x,y=batch
pred=model(x)
loss=crit(pred,y)
sum_loss+=loss.item()
model.train()
return sum_loss/len(dev_dataloader)
def save_checkpoint(model_,optim_,epoch_,checkpoint_path):
save_dict={
'model_state_dict':model_.state_dict(),
'optim_state_dict':optim_.state_dict(),
'epoch':epoch_,
}
torch.save(save_dict,checkpoint_path)
def train(acc=False):
# 参数c如果非空,则由历史保存的模型开始,重新训练
parser=ArgumentParser(description='Model Training')
parser.add_argument('--c',default='model_save/model_40_600.pth',type=str,help='train from scratch or resume training')
args=parser.parse_args()
# model instance
model=BanknoteClasificationModel().to(HP.device)
# loss function
criterion=nn.CrossEntropyLoss()
# optimizier
opt=optim.Adam(model.parameters(),lr=HP.init_lr)
# train dataloader
train_set=BanknoteDataset(HP.trainset_path)
train_dataloader=DataLoader(train_set,batch_size=HP.batch_size,shuffle=True,drop_last=True)
# dev dataloader (evaluation)
dev_set=BanknoteDataset(HP.devset_path)
dev_dataloader=DataLoader(dev_set,batch_size=HP.batch_size,shuffle=True,drop_last=True)
# test dataloader
test_set=BanknoteDataset(HP.testset_path)
test_dataloader=DataLoader(test_set,batch_size=len(test_set))
start_epoch, step = 0, 0
if args.c:
checkpiont=torch.load(args.c)
model.load_state_dict(checkpiont['model_state_dict'])
opt.load_state_dict(checkpiont['optim_state_dict'])
start_epoch=checkpiont['epoch']
else:
print('Training From Scratch !')
model.train()
for epoch in range(start_epoch,HP.epoch):
print('Epoch : %d' % (epoch))
for batch in train_dataloader:
x,y=batch # load data
opt.zero_grad() # clear gradient
pred=model(x) # forward process
loss=criterion(pred,y) # loss calc
loss.backward()# backward process
opt.step()
logger.add_scalar('Loss/Train',loss.item(),step)
if not step % HP.verbose_step: # 每隔指定step计算dev loss 保存loss到tensorboard
eval_loss=evaluate(model,dev_dataloader,criterion)
logger.add_scalar('Loss/Dev', eval_loss,step) # 将验证集损失记录到tensorboard中
if not step % HP.save_step: # 每隔指定step保存模型
mode_path='model_%d_%d.pth' % (epoch,step)
save_checkpoint(model,opt,epoch,os.path.join('model_save',mode_path))
print('Epoch:[%d,%d] Step:%d Train Loss:%0.3f Dev Loss:%0.3f' %
(epoch+1, HP.epoch, step, loss.item(), eval_loss))
step += 1
if acc: # 每个epoch查看在三个数据集上的精确度
train_acc=accuracy(model,train_dataloader)
dev_acc=accuracy(model,dev_dataloader)
test_acc=accuracy(model,test_dataloader)
print('Epoch:%d Train Accuracy:%0.3f' % (epoch,train_acc))
print('Epoch:%d Dev Accuracy:%0.3f' % (epoch,dev_acc))
print('Epoch:%d Test Accuracy:%0.3f' % (epoch,test_acc))
logger.add_scalar('Acccuracy/Train',train_acc,step)
logger.add_scalar('Acccuracy/Dev',dev_acc,step)
logger.add_scalar('Acccuracy/Test',test_acc,step)
logger.flush() # 将日志写入磁盘
logger.close()
if __name__=='__main__':
train()
可以发现,使用随机初始化的模型参数,一开始损失就比较低了。
随着训练的进行,损失更是微乎其微,说明模型拟合得不错。
6.1 查看损失曲线
在模型训练完之后,可以在Terminal中进入log目录,执行tensorboard --logdir=./,粘贴网址到浏览器,即可观察到保存的损失曲线与精确度曲线(注!由于当前使用的是pytorch框架,必须在Terminal中进入装有tensorflow的环境,才能正常执行该命令)。
上图分别是验证集和训练集的损失曲线,我们可以在验证集损失曲线中找一个损失刚进入平稳状态的step,来确定在测试集上使用哪个保存的模型(过高的step可能会过拟合)。
7 在测试集上运行 inference.py
选择模型在测试集上运行,计算精确度。
# 选择合适的模型,在测试集运行
import torch
from torch.utils.data import Dataset,DataLoader
from config import HP
from dataset_banknote import BanknoteDataset
from model import BanknoteClasificationModel
import os
def accuracy(model,data_loader):
model.eval()
total_acc=0.
total_cnt=0
for batch in data_loader:
with torch.no_grad():
x,y=batch
pred=model(x)
total_acc+=(torch.argmax(pred,dim=1)==y).sum()
total_cnt+=y.shape[0]
return total_acc/total_cnt
if __name__=='__main__':
testset=BanknoteDataset(HP.testset_path)
test_dataloader=DataLoader(testset,batch_size=len(testset))
model=BanknoteClasificationModel()
checkpoint=torch.load(os.path.join('model_save','model_26_400.pth'))
model.load_state_dict(checkpoint['model_state_dict'])
model.to(HP.device)
test_acc=accuracy(model,test_dataloader)
print('test acc:%0.3f' % test_acc)
可以发现精确度已经到达了100%。
test acc:1.000
链接:https://pan.baidu.com/s/1QH41vyr2EfbqUMmooqWrwg?pwd=ohph 提取码:ohph