目录:
1、基于crnn图像序列预测-pytorch代码实现——加载自己的数据集
2、基于crnn图像序列预测-pytorch代码实现——模型介绍
3、基于crnn图像序列预测-pytorch代码实现——训练过程及常见错误
在这里以VGG_LSTM为例,优化算法选的是Adam,损失函数是CrossEntropyLoss(),详细训练代码如下:
if __name__ == "__main__":
model = VGG_LSTM()
print(model)
if torch.cuda.is_available():
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
for epoch in range(100):
print('epoch {}'.format(epoch + 1))
train_loss = 0.
train_acc = 0.
for batch_x, batch_y in train_loader:
# print(batch_x.size())
batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
out = model(batch_x)
# print(batch_x.size())
loss = loss_func(out, batch_y)
train_loss += loss.data[0]
pred = torch.max(out, 1)[1]
train_correct = (pred == batch_y).sum()
train_acc += train_correct.data[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
train_data)), train_acc / (len(train_data))))
# -----------------------evaluation--------------------------------
model.eval()
eval_loss = 0.
eval_acc = 0.
for batch_x, batch_y in test_loader:
batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
out = model(batch_x)
loss = loss_func(out, batch_y)
eval_loss += loss.data[0]
pred = torch.max(out, 1)[1]
num_correct = (pred == batch_y).sum()
eval_acc += num_correct.data[0]
print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
test_data)), eval_acc / (len(test_data))))
整个过程,包括数据加载,模型到训练的详细代码如下:
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torch.optim as optim
BATCH_SIZE = 4
learning_rate = 0.0001
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(224),
# transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
# transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
val_transforms = transforms.Compose([
# transforms.Resize(256),
transforms.RandomResizedCrop(224),
transforms.ToTensor(),
# transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
def default_loader(path):
return Image.open(path).convert('RGB')
class MyDataset(Dataset):
def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
fh = open(txt, 'r')
imgs = []
for line in fh:
line = line.strip('\n')
line = line.rstrip()
words = line.split()
imgs.append((words[0], int(words[1])))
imgs.sort(key=lambda x: x[0], reverse=False)
self.num_samples = len(imgs)
self.num_samples_per_iteration = 9
self.imgs = imgs
self.transform = transform
self.target_transform = target_transform
self.loader = loader
def __getitem__(self, index):
current_index = np.random.choice(range(self.num_samples_per_iteration, self.num_samples))
current_imgs = []
current_label = self.imgs[current_index][1]
for i in range(current_index - self.num_samples_per_iteration, current_index):
fn, label = self.imgs[i]
img = self.loader(fn)
if self.transform is not None:
img = self.transform(img)
current_imgs.append(img)
batch_cur_imgs = np.stack(current_imgs, axis=0) # [9, 3, 256, 256]
return batch_cur_imgs, current_label
def __len__(self):
return len(self.imgs)
train_data = MyDataset(txt='trainset256.txt', transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
test_data = MyDataset(txt='testset256.txt', transform=val_transforms)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
print('num_of_trainData:', len(train_data))
print('num_of_testData:', len(test_data))
class VGG_LSTM(nn.Module):
def __init__(self, lstm_hidden_size=256, num_lstm_layers=1, bidirectional=True):
super(VGG_LSTM, self).__init__()
net = models.vgg16(pretrained=True)
net.classifier = nn.Sequential()
self.num_directions = 2 if bidirectional else 1
self.num_lstm_layers = num_lstm_layers
self.lstm_hidden_size = lstm_hidden_size
# [B, 3, 224, 224] -> [B, 512, 7, 7]
self.features = net
self.lstm1 = nn.LSTM(input_size=512 * 7 * 7,
hidden_size=lstm_hidden_size,
num_layers=num_lstm_layers,
batch_first=True,
dropout=0.5,
bidirectional=bidirectional) # [B, 7, lstm_hidden_size]
self.linear1 = nn.Sequential(nn.Linear(lstm_hidden_size * self.num_directions * num_lstm_layers, 64),
nn.ReLU(inplace=True))
self.output_layer = nn.Linear(64, 3)
def init_hidden(self, x):
batch_size = x.size(0)
h = x.data.new(
self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
c = x.data.new(
self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
return Variable(h).cuda(), Variable(c).cuda()
def forward(self, x):
# x shape: [B, 9, 3, 224, 224]
B = x.size(0)
x = x.view(B * 9, 3, 224, 224)
output = self.features(x) # [B*9, 512, 7, 7]
output = output.view(B * 9, -1).transpose(0, 1).contiguous().view(512 * 7 * 7, B, 9)
output = output.permute(1, 2, 0) # -> [B, 9, 512*7*7]
h, c = self.init_hidden(output)
output, (h, c) = self.lstm1(output, (h, c)) # h: (num_layers * num_directions, batch, lstm_hidden_size)
h = h.transpose_(0, 1).contiguous().view(B, -1) # -> [B, num_layers * num_directions*lstm_hidden_size]
output = self.linear1(h) # [B, 64]
output = self.output_layer(output) # [B, 3]
return output
if __name__ == "__main__":
model = VGG_LSTM()
print(model)
if torch.cuda.is_available():
model.cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = nn.CrossEntropyLoss()
for epoch in range(100):
print('epoch {}'.format(epoch + 1))
train_loss = 0.
train_acc = 0.
for batch_x, batch_y in train_loader:
# print(batch_x.size())
batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
out = model(batch_x)
# print(batch_x.size())
loss = loss_func(out, batch_y)
train_loss += loss.data[0]
pred = torch.max(out, 1)[1]
train_correct = (pred == batch_y).sum()
train_acc += train_correct.data[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
train_data)), train_acc / (len(train_data))))
# -----------------------evaluation--------------------------------
model.eval()
eval_loss = 0.
eval_acc = 0.
for batch_x, batch_y in test_loader:
batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
out = model(batch_x)
loss = loss_func(out, batch_y)
eval_loss += loss.data[0]
pred = torch.max(out, 1)[1]
num_correct = (pred == batch_y).sum()
eval_acc += num_correct.data[0]
print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
test_data)), eval_acc / (len(test_data))))
常见错误:
1、报错:size mismatch,如下图:
解决办法:报这样的错误是应为卷积层与层之间的维度不匹配。要明确每一步的输入输出的size,可以debug查看每一步的size。还有就是LSTM的输入输出格式,详细可以参考pytorch官网。
2、报错:out of memory,如下图:
解决办法:
一、调小batch_size的值;
二、输入图片变小一些,可以使用resize;
三、换显卡。
另外需要注意的是,并不是卷积部分网络越深越好,像在我的任务中,卷积部分迁移VGG、RESNET准确率反而下降。所以我们需要根据具体的任务来调整网络结构。