基于crnn图像序列预测-pytorch代码实现——训练过程及常见错误

最新推荐文章于 2022-10-23 20:35:45 发布

置顶 hnu_zzt

最新推荐文章于 2022-10-23 20:35:45 发布

阅读量4.6k

点赞数 1

分类专栏： CRNN图像序列预测文章标签：图像序列预测卷积循环神经网络

本文链接：https://blog.csdn.net/hnu_zzt/article/details/86519448

版权

CRNN图像序列预测专栏收录该内容

8 篇文章 3 订阅

订阅专栏

目录：
1、基于crnn图像序列预测-pytorch代码实现——加载自己的数据集
2、基于crnn图像序列预测-pytorch代码实现——模型介绍
3、基于crnn图像序列预测-pytorch代码实现——训练过程及常见错误

在这里以VGG_LSTM为例，优化算法选的是Adam，损失函数是CrossEntropyLoss()，详细训练代码如下：

if __name__ == "__main__":
    model = VGG_LSTM()
    print(model)
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    for epoch in range(100):
        print('epoch {}'.format(epoch + 1))
        train_loss = 0.
        train_acc = 0.
        for batch_x, batch_y in train_loader:
            # print(batch_x.size())
            batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
            out = model(batch_x)
            # print(batch_x.size())
            loss = loss_func(out, batch_y)
            train_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            train_correct = (pred == batch_y).sum()
            train_acc += train_correct.data[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
            train_data)), train_acc / (len(train_data))))



        # -----------------------evaluation--------------------------------
        model.eval()
        eval_loss = 0.
        eval_acc = 0.
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
            out = model(batch_x)
            loss = loss_func(out, batch_y)
            eval_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct.data[0]
        print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
            test_data)), eval_acc / (len(test_data))))

整个过程，包括数据加载，模型到训练的详细代码如下：

import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import torch.optim as optim

BATCH_SIZE = 4
learning_rate = 0.0001

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    # transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])
val_transforms = transforms.Compose([
    # transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    # transforms.Normalize((.5, .5, .5), (.5, .5, .5))
])

def default_loader(path):
    return Image.open(path).convert('RGB')


class MyDataset(Dataset):
    def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
        fh = open(txt, 'r')
        imgs = []
        for line in fh:
            line = line.strip('\n')
            line = line.rstrip()
            words = line.split()
            imgs.append((words[0], int(words[1])))
        imgs.sort(key=lambda x: x[0], reverse=False)
        self.num_samples = len(imgs)
        self.num_samples_per_iteration = 9
        self.imgs = imgs
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

    def __getitem__(self, index):
        current_index = np.random.choice(range(self.num_samples_per_iteration, self.num_samples))
        current_imgs = []
        current_label = self.imgs[current_index][1]
        for i in range(current_index - self.num_samples_per_iteration, current_index):
            fn, label = self.imgs[i]
            img = self.loader(fn)
            if self.transform is not None:
                img = self.transform(img)
            current_imgs.append(img)
        batch_cur_imgs = np.stack(current_imgs, axis=0)  # [9, 3, 256, 256]
        return batch_cur_imgs, current_label

    def __len__(self):
        return len(self.imgs)


train_data = MyDataset(txt='trainset256.txt', transform=train_transforms)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)

test_data = MyDataset(txt='testset256.txt', transform=val_transforms)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
print('num_of_trainData:', len(train_data))
print('num_of_testData:', len(test_data))


class VGG_LSTM(nn.Module):
    def __init__(self, lstm_hidden_size=256, num_lstm_layers=1, bidirectional=True):
        super(VGG_LSTM, self).__init__()
        net = models.vgg16(pretrained=True)
        net.classifier = nn.Sequential()
        self.num_directions = 2 if bidirectional else 1
        self.num_lstm_layers = num_lstm_layers
        self.lstm_hidden_size = lstm_hidden_size
        # [B, 3, 224, 224] -> [B, 512, 7, 7]
        self.features = net
        self.lstm1 = nn.LSTM(input_size=512 * 7 * 7,
                             hidden_size=lstm_hidden_size,
                             num_layers=num_lstm_layers,
                             batch_first=True,
                             dropout=0.5,
                             bidirectional=bidirectional)  # [B, 7, lstm_hidden_size]
        self.linear1 = nn.Sequential(nn.Linear(lstm_hidden_size * self.num_directions * num_lstm_layers, 64),
                                     nn.ReLU(inplace=True))
        self.output_layer = nn.Linear(64, 3)

    def init_hidden(self, x):
        batch_size = x.size(0)
        h = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        c = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        return Variable(h).cuda(), Variable(c).cuda()

    def forward(self, x):
        # x shape: [B, 9, 3, 224, 224]
        B = x.size(0)
        x = x.view(B * 9, 3, 224, 224)
        output = self.features(x)  # [B*9, 512, 7, 7]
        output = output.view(B * 9, -1).transpose(0, 1).contiguous().view(512 * 7 * 7, B, 9)
        output = output.permute(1, 2, 0)  # -> [B, 9, 512*7*7]
        h, c = self.init_hidden(output)
        output, (h, c) = self.lstm1(output, (h, c))  # h: (num_layers * num_directions, batch, lstm_hidden_size)
        h = h.transpose_(0, 1).contiguous().view(B, -1)  # -> [B, num_layers * num_directions*lstm_hidden_size]
        output = self.linear1(h)  # [B, 64]
        output = self.output_layer(output)  # [B, 3]
        return output


if __name__ == "__main__":
    model = VGG_LSTM()
    print(model)
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    for epoch in range(100):
        print('epoch {}'.format(epoch + 1))
        train_loss = 0.
        train_acc = 0.
        for batch_x, batch_y in train_loader:
            # print(batch_x.size())
            batch_x, batch_y = Variable(batch_x).cuda(), Variable(batch_y).cuda()
            out = model(batch_x)
            # print(batch_x.size())
            loss = loss_func(out, batch_y)
            train_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            train_correct = (pred == batch_y).sum()
            train_acc += train_correct.data[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Train Loss: {:.6f}, Acc: {:.6f}'.format(train_loss / (len(
            train_data)), train_acc / (len(train_data))))



        # -----------------------evaluation--------------------------------
        model.eval()
        eval_loss = 0.
        eval_acc = 0.
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = Variable(batch_x, volatile=True).cuda(), Variable(batch_y, volatile=True).cuda()
            out = model(batch_x)
            loss = loss_func(out, batch_y)
            eval_loss += loss.data[0]
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct.data[0]
        print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
            test_data)), eval_acc / (len(test_data))))

常见错误：
1、报错：size mismatch，如下图：
在这里插入图片描述
解决办法：报这样的错误是应为卷积层与层之间的维度不匹配。要明确每一步的输入输出的size，可以debug查看每一步的size。还有就是LSTM的输入输出格式，详细可以参考pytorch官网。

2、报错：out of memory，如下图：
在这里插入图片描述
解决办法：
一、调小batch_size的值；

二、输入图片变小一些，可以使用resize；

三、换显卡。

另外需要注意的是，并不是卷积部分网络越深越好，像在我的任务中，卷积部分迁移VGG、RESNET准确率反而下降。所以我们需要根据具体的任务来调整网络结构。

hnu_zzt

关注

1
点赞
踩
20

收藏

觉得还不错? 一键收藏
3
评论
基于crnn图像序列预测-pytorch代码实现——训练过程及常见错误

在这里主要介绍三种模型，包括简单的一层卷积层的conv1_LSTM,以及卷积部分迁移学习VGG和RESNET模型的VGG_LSTM，RESNET_LSTM。常见错误：1、报错：size mismatch，如下图，报这样的错误是应为卷积层与层之间的维度不匹配。要明确每一步的输入输出的size。...
复制链接

扫一扫