Res-net+biLSTM行为分类

加载头文件

import os
import glob
import numpy as np
import cv2
from PIL import Image
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import scipy.io
from torch.utils.data import Dataset, DataLoader

加载数据集

HDMB:大型人体运动数据库下载 HMBD51 数据集,并将 RAR 文件提取到名为 “hmdb51_org” 的文件夹中。该数据集包含 51 个类的 7000 个片段、大约 2 GB 的视频数据,例如 “drink”、“run” 和 “shake_hands”。
在这里插入图片描述
提取 RAR 文件后,使用下面的函数获取视频的文件名和标签。

def hmdb51Files(dataFolder):
    # 获取所有视频文件的路径
    videoPaths = glob.glob(os.path.join(dataFolder, '*', '*.avi'))
    # print(videoPaths)
    # 获取所有视频文件的标签
    labels = [os.path.basename(os.path.dirname(p)) for p in videoPaths]
    # 将标签转换为数字编码
    labelToIdx = {label: idx for idx, label in enumerate(sorted(set(labels)))}
    labels = [labelToIdx[label] for label in labels]
    # 将视频文件路径和标签合并为一个列表
    files = list(zip(videoPaths, labels))
    # 将列表随机打乱
    np.random.shuffle(files)
    # 返回视频文件路径和标签
    return files, labels
dataFolder = "hmdb51_org"
files, labels = hmdb51Files(dataFolder)

加载Res-net模型

我们直接从torchvision.models中加载Res-net模型,并去掉模型的最后一层,使原来的全连接层输出作为模型的输出:

# 加载 ResNet50 model
resnet = models.resnet18(pretrained=True).cuda()

# 去掉模型最后一层来将模型作为特征提取
resnet_feat = torch.nn.Sequential(*list(resnet.children())[:-1])

Res-net模型结构如下所示:

ResNet( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2),
padding=(3, 3), bias=False) (bn1): BatchNorm2d(64, eps=1e-05,
momentum=0.1, affine=True, track_running_stats=True) (relu):
ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2,
padding=1, dilation=1, ceil_mode=False) (layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): BasicBlock(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
) ) (layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
) ) (layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
) ) (layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): BasicBlock(
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
) ) (avgpool): AdaptiveAvgPool2d(output_size=(1, 1)) (fc): Linear(in_features=512, out_features=1000, bias=True) )

将视频文件转换为时间序列

def get_tensor_from_video(video_path):
    """
    :param video_path: 视频文件地址
    :return: pytorch tensor
    """
    transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    if not os.access(video_path, os.F_OK):
        print('测试文件不存在')
        return

    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    frames_list = []
    
    while(cap.isOpened()):
        ret,frame = cap.read()
        if not ret:
            break
        else:
            # 注意,opencv默认读取的为BGR通道组成模式,需要转换为RGB通道模式
            frame_count += 1
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = transform(frame).cuda()
            frames_list.append(frame)
    cap.release()

    # 转换成tensor
    result_frames = torch.stack(frames_list, dim=0)
    result_frames = result_frames.float()
    # 注意:此时result_frames组成的维度为[视频帧数量,宽,高,通道数]
    return result_frames
# Create an empty list to store the features and labels
samples = []

numFiles = len(files)
for i in range(numFiles):
    print("Reading file %d of %d...\n" % (i,numFiles))
    
    video = get_tensor_from_video(files[i][0])
    
    # Extract the features using the pre-trained ResNet50 model
    with torch.no_grad():
        features_tensor = resnet_feat(video)
        # print(features_tensor.shape)
    # Flatten the features tensor
    features_tensor = torch.flatten(features_tensor, start_dim=1)
    # Convert the features tensor to a numpy array
    features = features_tensor.cpu().numpy()
    # Append the features and label to the samples list
    samples.append((features, files[i][1]))

# Shuffle the samples list
np.random.shuffle(samples)

# Split the samples into training and testing sets
split_idx = int(0.9 * len(samples))
train_samples = samples[:split_idx]
test_samples = samples[split_idx:]

通过之前加载的ResNet网络我们可以将视频转换成一维的时间序列,并将其保存在samples列表中,按照1:9的比例划分训练集和验证集。

保存时间序列

将训练集和验证集分别保存为.npy文件

# Convert the features and labels arrays to numpy arrays
train_features = np.array(train_features)
train_labels = np.array(train_labels)
test_features = np.array(test_features)
test_labels = np.array(test_labels)


# Print the shapes of the features and labels arrays
print("Train Features shape:", train_features.shape)
print("Train Labels shape:", train_labels.shape)
print("Test Features shape:", test_features.shape)
print("Test Labels shape:", test_labels.shape)


# Save the features and labels to numpy arrays
np.save('train_features.npy', train_features)
np.save('train_labels.npy', train_labels)
np.save('test_features.npy', test_features)
np.save('test_labels.npy', test_labels)

定义Bi-LSTM网络结构

class MultiLayerBiLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes) # *2 to account for bidirectional LSTM
        self.BN = nn.BatchNorm1d(18)

    def forward(self, x):
        # Initialize hidden state and cell state with zeros
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(x.device) # *2 to account for bidirectional LSTM
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(x.device) # *2 to account for bidirectional LSTM
        # Forward propagate bidirectional LSTM
        out, _ = self.lstm(x, (h0, c0))
        out = self.BN(out)
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        out = nn.functional.softmax(out, dim=1)
        return out

这里的input_size需要跟输入的序列长度保持一致,num_classes为需要分类的类别数目,hidden_size可以自行定义。

# Define the LSTM parameters
input_size = 512
hidden_size = 196
num_classes = 51

# Instantiate the LSTM model
model = MultiLayerBiLSTMClassifier(input_size, hidden_size, 2, num_classes).cuda()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Train the LSTM model
num_epochs = 50
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

加载.npy数据

分别重写Dataset函数用来加载之前保存的.npy数据,这里我的代码如下,batch_size大小设置为32:
训练集:

class trainDataset(Dataset):
    def __init__(self,filepath):
        t_f = np.load('train_features.npy',allow_pickle=True)
        # 截取视频的前17帧用来进行训练
        lt_f = t_f.shape[0]
        for i in range(lt_f):
            t_f[i] = t_f[i][0:18].astype(float)
        
        # Load the features and labels from numpy arrays
        train_features = torch.Tensor(list(t_f)).float()
        train_labels = torch.from_numpy(np.load('train_labels.npy',allow_pickle=True))#.long()
        idx = np.random.permutation(len(train_features))
        train_features, train_labels = train_features[idx], train_labels[idx]
        self.len = train_features.shape[0]
        self.x_data = train_features
        self.y_data = train_labels
 
    # getitem函数,可以使用索引拿到数据
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
 
    # 返回数据的条数/长度
    def __len__(self):
        return self.len

dataset = trainDataset("")

train_loader = DataLoader(dataset=dataset,
                          batch_size=32,
                          shuffle=True)

验证集:

class testDataset(Dataset):
    def __init__(self,filepath):
        te_f = np.load('test_features.npy',allow_pickle=True)
        lte_f = te_f.shape[0]
        for i in range(lte_f):
            te_f[i] = te_f[i][0:18].astype(float)
        
        # Load the features and labels from numpy arrays
        test_features = torch.Tensor(list(te_f)).float()
        test_labels = torch.from_numpy(np.load('test_labels.npy'))#.long()
        idx = np.random.permutation(len(test_features))
        test_features, test_labels = test_features[idx], test_labels[idx]
        self.len = test_features.shape[0]
        self.x_data = test_features
        self.y_data = test_labels
 
    # getitem函数,可以使用索引拿到数据
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
 
    # 返回数据的条数/长度
    def __len__(self):
        return self.len

test_dataset = testDataset("")

test_loader = DataLoader(dataset=test_dataset,
                          batch_size=32,
                          shuffle=True)

训练网络

t_test_acc = 0

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_total = 0
    train_correct = 0
    for _,train_features in enumerate(train_loader, 0):
        # Get a batch of 15 frames features and labels
        batch_features = train_features[0].cuda()
        batch_labels = train_features[1].cuda()
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward + backward + optimize
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        # Accumulate training loss and accuracy
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += batch_features.shape[0]
        train_correct += (predicted == batch_labels).sum().item()
    train_loss /= train_total
    train_accuracy = 100 * train_correct / train_total
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Testing
    model.eval()
    test_loss = 0.0
    test_total = 0
    test_correct = 0
    with torch.no_grad():
        for _,test_features in enumerate(test_loader, 0):
            # Get a batch of 15 frames features and labels
            batch_features = test_features[0].cuda()
            batch_labels = test_features[1].cuda()
            # Forward
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            # Accumulate testing loss and accuracy
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            test_total += batch_features.shape[0]
            test_correct += (predicted == batch_labels).sum().item()
            # Calculate testing accuracy
        test_accuracy = test_correct / test_total
        test_loss /= test_total
        test_accuracy = 100* test_correct / test_total
        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)
        if(test_accuracy > t_test_acc):
            t_test_acc = test_accuracy
            torch.save(model.state_dict(), "cnn_bilstm_highest_acc.pth")
            print("save successfully,t_acc is {:.4f}%".format(t_test_acc))
            
        # Print testing loss and accuracy
        print('Train Loss: {:.4f}, Train Accuracy: {:.2f}%  Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format( train_loss, train_accuracy, test_loss, test_accuracy))

在训练中,模型很快便达到过拟合,对验证集的分类正确率最高能达到59%,训练过程中Loss和正确率变化如下所示,在训练的同时我们保存了验证集正确率最高的模型:
在这里插入图片描述

结果可视化

接下来,我们对模型的预测结果绘制一下热图,首先加载我们保存下来的模型:

model1 = MultiLayerBiLSTMClassifier(input_size, hidden_size, 2, num_classes).cuda()
model1.load_state_dict(torch.load("cnn_bilstm_highest_acc_h128_atten.pth"))

我们定义一个混淆矩阵,并将他打印出来:

conf_matrix = torch.zeros(num_classes, num_classes)
with torch.no_grad():
        for _,test_features in enumerate(test_loader, 0):
            # Get a batch of 15 frames features and labels
            batch_features = test_features[0].cuda()
            batch_labels = test_features[1].cuda()
            # Forward
            outputs = model1(batch_features)
            _, p = torch.max(outputs, 1)
            #记录混淆矩阵参数
            length = p.shape[0]
            for i in range(length):
                conf_matrix[p[i].item(),batch_labels[i].item()]+=1

conf_matrix=np.array(conf_matrix.cpu())# 将混淆矩阵从gpu转到cpu再转到np

corrects=conf_matrix.diagonal(offset=0)#抽取对角线的每种分类的识别正确个数
per_kinds=conf_matrix.sum(axis=1)#抽取每个分类数据总的测试条数

print("混淆矩阵总元素个数:{0},测试集总个数:{1}".format(int(np.sum(conf_matrix)),len(test_features)))
np.set_printoptions(threshold=np.inf)
print(conf_matrix)

最后将这个混淆矩阵以热图的形式绘制出来,代码如下:

# 显示数据
plt.figure(figsize=(10,10))
plt.imshow(conf_matrix, cmap=plt.cm.Blues)


thresh = conf_matrix.max() / 2	#数值颜色阈值,如果数值超过这个,就颜色加深。
for x in range(51):
    for y in range(51):
        # 注意这里的matrix[y, x]不是matrix[x, y]
        info = int(conf_matrix[y, x])
        plt.text(x, y, info,
                 verticalalignment='center',
                 horizontalalignment='center',
                 color="white" if info > thresh else "black")
                 
plt.tight_layout()#保证图不重叠

效果如下所示:
在这里插入图片描述
可以看到我们的模型对于大部分的类别预测比较准确,对于某些特点类别的预测效果可以进一步提升,例如对于32:ride_horse模型很容易将其判定为49:walk等等。

本文全部代码可见:GitHub
注意将代码放到与"hmdb51_org"文件夹同级位置上。

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

人工智能小白

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值