pytorch+cnn+lstm+词向量

pytorch+cnn+lstm+词向量

视频分类,词向量

# -*- coding: utf-8 -*-
"""
Created on Fri Nov  6 12:53:02 2020

@author: HUANGYANGLAI
"""

import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
from functions import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

data_path='I:\\test\\image'
save_model_path = "I:\\test\\CRNN_ckpt\\"

# EncoderCNN architecture
#cnn编码器构架
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768#编码器第一隐藏层,第二隐藏层参数
CNN_embed_dim = 512      # latent dim extracted by 2D CNN
img_x, img_y = 256, 342  # resize video 2d frame size(可能更改图片尺寸)
dropout_p = 0.3         # dropout probability(随机失活)


# DecoderRNN architecture
#RNN解码器框架
RNN_hidden_layers = 3#单层网络层数
RNN_hidden_nodes = 512#隐藏节点
RNN_FC_dim = 256


# training parameters
#训练参数
#k = 2             # number of target category(目标类别数)
epochs = 10    # training epochs(迭代次数)
batch_size = 1     #(批处理)
learning_rate = 1e-2    #(学习精度)
log_interval = 10   # interval for displaying training info(显示训练信息的时间间隔)

# Select which frame to begin & end in videos
#选择视频中开始和结束的帧
begin_frame, end_frame, skip_frame = 1, 90, 2#跳过帧

#用来固定第一次的词向量
number1=0

class EncoderCNN1(nn.Module):
    def __init__(self, img_x=90, img_y=120, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        super(EncoderCNN1, self).__init__()

        self.img_x = img_x
        self.img_y = img_y
        self.CNN_embed_dim = CNN_embed_dim

        # CNN architechtures
        self.ch1, self.ch2, self.ch3, self.ch4 = 32, 64, 128, 256
        self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3)      # 2d kernal size
        self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2)      # 2d strides
        self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0)  # 2d padding

        # conv2D output shapes
        self.conv1_outshape = conv2D_output_size((self.img_x, self.img_y), self.pd1, self.k1, self.s1)  # Conv1 output shape
        self.conv2_outshape = conv2D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)
        self.conv3_outshape = conv2D_output_size(self.conv2_outshape, self.pd3, self.k3, self.s3)
        self.conv4_outshape = conv2D_output_size(self.conv3_outshape, self.pd4, self.k4, self.s4)

        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.pd1),
            nn.BatchNorm2d(self.ch1, momentum=0.01),
            nn.ReLU(inplace=True),                      
            # nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.pd2),
            nn.BatchNorm2d(self.ch2, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3, padding=self.pd3),
            nn.BatchNorm2d(self.ch3, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4, padding=self.pd4),
            nn.BatchNorm2d(self.ch4, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.drop = nn.Dropout2d(self.drop_p)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(self.ch4 * self.conv4_outshape[0] * self.conv4_outshape[1], self.fc_hidden1)   # fully connected layer, output k classes
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)   # output = CNN embedding latent variables
        ###########用来将3维变成2维
        self.fc4= nn.Sequential(
            nn.Linear(23040,100),
            nn.Tanh(),
            )

    def forward(self, x_3d):
        cnn_embed_seq = []
        print('x_3d的形状',x_3d.size())#([1, 28, 3, 256, 342])
        print('x_3d的形状1',x_3d.size(1))
        for t in range(x_3d.size(1)):
            # CNNs
            #print('x_3d[:, t, :, :, :]',x_3d[:, t, :, :, :].size())#torch.Size([1, 3, 256, 342])
            x = self.conv1(x_3d[:, t, :, :, :])#torch.Size([1, 32, 126, 129])
            #print('self.conv1(x_3d[:, t, :, :, :])',x.size())
            x = self.conv2(x)
            #print('self.conv2(x_3d[:, t, :, :, :])',x.size())#torch.Size([1, 64, 62, 84])
            x = self.conv3(x)
            #print('self.conv3(x_3d[:, t, :, :, :])',x.size())#torch.Size([1, 128, 30, 41])
            x = self.conv4(x)
            #print('x的形状',x.size())# torch.Size([1, 256, 14, 20])
            x = x.view(x.size(0), -1)           # flatten the output of conv
            #print('拉直的x的形状',x.size())# torch.Size([1, 71680])
            # FC layers
            x = F.relu(self.fc1(x))
            #print('F.relu(self.fc1(x))', x.size())#torch.Size([1, 1024])
            # x = F.dropout(x, p=self.drop_p, training=self.training)
            x = F.relu(self.fc2(x))
            #print('x = F.relu(self.fc2(x))',x.size())#torch.Size([1, 768])
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)
            #print('x = self.fc3(x)',x.size())#torch.Size([1, 512])
            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        #print('cnn_embed_seq',cnn_embed_seq)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        print('cnn_embed_seq1',cnn_embed_seq.size())#torch.Size([1, 28, 512])
        cnn_embed_seq=cnn_embed_seq.reshape(1,23040)
        cnn_embed_seq=F.relu(self.fc4(cnn_embed_seq))
        print('cnn_embed_seq2',cnn_embed_seq.size())
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    #(设置模式作为训练模式)
    cnn_encoder, rnn_decoder = model
    cnn_encoder.train()#训练模式
    rnn_decoder.train()#训练模式

    losses = []
    scores = []#分数
    N_count = 0   # counting total trained sample in one epoch(计算一次训练内训练的样本数)
    for batch_idx, (X, y) in enumerate(train_loader):
        print('迭代次数',batch_idx)
        print('看看是啥',X.size())#torch.Size([1, 28, 3, 256, 342])
        print('看看标签',y)#tensor([[1]])
        ####################################################构造词向量
        yy=y.clone()
        yy=yy.squeeze()
        yy=yy.numpy().tolist()
        print('yyyyyyyy',yy)
        if(yy==0):
            y_embed=yd0
        if(yy==1):
            y_embed=yd1
            
        ################################################################
        # distribute data to device#使用设备训练可以是显卡或者cpu
        X, y = X.to(device), y.to(device).view(-1, )
        

        N_count += X.size(0)


        output = cnn_encoder(X)  
        
        loss = loss_func(output,y_embed)
        print('outpt是啥',output.size())
        '''
        output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes),批数分类数
        # print('outpt是啥',output)# tensor([[-0.0016, -0.0368]],
        # print('cnn_encoder(X)',cnn_encoder(X).size())#(1,28,512)
        loss = F.cross_entropy(output, y)#交叉熵函数
        '''
        losses.append(loss.item())#误差
        '''
        # to compute accuracy(计算精确度)
        # print('呦西呦西qq',output.size())#torch.Size([1, 2])
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        # print('呦西呦西',y_pred)#tensor([0])
        # print('柯基柯基',y.size())#torch.Size([1])
        # print('呦西呦西1',y.cpu().data)#tensor([0])
        # print('呦西呦西2',y_pred.cpu().data)#tensor([0])
        #step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        step_score = accuracy_score(y.cpu().data, y_pred.cpu().data)
        # print('八嘎',step_score)
        #accuracy_score中normalize:默认值为True,返回正确分类的比例
        scores.append(step_score)         # computed on CPU
        '''
  
        optimizer.zero_grad()#老三步
        loss.backward(retain_graph=True)#老三步
        optimizer.step()#老三步

        # show information
        if (batch_idx + 1) % log_interval == 0:#十次显示一次
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return losses, scores



def validation(model, device, optimizer, test_loader):#测试
    # set model as testing mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval()#测试模式
    rnn_decoder.eval()#测试模式

    test_loss =0
    all_y = []
    all_y_pred = []
    ko=0
    with torch.no_grad():#在测试中不需要进行梯度翻传等操作
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device).view(-1, )

            output = rnn_decoder(cnn_encoder(X))
            # print('预测的output',output)
            #print('cnn_encoder(x)',cnn_encoder(x))
            loss = F.cross_entropy(output, y, reduction='sum')#对n个样本的loss进行求
            
            print('实际loss',loss)
            ko=ko+1
            print('ko',ko)
            test_loss+=loss.item()                # sum up batch loss
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability
            # print('预测的标签',y_pred)
            # print('真实标签',y)
            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    print('test_loss',test_loss)
    test_loss = test_loss/len(test_loader.dataset)
    
    # print('len(test_loader.dataset)',len(test_loader.dataset))
    # # compute accuracy
    # print('all1y',all_y)
    all_y = torch.stack(all_y, dim=0)#所有元素相加
#    print('ally',all_y)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    print('all_y_pred',all_y_pred)
    print('test_loss',test_loss)
    print('len(test_loader.dataset)',len(test_loader.dataset))
    test_score = accuracy_score(all_y.cpu().data, all_y_pred.cpu().data)
    #accuracy_score中normalize:默认值为True,返回正确分类的比例

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), loss.item(), 100* test_score))

    # save Pytorch models of best record
    #torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_encoder_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
    #存放torch.nn.Module模块中的state_dict只包含卷积层和全连接层的参数,当网络中存在batchnorm时,例如vgg网络结构,torch.nn.Module模块中的state_dict也会存放batchnorm's running_mean
    #torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_epoch{}.pth'.format(epoch + 1)))  # save motion_encoder
    #torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch{}.pth'.format(epoch + 1)))      # save optimizer
    print("Epoch {} model saved!".format(epoch + 1))

    return test_loss, test_score

# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

# Data loading parameters
#params = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0, 'pin_memory': True} 

'''
# load UCF101 actions names
#加装类别名字
with open(action_name_path, 'rb') as f:
    action_names = pickle.load(f)
'''


action_names=['ApplyEyeMakeup','BandMarching']



# convert labels -> category
le = LabelEncoder()
le.fit(action_names)
print("jjjjjjjjjj",le.fit(action_names))
# show how many classes there are(列表中显示标签名称)
list(le.classes_)
print('list(le.classes_)',list(le.classes_))

# convert category -> 1-hot
action_category = le.transform(action_names).reshape(-1, 1)#(将字符串标签给编号0-100print('action_category',action_category)
enc = OneHotEncoder()#实现将分类特征的每一个数值转化为一个可以用来计算的值
enc.fit(action_category)#这里的作用是为后面enc.transfrom中生成自动编码做准备

print("kkkkkkkkkkkkkkkkkk",enc.fit(action_category))
actions = []
fnames = os.listdir(data_path)#(得到数据路径下的所有文件,返回以列表的形式)

all_names = []
for f in fnames:
    loc1 = f.find('v_')
    loc2 = f.find('_g')
    actions.append(f[(loc1 + 2): loc2])

    all_names.append(f)


# list all data files(列出所有文件数据)
all_X_list = all_names                  # all video file names(所有视频文件名)
all_y_list = labels2cat(le, actions)    # all video labels(即每个视频文件夹对应的标签)
print('\n')
print(all_X_list)
print(all_y_list)
print(actions)
print('\n')
train_list, test_list, train_label, test_label = train_test_split(all_X_list, all_y_list, test_size=0.5, random_state=42)
print('train_list是啥',train_list)#['v_ApplyEyeMakeup_g01_c01', 'v_BandMarching_g01_c01']
print('test_list是啥',test_list)#['v_ApplyEyeMakeup_g01_c02', 'v_BandMarching_g01_c02']
print('train_label是啥',train_label)#[0,1]
print('test_label是啥',test_label)# [0,1]

transform = transforms.Compose([transforms.Resize([img_x, img_y]),#改变形状
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()#挑选帧
print('selected_frames',selected_frames)



train_set, valid_set = Dataset_CRNN(data_path, train_list, train_label, selected_frames, transform=transform), \
                       Dataset_CRNN(data_path, test_list, test_label, selected_frames, transform=transform)
#print('train_set的形状',train_set)# <functions.Dataset_CRNN object at 0x0000027A97384488>
train_loader = data.DataLoader(train_set, **params)
#print('train_loader的形状',train_loader)
valid_loader = data.DataLoader(valid_set, **params)

cnn_encoder = EncoderCNN1(img_x=img_x, img_y=img_y, fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2,
                         drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)

rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=dropout_p, num_classes=k).to(device)

#crnn_params = list(cnn_encoder.parameters())
#optimizer = torch.optim.Adam(cnn_encoder.parameters(), lr=learning_rate)#优化cnn编码器和rnn解码器的参数
optimizer = torch.optim.SGD(cnn_encoder.parameters(), lr=learning_rate)#优化cnn编码器和rnn解码器的参数
#loss_func=torch.nn.MSELoss()
loss_func=torch.nn.SmoothL1Loss()

epoch_train_losses = []
epoch_train_scores = []

#############################################词向量
word_to_ix={'ApplyEyeMakeup':0,'BandMarching':1}
idex_to_word={word_to_ix[word]:word for word in word_to_ix}
embeds = torch.nn.Embedding(2,100)

y_idx0=torch.LongTensor([word_to_ix['ApplyEyeMakeup']])
print('y_idx',y_idx0.size())
y_embed00 = embeds(y_idx0)
print('y_embed00',y_embed00.size())

yd0=y_embed00

y_idx1=torch.LongTensor([word_to_ix['ApplyEyeMakeup']])
print('y_idx',y_idx1.size())
y_embed1 = embeds(y_idx1)
print('y_embed1',y_embed1.size())

yd1=y_embed1
###########################################
for epoch in range(epochs):
    train_losses, train_scores = train(log_interval, [cnn_encoder, rnn_decoder], device, train_loader, optimizer, epoch)
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)
    
    
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(np.arange(1, epochs + 1), A[:, -1])  # train loss (on epoch end)
#plt.plot(np.arange(1, epochs + 1), C)         #  test loss (on epoch end)
plt.title("model loss")
plt.xlabel('epochs')
plt.ylabel('loss')
#plt.legend(['train', 'test'], loc="upper left")
plt.legend(['train'], loc="upper left")
'''
# 2nd figure
plt.subplot(122)
plt.plot(np.arange(1, epochs + 1), B[:, -1])  # train accuracy (on epoch end)
#plt.plot(np.arange(1, epochs + 1), D)         #  test accuracy (on epoch end)
plt.title("training scores")
plt.xlabel('epochs')
plt.ylabel('accuracy')
#plt.legend(['train', 'test'], loc="upper left")
title = "./fig_UCF101_CRNN.png"
plt.savefig(title, dpi=600)
# plt.close(fig)
plt.show()
'''
  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值