视频分类,词向量
# - * - coding: utf- 8 - * -
"" "
Created on Fri Nov 6 12 : 53 : 02 2020
@author: HUANGYANGLAI
"" "
import os
import numpy as np
import torch
import torch. nn as nn
import torch. nn. functional as F
import torchvision. models as models
import torchvision. transforms as transforms
import torch. utils. data as data
import torchvision
from torch. autograd import Variable
import matplotlib. pyplot as plt
from functions import *
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import OneHotEncoder, LabelEncoder
from sklearn. metrics import accuracy_score
import pickle
data_path= 'I:\\test\\image'
save_model_path = "I:\\test\\CRNN_ckpt\\"
# EncoderCNN architecture
#cnn编码器构架
CNN_fc_hidden1, CNN_fc_hidden2 = 1024 , 768 #编码器第一隐藏层,第二隐藏层参数
CNN_embed_dim = 512 # latent dim extracted by 2 D CNN
img_x, img_y = 256 , 342 # resize video 2 d frame size(可能更改图片尺寸)
dropout_p = 0.3 # dropout probability(随机失活)
# DecoderRNN architecture
#RNN 解码器框架
RNN_hidden_layers = 3 #单层网络层数
RNN_hidden_nodes = 512 #隐藏节点
RNN_FC_dim = 256
# training parameters
#训练参数
#k = 2 # number of target category(目标类别数)
epochs = 10 # training epochs(迭代次数)
batch_size = 1 #(批处理)
learning_rate = 1e-2 #(学习精度)
log_interval = 10 # interval for displaying training info(显示训练信息的时间间隔)
# Select which frame to begin & end in videos
#选择视频中开始和结束的帧
begin_frame, end_frame, skip_frame = 1 , 90 , 2 #跳过帧
#用来固定第一次的词向量
number1= 0
class EncoderCNN1 ( nn. Module) :
def __init__ ( self, img_x= 90 , img_y= 120 , fc_hidden1= 512 , fc_hidden2= 512 , drop_p= 0.3 , CNN_embed_dim= 300 ) :
super ( EncoderCNN1, self) . __init__ ( )
self. img_x = img_x
self. img_y = img_y
self. CNN_embed_dim = CNN_embed_dim
# CNN architechtures
self. ch1, self. ch2, self. ch3, self. ch4 = 32 , 64 , 128 , 256
self. k1, self. k2, self. k3, self. k4 = ( 5 , 5 ) , ( 3 , 3 ) , ( 3 , 3 ) , ( 3 , 3 ) # 2 d kernal size
self. s1, self. s2, self. s3, self. s4 = ( 2 , 2 ) , ( 2 , 2 ) , ( 2 , 2 ) , ( 2 , 2 ) # 2 d strides
self. pd1, self. pd2, self. pd3, self. pd4 = ( 0 , 0 ) , ( 0 , 0 ) , ( 0 , 0 ) , ( 0 , 0 ) # 2 d padding
# conv2D output shapes
self. conv1_outshape = conv2D_output_size ( ( self. img_x, self. img_y) , self. pd1, self. k1, self. s1) # Conv1 output shape
self. conv2_outshape = conv2D_output_size ( self. conv1_outshape, self. pd2, self. k2, self. s2)
self. conv3_outshape = conv2D_output_size ( self. conv2_outshape, self. pd3, self. k3, self. s3)
self. conv4_outshape = conv2D_output_size ( self. conv3_outshape, self. pd4, self. k4, self. s4)
# fully connected layer hidden nodes
self. fc_hidden1, self. fc_hidden2 = fc_hidden1, fc_hidden2
self. drop_p = drop_p
self. conv1 = nn. Sequential (
nn. Conv2d ( in_channels= 3 , out_channels= self. ch1, kernel_size= self. k1, stride= self. s1, padding= self. pd1) ,
nn. BatchNorm2d ( self. ch1, momentum= 0.01 ) ,
nn. ReLU ( inplace= True) ,
# nn. MaxPool2d ( kernel_size= 2 ) ,
)
self. conv2 = nn. Sequential (
nn. Conv2d ( in_channels= self. ch1, out_channels= self. ch2, kernel_size= self. k2, stride= self. s2, padding= self. pd2) ,
nn. BatchNorm2d ( self. ch2, momentum= 0.01 ) ,
nn. ReLU ( inplace= True) ,
# nn. MaxPool2d ( kernel_size= 2 ) ,
)
self. conv3 = nn. Sequential (
nn. Conv2d ( in_channels= self. ch2, out_channels= self. ch3, kernel_size= self. k3, stride= self. s3, padding= self. pd3) ,
nn. BatchNorm2d ( self. ch3, momentum= 0.01 ) ,
nn. ReLU ( inplace= True) ,
# nn. MaxPool2d ( kernel_size= 2 ) ,
)
self. conv4 = nn. Sequential (
nn. Conv2d ( in_channels= self. ch3, out_channels= self. ch4, kernel_size= self. k4, stride= self. s4, padding= self. pd4) ,
nn. BatchNorm2d ( self. ch4, momentum= 0.01 ) ,
nn. ReLU ( inplace= True) ,
# nn. MaxPool2d ( kernel_size= 2 ) ,
)
self. drop = nn. Dropout2d ( self. drop_p)
self. pool = nn. MaxPool2d ( 2 )
self. fc1 = nn. Linear ( self. ch4 * self. conv4_outshape[ 0 ] * self. conv4_outshape[ 1 ] , self. fc_hidden1) # fully connected layer, output k classes
self. fc2 = nn. Linear ( self. fc_hidden1, self. fc_hidden2)
self. fc3 = nn. Linear ( self. fc_hidden2, self. CNN_embed_dim) # output = CNN embedding latent variables
###########用来将3 维变成2 维
self. fc4= nn. Sequential (
nn. Linear ( 23040 , 100 ) ,
nn. Tanh ( ) ,
)
def forward ( self, x_3d) :
cnn_embed_seq = [ ]
print ( 'x_3d的形状' , x_3d. size ( ) ) #( [ 1 , 28 , 3 , 256 , 342 ] )
print ( 'x_3d的形状1' , x_3d. size ( 1 ) )
for t in range ( x_3d. size ( 1 ) ) :
# CNNs
#print ( 'x_3d[:, t, :, :, :]' , x_3d[ : , t, : , : , : ] . size ( ) ) #torch. Size ( [ 1 , 3 , 256 , 342 ] )
x = self. conv1 ( x_3d[ : , t, : , : , : ] ) #torch. Size ( [ 1 , 32 , 126 , 129 ] )
#print ( 'self.conv1(x_3d[:, t, :, :, :])' , x. size ( ) )
x = self. conv2 ( x)
#print ( 'self.conv2(x_3d[:, t, :, :, :])' , x. size ( ) ) #torch. Size ( [ 1 , 64 , 62 , 84 ] )
x = self. conv3 ( x)
#print ( 'self.conv3(x_3d[:, t, :, :, :])' , x. size ( ) ) #torch. Size ( [ 1 , 128 , 30 , 41 ] )
x = self. conv4 ( x)
#print ( 'x的形状' , x. size ( ) ) # torch. Size ( [ 1 , 256 , 14 , 20 ] )
x = x. view ( x. size ( 0 ) , - 1 ) # flatten the output of conv
#print ( '拉直的x的形状' , x. size ( ) ) # torch. Size ( [ 1 , 71680 ] )
# FC layers
x = F . relu ( self. fc1 ( x) )
#print ( 'F.relu(self.fc1(x))' , x. size ( ) ) #torch. Size ( [ 1 , 1024 ] )
# x = F . dropout ( x, p= self. drop_p, training= self. training)
x = F . relu ( self. fc2 ( x) )
#print ( 'x = F.relu(self.fc2(x))' , x. size ( ) ) #torch. Size ( [ 1 , 768 ] )
x = F . dropout ( x, p= self. drop_p, training= self. training)
x = self. fc3 ( x)
#print ( 'x = self.fc3(x)' , x. size ( ) ) #torch. Size ( [ 1 , 512 ] )
cnn_embed_seq. append ( x)
# swap time and sample dim such that ( sample dim, time dim, CNN latent dim)
#print ( 'cnn_embed_seq' , cnn_embed_seq)
cnn_embed_seq = torch. stack ( cnn_embed_seq, dim= 0 ) . transpose_ ( 0 , 1 )
print ( 'cnn_embed_seq1' , cnn_embed_seq. size ( ) ) #torch. Size ( [ 1 , 28 , 512 ] )
cnn_embed_seq= cnn_embed_seq. reshape ( 1 , 23040 )
cnn_embed_seq= F . relu ( self. fc4 ( cnn_embed_seq) )
print ( 'cnn_embed_seq2' , cnn_embed_seq. size ( ) )
# cnn_embed_seq: shape= ( batch, time_step, input_size)
return cnn_embed_seq
def train ( log_interval, model, device, train_loader, optimizer, epoch) :
# set model as training mode
#(设置模式作为训练模式)
cnn_encoder, rnn_decoder = model
cnn_encoder. train ( ) #训练模式
rnn_decoder. train ( ) #训练模式
losses = [ ]
scores = [ ] #分数
N_count = 0 # counting total trained sample in one epoch(计算一次训练内训练的样本数)
for batch_idx, ( X , y) in enumerate ( train_loader) :
print ( '迭代次数' , batch_idx)
print ( '看看是啥' , X . size ( ) ) #torch. Size ( [ 1 , 28 , 3 , 256 , 342 ] )
print ( '看看标签' , y) #tensor ( [ [ 1 ] ] )
####################################################构造词向量
yy= y. clone ( )
yy= yy. squeeze ( )
yy= yy. numpy ( ) . tolist ( )
print ( 'yyyyyyyy' , yy)
if ( yy== 0 ) :
y_embed= yd0
if ( yy== 1 ) :
y_embed= yd1
################################################################
# distribute data to device#使用设备训练可以是显卡或者cpu
X , y = X . to ( device) , y. to ( device) . view ( - 1 , )
N_count += X . size ( 0 )
output = cnn_encoder ( X )
loss = loss_func ( output, y_embed)
print ( 'outpt是啥' , output. size ( ) )
'' '
output = rnn_decoder ( cnn_encoder ( X ) ) # output has dim = ( batch, number of classes) ,批数分类数
# print ( 'outpt是啥' , output) # tensor ( [ [ - 0.0016 , - 0.0368 ] ] ,
# print ( 'cnn_encoder(X)' , cnn_encoder ( X ) . size ( ) ) #( 1 , 28 , 512 )
loss = F . cross_entropy ( output, y) #交叉熵函数
'' '
losses. append ( loss. item ( ) ) #误差
'' '
# to compute accuracy(计算精确度)
# print ( '呦西呦西qq' , output. size ( ) ) #torch. Size ( [ 1 , 2 ] )
y_pred = torch. max ( output, 1 ) [ 1 ] # y_pred != output
# print ( '呦西呦西' , y_pred) #tensor ( [ 0 ] )
# print ( '柯基柯基' , y. size ( ) ) #torch. Size ( [ 1 ] )
# print ( '呦西呦西1' , y. cpu ( ) . data) #tensor ( [ 0 ] )
# print ( '呦西呦西2' , y_pred. cpu ( ) . data) #tensor ( [ 0 ] )
#step_score = accuracy_score ( y. cpu ( ) . data. squeeze ( ) . numpy ( ) , y_pred. cpu ( ) . data. squeeze ( ) . numpy ( ) )
step_score = accuracy_score ( y. cpu ( ) . data, y_pred. cpu ( ) . data)
# print ( '八嘎' , step_score)
#accuracy_score中normalize:默认值为True,返回正确分类的比例
scores. append ( step_score) # computed on CPU
'' '
optimizer. zero_grad ( ) #老三步
loss. backward ( retain_graph= True) #老三步
optimizer. step ( ) #老三步
# show information
if ( batch_idx + 1 ) % log_interval == 0 : #十次显示一次
print ( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%' . format (
epoch + 1 , N_count, len ( train_loader. dataset) , 100. * ( batch_idx + 1 ) / len ( train_loader) , loss. item ( ) , 100 * step_score) )
return losses, scores
def validation ( model, device, optimizer, test_loader) : #测试
# set model as testing mode
cnn_encoder, rnn_decoder = model
cnn_encoder. eval ( ) #测试模式
rnn_decoder. eval ( ) #测试模式
test_loss = 0
all_y = [ ]
all_y_pred = [ ]
ko= 0
with torch. no_grad ( ) : #在测试中不需要进行梯度翻传等操作
for X , y in test_loader:
# distribute data to device
X , y = X . to ( device) , y. to ( device) . view ( - 1 , )
output = rnn_decoder ( cnn_encoder ( X ) )
# print ( '预测的output' , output)
#print ( 'cnn_encoder(x)' , cnn_encoder ( x) )
loss = F . cross_entropy ( output, y, reduction= 'sum' ) #对n个样本的loss进行求
print ( '实际loss' , loss)
ko= ko+ 1
print ( 'ko' , ko)
test_loss+= loss. item ( ) # sum up batch loss
y_pred = output. max ( 1 , keepdim= True) [ 1 ] # ( y_pred != output) get the index of the max log- probability
# print ( '预测的标签' , y_pred)
# print ( '真实标签' , y)
# collect all y and y_pred in all batches
all_y. extend ( y)
all_y_pred. extend ( y_pred)
print ( 'test_loss' , test_loss)
test_loss = test_loss/ len ( test_loader. dataset)
# print ( 'len(test_loader.dataset)' , len ( test_loader. dataset) )
# # compute accuracy
# print ( 'all1y' , all_y)
all_y = torch. stack ( all_y, dim= 0 ) #所有元素相加
# print ( 'ally' , all_y)
all_y_pred = torch. stack ( all_y_pred, dim= 0 )
print ( 'all_y_pred' , all_y_pred)
print ( 'test_loss' , test_loss)
print ( 'len(test_loader.dataset)' , len ( test_loader. dataset) )
test_score = accuracy_score ( all_y. cpu ( ) . data, all_y_pred. cpu ( ) . data)
#accuracy_score中normalize:默认值为True,返回正确分类的比例
# show information
print ( '\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n' . format ( len ( all_y) , loss. item ( ) , 100 * test_score) )
# save Pytorch models of best record
#torch. save ( cnn_encoder. state_dict ( ) , os. path. join ( save_model_path, 'cnn_encoder_epoch{}.pth' . format ( epoch + 1 ) ) ) # save spatial_encoder
#存放torch. nn. Module模块中的state_dict只包含卷积层和全连接层的参数,当网络中存在batchnorm时,例如vgg网络结构,torch. nn. Module模块中的state_dict也会存放batchnorm's running_mean
#torch. save ( rnn_decoder. state_dict ( ) , os. path. join ( save_model_path, 'rnn_decoder_epoch{}.pth' . format ( epoch + 1 ) ) ) # save motion_encoder
#torch. save ( optimizer. state_dict ( ) , os. path. join ( save_model_path, 'optimizer_epoch{}.pth' . format ( epoch + 1 ) ) ) # save optimizer
print ( "Epoch {} model saved!" . format ( epoch + 1 ) )
return test_loss, test_score
# Detect devices
use_cuda = torch. cuda. is_available ( ) # check if GPU exists
device = torch. device ( "cuda" if use_cuda else "cpu" ) # use CPU or GPU
# Data loading parameters
#params = { 'batch_size' : batch_size, 'shuffle' : False, 'num_workers' : 4 , 'pin_memory' : True} if use_cuda else { }
params = { 'batch_size' : batch_size, 'shuffle' : True, 'num_workers' : 0 , 'pin_memory' : True}
'' '
# load UCF101 actions names
#加装类别名字
with open ( action_name_path, 'rb' ) as f:
action_names = pickle. load ( f)
'' '
action_names= [ 'ApplyEyeMakeup' , 'BandMarching' ]
# convert labels - > category
le = LabelEncoder ( )
le. fit ( action_names)
print ( "jjjjjjjjjj" , le. fit ( action_names) )
# show how many classes there are(列表中显示标签名称)
list ( le. classes_)
print ( 'list(le.classes_)' , list ( le. classes_) )
# convert category - > 1 - hot
action_category = le. transform ( action_names) . reshape ( - 1 , 1 ) #(将字符串标签给编号0 - 100 )
print ( 'action_category' , action_category)
enc = OneHotEncoder ( ) #实现将分类特征的每一个数值转化为一个可以用来计算的值
enc. fit ( action_category) #这里的作用是为后面enc. transfrom中生成自动编码做准备
print ( "kkkkkkkkkkkkkkkkkk" , enc. fit ( action_category) )
actions = [ ]
fnames = os. listdir ( data_path) #(得到数据路径下的所有文件,返回以列表的形式)
all_names = [ ]
for f in fnames:
loc1 = f. find ( 'v_' )
loc2 = f. find ( '_g' )
actions. append ( f[ ( loc1 + 2 ) : loc2] )
all_names. append ( f)
# list all data files(列出所有文件数据)
all_X_list = all_names # all video file names(所有视频文件名)
all_y_list = labels2cat ( le, actions) # all video labels(即每个视频文件夹对应的标签)
print ( '\n' )
print ( all_X_list)
print ( all_y_list)
print ( actions)
print ( '\n' )
train_list, test_list, train_label, test_label = train_test_split ( all_X_list, all_y_list, test_size= 0.5 , random_state= 42 )
print ( 'train_list是啥' , train_list) #[ 'v_ApplyEyeMakeup_g01_c01' , 'v_BandMarching_g01_c01' ]
print ( 'test_list是啥' , test_list) #[ 'v_ApplyEyeMakeup_g01_c02' , 'v_BandMarching_g01_c02' ]
print ( 'train_label是啥' , train_label) #[ 0 , 1 ]
print ( 'test_label是啥' , test_label) # [ 0 , 1 ]
transform = transforms. Compose ( [ transforms. Resize ( [ img_x, img_y] ) , #改变形状
transforms. ToTensor ( ) ,
transforms. Normalize ( mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] ) ] )
selected_frames = np. arange ( begin_frame, end_frame, skip_frame) . tolist ( ) #挑选帧
print ( 'selected_frames' , selected_frames)
train_set, valid_set = Dataset_CRNN ( data_path, train_list, train_label, selected_frames, transform= transform) , \
Dataset_CRNN ( data_path, test_list, test_label, selected_frames, transform= transform)
#print ( 'train_set的形状' , train_set) # < functions. Dataset_CRNN object at 0x0000027A97384488 >
train_loader = data. DataLoader ( train_set, ** params)
#print ( 'train_loader的形状' , train_loader)
valid_loader = data. DataLoader ( valid_set, ** params)
cnn_encoder = EncoderCNN1 ( img_x= img_x, img_y= img_y, fc_hidden1= CNN_fc_hidden1, fc_hidden2= CNN_fc_hidden2,
drop_p= dropout_p, CNN_embed_dim= CNN_embed_dim) . to ( device)
rnn_decoder = DecoderRNN ( CNN_embed_dim= CNN_embed_dim, h_RNN_layers= RNN_hidden_layers, h_RNN= RNN_hidden_nodes,
h_FC_dim= RNN_FC_dim, drop_p= dropout_p, num_classes= k) . to ( device)
#crnn_params = list ( cnn_encoder. parameters ( ) )
#optimizer = torch. optim. Adam ( cnn_encoder. parameters ( ) , lr= learning_rate) #优化cnn编码器和rnn解码器的参数
optimizer = torch. optim. SGD ( cnn_encoder. parameters ( ) , lr= learning_rate) #优化cnn编码器和rnn解码器的参数
#loss_func= torch. nn. MSELoss ( )
loss_func= torch. nn. SmoothL1Loss ( )
epoch_train_losses = [ ]
epoch_train_scores = [ ]
#############################################词向量
word_to_ix= { 'ApplyEyeMakeup' : 0 , 'BandMarching' : 1 }
idex_to_word= { word_to_ix[ word] : word for word in word_to_ix}
embeds = torch. nn. Embedding ( 2 , 100 )
y_idx0= torch. LongTensor ( [ word_to_ix[ 'ApplyEyeMakeup' ] ] )
print ( 'y_idx' , y_idx0. size ( ) )
y_embed00 = embeds ( y_idx0)
print ( 'y_embed00' , y_embed00. size ( ) )
yd0= y_embed00
y_idx1= torch. LongTensor ( [ word_to_ix[ 'ApplyEyeMakeup' ] ] )
print ( 'y_idx' , y_idx1. size ( ) )
y_embed1 = embeds ( y_idx1)
print ( 'y_embed1' , y_embed1. size ( ) )
yd1= y_embed1
###########################################
for epoch in range ( epochs) :
train_losses, train_scores = train ( log_interval, [ cnn_encoder, rnn_decoder] , device, train_loader, optimizer, epoch)
epoch_train_losses. append ( train_losses)
epoch_train_scores. append ( train_scores)
A = np. array ( epoch_train_losses)
B = np. array ( epoch_train_scores)
fig = plt. figure ( figsize= ( 10 , 4 ) )
plt. subplot ( 121 )
plt. plot ( np. arange ( 1 , epochs + 1 ) , A [ : , - 1 ] ) # train loss ( on epoch end)
#plt. plot ( np. arange ( 1 , epochs + 1 ) , C ) # test loss ( on epoch end)
plt. title ( "model loss" )
plt. xlabel ( 'epochs' )
plt. ylabel ( 'loss' )
#plt. legend ( [ 'train' , 'test' ] , loc= "upper left" )
plt. legend ( [ 'train' ] , loc= "upper left" )
'' '
# 2 nd figure
plt. subplot ( 122 )
plt. plot ( np. arange ( 1 , epochs + 1 ) , B [ : , - 1 ] ) # train accuracy ( on epoch end)
#plt. plot ( np. arange ( 1 , epochs + 1 ) , D ) # test accuracy ( on epoch end)
plt. title ( "training scores" )
plt. xlabel ( 'epochs' )
plt. ylabel ( 'accuracy' )
#plt. legend ( [ 'train' , 'test' ] , loc= "upper left" )
title = "./fig_UCF101_CRNN.png"
plt. savefig ( title, dpi= 600 )
# plt. close ( fig)
plt. show ( )
'' '