本博文主要vad训练用到的模型结构,可以采用dnn之类网络,亦可以采用cnn网络实现,这里采用的实现方式是基于cnn的,网络架构在一定程度上影响这模型的精度,但是更多的是数据起着决定性的作用。实现过程也比较简单,直接上代码,具体实现如下:
import torch.nn as nn
import math
import torch
from .dfsmn import DFSMN
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
import six
from .tdnn import*
class dnn(nn.Module):
def __init__(self,idim,hidm,n_layers,dropout,left_context = 11,right_context = 11,n_classes = 2):
super(dnn,self).__init__()
self.left_context = left_context
self.right_context = right_context
self.tdnn = nn.ModuleList([])
self.softmax = nn.LogSoftmax(dim = -1)
input_dim = idim
output_dim = hidm
add_bias = True
context_size = 1
batch_norm = True
act = True
drop = dropout
for i in six.moves.range(0, n_layers):
if i == 0:
input_dim = idim
context_size = left_context * 2 + 1
output_dim = hidm
add_bias = False
batch_norm = True
act = True
drop = dropout
elif i == n_layers - 1:
input_dim = hidm
output_dim = n_classes
add_bias = True
batch_norm = False
act = False
drop = 0
else:
input_dim = hidm
output_dim = hidm
context_size = 1
add_bias = False
batch_norm = True
act = True
drop = dropout
self.tdnn.append(TDNNLayer(input_dim = input_dim,
output_dim = output_dim,
context_size = context_size,
stride = 1,dilation = 1,
batch_norm = batch_norm,
dropout_p = drop,
bias = add_bias,
act = act))
self.n_layers = n_layers
def forward(self,x,ilens):
T0 = x.shape[1]
for i in six.moves.range(0,self.n_layers):
x = self.tdnn[i](x)
x = self.softmax(x)
T1 = x.shape[1]
return x, ilens - (T0 - T1)
class fsmn(nn.Module):
def __init__(self,inp,hid,svd_size,left_frame = 0,right_frame = 0,num_classes = 2,drop = 0.3):
super(fsmn,self).__init__()
self.inp = inp
self.hid = hid
self.svd_size = svd_size
self.left_frame = left_frame
self.right_frame = right_frame
self.num_classes = num_classes
self.dropout = drop
self.bn0 = nn.BatchNorm1d(self.inp)
self.conv0 = nn.Conv1d(self.inp,self.hid,kernel_size = 1,bias = False)
self.fsmn1 = DFSMN(self.hid,self.svd_size,self.hid,self.left_frame,1,0,1)
self.dropout1 = nn.Dropout(self.dropout)
self.fsmn2 = DFSMN(self.hid,self.svd_size,self.hid,self.left_frame,1,0,1)
self.dropout2 = nn.Dropout(self.dropout)
self.conv1 = nn.Conv1d(self.hid,self.num_classes,kernel_size = 1,bias = False)
self.act = nn.ReLU(inplace = True)
def forward(self,x):
out = self.bn0(x)
out = self.conv0(out)
out = self.act(out)
out,_ = self.fsmn1(out)
out = self.act(out)
out = self.dropout1(out)
out,_ = self.fsmn2(out)
out = self.act(out)
out = self.dropout2(out)
out = self.conv1(out)
return out
tdnn的实现:
import torch
import torch.nn as nn
import torch.nn.functional as F
class TDNN(nn.Module):
def __init__(
self,
input_dim=23,
output_dim=512,
context_size=5,
stride=1,
dilation=1,
batch_norm=True,
dropout_p=0.0
):
'''
TDNN as defined by https://www.danielpovey.com/files/2015_interspeech_multisplice.pdf
Affine transformation not applied globally to all frames but smaller windows with local context
batch_norm: True to include batch normalisation after the non linearity
Context size and dilation determine the frames selected
(although context size is not really defined in the traditional sense)
For example:
context size 5 and dilation 1 is equivalent to [-2,-1,0,1,2]
context size 3 and dilation 2 is equivalent to [-2, 0, 2]
context size 1 and dilation 1 is equivalent to [0]
'''
super(TDNN, self).__init__()
self.context_size = context_size
self.stride = stride
self.input_dim = input_dim
self.output_dim = output_dim
self.dilation = dilation
self.dropout_p = dropout_p
self.batch_norm = batch_norm
self.kernel = nn.Linear(input_dim*context_size, output_dim)
self.nonlinearity = nn.ReLU()
if self.batch_norm:
self.bn = nn.BatchNorm1d(output_dim)
if self.dropout_p:
self.drop = nn.Dropout(p=self.dropout_p)
def forward(self, x):
'''
input: size (batch, seq_len, input_features)
outpu: size (batch, new_seq_len, output_features)
'''
_, _, d = x.shape
assert (d == self.input_dim), 'Input dimension was wrong. Expected ({}), got ({})'.format(self.input_dim, d)
x = x.unsqueeze(1)
# Unfold input into smaller temporal contexts
x = F.unfold(
x,
(self.context_size, self.input_dim),
stride=(1,self.input_dim),
padding = int(self.context_size / 2),
dilation=(self.dilation,1)
)
print('x.shape = ',x.shape)
# N, output_dim*context_size, new_t = x.shape
x = x.transpose(1,2)
x = self.kernel(x)
x = self.nonlinearity(x)
print('x.shape = ',x.shape)
if self.dropout_p:
x = self.drop(x)
if self.batch_norm:
x = x.transpose(1,2)
x = self.bn(x)
x = x.transpose(1,2)
return x
if __name__ == '__main__':
td = TDNN(input_dim = 26)
x = torch.randn(64,384,26)
out = td(x)
print('out.shape = ',out.shape)
fsmn的实现:
# coding=utf-8
import torch
import torch.nn as nn
import torch.nn.functional as F
class DFSMN(nn.Module):
def __init__(
self,
input_dim,
hidden_dim,
output_dim,
left_frames=1,
left_dilation=1,
right_frames=1,
right_dilation=1,
):
'''
input_dim as it's name ....
hidden_dim means the dimension or channles num of the memory
left means history
right means future
'''
super(DFSMN, self).__init__()
self.left_frames = left_frames
self.right_frames = right_frames
self.in_conv = nn.Conv1d(input_dim, hidden_dim, kernel_size=1)
#self.norm = nn.InstanceNorm1d(hidden_dim)
#nn.init.normal_(self.in_conv.weight.data,std=0.05)
if left_frames > 0:
# self.left_conv = nn.Sequential(
# nn.ConstantPad1d([left_dilation*left_frames,0],0),
# nn.Conv1d(hidden_dim, hidden_dim, kernel_size=left_frames + 1, dilation=left_dilation, bias=False, groups=hidden_dim)
# )
self.left_conv = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=left_frames,padding = int(left_frames * left_dilation / 2),dilation=left_dilation, bias=False, groups=hidden_dim)
if right_frames > 0:
# self.right_conv = nn.Sequential(
# nn.ConstantPad1d([0,right_dilation*right_frames],0),
# nn.Conv1d(hidden_dim, hidden_dim, kernel_size=right_frames + 1, dilation=right_dilation, bias=False, groups=hidden_dim)
# )
self.right_conv = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=right_frames,padding = int(right_frames * left_dilation / 2),dilation=left_dilation, bias=False, groups=hidden_dim)
self.out_conv = nn.Conv1d(hidden_dim, output_dim, kernel_size=1)
#nn.init.normal_(self.out_conv.weight.data,std=0.05)
#self.weight = nn.Parameter(torch.Tensor([0]), requires_grad=True)
def forward(self, inputs, hidden=None):
out = self.in_conv(inputs)
if self.left_frames > 0:
left = self.left_conv(out)
else:
left = 0
if self.right_frames > 0:
right = self.right_conv(out)
else:
right = 0.
out_p = out+left+right
if hidden is not None:
out_p = hidden + F.relu(out_p)*self.weight
#out_p = hidden + out_p
out = self.out_conv(out_p)
return out, out_p
if __name__ == '__main__':
inputs = torch.randn(10,257,199)
net = DFSMN(257,128,137, left_frames = 10,left_dilation=1, right_frames = 0,right_dilation=1)
print(net(inputs)[0].shape)
水平有限,不当之处还请指教,谢谢!