文章目录
题目
我之前写的两者更为简单清楚的对比,有需要的可以看看。
创造数据
In [ ]:
import torch
import torch.nn.functional as F
# data is a batch and has three sequences,each of them has three words(word's dimension is three)
data = torch.tensor(
(
[
[[1,2,30],[0,0,0],[0,0,0]], # has one word,zero padding left
[[1,2,30],[4,5,60],[0,0,0]], # has two words,zero padding left
[[1,2,30],[4,5,60],[7,8,90]] # has three words
]
),dtype=torch.float32,requires_grad=True
)
data
Out[ ]:
tensor([[[ 1., 2., 30.],
[ 0., 0., 0.],
[ 0., 0., 0.]],
[[ 1., 2., 30.],
[ 4., 5., 60.],
[ 0., 0., 0.]],
[[ 1., 2., 30.],
[ 4., 5., 60.],
[ 7., 8., 90.]]], requires_grad=True)
draw the plot to show the data
In [ ]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
plt.figure(num=1,figsize=(15,5))
for i in range(len(data)):
plt.subplot(1,3,i+1)
for j in range(len(data[i])):
plt.plot([0,1,2],data[i][j].detach().numpy(),label=f"第{j}个单词的特征编码")
plt.legend(loc='best')
plt.xlabel(f'第{i}个句子')
plt.show()
此时我们可以看到每个单词的维度差异很大,对于梯度求导会很不便,尝试用一下归一化的方法吧
Layer Normalization
In [ ]:
torch.tensor([0,0,1])-.2
Out[ ]:
tensor([-0.2000, -0.2000, 0.8000])
In [ ]:
def LN(X,gamma,beta,mean,std,eps,momentum):
'''
X: input shape=[batchsize,seqlen,wordnum,worddim]
gamma and beta: used to adjust normalization
std and mean: used to realize normalization shape=[worddim]
eps: in case of dividing zero
momentum: used to adjust std and mean
'''
# validation
if not torch.is_grad_enabled():
X_hat = (X-mean)/torch.sqrt(std+eps)
else: # training
# only suit for 4 dim
mean_temp = X.mean(dim=(0,2,3),keepdim=True)
std_temp = ((X-mean)**2).mean(dim=(0,2,3),keepdim=True)
X_hat = (X-mean_temp)/torch.sqrt(std_temp+eps)
mean = momentum*mean_temp+(1.0-momentum)
std = momentum*std_temp+(1.0-momentum)
Y = gamma*X_hat+beta
return Y,mean,std
# data.unsqueeze(0).shape
ln,mean_ln,std_ln = LN(data.unsqueeze(0),0.9,0.1,torch.tensor(0),1,1e-5,0.9)
ln = ln.squeeze()
draw the plot after ln
In [ ]:
plt.figure(num=2,figsize=(15,5))
for i in range(len(ln)):
plt.subplot(1,3,i+1)
for j in range(len(ln[i])):
plt.plot([0,1,2],ln[i][j].detach().numpy(),label=f"第{j}个单词的特征编码")
plt.legend(loc='best')
plt.xlabel(f'第{i}个句子')
plt.show()
batch normalization
In [ ]:
def BN(X,gamma,beta,mean,std,eps,momentum):
'''
X: input shape=[batchsize,seqlen,wordnum,worddim]
gamma and beta: used to adjust normalization
std and mean: used to realize normalization shape=[worddim]
eps: in case of dividing zero
momentum: used to adjust std and mean
'''
# validation
if not torch.is_grad_enabled():
X_hat = (X-mean)/torch.sqrt(std+eps)
else: # training
# only suit for 4 dim
mean_temp = X.mean(dim=(0,1,3),keepdim=True)
std_temp = ((X-mean)**2).mean(dim=(0,1,3),keepdim=True)
X_hat = (X-mean_temp)/torch.sqrt(std_temp+eps)
mean = momentum*mean_temp+(1.0-momentum)
std = momentum*std_temp+(1.0-momentum)
Y = gamma*X_hat+beta
return Y,mean,std
# data.unsqueeze(0).shape
bn,mean_bn,std_bn = BN(data.unsqueeze(0),0.9,0.1,torch.tensor(0),1,1e-5,0.9)
bn = bn.squeeze()
draw the plot after bn
In [ ]:
plt.figure(num=3,figsize=(15,5))
for i in range(len(bn)):
plt.subplot(1,3,i+1)
for j in range(len(bn[i])):
plt.plot([0,1,2],bn[i][j].detach().numpy(),label=f"第{j}个单词的特征编码")
plt.legend(loc='best')
plt.xlabel(f'第{i}个句子')
plt.show()
In [ ]:
tmean_ln = data.unsqueeze(0).mean(dim=(0,2,3),keepdim=True)
tmean_bn = data.unsqueeze(0).mean(dim=(0,1,3),keepdim=True)
tmean_ln,tmean_bn
Out[ ]:
(tensor([[[[ 3.6667]],
[[11.3333]],
[[23.0000]]]], grad_fn=<MeanBackward1>),
tensor([[[[11.0000],
[15.3333],
[11.6667]]]], grad_fn=<MeanBackward1>))