门
更新门——能关注的机制
重置门——能遗忘的机制
候选隐状态
隐状态
import torch
from torch import nn
from d2l import torch as d2l
batch_size,num_steps = 32,35
train_iter,vocab = d2l.load_data_time_machine(batch_size,num_steps)
#初始化模型参数
def get_params(vocab_size,num_hiddens,device):
num_inputs = num_outputs=vocab_size
def normal(shape):
return torch.randn(size=shape,device=device)*0.01
def three():
return (normal((num_inputs,num_hiddens)),
normal((num_hiddens,num_hiddens)),
torch.zeros(num_hiddens,device=device))
W_xz,W_hz,b_z=three() #更新门参数
W_xr,W_hr,b_r=three() #重置门参数
W_xh,W_hh,b_h=three() #候选隐状态参数
#输出层参数
W_hq = normal((num_hiddens,num_outputs))
b_q = torch.zeros(num_outputs,device=device)
#附加梯度
params = [W_xz,W_hz,b_z,W_xr,W_hr,b_r,W_xh,W_hh,b_h,W_hq,b_q]
for param in params:
param.requires_grad_(True)
return params
#定义模型
def init_gru_state(batch_size,num_hiddens,device):
return (torch.zeros((batch_size,num_hiddens),device=device),)
def gru(inputs,state,params):
W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
H,=state
outputs = []
for X in inputs:
Z = torch.sigmoid((X@W_xz)+(H@W_hz)+b_z)
R = torch.sigmoid((X@W_xr)+(H@W_hr)+b_r)
H_tilda = torch.tanh((X@W_xh)+((R*H)@W_hh)+b_h)
H = Z*H+(1-Z)*H_tilda
Y = H@W_hq+b_q
outputs.append(Y)
return torch.cat(outputs,dim=0),(H,)
#训练与预测
vocab_size,num_hiddens,device = len(vocab),256,d2l.try_gpu()
num_epochs,lr = 500,1
model = d2l.RNNModelScratch(len(vocab),num_hiddens,device,get_params,init_gru_state,gru)
d2l.train_ch8(model,train_iter,vocab,lr,num_epochs,device)
d2l.plt.show()
perplexity 1.1, 23635.9 tokens/sec on cuda:0
time traveller which his foursm stmetting in in entllenbut upaca
travelleryou can show black is white by argument said filby
GRU的简洁实现
num_inputs = vocab_size
gru_layer = nn.GRU(num_inputs,num_hiddens)
model = d2l.RNNModel(gru_layer,len(vocab))
model = model.to(device)
d2l.train_ch8(model,train_iter,vocab,lr,num_epochs,device)
d2l.plt.show()
perplexity 1.0, 308896.5 tokens/sec on cuda:0
time travelleryou can show black is white by argument said filby
travelleryou can show black is white by argument said filby
总结:
- GRU可以更好的捕捉时间步距离很长的序列上的依赖关系
- 重置门有助于捕获序列中的短期依赖关系
- 更新门有助于捕获序列中的长期依赖关系
- 充值门打开时,门控循环单元包含基本循环神经网络
- 更新门打开时,门控循环单元可以跳过子序列