摘要
本文使用纯 Python 和 PyTorch 对比实现门控循环单元GRU及其反向传播.
相关
原理和详细解释, 请参考: :
门控循环单元GRUCell详解及反向传播的梯度求导
文章索引 :
https://blog.csdn.net/oBrightLamp/article/details/85067981
正文
1. GRUCell 类
文件目录 : vanilla_nn/grucell.py
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class GRUCell:
def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh):
self.weight_ih = weight_ih
self.weight_hh = weight_hh
self.bias_ih = bias_ih
self.bias_hh = bias_hh
self.dh_prev = None
self.weight_ih_grad_stack = []
self.weight_hh_grad_stack = []
self.bias_ih_grad_stack = []
self.bias_hh_grad_stack = []
self.x_stack = []
self.dx_list = []
self.h_prev_stack = []
self.h_next_stack = []
self.dh_prev_list = []
self.reset_gate_stack = []
self.update_gate_stack = []
self.cell_gate_stack = []
def __call__(self, x, h_prev):
xw_vector = np.dot(x, self.weight_ih.T) + self.bias_ih
hv_vector = np.dot(h_prev, self.weight_hh.T) + self.bias_hh
h_size = np.shape(h_prev)[1]
reset_gate = sigmoid(xw_vector[:, h_size * 0:h_size * 1]
+ hv_vector[:, h_size * 0:h_size * 1])
update_gate = sigmoid(xw_vector[:, h_size * 1:h_size * 2]
+ hv_vector[:, h_size * 1:h_size * 2])
cell_gate = np.tanh(xw_vector[:, h_size * 2:]
+ hv_vector[:, h_size * 2:] * reset_gate)
h_next = (1 - update_gate) * cell_gate + update_gate * h_prev
self.x_stack.append(x)
self.reset_gate_stack.append(reset_gate)
self.update_gate_stack.append(update_gate)
self.cell_gate_stack.append(cell_gate)
self.h_prev_stack.append(h_prev)
self.h_next_stack.append(h_next)
self.dh_prev = np.zeros_like(h_next)
return h_next
def backward(self, dh_next):
x = self.x_stack.pop()
h_prev = self.h_prev_stack.pop()
reset_gate = self.reset_gate_stack.pop()
update_gate = self.update_gate_stack.pop()
cell_gate = self.cell_gate_stack.pop()
h_size = np.shape(dh_next)[1]
wr = self.weight_ih[h_size *