Building your Recurrent Neural Network - Step by Step(吴恩达课程)
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)# GRADED FUNCTION: rnn_cell_forwarddefrnn_cell_forward(xt, a_prev, parameters):"""
Implements a single forward step of the RNN-cell as described in Figure (2)
Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
ba -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
"""# Retrieve parameters from "parameters"
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]### START CODE HERE ### (≈2 lines)# compute next activation state using the formula given above
a_next = np.tanh(np.dot(Waa, a_prev)+ np.dot(Wax, xt)+ ba)# compute output of the current cell using the formula given above
yt_pred = softmax(np.dot(Wya, a_next)+ by)### END CODE HERE #### store values you need for backward propagation in cache
cache =(a_next, a_prev, xt, parameters)return a_next, yt_pred, cache
# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)# GRADED FUNCTION: rnn_forwarddefrnn_forward(x, a0, parameters):"""
Implement the forward propagation of the recurrent neural network described in Figure (3).
Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
ba -- Bias numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
caches -- tuple of values needed for the backward pass, contains (list of caches, x)
"""# Initialize "caches" which will contain the list of all caches
caches =[]# Retrieve dimensions from shapes of x and parameters["Wya"]
n_x, m, T_x = x.shape
n_y, n_a = parameters["Wya"].shape
### START CODE HERE #### initialize "a" and "y_pred" with zeros (≈2 lines)
a = np.zeros((n_a, m, T_x))
y_pred = np.zeros((n_y, m, T_x))# Initialize a_next (≈1 line)
a_next = a0
# loop over all time-steps of the input 'x' (1 line)for t inrange(T_x):# Update next hidden state, compute the prediction, get the cache (≈2 lines)
xt = x[:,:, t]
a_next, yt_pred, cache = rnn_cell_forward(xt, a_next, parameters)# Save the value of the new "next" hidden state in a (≈1 line)
a[:,:,t]= a_next
# Save the value of the prediction in y (≈1 line)
y_pred[:,:,t]= yt_pred
# Append "cache" to "caches" (≈1 line)
caches.append(cache)### END CODE HERE #### store values needed for backward propagation in cache
caches =(caches, x)return a, y_pred, caches
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)# GRADED FUNCTION: lstm_cell_forwarddeflstm_cell_forward(xt, a_prev, c_prev, parameters):"""
Implement a single forward step of the LSTM-cell as described in Figure (4)
Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
c_next -- next memory state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)
Note: ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde),
c stands for the cell state (memory)
"""# Retrieve parameters from "parameters"
Wf = parameters["Wf"]# forget gate weight
bf = parameters["bf"]
Wi = parameters["Wi"]# update gate weight (notice the variable name)
bi = parameters["bi"]# (notice the variable name)
Wc = parameters["Wc"]# candidate value weight
bc = parameters["bc"]
Wo = parameters["Wo"]# output gate weight
bo = parameters["bo"]
Wy = parameters["Wy"]# prediction weight
by = parameters["by"]# Retrieve dimensions from shapes of xt and Wy
n_x, m = xt.shape
n_y, n_a = Wy.shape
### START CODE HERE #### Concatenate a_prev and xt (≈1 line)
concat = np.concatenate((a_prev, xt), axis=0)# Compute values for ft (forget gate), it (update gate),# cct (candidate value), c_next (cell state), # ot (output gate), a_next (hidden state) (≈6 lines)
ft = sigmoid(np.dot(Wf, concat)+ bf)# forget gate
it = sigmoid(np.dot(Wi, concat)+ bi)# update gate
cct = np.tanh(np.dot(Wc, concat)+ bc)# candidate value
c_next = ft * c_prev + it * cct # cell state
ot = sigmoid(np.dot(Wo, concat)+ bo)# output gate
a_next = ot * np.tanh(c_next)# hidden state# Compute prediction of the LSTM cell (≈1 line)
yt_pred = softmax(np.dot(Wy, a_next)+ by)### END CODE HERE #### store values needed for backward propagation in cache
cache =(a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)return a_next, c_next, yt_pred, cache
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)# GRADED FUNCTION: lstm_forwarddeflstm_forward(x, a0, parameters):"""
Implement the forward propagation of the recurrent neural network using an LSTM-cell described in Figure (4).
Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
y -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
c -- The value of the cell state, numpy array of shape (n_a, m, T_x)
caches -- tuple of values needed for the backward pass, contains (list of all the caches, x)
"""# Initialize "caches", which will track the list of all the caches
caches =[]### START CODE HERE ###
Wy = parameters['Wy']# saving parameters['Wy'] in a local variable in case students use Wy instead of parameters['Wy']# Retrieve dimensions from shapes of x and parameters['Wy'] (≈2 lines)
n_x, m, T_x = x.shape
n_y, n_a = parameters['Wy'].shape
# initialize "a", "c" and "y" with zeros (≈3 lines)
a = np.zeros((n_a, m, T_x))
c = np.zeros((n_a, m, T_x))
y = np.zeros((n_y, m, T_x))# Initialize a_next and c_next (≈2 lines)
a_next = a0
c_next = np.zeros_like(a0)# loop over all time-stepsfor t inrange(T_x):# Get the 2D slice 'xt' from the 3D input 'x' at time step 't'
xt = x[:,:, t]# Update next hidden state, next memory state, compute the prediction, get the cache (≈1 line)
a_next, c_next, yt, cache = lstm_cell_forward(xt, a_next, c_next, parameters)# Save the value of the new "next" hidden state in a (≈1 line)
a[:,:,t]= a_next
# Save the value of the next cell state (≈1 line)
c[:,:,t]= c_next
# Save the value of the prediction in y (≈1 line)
y[:,:,t]= yt
# Append the cache into caches (≈1 line)
caches.append(cache)### END CODE HERE #### store values needed for backward propagation in cache
caches =(caches, x)return a, y, c, caches
# UNGRADED FUNCTION: rnn_cell_backwarddefrnn_cell_backward(da_next, cache):"""
Implements the backward pass for the RNN-cell (single time-step).
Arguments:
da_next -- Gradient of loss with respect to next hidden state
cache -- python dictionary containing useful values (output of rnn_cell_forward())
Returns:
gradients -- python dictionary containing:
dx -- Gradients of input data, of shape (n_x, m)
da_prev -- Gradients of previous hidden state, of shape (n_a, m)
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dba -- Gradients of bias vector, of shape (n_a, 1)
"""# Retrieve values from cache(a_next, a_prev, xt, parameters)= cache
# Retrieve values from parameters
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]### START CODE HERE #### compute the gradient of tanh with respect to a_next (≈1 line)
dtanh =1-(np.power(a_next,2))# compute the gradient of the loss with respect to Wax (≈2 lines)
dxt = np.dot(Wax.T, da_next*dtanh)
dWax = np.dot(da_next*dtanh, xt.T)# compute the gradient with respect to Waa (≈2 lines)
da_prev = np.dot(Waa.T, da_next*dtanh)
dWaa = np.dot(da_next*dtanh, a_prev.T)# compute the gradient with respect to b (≈1 line)
dba = np.sum(da_next*dtanh, axis=1, keepdims=True)### END CODE HERE #### Store the gradients in a python dictionary
gradients ={"dxt": dxt,"da_prev": da_prev,"dWax": dWax,"dWaa": dWaa,"dba": dba}return gradients
# UNGRADED FUNCTION: rnn_backwarddefrnn_backward(da, caches):"""
Implement the backward pass for a RNN over an entire sequence of input data.
Arguments:
da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
caches -- tuple containing information from the forward pass (rnn_forward)
Returns:
gradients -- python dictionary containing:
dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
dba -- Gradient w.r.t the bias, of shape (n_a, 1)
"""### START CODE HERE #### Retrieve values from the first cache (t=1) of caches (≈2 lines)(caches, x)= caches
(a1, a0, x1, parameters)= caches[1]# Retrieve dimensions from da's and x1's shapes (≈2 lines)
n_a, m, T_x = da.shape
n_x, m = x1.shape
# initialize the gradients with the right sizes (≈6 lines)
dx = np.zeros((n_x, m, T_x))
dWax = np.zeros((n_a, n_x))
dWaa = np.zeros((n_a, n_a))
dba = np.zeros((n_a,1))
da0 = np.zeros((n_a, m))
da_prevt = np.zeros((n_a, m))# Loop through all the time stepsfor t inreversed(range(T_x)):# Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
gradients = rnn_cell_backward(da[:,:,t]+ da_prevt, caches[t])# Retrieve derivatives from gradients (≈ 1 line)
dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]# Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines)
dx[:,:, t]= dxt
dWax += dWaxt
dWaa += dWaat
dba += dbat
# Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line)
da0 = da_prevt
### END CODE HERE #### Store the gradients in a python dictionary
gradients ={"dx": dx,"da0": da0,"dWax": dWax,"dWaa": dWaa,"dba": dba}return gradients
# UNGRADED FUNCTION: lstm_cell_backwarddeflstm_cell_backward(da_next, dc_next, cache):"""
Implement the backward pass for the LSTM-cell (single time-step).
Arguments:
da_next -- Gradients of next hidden state, of shape (n_a, m)
dc_next -- Gradients of next cell state, of shape (n_a, m)
cache -- cache storing information from the forward pass
Returns:
gradients -- python dictionary containing:
dxt -- Gradient of input data at time-step t, of shape (n_x, m)
da_prev -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
"""# Retrieve information from "cache"(a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)= cache
### START CODE HERE #### Retrieve dimensions from xt's and a_next's shape (≈2 lines)
n_x, m = xt.shape
n_a, m = a_next.shape
# Compute gates related derivatives. Their values can be found by looking carefully at equations (7) to (10) (≈4 lines)
dot = da_next*np.tanh(c_next)
dcct =(da_next*ot*(1-np.power(np.tanh(c_next),2))+dc_next)*it
dit =(da_next*ot*(1-np.power(np.tanh(c_next),2))+dc_next)*cct
dft =(da_next*ot*(1-np.power(np.tanh(c_next),2))+dc_next)*c_prev
# Code equations (7) to (10) (≈4 lines)
dit = dit*it*(1-it)
dft = dft*ft*(1-ft)
dot = dot*ot*(1-ot)
dcct = dcct*(1-np.power(cct,2))# Compute parameters related derivatives. Use equations (11)-(14) (≈8 lines)
concat = np.zeros((n_x + n_a, m))
concat[: n_a,:]= a_prev
concat[n_a :,:]= xt
dWf = np.dot(dft, concat.T)
dWi = np.dot(dit, concat.T)
dWc = np.dot(dcct, concat.T)
dWo = np.dot(dot, concat.T)
dbf = np.sum(dft, axis=1, keepdims=True)
dbi = np.sum(dit, axis=1, keepdims=True)
dbc = np.sum(dcct, axis=1, keepdims=True)
dbo = np.sum(dot, axis=1, keepdims=True)# Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (15)-(17). (≈3 lines)
da_prevx = np.dot(parameters['Wf'].T, dft)+ np.dot(parameters['Wo'].T, dot)+ np.dot(parameters['Wi'].T, dit)+ np.dot(parameters['Wc'].T, dcct)
da_prev = da_prevx[: n_a,:]
dc_prev =(da_next*ot*(1-np.power(np.tanh(c_next),2))+dc_next)*ft
dxt = da_prevx[n_a :,:]### END CODE HERE #### Save gradients in dictionary
gradients ={"dxt": dxt,"da_prev": da_prev,"dc_prev": dc_prev,"dWf": dWf,"dbf": dbf,"dWi": dWi,"dbi": dbi,"dWc": dWc,"dbc": dbc,"dWo": dWo,"dbo": dbo}return gradients
# UNGRADED FUNCTION: lstm_backwarddeflstm_backward(da, caches):"""
Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).
Arguments:
da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
caches -- cache storing information from the forward pass (lstm_forward)
Returns:
gradients -- python dictionary containing:
dx -- Gradient of inputs, of shape (n_x, m, T_x)
da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
"""# Retrieve values from the first cache (t=1) of caches.(caches, x)= caches
(a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters)= caches[0]### START CODE HERE #### Retrieve dimensions from da's and x1's shapes (≈2 lines)
n_a, m, T_x = da.shape
n_x, m = x1.shape
# initialize the gradients with the right sizes (≈12 lines)
dx = np.zeros((n_x, m, T_x))
da0 = np.zeros((n_a, m))
da_prevt = np.zeros((n_a, m))
dc_prevt = np.zeros((n_a, m))
dWf = np.zeros((n_a, n_a + n_x))
dWi = np.zeros((n_a, n_a + n_x))
dWc = np.zeros((n_a, n_a + n_x))
dWo = np.zeros((n_a, n_a + n_x))
dbf = np.zeros((n_a,1))
dbi = np.zeros((n_a,1))
dbc = np.zeros((n_a,1))
dbo = np.zeros((n_a,1))
dc_next = dc_prevt
da_next = da_prevt
# loop back over the whole sequencefor t inreversed(range(T_x)):# Compute all gradients using lstm_cell_backward
gradients = lstm_cell_backward(da_next + da[:,:, t], dc_next, caches[t])# Store or add the gradient to the parameters' previous step's gradient
dx[:,:, t]= gradients['dxt']
dWf = gradients['dWf']
dWi = gradients['dWi']
dWc = gradients['dWc']
dWo = gradients['dWo']
dbf = gradients['dbf']
dbi = gradients['dbi']
dbc = gradients['dbc']
dbo = gradients['dbo']
dc_next = gradients['dc_prev']
da_next = gradients['da_prev']# Set the first activation's gradient to the backpropagated gradient da_prev.
da0 = gradients['da_prev']### END CODE HERE #### Store the gradients in a python dictionary
gradients ={"dx": dx,"da0": da0,"dWf": dWf,"dbf": dbf,"dWi": dWi,"dbi": dbi,"dWc": dWc,"dbc": dbc,"dWo": dWo,"dbo": dbo}return gradients