吴恩达Coursera, 机器学习专项课程, Machine Learning:Unsupervised Learning, Recommenders, Reinforcement Learning第三周所有jupyter notebook文件:
本次作业
Exercise 1
# UNQ_C1
# GRADED CELL
# Create the Q-Network
q_network = Sequential([
### START CODE HERE ###
tf.keras.Input(shape=state_size), #specify input size
### START CODE HERE ###
Dense(64, activation='relu', name = 'layer1'),
Dense(64, activation='relu', name = 'layer2'),
Dense(num_actions, activation='linear', name = 'layer3')
### END CODE HERE ###
])
# Create the target Q^-Network
target_q_network = Sequential([
### START CODE HERE ###
tf.keras.Input(shape=state_size), #specify input size
### START CODE HERE ###
Dense(64, activation='relu', name = 'layer1'),
Dense(64, activation='relu', name = 'layer2'),
Dense(num_actions, activation='linear', name = 'layer3')
### END CODE HERE ###
])
### START CODE HERE ###
optimizer = tf.keras.optimizers.Adam()
### END CODE HERE ###
Exercise 2
# UNQ_C2
# GRADED FUNCTION: calculate_loss
def compute_loss(experiences, gamma, q_network, target_q_network):
"""
Calculates the loss.
Args:
experiences: (tuple) tuple of ["state", "action", "reward", "next_state", "done"] namedtuples
gamma: (float) The discount factor.
q_network: (tf.keras.Sequential) Keras model for predicting the q_values
target_q_network: (tf.keras.Sequential) Karas model for predicting the targets
Returns:
loss: (TensorFlow Tensor(shape=(0,), dtype=int32)) the Mean-Squared Error between
the y targets and the Q(s,a) values.
"""
# Unpack the mini-batch of experience tuples
states, actions, rewards, next_states, done_vals = experiences
# Compute max Q^(s,a)
max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)
# Set y = R if episode terminates, otherwise set y = R + γ max Q^(s,a).
### START CODE HERE ###
y_targets = rewards*done_vals + (rewards + gamma*max_qsa)*(1-done_vals)
### END CODE HERE ###
# Get the q_values
q_values = q_network(states)
q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
tf.cast(actions, tf.int32)], axis=1))
# Compute the loss
### START CODE HERE ###
loss = MSE(q_values,y_targets)
### END CODE HERE ###
return loss