class DoubleDQN:
def learn(self):
# 这一段和DQN一样
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
# 这一段和DQN不一样
q_next, q_eval4next = self.sess.run([self.q_next, self.q_eval], feed_dict={self.s_:batch_memory[:, -self.n_features:], # next observation
self.s:batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s:batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
if self.double_q: # 如果是DDQN
max_act4next = np.argmax(q_eval4next, axis=1) # q_eval得出的最高奖励动作
selected_q_next = q_next[batch_index, max_act4next] # DDQN选择q_next依据q_eval选出的动作
else: # 如果是Natural DQN
selected_q_next = np.max(q_next, axis=1) # natural DQN
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
#下面和DQN一样
_, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={self.s:batch_memory[:, :self.n_features], self.q_target:q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1