深度强化学习 学术前沿与实战应用——DDQN

class DoubleDQN:
    def learn(self):
        # 这一段和DQN一样
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('\ntarget_params_replaced\n')

        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # 这一段和DQN不一样
        q_next, q_eval4next = self.sess.run([self.q_next, self.q_eval], feed_dict={self.s_:batch_memory[:, -self.n_features:],  # next observation
                                                                                   self.s:batch_memory[:, -self.n_features:]})  # next observation
        q_eval = self.sess.run(self.q_eval, {self.s:batch_memory[:, :self.n_features]})
        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]

        if self.double_q:   # 如果是DDQN
            max_act4next = np.argmax(q_eval4next, axis=1)  # q_eval得出的最高奖励动作
            selected_q_next = q_next[batch_index, max_act4next] # DDQN选择q_next依据q_eval选出的动作
        else:   # 如果是Natural DQN
            selected_q_next = np.max(q_next, axis=1)    # natural DQN
        q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next

        #下面和DQN一样
        _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={self.s:batch_memory[:, :self.n_features], self.q_target:q_target})
        self.cost_his.append(self.cost)
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值