DDQN与DQN算法用tensorflow2.0实现

深度强化学习Double Deep Q Learning算法和Deep Q Learning用tensorflow2.0实现

DQN算法实现

首先搭建网络结构,是一个很简单的三个全连接层。

from keras import layers, models


class Q_Network:
    def __init__(self, observation_n, action_n):
        self.observation_n = observation_n
        self.action_n = action_n
        self._build_model()

    def _build_model(self):
        # ------------------ build eval network -------------------------
        self.eval_model = models.Sequential(name="eval_network")
        self.eval_model.add(layers.Dense(64, activation="relu", input_shape=(None, self.observation_n)))
        self.eval_model.add(layers.Dense(64, activation="relu"))
        self.eval_model.add(layers.Dense(self.action_n))

        # print(self.eval_model.summary())

        # ------------------ build target network ---------------------
        self.target_model = models.Sequential(name="target_network")
        self.target_model.add(layers.Dense(64, activation="relu", input_shape=(None, self.observation_n)))
        self.target_model.add(layers.Dense(64, activation="relu"))
        self.target_model.add(layers.Dense(self.action_n))

        # print(self.target_model.summary())

DQN算法实现

import tensorflow as tf
import numpy as np
from Network import Q_Network


class Deep_Q_Network:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=None
    ):
        """

        :param n_actions: 动作种类个数
        :param n_features: 观察向量的维度
        :param learning_rate: 学习率
        :param reward_decay: 奖励衰减系数
        :param e_greedy: 贪心策略的epsilon
        :param replace_target_iter: 多少步替换依次权重
        :param memory_size: 内存表的大小
        :param batch_size: 神经网络训练的批次
        :param e_greedy_increment: 贪心选择策略中的epsilon的衰减
        """
        self.memory_counter = 0
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 3))

        self.q_target = Q_Network(observation_n=n_features, action_n=self.n_actions).target_model
        self.q_eval = Q_Network(observation_n=n_features, action_n=self.n_actions).eval_model
        self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.loss = tf.losses.MeanSquaredError()

    def choose_action(self, state, eps=0.1):
        state = tf.Variable(state, dtype=tf.float32)
        if len(tf.shape(state)) == 1:
            state = tf.expand_dims(state, axis=0)
        if np.random.uniform() > eps:
            action_value = self.q_eval.predict(state)
            return np.argmax(action_value)
        else:
            return np.random.choice(np.arange(self.n_actions))

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.soft_update(1)

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)

        experience = self.memory[sample_index, :]
        states = np.array([e[:self.n_features] for e in experience]).astype("float32")
        actions = np.array([e[self.n_features] for e in experience]).astype("int32")
        rewards = np.array([e[self.n_features + 1] for e in experience]).astype("float32")
        next_states = np.array([e[-self.n_features:] for e in experience]).astype("float32")
        dones = np.array([e[self.n_features+2] for e in experience])

        q_target_values = self.q_target.predict(next_states)
        q_target_values = tf.reduce_max(q_target_values, axis=-1, keepdims=True)
        q_target_values = rewards + self.gamma * (1 - dones) * tf.squeeze(q_target_values)
        with tf.GradientTape() as tape:
            q_values = self.q_eval(states, training=True)
            enum_actions = list(enumerate(actions))
            q_values = tf.gather_nd(params=q_values, indices=enum_actions)
            loss = self.loss(q_values, q_target_values)

        grads = tape.gradient(loss, self.q_eval.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_eval.trainable_variables))
        print("梯度更新")
        self.memory_counter += 1

    def store_transition(self, s, a, r, done, s_):
        transition = np.hstack((s, [a, r, done], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def soft_update(self, tau):
        for target_param, local_param in zip(self.q_target.weights, self.q_eval.weights):
            tf.compat.v1.assign(target_param, tau * local_param + (1. - tau) * target_param)

主程序

import gym
from DQN import Deep_Q_Network


def main():
    step = 0
    for episode in range(300):
        # initial observation
        observation = env.reset()

        while True:
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(observation)

            # RL take action and get next observation and reward
            observation_, reward, done, info = env.step(action)

            RL.store_transition(observation, action, reward, done, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()
                RL.soft_update(1)

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1
        print("episode %d" % (episode + 1))
    # end of game
    print('game over')
    env.close()


def transformer_state(state):
    """将 position, velocity 通过线性转换映射到 [0, 40] 范围内"""
    pos, v = state
    pos_low, v_low = env.observation_space.low
    pos_high, v_high = env.observation_space.high
    pos = 40 * (pos - pos_low) / (pos_high - pos_low)
    v = 40 * (v - v_low) / (v_high - v_low)
    return int(pos), int(v)


if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    RL = Deep_Q_Network(n_actions=env.action_space.n, n_features=env.observation_space.shape[0])
    main()
    

DDQN算法

网络结构部分与DQN相同,
DDQN算法部分

import tensorflow as tf
import numpy as np
from Network import Q_Network


class Double_Deep_Q_Network:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=None
    ):
        """

        :param n_actions: 动作种类个数
        :param n_features: 观察向量的维度
        :param learning_rate: 学习率
        :param reward_decay: 奖励衰减系数
        :param e_greedy: 贪心策略的epsilon
        :param replace_target_iter: 多少步替换依次权重
        :param memory_size: 内存表的大小
        :param batch_size: 神经网络训练的批次
        :param e_greedy_increment: 贪心选择策略中的epsilon的衰减
        """
        self.memory_counter = 0
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 3))

        self.q_target = Q_Network(observation_n=n_features, action_n=self.n_actions).target_model
        self.q_eval = Q_Network(observation_n=n_features, action_n=self.n_actions).eval_model
        self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.loss = tf.losses.MeanSquaredError()

    def choose_action(self, state, eps=0.1):
        state = tf.Variable(state, dtype=tf.float32)
        if len(tf.shape(state)) == 1:
            state = tf.expand_dims(state, axis=0)
        if np.random.uniform() > eps:
            action_value = self.q_eval.predict(state)
            return np.argmax(action_value)
        else:
            return np.random.choice(np.arange(self.n_actions))

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.soft_update(1)

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)

        experience = self.memory[sample_index, :]
        states = np.array([e[:self.n_features] for e in experience]).astype("float32")
        actions = np.array([e[self.n_features] for e in experience]).astype("int32")
        rewards = np.array([e[self.n_features + 1] for e in experience]).astype("float32")
        next_states = np.array([e[-self.n_features:] for e in experience]).astype("float32")
        dones = np.array([e[self.n_features+2] for e in experience])

        q_target_values = self.q_target.predict(next_states)
        # 选择动作  就这里的q_target与DQN有些许的不一样
        q_eval_values = self.q_eval.predict(next_states)
        # print(q_eval_values)
        max_actions = tf.argmax(q_eval_values, axis=1)
        # print(max_actions)
        # q_target_values = tf.reduce_max(q_target_values, axis=-1, keep_dims=True)
        enum_max_actions = list(enumerate(max_actions))
        q_target_values = tf.gather_nd(params=q_target_values, indices=enum_max_actions)
        q_target_values = rewards + self.gamma * (1 - dones) * tf.squeeze(q_target_values)

主程序部分也与DQN一致

import gym
from DDQN import Double_Deep_Q_Network


def main():
    step = 0
    for episode in range(300):
        # initial observation
        observation = env.reset()

        while True:
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(observation)

            # RL take action and get next observation and reward
            observation_, reward, done, info = env.step(action)

            RL.store_transition(observation, action, reward, done, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()
                RL.soft_update(1)

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1
        print("episode %d" % (episode + 1))
    # end of game
    print('game over')
    env.close()


def transformer_state(state):
    """将 position, velocity 通过线性转换映射到 [0, 40] 范围内"""
    pos, v = state
    pos_low, v_low = env.observation_space.low
    pos_high, v_high = env.observation_space.high
    pos = 40 * (pos - pos_low) / (pos_high - pos_low)
    v = 40 * (v - v_low) / (v_high - v_low)
    return int(pos), int(v)


if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    RL = Double_Deep_Q_Network(n_actions=env.action_space.n, n_features=env.observation_space.shape[0])
    main()

  • 9
    点赞
  • 32
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值